diff --git a/tools/bazel.rc b/.bazelrc
similarity index 95%
rename from tools/bazel.rc
rename to .bazelrc
index 1fdf51f53e29c7111cf89c016400b710051cf9c6..cd7e13ddfc146208f79be900917b05b694869d72 100644
--- a/tools/bazel.rc
+++ b/.bazelrc
@@ -76,7 +76,6 @@ build:nonccl --define=no_nccl_support=true
 
 build --define=use_fast_cpp_protos=true
 build --define=allow_oversize_protos=true
-build --define=grpc_no_ares=true
 
 build --spawn_strategy=standalone
 build --genrule_strategy=standalone
@@ -93,3 +92,11 @@ build:dynamic_kernels --copt=-DAUTOLOAD_DYNAMIC_KERNELS
 build --define=PREFIX=/usr
 build --define=LIBDIR=$(PREFIX)/lib
 build --define=INCLUDEDIR=$(PREFIX)/include
+
+# Default options should come above this line
+
+# Options from ./configure
+try-import %workspace%/.tf_configure.bazelrc
+
+# Put user-specific options in .bazelrc.user
+try-import %workspace%/.bazelrc.user
diff --git a/.gitignore b/.gitignore
index 90324058600bee46af56e49028977971848a80de..e1d352c238a1b2d4febe0f5d4a30cfa0c942f7e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 .DS_Store
 .ipynb_checkpoints
 node_modules
-/.bazelrc
+/.bazelrc.user
 /.tf_configure.bazelrc
 /bazel-*
 /bazel_pip
diff --git a/CODEOWNERS b/CODEOWNERS
index 54a61a4d72c40d297d90d53e223f64f813d9167d..cb3fa2312405ce44d5dfc30ea4164740f436e07e 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,7 +1,7 @@
 # Where component owners are known, add them here.
 
 /tenosrflow/core/debug @caisq
-/tensorflow/core/nccl/ @azaks @csigg
+/tensorflow/core/nccl/ @azaks2 @chsigg
 /tensorflow/core/platform/windows/ @mrry
 /tensorflow/core/platform/s3 @yongtang
 /tensorflow/go @asimshankar
@@ -51,13 +51,13 @@
 /tensorflow/contrib/pi_examples/ @maciekcc
 /tensorflow/contrib/quantization/ @petewarden
 /tensorflow/contrib/rnn/ @ebrevdo @scottzhu
-/tensorflow/contrib/saved_model/ @nfiedel @sukritiramesh @allenl
+/tensorflow/contrib/saved_model/ @nfiedel @sukritiramesh @allenlavoie
 /tensorflow/contrib/seq2seq/ @ebrevdo @lmthang
 /tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh
 /tensorflow/contrib/slim/ @sguada @thenbasilmanran
 /tensorflow/contrib/stateless/ @girving @alextp
 /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst @yupbank
-/tensorflow/contrib/tensorrt/ @aaroey
+/tensorflow/contrib/tensorrt/ @aaroey @smit-hinsu @azaks2
 # NEED OWNER: /tensorflow/contrib/testing/
 /tensorflow/contrib/timeseries/ @allenlavoie
 /tensorflow/contrib/tpu/ @frankchn @saeta @jhseu @sourabhbajaj
diff --git a/README.md b/README.md
index 044174947a094d43a51f7140dd40ec0f17801d40..519815d006cc33be10132909baf414a4bd843435 100644
--- a/README.md
+++ b/README.md
@@ -113,11 +113,12 @@ The TensorFlow project strives to abide by generally accepted best practices in
 Build Type                                                                                                                                                                                      | Status                                                                                                                                                                                   | Artifacts
 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
 **IBM s390x**                                                                                                                                                                                   | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)                                                        | TBA
-**IBM ppc64le CPU**                                                                                                                                                                             | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/)                                    | TBA
-**IBM ppc64le GPU** Nightly                                                                                                                                                                     | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)            | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)
-**IBM ppc64le GPU** Stable Release                                                                                                                                                              | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)                  | [Release](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)
+**Linux ppc64le CPU** Nightly                                                                                                                                                                   | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/)                                  | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/)
+**Linux ppc64le CPU** Stable Release                                                                                                                                                            | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/)                  | [Release](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/)
+**Linux ppc64le GPU** Nightly                                                                                                                                                                   | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/)                                  | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)
+**Linux ppc64le GPU** Stable Release                                                                                                                                                            | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)                  | [Release](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)
 **Linux CPU with Intel® MKL-DNN** Nightly                                                                                                                                                       | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/)                                | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)
-**Linux CPU with Intel® MKL-DNN** Python 2.7<br> **Linux CPU with Intel® MKL-DNN** Python 3.4<br> **Linux CPU with Intel® MKL-DNN** Python 3.5<br> **Linux CPU with Intel® MKL-DNN** Python 3.6 | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/lastStableBuild) | [1.11.0 py2.7](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp27-cp27mu-linux_x86_64.whl)<br>[1.11.0 py3.4](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp34-cp34m-linux_x86_64.whl)<br>[1.11.0 py3.5](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp35-cp35m-linux_x86_64.whl)<br>[1.11.0 py3.6](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp36-cp36m-linux_x86_64.whl)
+**Linux CPU with Intel® MKL-DNN** Python 2.7<br> **Linux CPU with Intel® MKL-DNN** Python 3.4<br> **Linux CPU with Intel® MKL-DNN** Python 3.5<br> **Linux CPU with Intel® MKL-DNN** Python 3.6 | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/lastStableBuild) | [1.12.0 py2.7](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.12.0-cp27-cp27mu-linux_x86_64.whl)<br>[1.12.0 py3.4](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.12.0-cp34-cp34m-linux_x86_64.whl)<br>[1.12.0 py3.5](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.12.0-cp35-cp35m-linux_x86_64.whl)<br>[1.12.0 py3.6](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.12.0-cp36-cp36m-linux_x86_64.whl)
 
 ## For more information
 
diff --git a/RELEASE.md b/RELEASE.md
index b13b071bd6cf4d3a260c8e248a67d23e1a688498..32abdcea497618918964174a661a6ba872598f65 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -7,6 +7,8 @@
     Serving.
 *   Keras models now support evaluating with a `tf.data.Dataset`.
 *   TensorFlow binaries are built with XLA support linked in by default.
+*   Ignite Dataset added to contrib/ignite that allows to work with Apache
+    Ignite.
 
 ## Bug Fixes and Other Changes
 
diff --git a/WORKSPACE b/WORKSPACE
index 0c7bc085b512b084b9470abe17326d7c119aa327..7057d3f149e766cd2983ecc89509f84c37075602 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,5 +1,7 @@
 workspace(name = "org_tensorflow")
 
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
 http_archive(
     name = "io_bazel_rules_closure",
     sha256 = "a38539c5b5c358548e75b44141b4ab637bba7c4dc02b46b1f62a96d6433f56ae",
@@ -14,30 +16,27 @@ load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 
 closure_repositories()
 
-http_archive(
-    name = "base_images_docker",
-    sha256 = "e2b1b7254270bb7605e814a9dbf6d1e4ae04a11136ff1714fbfdabe3f87f7cf9",
-    strip_prefix = "base-images-docker-12801524f867e657fbb5d1a74f31618aff181ac6",
-    urls = ["https://github.com/GoogleCloudPlatform/base-images-docker/archive/12801524f867e657fbb5d1a74f31618aff181ac6.tar.gz"],
-)
+load("//third_party/toolchains/preconfig/generate:archives.bzl",
+     "bazel_toolchains_archive")
 
-http_archive(
-    name = "bazel_toolchains",
-    sha256 = "15b5858b1b5541ec44df31b94c3b8672815b31d71215a98398761ea9f4c4eedb",
-    strip_prefix = "bazel-toolchains-6200b238c9c2d137c0d9a7262c80cc71d98e692b",
-    urls = [
-        "https://github.com/bazelbuild/bazel-toolchains/archive/6200b238c9c2d137c0d9a7262c80cc71d98e692b.tar.gz",
-    ],
+bazel_toolchains_archive()
+
+load(
+    "@bazel_toolchains//repositories:repositories.bzl",
+    bazel_toolchains_repositories = "repositories",
 )
 
-http_archive(
-    name = "io_bazel_rules_docker",
-    sha256 = "29d109605e0d6f9c892584f07275b8c9260803bf0c6fcb7de2623b2bedc910bd",
-    strip_prefix = "rules_docker-0.5.1",
-    urls = ["https://github.com/bazelbuild/rules_docker/archive/v0.5.1.tar.gz"],
+bazel_toolchains_repositories()
+
+load(
+    "@io_bazel_rules_docker//container:container.bzl",
+    container_repositories = "repositories",
 )
 
-load("//third_party/toolchains/preconfig/generate:workspace.bzl", "remote_config_workspace")
+container_repositories()
+
+load("//third_party/toolchains/preconfig/generate:workspace.bzl",
+     "remote_config_workspace")
 
 remote_config_workspace()
 
@@ -45,7 +44,7 @@ remote_config_workspace()
 # files, in case the parsing of those build files depends on the bazel
 # version we require here.
 load("//tensorflow:version_check.bzl", "check_bazel_version_at_least")
-check_bazel_version_at_least("0.15.0")
+check_bazel_version_at_least("0.18.0")
 
 load("//tensorflow:workspace.bzl", "tf_workspace")
 
@@ -57,9 +56,9 @@ android_workspace()
 # Please add all new TensorFlow dependencies in workspace.bzl.
 tf_workspace()
 
-new_http_archive(
+http_archive(
     name = "inception_v1",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "7efe12a8363f09bc24d7b7a450304a15655a57a7751929b2c1593a71183bb105",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/inception_v1.zip",
@@ -67,9 +66,9 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "mobile_ssd",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "bddd81ea5c80a97adfac1c9f770e6f55cbafd7cce4d3bbe15fbeb041e6b8f3e8",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_android_export.zip",
@@ -77,9 +76,9 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "mobile_multibox",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "859edcddf84dddb974c36c36cfc1f74555148e9c9213dedacf1d6b613ad52b96",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1a.zip",
@@ -87,9 +86,9 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "stylize",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "3d374a730aef330424a356a8d4f04d8a54277c425e274ecb7d9c83aa912c6bfa",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/stylize_v1.zip",
@@ -97,9 +96,9 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "speech_commands",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "c3ec4fea3158eb111f1d932336351edfe8bd515bb6e87aad4f25dbad0a600d0c",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/speech_commands_v0.01.zip",
diff --git a/configure.py b/configure.py
index f087da002d534e1f0f4c1598e87217168c892dbe..1e732db26404906901a9eeab97a5e75137ee8388 100644
--- a/configure.py
+++ b/configure.py
@@ -255,18 +255,6 @@ def setup_python(environ_cp):
 def reset_tf_configure_bazelrc():
   """Reset file that contains customized config settings."""
   open(_TF_BAZELRC, 'w').close()
-  bazelrc_path = os.path.join(_TF_WORKSPACE_ROOT, '.bazelrc')
-
-  data = []
-  if os.path.exists(bazelrc_path):
-    with open(bazelrc_path, 'r') as f:
-      data = f.read().splitlines()
-  with open(bazelrc_path, 'w') as f:
-    for l in data:
-      if _TF_BAZELRC_FILENAME in l:
-        continue
-      f.write('%s\n' % l)
-    f.write('import %%workspace%%/%s\n' % _TF_BAZELRC_FILENAME)
 
 def cleanup_makefile():
   """Delete any leftover BUILD files from the Makefile build.
@@ -452,11 +440,12 @@ def convert_version_to_int(version):
   return int(version_str)
 
 
-def check_bazel_version(min_version):
-  """Check installed bazel version is at least min_version.
+def check_bazel_version(min_version, max_version):
+  """Check installed bazel version is between min_version and max_version.
 
   Args:
     min_version: string for minimum bazel version.
+    max_version: string for maximum bazel version.
 
   Returns:
     The bazel version detected.
@@ -474,6 +463,7 @@ def check_bazel_version(min_version):
 
   min_version_int = convert_version_to_int(min_version)
   curr_version_int = convert_version_to_int(curr_version)
+  max_version_int = convert_version_to_int(max_version)
 
   # Check if current bazel version can be detected properly.
   if not curr_version_int:
@@ -486,7 +476,12 @@ def check_bazel_version(min_version):
   if curr_version_int < min_version_int:
     print('Please upgrade your bazel installation to version %s or higher to '
           'build TensorFlow!' % min_version)
-    sys.exit(0)
+    sys.exit(1)
+  if (curr_version_int > max_version_int and
+      'TF_IGNORE_MAX_BAZEL_VERSION' not in os.environ):
+    print('Please downgrade your bazel installation to version %s or lower to '
+          'build TensorFlow!' % max_version)
+    sys.exit(1)
   return curr_version
 
 
@@ -1559,11 +1554,9 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  check_bazel_version('0.15.0')
+  check_bazel_version('0.19.0', '0.20.0')
 
   reset_tf_configure_bazelrc()
-  # Explicitly import tools/bazel.rc, this is needed for Bazel 0.19.0 or later
-  write_to_bazelrc('import %workspace%/tools/bazel.rc')
 
   cleanup_makefile()
   setup_python(environ_cp)
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index fd4b94202aad24a82abef8abd16431f61a8326f0..449a1372edb031c68786d8672e2a1499c2b3d047 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -267,6 +267,15 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+# By default, XLA GPU is compiled into tensorflow when building with
+# --config=cuda even when `with_xla_support` is false. The config setting
+# here allows us to override the behavior if needed.
+config_setting(
+    name = "no_xla_deps_in_cuda",
+    define_values = {"no_xla_deps_in_cuda": "true"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "with_gdr_support",
     define_values = {"with_gdr_support": "true"},
@@ -606,9 +615,11 @@ py_library(
     name = "tensorflow_py",
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = [
+    deps = select({
+        "api_version_2": [],
+        "//conditions:default": ["//tensorflow/contrib:contrib_py"],
+    }) + [
         ":tensorflow_py_no_contrib",
-        "//tensorflow/contrib:contrib_py",
         "//tensorflow/python/estimator:estimator_py",
     ],
 )
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index f13623b0d57d3b59bb9455a46a9fab29fee25784..4eba763129a6aef40e3c130d56bf8ab19638b7ca 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -20,14 +20,14 @@ from __future__ import print_function as _print_function
 
 import os as _os
 
+# API IMPORTS PLACEHOLDER
+
 # pylint: disable=g-bad-import-order
 from tensorflow.python.tools import component_api_helper as _component_api_helper
 _component_api_helper.package_hook(
     parent_package_str=__name__,
     child_package_str=('tensorflow_estimator.python.estimator.api.estimator'))
 
-# API IMPORTS PLACEHOLDER
-
 # Make sure directory containing top level submodules is in
 # the __path__ so that "from tensorflow.foo import bar" works.
 # We're using bitwise, but there's nothing special about that.
@@ -35,8 +35,9 @@ _tf_api_dir = _os.path.dirname(_os.path.dirname(bitwise.__file__))  # pylint: di
 if _tf_api_dir not in __path__:
   __path__.append(_tf_api_dir)
 
-# Calls to enable and disable features.
-enable_eager_execution()  # pylint: disable=undefined-variable
+# Enable TF2 behaviors
+from tensorflow.python.compat import compat as _compat  # pylint: disable=g-import-not-at-top
+_compat.enable_v2_behavior()
 
 # These symbols appear because we import the python package which
 # in turn imports from tensorflow.core and tensorflow.python. They
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index 65bdb6cb1b5e6fb0656a12b932d767aeacfccd29..21b5277614667bdbd7271ac3e57f5b69d5a19264 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -23,13 +23,13 @@ import os as _os
 # pylint: disable=g-bad-import-order
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 
+# API IMPORTS PLACEHOLDER
+
 from tensorflow.python.tools import component_api_helper as _component_api_helper
 _component_api_helper.package_hook(
     parent_package_str=__name__,
     child_package_str=('tensorflow_estimator.python.estimator.api.estimator'))
 
-# API IMPORTS PLACEHOLDER
-
 from tensorflow.python.util.lazy_loader import LazyLoader  # pylint: disable=g-import-not-at-top
 contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
 del LazyLoader
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index f653e581bf3beda9fdbf8fb7905a4f9fe170e7fb..25df970ecab0757f23465ab19e7f45de0c759458 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -175,6 +175,34 @@ tf_cuda_library(
     ],
 )
 
+tf_cuda_library(
+    name = "env",
+    srcs = [
+        "env.cc",
+    ],
+    hdrs = [
+        "env.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tensorflow:android": [
+            ":c_api",
+            ":tf_status_helper",
+            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:platform_env",
+            "//tensorflow/core:lib",
+        ],
+        "//conditions:default": [
+            ":c_api",
+            ":tf_status_helper",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:platform_env",
+            "//tensorflow/core:lib",
+        ],
+    }) + [":c_api_internal"],
+)
+
 tf_cuda_library(
     name = "kernels",
     srcs = [
@@ -188,10 +216,14 @@ tf_cuda_library(
     deps = select({
         "//tensorflow:android": [
             ":c_api",
+            ":c_api_internal",
+            ":tf_status_helper",
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":c_api",
+            ":c_api_internal",
+            ":tf_status_helper",
             "//tensorflow/core:framework",
         ],
     }),
@@ -330,6 +362,27 @@ tf_kernel_library(
     alwayslink = 1,
 )
 
+tf_cuda_cc_test(
+    name = "env_test",
+    size = "small",
+    srcs = ["env_test.cc"],
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    tags = ["noasan"],
+    # We must ensure that the dependencies can be dynamically linked since
+    # the shared library must be able to use core:framework.
+    # linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":c_api",
+        ":env",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "kernels_test",
     size = "small",
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index f13e8777dff164bcd8eedf46310ae846abd0c804..9580215a317b1a6b1cdacbd430a1764af61be990 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -136,16 +136,22 @@ const char* TF_Message(const TF_Status* s) {
 namespace {
 class TF_ManagedBuffer : public TensorBuffer {
  public:
-  void* data_;
-  size_t len_;
-  void (*deallocator_)(void* data, size_t len, void* arg);
-  void* deallocator_arg_;
+  TF_ManagedBuffer(void* data, size_t len,
+                   void (*deallocator)(void* data, size_t len, void* arg),
+                   void* deallocator_arg)
+      : TensorBuffer(data),
+        len_(len),
+        deallocator_(deallocator),
+        deallocator_arg_(deallocator_arg) {}
+
+  const size_t len_;
+  void (*const deallocator_)(void* data, size_t len, void* arg);
+  void* const deallocator_arg_;
 
   ~TF_ManagedBuffer() override {
-    (*deallocator_)(data_, len_, deallocator_arg_);
+    (*deallocator_)(data(), len_, deallocator_arg_);
   }
 
-  void* data() const override { return data_; }
   size_t size() const override { return len_; }
   TensorBuffer* root_buffer() override { return this; }
   void FillAllocationDescription(AllocationDescription* proto) const override {
@@ -199,8 +205,7 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
     dimvec[i] = static_cast<tensorflow::int64>(dims[i]);
   }
 
-  TF_ManagedBuffer* buf = new TF_ManagedBuffer;
-  buf->len_ = len;
+  TF_ManagedBuffer* buf = nullptr;
   if (dtype != TF_STRING && dtype != TF_RESOURCE &&
       tensorflow::DataTypeCanUseMemcpy(static_cast<DataType>(dtype)) &&
       reinterpret_cast<intptr_t>(data) % std::max(1, EIGEN_MAX_ALIGN_BYTES) !=
@@ -212,17 +217,15 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
     //
     // Other types have the same representation, so copy only if it is safe to
     // do so.
-    buf->data_ = allocate_tensor("TF_NewTensor", len);
-    std::memcpy(buf->data_, data, len);
-    buf->deallocator_ = deallocate_buffer;
-    buf->deallocator_arg_ = nullptr;
+    buf = new TF_ManagedBuffer(allocate_tensor("TF_NewTensor", len), len,
+                               deallocate_buffer, nullptr);
+    std::memcpy(buf->data(), data, len);
     // Free the original buffer.
     deallocator(data, len, deallocator_arg);
   } else {
-    buf->data_ = data;
-    buf->deallocator_ = deallocator;
-    buf->deallocator_arg_ = deallocator_arg;
+    buf = new TF_ManagedBuffer(data, len, deallocator, deallocator_arg);
   }
+
   TF_Tensor* ret = new TF_Tensor{dtype, TensorShape(dimvec), buf};
   size_t elem_size = TF_DataTypeSize(dtype);
   if (elem_size > 0 && len < (elem_size * ret->shape.num_elements())) {
@@ -477,14 +480,15 @@ static TF_Tensor* EmptyTensor(TF_DataType dtype, const TensorShape& shape) {
   CHECK_EQ(nelems, 0);
   static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
                 "64-bit int types should match in size");
-  return TF_NewTensor(dtype, reinterpret_cast<const int64_t*>(dims.data()),
-                      shape.dims(), reinterpret_cast<void*>(&empty), 0,
-                      [](void*, size_t, void*) {}, nullptr);
+  return TF_NewTensor(
+      dtype, reinterpret_cast<const int64_t*>(dims.data()), shape.dims(),
+      reinterpret_cast<void*>(&empty), 0, [](void*, size_t, void*) {}, nullptr);
 }
 
 // Non-static for testing.
 TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
                                TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
   if (!src.IsInitialized()) {
     status->status = FailedPrecondition(
         "attempt to use a tensor with an uninitialized value");
@@ -1592,18 +1596,20 @@ TF_AttrMetadata TF_OperationGetAttrMetadata(TF_Operation* oper,
     break;                                            \
   }
 
-      LIST_CASE(s, TF_ATTR_STRING, metadata.total_size = 0;
-                for (int i = 0; i < attr->list().s_size();
-                     ++i) { metadata.total_size += attr->list().s(i).size(); });
+      LIST_CASE(
+          s, TF_ATTR_STRING, metadata.total_size = 0;
+          for (int i = 0; i < attr->list().s_size();
+               ++i) { metadata.total_size += attr->list().s(i).size(); });
       LIST_CASE(i, TF_ATTR_INT);
       LIST_CASE(f, TF_ATTR_FLOAT);
       LIST_CASE(b, TF_ATTR_BOOL);
       LIST_CASE(type, TF_ATTR_TYPE);
-      LIST_CASE(shape, TF_ATTR_SHAPE, metadata.total_size = 0;
-                for (int i = 0; i < attr->list().shape_size(); ++i) {
-                  const auto& s = attr->list().shape(i);
-                  metadata.total_size += s.unknown_rank() ? 0 : s.dim_size();
-                });
+      LIST_CASE(
+          shape, TF_ATTR_SHAPE, metadata.total_size = 0;
+          for (int i = 0; i < attr->list().shape_size(); ++i) {
+            const auto& s = attr->list().shape(i);
+            metadata.total_size += s.unknown_rank() ? 0 : s.dim_size();
+          });
       LIST_CASE(tensor, TF_ATTR_TENSOR);
       LIST_CASE(tensor, TF_ATTR_FUNC);
 #undef LIST_CASE
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 3d56268110edbe96616201d15a69cc8c84d3115a..c7abba85521fccec07983cd5ab4f94a8368d6181 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -91,7 +91,7 @@ extern "C" {
 // --------------------------------------------------------------------------
 // TF_Version returns a string describing version information of the
 // TensorFlow library. TensorFlow using semantic versioning.
-TF_CAPI_EXPORT extern const char* TF_Version();
+TF_CAPI_EXPORT extern const char* TF_Version(void);
 
 // --------------------------------------------------------------------------
 // TF_DataType holds the type for a scalar value.  E.g., one slot in a tensor.
@@ -157,7 +157,7 @@ typedef enum TF_Code {
 typedef struct TF_Status TF_Status;
 
 // Return a new status object.
-TF_CAPI_EXPORT extern TF_Status* TF_NewStatus();
+TF_CAPI_EXPORT extern TF_Status* TF_NewStatus(void);
 
 // Delete a previously created status object.
 TF_CAPI_EXPORT extern void TF_DeleteStatus(TF_Status*);
@@ -196,7 +196,7 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_NewBufferFromString(const void* proto,
                                                         size_t proto_len);
 
 // Useful for passing *out* a protobuf.
-TF_CAPI_EXPORT extern TF_Buffer* TF_NewBuffer();
+TF_CAPI_EXPORT extern TF_Buffer* TF_NewBuffer(void);
 
 TF_CAPI_EXPORT extern void TF_DeleteBuffer(TF_Buffer*);
 
@@ -305,7 +305,7 @@ TF_CAPI_EXPORT extern size_t TF_StringEncodedSize(size_t len);
 typedef struct TF_SessionOptions TF_SessionOptions;
 
 // Return a new options object.
-TF_CAPI_EXPORT extern TF_SessionOptions* TF_NewSessionOptions();
+TF_CAPI_EXPORT extern TF_SessionOptions* TF_NewSessionOptions(void);
 
 // Set the target in TF_SessionOptions.options.
 // target can be empty, a single entry, or a comma separated list of entries.
@@ -338,7 +338,7 @@ TF_CAPI_EXPORT extern void TF_DeleteSessionOptions(TF_SessionOptions*);
 typedef struct TF_Graph TF_Graph;
 
 // Return a new graph object.
-TF_CAPI_EXPORT extern TF_Graph* TF_NewGraph();
+TF_CAPI_EXPORT extern TF_Graph* TF_NewGraph(void);
 
 // Destroy an options object.  Graph will be deleted once no more
 // TFSession's are referencing it.
@@ -890,7 +890,8 @@ TF_CAPI_EXPORT extern void TF_GraphVersions(TF_Graph* graph,
 // TF_GraphImportGraphDef.
 typedef struct TF_ImportGraphDefOptions TF_ImportGraphDefOptions;
 
-TF_CAPI_EXPORT extern TF_ImportGraphDefOptions* TF_NewImportGraphDefOptions();
+TF_CAPI_EXPORT extern TF_ImportGraphDefOptions* TF_NewImportGraphDefOptions(
+    void);
 TF_CAPI_EXPORT extern void TF_DeleteImportGraphDefOptions(
     TF_ImportGraphDefOptions* opts);
 
@@ -1611,7 +1612,7 @@ TF_CAPI_EXPORT extern void TF_DeleteLibraryHandle(TF_Library* lib_handle);
 //
 // The data in the buffer will be the serialized OpList proto for ops registered
 // in this address space.
-TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllOpList();
+TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllOpList(void);
 
 // TF_ApiDefMap encapsulates a collection of API definitions for an operation.
 //
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 3693cc85996365360253c8a94c29272a16e11e9a..81343f7bc027be82d28164be51011c794715d03a 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -66,7 +66,8 @@ void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) {
 }
 
 TF_Buffer* TF_CreateConfig(unsigned char enable_xla_compilation,
-                           unsigned char gpu_memory_allow_growth) {
+                           unsigned char gpu_memory_allow_growth,
+                           unsigned int num_cpu_devices) {
   tensorflow::ConfigProto config;
   auto* optimizer_options =
       config.mutable_graph_options()->mutable_optimizer_options();
@@ -87,6 +88,8 @@ TF_Buffer* TF_CreateConfig(unsigned char enable_xla_compilation,
   auto* gpu_options = config.mutable_gpu_options();
   gpu_options->set_allow_growth(gpu_memory_allow_growth);
 
+  (*config.mutable_device_count())["CPU"] = num_cpu_devices;
+
   // TODO(b/113217601): This is needed for EagerContext::runner_ to use a
   // threadpool, so that we avoid the possibility of running the runner_ in the
   // threadpool of GPU event mgr, as that can trigger more callbacks to be
@@ -6530,7 +6533,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/cycle_length"
+      name: "ExperimentalParallelInterleaveDataset/cycle_length"
       op: "Const"
       attr {
         key: "dtype"
@@ -6551,7 +6554,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/block_length"
+      name: "ExperimentalParallelInterleaveDataset/block_length"
       op: "Const"
       attr {
         key: "dtype"
@@ -6572,7 +6575,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/sloppy"
+      name: "ExperimentalParallelInterleaveDataset/sloppy"
       op: "Const"
       attr {
         key: "dtype"
@@ -6593,7 +6596,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/buffer_output_elements"
+      name: "ExperimentalParallelInterleaveDataset/buffer_output_elements"
       op: "Const"
       attr {
         key: "dtype"
@@ -6614,7 +6617,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/prefetch_input_elements"
+      name: "ExperimentalParallelInterleaveDataset/prefetch_input_elements"
       op: "Const"
       attr {
         key: "dtype"
@@ -6635,14 +6638,14 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset"
-      op: "ParallelInterleaveDataset"
+      name: "ExperimentalParallelInterleaveDataset"
+      op: "ExperimentalParallelInterleaveDataset"
       input: "RepeatDataset:handle:0"
-      input: "ParallelInterleaveDataset/cycle_length:output:0"
-      input: "ParallelInterleaveDataset/block_length:output:0"
-      input: "ParallelInterleaveDataset/sloppy:output:0"
-      input: "ParallelInterleaveDataset/buffer_output_elements:output:0"
-      input: "ParallelInterleaveDataset/prefetch_input_elements:output:0"
+      input: "ExperimentalParallelInterleaveDataset/cycle_length:output:0"
+      input: "ExperimentalParallelInterleaveDataset/block_length:output:0"
+      input: "ExperimentalParallelInterleaveDataset/sloppy:output:0"
+      input: "ExperimentalParallelInterleaveDataset/buffer_output_elements:output:0"
+      input: "ExperimentalParallelInterleaveDataset/prefetch_input_elements:output:0"
       attr {
         key: "Targuments"
         value {
@@ -6742,7 +6745,7 @@ library {
     node_def {
       name: "ShuffleDataset_2"
       op: "ShuffleDataset"
-      input: "ParallelInterleaveDataset:handle:0"
+      input: "ExperimentalParallelInterleaveDataset:handle:0"
       input: "ShuffleDataset_2/buffer_size_1:output:0"
       input: "ShuffleDataset_2/seed_2:output:0"
       input: "ShuffleDataset_2/seed2_2:output:0"
@@ -8535,8 +8538,9 @@ TFE_Context* TFE_CreateContextFromSession(TF_Session* session,
 
   // Reduce GPU memory allocation, and set appropriate config options for TFE
   // context.
-  auto* config =
-      TF_CreateConfig(/*xla*/ false, /* gpu_memory_allow_growth */ true);
+  auto* config = TF_CreateConfig(
+      /*xla*/ false, /* gpu_memory_allow_growth */ true, /* num_cpu_devices */
+      10);
   TFE_ContextOptionsSetConfig(opts, config->data, config->length, status);
   if (!status->status.ok()) {
     CHECK(!config);
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 80c8bfe594c4c89606efd01bec7f50e7a86b5bda..cb7a146846ff0bdac09f4a90765f78e0ada75718 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -67,9 +67,10 @@ TF_CAPI_EXPORT extern void TF_EnableXLACompilation(TF_SessionOptions* options,
 // a) ConfigProto.optimizer_options.global_jit_level is set to to ON_1 if
 // `enable_xla_compilation` is non-zero, and OFF otherwise.
 // b) ConfigProto.gpu_options.allow_growth is set to `gpu_memory_allow_growth`.
+// c) ConfigProto.device_count is set to `num_cpu_devices`.
 TF_CAPI_EXPORT extern TF_Buffer* TF_CreateConfig(
-    unsigned char enable_xla_compilation,
-    unsigned char gpu_memory_allow_growth);
+    unsigned char enable_xla_compilation, unsigned char gpu_memory_allow_growth,
+    unsigned int num_cpu_devices);
 
 // Create a serialized tensorflow.RunOptions proto, where RunOptions.trace_level
 // is set to FULL_TRACE if `enable_full_trace` is non-zero, and NO_TRACE
@@ -239,7 +240,7 @@ TF_CAPI_EXPORT void TF_InitMain(const char* usage, int* argc, char*** argv);
 
 // Platform-specific implementation to return an unused port. (This should used
 // in tests only.)
-TF_CAPI_EXPORT int TF_PickUnusedPortOrDie();
+TF_CAPI_EXPORT int TF_PickUnusedPortOrDie(void);
 
 // Fast path method that makes constructing a single scalar tensor require less
 // overhead and copies.
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 8d6c8d958d5961fce817156a14eb2b2940c1f2f0..120748ab763a3358b6e38e64bb3b6fd2ea32f7c3 100755
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -48,7 +48,7 @@ extern "C" {
 typedef struct TFE_ContextOptions TFE_ContextOptions;
 
 // Return a new options object.
-TF_CAPI_EXPORT extern TFE_ContextOptions* TFE_NewContextOptions();
+TF_CAPI_EXPORT extern TFE_ContextOptions* TFE_NewContextOptions(void);
 
 // Set the config in TF_ContextOptions.options.
 // config should be a serialized tensorflow.ConfigProto proto.
@@ -170,23 +170,11 @@ TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h,
                                                   int dim_index,
                                                   TF_Status* status);
 
-// Returns the device of the operation that produced `h`.
-// If `h` was produced by a copy, returns the destination device of
-// the copy. Note that returned device name is not always the device
-// holding the tensor handle's memory. If you want the latter, use
-// TFE_TensorHandleBackingDeviceName.
-// This function will block till the operation that produces `h` has completed.
-//
-// Device on which the kernel of the operation that produced `h` ran.
-//
-// If `h` was produced by a copy, returns the destination device of
-// the copy.
-//
-// Note that returned device name is not always the device that owns the memory
-// that backs the tensor handle. For the latter see
-// TFE_TensorHandleBackingDeviceName.
-//
-// This function will block till the operation that produces `h` has completed.
+// Returns the device of the operation that produced `h`. If `h` was produced by
+// a copy, returns the destination device of the copy. Note that the returned
+// device name is not always the device holding the tensor handle's memory. If
+// you want the latter, use TFE_TensorHandleBackingDeviceName. This function
+// will block till the operation that produces `h` has completed.
 TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(
     TFE_TensorHandle* h, TF_Status* status);
 
diff --git a/tensorflow/c/env.cc b/tensorflow/c/env.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c35ff9001d0ee1ab0fbae9e1bcc07116fab1065
--- /dev/null
+++ b/tensorflow/c/env.cc
@@ -0,0 +1,183 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/env.h"
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+struct TF_StringStream {
+  std::vector<::tensorflow::string>* list;
+  size_t position;
+};
+
+void TF_CreateDir(const char* dirname, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->CreateDir(dirname));
+}
+
+void TF_DeleteDir(const char* dirname, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->DeleteDir(dirname));
+}
+
+void TF_DeleteRecursively(const char* dirname, int64_t* undeleted_file_count,
+                          int64_t* undeleted_dir_count, TF_Status* status) {
+  ::tensorflow::int64 f, d;
+
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->DeleteRecursively(dirname, &f, &d));
+  *undeleted_file_count = f;
+  *undeleted_dir_count = d;
+}
+
+void TF_FileStat(const char* filename, TF_FileStatistics* stats,
+                 TF_Status* status) {
+  ::tensorflow::FileStatistics cc_stats;
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Status s =
+      ::tensorflow::Env::Default()->Stat(filename, &cc_stats);
+  ::tensorflow::Set_TF_Status_from_Status(status, s);
+  if (s.ok()) {
+    stats->length = cc_stats.length;
+    stats->mtime_nsec = cc_stats.mtime_nsec;
+    stats->is_directory = cc_stats.is_directory;
+  }
+}
+
+void TF_NewWritableFile(const char* filename, TF_WritableFileHandle** handle,
+                        TF_Status* status) {
+  std::unique_ptr<::tensorflow::WritableFile> f;
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Status s =
+      ::tensorflow::Env::Default()->NewWritableFile(filename, &f);
+  ::tensorflow::Set_TF_Status_from_Status(status, s);
+
+  if (s.ok()) {
+    *handle = reinterpret_cast<TF_WritableFileHandle*>(f.release());
+  }
+}
+
+void TF_CloseWritableFile(TF_WritableFileHandle* handle, TF_Status* status) {
+  auto* cc_file = reinterpret_cast<::tensorflow::WritableFile*>(handle);
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(status, cc_file->Close());
+  delete cc_file;
+}
+
+void TF_SyncWritableFile(TF_WritableFileHandle* handle, TF_Status* status) {
+  auto* cc_file = reinterpret_cast<::tensorflow::WritableFile*>(handle);
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(status, cc_file->Sync());
+}
+
+void TF_FlushWritableFile(TF_WritableFileHandle* handle, TF_Status* status) {
+  auto* cc_file = reinterpret_cast<::tensorflow::WritableFile*>(handle);
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(status, cc_file->Flush());
+}
+
+void TF_AppendWritableFile(TF_WritableFileHandle* handle, const char* data,
+                           size_t length, TF_Status* status) {
+  auto* cc_file = reinterpret_cast<::tensorflow::WritableFile*>(handle);
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, cc_file->Append(::tensorflow::StringPiece{data, length}));
+}
+
+void TF_DeleteFile(const char* filename, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->DeleteFile(filename));
+}
+
+bool TF_StringStreamNext(TF_StringStream* list, const char** result) {
+  if (list->position >= list->list->size()) {
+    *result = nullptr;
+    return false;
+  }
+
+  *result = list->list->at(list->position++).c_str();
+  return true;
+}
+
+void TF_StringStreamDone(TF_StringStream* list) {
+  delete list->list;
+  delete list;
+}
+TF_StringStream* TF_GetChildren(const char* dirname, TF_Status* status) {
+  auto* children = new std::vector<::tensorflow::string>;
+
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->GetChildren(dirname, children));
+
+  auto* list = new TF_StringStream;
+  list->list = children;
+  list->position = 0;
+  return list;
+}
+
+TF_StringStream* TF_GetLocalTempDirectories() {
+  auto* tmpdirs = new std::vector<::tensorflow::string>;
+
+  ::tensorflow::Env::Default()->GetLocalTempDirectories(tmpdirs);
+
+  auto* list = new TF_StringStream;
+  list->list = tmpdirs;
+  list->position = 0;
+  return list;
+}
+
+TF_CAPI_EXPORT extern uint64_t TF_NowNanos(void) {
+  return ::tensorflow::Env::Default()->NowNanos();
+}
+
+// Returns the number of microseconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowMicros(void) {
+  return ::tensorflow::Env::Default()->NowMicros();
+}
+
+// Returns the number of seconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowSeconds(void) {
+  return ::tensorflow::Env::Default()->NowSeconds();
+}
+
+void TF_DefaultThreadOptions(TF_ThreadOptions* options) {
+  options->stack_size = 0;
+  options->guard_size = 0;
+  options->numa_node = -1;
+}
+
+TF_Thread* TF_StartThread(const TF_ThreadOptions* options,
+                          const char* thread_name, void (*work_func)(void*),
+                          void* param) {
+  ::tensorflow::ThreadOptions cc_options;
+  cc_options.stack_size = options->stack_size;
+  cc_options.guard_size = options->guard_size;
+  cc_options.numa_node = options->numa_node;
+  return reinterpret_cast<TF_Thread*>(::tensorflow::Env::Default()->StartThread(
+      cc_options, thread_name, [=]() { (*work_func)(param); }));
+}
+
+void TF_JoinThread(TF_Thread* thread) {
+  // ::tensorflow::Thread joins on destruction
+  delete reinterpret_cast<::tensorflow::Thread*>(thread);
+}
diff --git a/tensorflow/c/env.h b/tensorflow/c/env.h
new file mode 100644
index 0000000000000000000000000000000000000000..15652353cd7e1f1e7d7a4c665703c0166682d790
--- /dev/null
+++ b/tensorflow/c/env.h
@@ -0,0 +1,194 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifndef TENSORFLOW_C_ENV_H_
+#define TENSORFLOW_C_ENV_H_
+
+#include "tensorflow/c/c_api.h"
+
+// --------------------------------------------------------------------------
+// C API for tensorflow::Env.
+
+struct TF_WritableFileHandle;
+struct TF_StringStream;
+struct TF_Thread;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TF_FileStatistics {
+  // The length of the file in bytes.
+  int64_t length;
+  // The last modified time in nanoseconds.
+  int64_t mtime_nsec;
+  // Whether the name refers to a directory.
+  bool is_directory;
+} TF_FileStatistics;
+
+typedef struct TF_ThreadOptions {
+  // Thread stack size to use (in bytes), zero implies that the system default
+  // will be used.
+  size_t stack_size;
+
+  // Guard area size to use near thread stacks to use (in bytes), zero implies
+  // that the system default will be used.
+  size_t guard_size;
+
+  // The NUMA node to use, -1 implies that there should be no NUMA affinity for
+  // this thread.
+  int numa_node;
+} TF_ThreadOptions;
+
+// Creates the specified directory. Typical status code are:
+//  * TF_OK - successfully created the directory
+//  * TF_ALREADY_EXISTS - directory already exists
+//  * TF_PERMISSION_DENIED - dirname is not writable
+TF_CAPI_EXPORT extern void TF_CreateDir(const char* dirname, TF_Status* status);
+
+// Deletes the specified directory. Typical status codes are:
+//  * TF_OK - successfully deleted the directory
+//  * TF_FAILED_PRECONDITION - the directory is not empty
+TF_CAPI_EXPORT extern void TF_DeleteDir(const char* dirname, TF_Status* status);
+
+// Deletes the specified directory and all subdirectories and files underneath
+// it. This is accomplished by traversing the directory tree rooted at dirname
+// and deleting entries as they are encountered.
+//
+// If dirname itself is not readable or does not exist, *undeleted_dir_count is
+// set to 1, *undeleted_file_count is set to 0 and an appropriate status (e.g.
+// TF_NOT_FOUND) is returned.
+//
+// If dirname and all its descendants were successfully deleted, TF_OK is
+// returned and both error counters are set to zero.
+//
+// Otherwise, while traversing the tree, undeleted_file_count and
+// undeleted_dir_count are updated if an entry of the corresponding type could
+// not be deleted. The returned error status represents the reason that any one
+// of these entries could not be deleted.
+//
+// Typical status codes:
+//  * TF_OK - dirname exists and we were able to delete everything underneath
+//  * TF_NOT_FOUND - dirname doesn't exist
+//  * TF_PERMISSION_DENIED - dirname or some descendant is not writable
+//  * TF_UNIMPLEMENTED - some underlying functions (like Delete) are not
+//    implemented
+TF_CAPI_EXPORT extern void TF_DeleteRecursively(const char* dirname,
+                                                int64_t* undeleted_file_count,
+                                                int64_t* undeleted_dir_count,
+                                                TF_Status* status);
+
+// Obtains statistics for the given path. If status is TF_OK, *stats is
+// updated, otherwise it is not touched.
+TF_CAPI_EXPORT extern void TF_FileStat(const char* filename,
+                                       TF_FileStatistics* stats,
+                                       TF_Status* status);
+
+// Creates or truncates the given filename and returns a handle to be used for
+// appending data to the file. If status is TF_OK, *handle is updated and the
+// caller is responsible for freeing it (see TF_CloseWritableFile).
+TF_CAPI_EXPORT extern void TF_NewWritableFile(const char* filename,
+                                              TF_WritableFileHandle** handle,
+                                              TF_Status* status);
+
+// Closes the given handle and frees its memory. If there was a problem closing
+// the file, it is indicated by status. Memory is freed in any case.
+TF_CAPI_EXPORT extern void TF_CloseWritableFile(TF_WritableFileHandle* handle,
+                                                TF_Status* status);
+
+// Syncs content of the handle to the filesystem. Blocks waiting for the
+// filesystem to indicate that the content has been persisted.
+TF_CAPI_EXPORT extern void TF_SyncWritableFile(TF_WritableFileHandle* handle,
+                                               TF_Status* status);
+
+// Flush local buffers to the filesystem. If the process terminates after a
+// successful flush, the contents may still be persisted, since the underlying
+// filesystem may eventually flush the contents.  If the OS or machine crashes
+// after a successful flush, the contents may or may not be persisted, depending
+// on the implementation.
+TF_CAPI_EXPORT extern void TF_FlushWritableFile(TF_WritableFileHandle* handle,
+                                                TF_Status* status);
+
+// Appends the given bytes to the file. Any failure to do so is indicated in
+// status.
+TF_CAPI_EXPORT extern void TF_AppendWritableFile(TF_WritableFileHandle* handle,
+                                                 const char* data,
+                                                 size_t length,
+                                                 TF_Status* status);
+
+// Deletes the named file and indicates whether successful in *status.
+TF_CAPI_EXPORT extern void TF_DeleteFile(const char* filename,
+                                         TF_Status* status);
+
+// Retrieves the next item from the given TF_StringStream and places a pointer
+// to it in *result. If no more items are in the list, *result is set to NULL
+// and false is returned.
+//
+// Ownership of the items retrieved with this function remains with the library.
+// Item points are invalidated after a call to TF_StringStreamDone.
+TF_CAPI_EXPORT extern bool TF_StringStreamNext(TF_StringStream* list,
+                                               const char** result);
+
+// Frees the resources associated with given string list. All pointers returned
+// by TF_StringStreamNext are invalid after this call.
+TF_CAPI_EXPORT extern void TF_StringStreamDone(TF_StringStream* list);
+
+// Retrieves the list of children of the given directory. You can iterate
+// through the list with TF_StringStreamNext. The caller is responsible for
+// freeing the list (see TF_StringStreamDone).
+TF_CAPI_EXPORT extern TF_StringStream* TF_GetChildren(const char* filename,
+                                                      TF_Status* status);
+
+// Retrieves a list of directory names on the local machine that may be used for
+// temporary storage. You can iterate through the list with TF_StringStreamNext.
+// The caller is responsible for freeing the list (see TF_StringStreamDone).
+TF_CAPI_EXPORT extern TF_StringStream* TF_GetLocalTempDirectories(void);
+
+// Returns the number of nanoseconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowNanos(void);
+
+// Returns the number of microseconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowMicros(void);
+
+// Returns the number of seconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowSeconds(void);
+
+// Populates a TF_ThreadOptions struct with system-default values.
+TF_CAPI_EXPORT extern void TF_DefaultThreadOptions(TF_ThreadOptions* options);
+
+// Returns a new thread that is running work_func and is identified
+// (for debugging/performance-analysis) by thread_name.
+//
+// The given param (which may be null) is passed to work_func when the thread
+// starts. In this way, data may be passed from the thread back to the caller.
+//
+// Caller takes ownership of the result and must call TF_JoinThread on it
+// eventually.
+TF_CAPI_EXPORT extern TF_Thread* TF_StartThread(const TF_ThreadOptions* options,
+                                                const char* thread_name,
+                                                void (*work_func)(void*),
+                                                void* param);
+
+// Waits for the given thread to finish execution, then deletes it.
+TF_CAPI_EXPORT extern void TF_JoinThread(TF_Thread* thread);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TENSORFLOW_C_ENV_H_
diff --git a/tensorflow/c/env_test.cc b/tensorflow/c/env_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..687ad024137352662759ec1f43df87e89faca353
--- /dev/null
+++ b/tensorflow/c/env_test.cc
@@ -0,0 +1,127 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/env.h"
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+#define ASSERT_TF_OK(x) ASSERT_EQ(TF_OK, TF_GetCode(x))
+
+TEST(TestEnv, TestDirHandling) {
+  TF_StringStream* tempdirs = TF_GetLocalTempDirectories();
+  const char* tempdir;
+  bool found = false;
+  while (TF_StringStreamNext(tempdirs, &tempdir)) {
+    found = true;
+
+    TF_Status* s = TF_NewStatus();
+
+    ::tensorflow::string dirpath =
+        ::tensorflow::io::JoinPath(tempdir, "somedir");
+    TF_CreateDir(dirpath.c_str(), s);
+    ASSERT_TF_OK(s) << "TF_CreateDir failed for " << dirpath << ": "
+                    << TF_Message(s);
+
+    ::tensorflow::string filepath =
+        ::tensorflow::io::JoinPath(dirpath, "somefile.txt");
+    TF_WritableFileHandle* handle;
+    TF_NewWritableFile(filepath.c_str(), &handle, s);
+    ASSERT_TF_OK(s) << "NewWritableFile failed for " << filepath << ": "
+                    << TF_Message(s);
+
+    const char* data = "Hello, world!\n";
+    TF_AppendWritableFile(handle, data, strlen(data), s);
+    ASSERT_TF_OK(s) << "TF_AppendWritableFile failed to append data to file at "
+                    << filepath << ": " << TF_Message(s);
+
+    TF_CloseWritableFile(handle, s);
+    ASSERT_TF_OK(s) << "TF_CloseWritableFile failed to close handle to "
+                    << filepath << ": " << TF_Message(s);
+
+    TF_StringStream* children = TF_GetChildren(dirpath.c_str(), s);
+    ASSERT_TF_OK(s) << "TF_GetChildren failed for " << dirpath;
+    const char* childpath;
+    ASSERT_TRUE(TF_StringStreamNext(children, &childpath));
+    ASSERT_EQ(::tensorflow::string(childpath), "somefile.txt");
+    // There should only be one file in this directory.
+    ASSERT_FALSE(TF_StringStreamNext(children, &childpath));
+    ASSERT_EQ(childpath, nullptr);
+    TF_StringStreamDone(children);
+
+    TF_FileStatistics stats;
+    TF_FileStat(filepath.c_str(), &stats, s);
+    ASSERT_EQ(stats.length, strlen(data));
+    ASSERT_FALSE(stats.is_directory);
+    ASSERT_GT(stats.mtime_nsec, 0);
+
+    // Trying to delete a non-empty directory should fail.
+    TF_DeleteDir(dirpath.c_str(), s);
+    ASSERT_NE(TF_OK, TF_GetCode(s))
+        << "TF_DeleteDir unexpectedly succeeded with a non-empty directory "
+        << dirpath;
+
+    TF_DeleteFile(filepath.c_str(), s);
+    ASSERT_TF_OK(s) << "TF_DeleteFile failed for " << filepath << ": "
+                    << TF_Message(s);
+
+    // Now deleting the directory should work.
+    TF_DeleteDir(dirpath.c_str(), s);
+    ASSERT_TF_OK(s) << "TF_DeleteDir failed for " << dirpath << ": "
+                    << TF_Message(s);
+
+    TF_DeleteStatus(s);
+    break;
+  }
+
+  ASSERT_TRUE(found) << "expected at least one temp dir";
+
+  TF_StringStreamDone(tempdirs);
+}
+
+TEST(TestEnv, TestTimeFunctions) {
+  ASSERT_GE(TF_NowSeconds(), 946684800);  // Midnight Jan 1, 2000
+  ASSERT_GE(TF_NowMicros(), 946684800 * 1e6);
+  ASSERT_GE(TF_NowNanos(), 946684800 * 1e9);
+}
+
+namespace {
+
+struct SomeThreadData {
+  ::tensorflow::mutex mu;
+  bool did_work = false;
+};
+
+void SomeThreadFunc(void* data) {
+  auto* real_data = static_cast<SomeThreadData*>(data);
+  ::tensorflow::mutex_lock l(real_data->mu);
+  real_data->did_work = true;
+}
+
+}  // namespace
+
+TEST(TestEnv, TestThreads) {
+  TF_ThreadOptions options;
+  TF_DefaultThreadOptions(&options);
+  SomeThreadData data;
+  TF_Thread* thread =
+      TF_StartThread(&options, "SomeThreadName", &SomeThreadFunc, &data);
+  TF_JoinThread(thread);
+  ::tensorflow::mutex_lock l(data.mu);
+  ASSERT_TRUE(data.did_work);
+}
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index ca69345264607ac689fb556b4f5c9bc08ea5eb88..2a4eaecb6cf2740a522b1e849d1306ebde6c4577 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/kernels.h"
+#include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -116,3 +118,43 @@ void TF_RegisterKernelBuilder(const char* name, TF_KernelBuilder* builder,
 
   TF_SetStatus(status, TF_OK, "");
 }
+
+int TF_NumInputs(TF_OpKernelContext* ctx) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  return cc_ctx->num_inputs();
+}
+
+int TF_NumOutputs(TF_OpKernelContext* ctx) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  return cc_ctx->num_outputs();
+}
+
+void TF_GetInput(TF_OpKernelContext* ctx, int i, TF_Tensor** tensor,
+                 TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  if (i < 0 || i >= cc_ctx->num_inputs()) {
+    TF_SetStatus(status, TF_OUT_OF_RANGE, "input index out of range");
+    return;
+  }
+  const ::tensorflow::Tensor& cc_tensor(cc_ctx->input(i));
+  TF_Tensor* result = ::tensorflow::TF_TensorFromTensor(cc_tensor, status);
+  if (TF_GetCode(status) == TF_OK) {
+    *tensor = result;
+  }
+}
+
+void TF_SetOutput(TF_OpKernelContext* ctx, int i, const TF_Tensor* tensor,
+                  TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  if (i < 0 || i >= cc_ctx->num_inputs()) {
+    TF_SetStatus(status, TF_OUT_OF_RANGE, "input index out of range");
+    return;
+  }
+  ::tensorflow::Tensor cc_tensor;
+  ::tensorflow::Status s = ::tensorflow::TF_TensorToTensor(tensor, &cc_tensor);
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(status, s);
+  if (s.ok()) {
+    cc_ctx->set_output(i, cc_tensor);
+  }
+}
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index 2518789a3c141755d0b3373d53642c487331f68b..1a91aa184f11ac8e45b38a1d106c7b445747a7c1 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -85,6 +85,32 @@ TF_CAPI_EXPORT extern void TF_RegisterKernelBuilder(const char* kernel_name,
 // builder is not registered with TensorFlow via TF_RegisterKernelBuilder.
 TF_CAPI_EXPORT extern void TF_DeleteKernelBuilder(TF_KernelBuilder* builder);
 
+// --------------------------------------------------------------------------
+// OpKernelContext routines
+
+// TF_NumInputs returns the number of inputs available in ctx.
+TF_CAPI_EXPORT extern int TF_NumInputs(TF_OpKernelContext* ctx);
+
+// TF_NumOutputs returns the number of outputs to be placed in *ctx by the
+// kernel.
+TF_CAPI_EXPORT extern int TF_NumOutputs(TF_OpKernelContext* ctx);
+
+// Retrieves the ith input from ctx. If TF_GetCode(status) is TF_OK, *tensor is
+// populated and its ownership is passed to the caller. In any other case,
+// *tensor is not modified.
+//
+// If i < 0 or i >= TF_NumInputs(ctx), *status is set to TF_OUT_OF_RANGE.
+TF_CAPI_EXPORT extern void TF_GetInput(TF_OpKernelContext* ctx, int i,
+                                       TF_Tensor** tensor, TF_Status* status);
+
+// Sets the ith output of ctx to tensor. If TF_GetCode(status) is anything but
+// TF_OK, ctx is left unmodified.
+//
+// If i < 0 or i >= TF_NumOutputs(ctx), *status is set to TF_OUT_OF_RANGE.
+TF_CAPI_EXPORT extern void TF_SetOutput(TF_OpKernelContext* ctx, int i,
+                                        const TF_Tensor* tensor,
+                                        TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index e706c7c1d96ee1781d8efc0f28c5e0cbcbc80861..e659ee3c3d258a626ccf03a782ec031b5a703a48 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/c/kernels.h"
 
+#include "tensorflow/c/c_api.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def.pb_text.h"
 #include "tensorflow/core/framework/op.h"
@@ -31,7 +32,6 @@ struct MyCustomKernel {
 static bool delete_called = false;
 
 static void* MyCreateFunc(TF_OpKernelConstruction* ctx) {
-  LOG(INFO) << "Wow, actually got into creation";
   struct MyCustomKernel* s = new struct MyCustomKernel;
   s->created = true;
   s->compute_called = false;
@@ -51,12 +51,31 @@ static void MyDeleteFunc(void* kernel) {
   delete s;
 }
 
+namespace tensorflow {
+
+static std::unique_ptr<OpKernel> GetFakeKernel(const char* device_name,
+                                               const char* op_name,
+                                               Status* status) {
+  NodeDef def;
+  def.set_op(op_name);
+  def.set_device(device_name);
+  def.add_input("input1");
+  def.add_input("input2");
+  return CreateOpKernel(DeviceType(device_name), nullptr, nullptr, def, 1,
+                        status);
+}
+
 // Tests registration of a single C kernel and checks that calls through the
 // C/C++ boundary are being made.
 TEST(TestKernel, TestRegisterKernelBuilder) {
   const char* kernel_name = "SomeKernelName";
   const char* op_name = "FooOp";
-  const char* device_name = "barDev";
+  const char* device_name = "FakeDeviceName1";
+
+  REGISTER_OP(op_name)
+      .Input("input1: double")
+      .Input("input2: uint8")
+      .Output("output1: uint8");
 
   TF_KernelBuilder* builder = TF_NewKernelBuilder(
       op_name, device_name, &MyCreateFunc, &MyComputeFunc, &MyDeleteFunc);
@@ -65,35 +84,120 @@ TEST(TestKernel, TestRegisterKernelBuilder) {
     TF_Status* status = TF_NewStatus();
     TF_RegisterKernelBuilder(kernel_name, builder, status);
     EXPECT_EQ(TF_OK, TF_GetCode(status));
-    TF_Buffer* buf = TF_GetRegisteredKernelsForOp("FooOp", status);
+    TF_Buffer* buf = TF_GetRegisteredKernelsForOp(op_name, status);
     EXPECT_EQ(TF_OK, TF_GetCode(status));
-    ::tensorflow::KernelList list;
+    KernelList list;
     list.ParseFromArray(buf->data, buf->length);
     ASSERT_EQ(1, list.kernel_size());
-    ASSERT_EQ("barDev", list.kernel(0).device_type());
+    ASSERT_EQ(device_name, list.kernel(0).device_type());
     TF_DeleteBuffer(buf);
     TF_DeleteStatus(status);
   }
 
-  REGISTER_OP("FooOp")
+  {
+    Status status;
+    std::unique_ptr<OpKernel> kernel =
+        GetFakeKernel(device_name, op_name, &status);
+    TF_EXPECT_OK(status);
+    ASSERT_NE(nullptr, kernel.get());
+    kernel->Compute(nullptr);
+  }
+
+  ASSERT_TRUE(delete_called);
+}
+
+class DummyDevice : public DeviceBase {
+ public:
+  DummyDevice(Env* env, bool save) : DeviceBase(env), save_(save) {}
+  bool RequiresRecordingAccessedTensors() const override { return save_; }
+  Allocator* GetAllocator(AllocatorAttributes /*attr*/) override {
+    return cpu_allocator();
+  }
+
+ private:
+  bool save_;
+};
+
+TEST(TestKernel, TestInputAndOutputCount) {
+  const char* kernel_name = "InputOutputCounterKernel";
+  const char* op_name = "BarOp";
+  const char* device_name = "FakeDeviceName2";
+
+  REGISTER_OP(op_name)
       .Input("input1: double")
       .Input("input2: uint8")
       .Output("output1: uint8");
 
+  static int num_inputs = 0;
+  static int num_outputs = 0;
+
+  // A kernel whose Compute function has a side-effect of updating num_inputs
+  // and num_outputs. Various functions on TF_OpKernelContext are also
+  // exercised.
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    num_inputs = TF_NumInputs(ctx);
+    num_outputs = TF_NumOutputs(ctx);
+
+    TF_Tensor* input = nullptr;
+    TF_Status* s = TF_NewStatus();
+    TF_GetInput(ctx, 0, &input, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s)) << "Failed to get input: " << TF_Message(s);
+    EXPECT_EQ(123, *static_cast<tensorflow::uint8*>(TF_TensorData(input)));
+    TF_GetInput(ctx, -1, &input, s);
+    EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
+    TF_GetInput(ctx, 3, &input, s);
+    EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
+
+    // Copy the input tensor to output.
+    TF_SetOutput(ctx, 0, input, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s));
+
+    TF_SetOutput(ctx, 24, input, s);
+    EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
+
+    TF_DeleteStatus(s);
+    if (input != nullptr) {
+      TF_DeleteTensor(input);
+    }
+  };
+
+  TF_KernelBuilder* builder = TF_NewKernelBuilder(op_name, device_name, nullptr,
+                                                  my_compute_func, nullptr);
+
+  {
+    TF_Status* status = TF_NewStatus();
+    TF_RegisterKernelBuilder(kernel_name, builder, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    TF_DeleteStatus(status);
+  }
+
   {
-    ::tensorflow::NodeDef def;
-    def.set_op("FooOp");
-    def.set_device("bar");
-    def.add_input("input1");
-    def.add_input("input2");
-    ::tensorflow::Status status;
-    std::unique_ptr<::tensorflow::OpKernel> kernel =
-        ::tensorflow::CreateOpKernel(::tensorflow::DeviceType("barDev"),
-                                     nullptr, nullptr, def, 1, &status);
+    OpKernelContext::Params p;
+    DummyDevice dummy_device(nullptr, false);
+    p.device = &dummy_device;
+
+    Tensor t(tensorflow::uint8(123));
+
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    // Simulate 2 inputs
+    inputs.emplace_back(&t);
+    inputs.emplace_back();
+    p.inputs = &inputs;
+
+    Status status;
+    std::unique_ptr<OpKernel> kernel =
+        GetFakeKernel(device_name, op_name, &status);
     TF_EXPECT_OK(status);
     ASSERT_NE(nullptr, kernel.get());
-    kernel->Compute(nullptr);
-  }
 
-  ASSERT_TRUE(delete_called);
+    p.op_kernel = kernel.get();
+    OpKernelContext ctx(&p);
+    kernel->Compute(&ctx);
+
+    ASSERT_EQ(2, num_inputs);
+    ASSERT_EQ(1, num_outputs);
+    ASSERT_EQ(123, ctx.mutable_output(0)->scalar<tensorflow::uint8>()());
+  }
 }
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
index 247236b760dd8c07bbb08426100b6a4d34296d2e..98d8393332269ae349cf8aa5c0b612c6f17172e6 100644
--- a/tensorflow/c/python_api.cc
+++ b/tensorflow/c/python_api.cc
@@ -160,4 +160,17 @@ void SetHandleShapeAndType(TF_Graph* graph, TF_Output output, const void* proto,
   ic->set_output_handle_shapes_and_types(output.index, shapes_and_types);
 }
 
+void AddWhileInputHack(TF_Graph* graph, TF_Output new_src, TF_Operation* dst,
+                       TF_Status* status) {
+  mutex_lock l(graph->mu);
+  status->status = graph->graph.AddWhileInputHack(&new_src.oper->node,
+                                                  new_src.index, &dst->node);
+  if (status->status.ok()) {
+    // This modification only updates the destination node for
+    // the purposes of running this graph in a session. Thus, we don't
+    // record the source node as being modified.
+    RecordMutation(graph, *dst, "adding input tensor");
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h
index 5cce84020bc68d912d259f51512341eb5f464a2c..44779ca656165dd65590cb5e9ea3ccf71165ed63 100644
--- a/tensorflow/c/python_api.h
+++ b/tensorflow/c/python_api.h
@@ -34,6 +34,7 @@ void SetAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
 
 void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device);
 
+// Updates 'dst' to consume 'new_src'.
 void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
                 TF_Status* status);
 
@@ -65,6 +66,13 @@ std::string GetHandleShapeAndType(TF_Graph* graph, TF_Output output);
 // because I couldn't get SWIG to work otherwise.
 void SetHandleShapeAndType(TF_Graph* graph, TF_Output output, const void* proto,
                            size_t proto_len, TF_Status* status);
+
+// This method is used to add a new input edge to 'dst', which must be a While
+// op. The While op's "T" attribute must have already been updated to include
+// the new edge. This is used to construct tf.while_loop gradients.
+void AddWhileInputHack(TF_Graph* graph, TF_Output new_src, TF_Operation* dst,
+                       TF_Status* status);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_PYTHON_API_H_
diff --git a/tensorflow/compat_template_v1.__init__.py b/tensorflow/compat_template_v1.__init__.py
index 7df80ec01245a7fe820c79d5879458c4cd0a93cb..d58acde09f007bc9df40b08b0ef79c6031ca7941 100644
--- a/tensorflow/compat_template_v1.__init__.py
+++ b/tensorflow/compat_template_v1.__init__.py
@@ -23,12 +23,12 @@ import os as _os
 # pylint: disable=g-bad-import-order
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 
+# API IMPORTS PLACEHOLDER
+
 from tensorflow.python.tools import component_api_helper as _component_api_helper
 _component_api_helper.package_hook(
     parent_package_str=__name__,
     child_package_str=('tensorflow_estimator.python.estimator.api.estimator'))
 
-# API IMPORTS PLACEHOLDER
-
 from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
 app.flags = flags  # pylint: disable=undefined-variable
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index e0ac7130a64d3928c39440c0e10a2d2e1990b9cd..ab1c1be344e2257721507543bc7647d4ff4becb2 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -178,7 +178,7 @@ Status GenArgMethods(const tf2xla::Config& config,
     TF_RETURN_IF_ERROR(
         AddRewritesForShape(i, xla::Shape(ps.parameters(i)), &rewrites));
     const string code = R"(
-  void set_arg{{NAME}}_data(void* data) {
+  void set_arg{{NAME}}_data(const void* data) {
     set_arg_data({{I}}, data);
   }
   {{TYPE}}* arg{{NAME}}_data() {
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index a2cdab5d1a8e72504ca11b789287d4efd07a59e9..968afad65ed6d4b5510687df484b7ce6743f6a85 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -114,7 +114,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   //   with dim indices specifying which value. No bounds checking is performed
   //   on dim indices.
 
-  void set_arg0_data(void* data) {
+  void set_arg0_data(const void* data) {
     set_arg_data(0, data);
   }
   float* arg0_data() {
@@ -132,7 +132,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
         arg_data(0)))[dim0][dim1];
   }
 
-  void set_arg_myfeed_data(void* data) {
+  void set_arg_myfeed_data(const void* data) {
     set_arg_data(0, data);
   }
   float* arg_myfeed_data() {
@@ -150,7 +150,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
         arg_data(0)))[dim0][dim1];
   }
 
-  void set_arg1_data(void* data) {
+  void set_arg1_data(const void* data) {
     set_arg_data(1, data);
   }
   tensorflow::int64* arg1_data() {
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 2dc3e8c9113b37bf9d575ad66783f4ab49478af4..4051664c24cacad4a2d151ad3ac9009015900609 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -283,7 +283,7 @@ def tf_library(
     )
 
     # Variables used for gen_test and gen_benchmark.
-    cpp_class_split = cpp_class.rsplit("::", maxsplit = 2)
+    cpp_class_split = cpp_class.rsplit("::", 2)
     if len(cpp_class_split) == 1:
         no_ns_name = cpp_class_split[0]
     else:
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index be91ed4f432b1890c22900f293fd4196e5c9d970..d8c88a9fca2db74265b4962e07a66ab214b1d994 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -76,6 +76,7 @@ cc_library(
     srcs = ["xla_cpu_device.cc"],
     visibility = [":friends"],
     deps = [
+        ":create_xla_launch_op",  # buildcleaner: keep
         ":flags",
         ":jit_compilation_passes",
         ":xla_device",
@@ -95,6 +96,7 @@ cc_library(
     srcs = ["xla_gpu_device.cc"],
     visibility = [":friends"],
     deps = [
+        ":create_xla_launch_op",  # buildcleaner: keep
         ":jit_compilation_passes",
         ":xla_device",
         "//tensorflow/compiler/jit/kernels:xla_ops",
@@ -104,6 +106,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
@@ -512,6 +515,7 @@ cc_library(
         "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
+        "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/cc:xla_jit_ops",
@@ -610,6 +614,7 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:resource_variable_ops",
         "//tensorflow/cc:scope",
@@ -622,6 +627,7 @@ tf_cc_test(
         "//tensorflow/compiler/tf2xla/cc:xla_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index f478832781cb1dc045d9163d4a6f5e5f64a8a705..03aba97bbe81a11f6366d118ee5bc573d0c6b31b 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -779,7 +779,8 @@ Status Encapsulator::Subgraph::RecordArg(
   if (inserted) {
     NodeDef arg_def;
     NodeDefBuilder builder(
-        absl::StrCat(src_node->name(), "_", src_slot, "_arg"), kArgOp);
+        absl::StrCat(src_node->name(), "_", src_slot, "_arg"), kArgOp,
+        NodeDebugInfo(src_node->def()));
     DataType dtype = edge->dst()->input_type(edge->dst_input());
     builder.Attr("T", dtype);
     builder.Attr("index", arg_index);
@@ -814,7 +815,8 @@ Status Encapsulator::Subgraph::RecordResult(
   if (inserted) {
     NodeDef ret_def;
     NodeDefBuilder builder(
-        absl::StrCat(src_node->name(), "_", src_slot, "_retval"), kRetValOp);
+        absl::StrCat(src_node->name(), "_", src_slot, "_retval"), kRetValOp,
+        NodeDebugInfo(src_node->def()));
     DataType dtype = src_node->output_type(src_slot);
     builder.Attr("T", dtype);
     builder.Attr("index", ret_index);
@@ -974,6 +976,7 @@ Status Encapsulator::Subgraph::AddHostComputes(
       }
 
       NodeDef host_compute_def;
+      // TODO(shikharagarwal): What source node should we use for errors?
       NodeDefBuilder builder(absl::StrCat("outside_compilation_",
                                           oc_subgraph_name, "_host_compute"),
                              kHostComputeOp);
@@ -1040,6 +1043,7 @@ Status Encapsulator::Subgraph::MakeSequencingNode(const string& subgraph_name,
                                                   Graph* graph_out) {
   if (sequencer_ == nullptr) {
     NodeDef seq_def;
+    // TODO(shikharagarwal): What source node should we use for errors?
     NodeDefBuilder builder(absl::StrCat(subgraph_name, "_sequencer"), "NoOp");
     builder.Attr(kXlaHostTransferSequencerAttr, subgraph_name);
     builder.Device(device_);
@@ -1214,7 +1218,8 @@ Status Encapsulator::Subgraph::AddHostComputeKeyPlaceholder(
   GraphDefBuilder::Options options(graph_out, /*status=*/nullptr);
   NodeDef key_def;
   NodeDefBuilder builder(
-      absl::StrCat(call_node_def_.name(), "_key_placeholder"), "Placeholder");
+      absl::StrCat(call_node_def_.name(), "_key_placeholder"), "Placeholder",
+      NodeDebugInfo(call_node_def_));
   builder.Attr("dtype", DT_STRING);
   builder.Attr("shape", shape_proto);
   builder.Attr("_host_compute_call_node", call_node_def_.name());
@@ -1248,6 +1253,7 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode(
   }
 
   NodeDef recv_def;
+  // TODO(shikharagarwal): What source node should we use for errors?
   NodeDefBuilder builder(absl::StrCat("outside_compilation_", subgraph_name,
                                       "_", oc_subgraph_name, "_recv"),
                          kRecvAtHostOp);
@@ -1303,6 +1309,7 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
   }
 
   NodeDef send_def;
+  // TODO(shikharagarwal): What source node should we use for errors?
   NodeDefBuilder builder(absl::StrCat("outside_compilation_", subgraph_name,
                                       "_", oc_subgraph_name, "_send"),
                          kSendFromHostOp);
@@ -1833,8 +1840,9 @@ Node* AddDummyShapedNode(const Node* src_node, int src_port,
   // Add any Enter nodes required to bring the constant to the correct control
   // flow frame.
   while (!control_flow_info[src_node->id()].frame_name.empty()) {
+    NodeDebugInfo debug_info(*src_node);
     NodeBuilder enter_builder(options.GetNameForOp("Enter"), "Enter",
-                              options.op_registry());
+                              options.op_registry(), &debug_info);
     enter_builder.Attr("frame_name",
                        control_flow_info[src_node->id()].frame_name);
     enter_builder.Attr("is_constant", true);
@@ -2018,7 +2026,8 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
             return errors::InvalidArgument(
                 "Shape inference is not possible for outside_compilation "
                 "SendFromHost node ",
-                send_node->name(), " because shape of node ", n->name(),
+                send_node->name(), " because shape of node ",
+                FormatNodeForError(*n),
                 " will not be known at compilation time.");
           }
         }
@@ -2047,8 +2056,7 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
         return errors::Internal(
             "Internal assumption failed while rewriting an outside_compilation "
             "cluster that contains a while loop. Logic assumes back-edge is to "
-            "port 1 of a 2-input "
-            "Merge node.");
+            "port 1 of a 2-input Merge node.");
       }
       // Connect the existing edge to both inputs of the Merge node so that the
       // graph will be well-formed.
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index de89be9a3555960dabe7bacd17226c15ae888ae6..8617beec004d0fe912155f054442c5b6249bb6b5 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -299,7 +299,7 @@ REGISTER_OP("XlaHostCompute")
     .Attr("Toutputs: list(type) >= 0")
     .Attr("ancestors: list(string) >= 0")
     .Attr("key: string")
-    .Attr("shape_inference_graph: string = ''")
+    .Attr("shape_inference_graph: func")
     .Attr("shapes: list(shape) >= 0")
     .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
@@ -510,11 +510,7 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
   s = ConvertGraphDefToGraph(options, *graphdef, graph.get());
   if (!s.ok()) return s;
 
-  s = PerformStaticShapeInferenceBeforeEncapsulation(
-      graph.get(), "_encapsulate", "_outside");
-  if (!s.ok()) return s;
-
-  s = PreprocessForEncapsulation(graph.get(), "_encapsulate", "_outside");
+  s = PerformStaticShapeInferenceBeforeEncapsulation(graph.get());
   if (!s.ok()) return s;
 
   std::unique_ptr<Graph> graph_out;
@@ -550,6 +546,14 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
   graphdef->Swap(&graphdef_out);
 
   *library = lib_def->ToProto();
+  // Remove "_xla_inferred_shapes" attr. They are added by
+  // `PerformStaticShapeInferenceBeforeEncapsulation`.
+  for (FunctionDef& fdef : *library->mutable_function()) {
+    for (NodeDef& node_def : *fdef.mutable_node_def()) {
+      node_def.mutable_attr()->erase("_xla_inferred_shapes");
+    }
+  }
+
   return s;
 }
 
@@ -901,18 +905,22 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
   {
     GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
     Node* key_constant = KeyPlaceholder("F1", shape.opts());
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                            {DT_FLOAT, DT_FLOAT}, shape.opts());
+    Node* recv = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        shape.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
                      shape.opts()
                          .WithName("E")
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O1"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
   }
 
+  NameAttrList shape_inference_graph;
+  shape_inference_graph.set_name("_outside_compilation_shape_inference_F1_O1");
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval_retval:float"}, {},
@@ -931,8 +939,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph",
-             "_outside_compilation_shape_inference_F1_O1"},
+            {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O1"}},
            {"c"}},
@@ -948,16 +955,18 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                            {DT_FLOAT, DT_FLOAT}, b2.opts());
+    Node* recv = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
                      b2.opts()
                          .WithName("E")
-                         .WithControlInputs({recv, b})
+                         .WithControlInputs({recv})
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O1"));
     Node* send = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
-                              b2.opts().WithControlInput(e));
+                              b2.opts().WithControlInput(e).WithAttr(
+                                  kXlaHasHostTransferAttrName, true));
 
     Node* s = Sequencer(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
@@ -966,9 +975,9 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
     NodeBuilder node_builder("F1", "F1", lib_def.get());
     node_builder.Input(a).Input(b);
     Node* call =
-        b2.opts().WithControlInputs({s}).FinalizeBuilder(&node_builder);
+        b2.opts().WithControlInputs({s, b}).FinalizeBuilder(&node_builder);
 
-    Binary(a, call, b2.opts().WithName("G").WithControlInputs({e}));
+    Binary(a, call, b2.opts().WithName("G").WithControlInputs({call}));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1022,14 +1031,16 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
   {
     GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
     Node* key_constant = KeyPlaceholder("F1", shape1.opts());
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                            {DT_FLOAT, DT_FLOAT}, shape1.opts());
+    Node* recv = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
                      shape1.opts()
                          .WithName("E")
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O1"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts());
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
   }
@@ -1037,33 +1048,45 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
   {
     GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
     Node* key_constant = KeyPlaceholder("F1", shape2.opts());
-    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                             {DT_FLOAT, DT_FLOAT}, shape2.opts());
+    Node* recv1 = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        shape2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      shape2.opts()
                          .WithName("E")
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O1"));
-    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
-                             {DT_FLOAT, DT_FLOAT}, shape2.opts());
+    Node* recv2 = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O2", {DT_FLOAT, DT_FLOAT},
+        shape2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* g = Binary(e, ops::NodeOut(recv2, 0),
+                     shape2.opts()
+                         .WithName("G")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O2"));
     Node* h = Binary(ops::NodeOut(recv2, 1), e,
                      shape2.opts()
                          .WithName("H")
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O2"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {h}, shape2.opts());
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g, h},
+                 shape2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape2, "F1_O2", &library_expected));
   }
 
+  NameAttrList shape_inference_graph1, shape_inference_graph2;
+  shape_inference_graph1.set_name("_outside_compilation_shape_inference_F1_O1");
+  shape_inference_graph2.set_name("_outside_compilation_shape_inference_F1_O2");
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"},
+      {"g_0_retval_retval:float", "i_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}},
           {{"I"},
            "UnaryTest",
-           {"outside_compilation_O2_host_compute:outputs:0"}},
+           {"outside_compilation_O2_host_compute:outputs:1"}},
           {{"F"},
            "BinaryTest",
            {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
@@ -1073,11 +1096,10 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
            "XlaHostCompute",
            {"F:o:0", "D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
-            {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
+            {"Toutputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O2"},
-            {"shape_inference_graph",
-             "_outside_compilation_shape_inference_F1_O2"},
+            {"shape_inference_graph", shape_inference_graph2},
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O2"}},
            {"F"}},
@@ -1088,13 +1110,13 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph",
-             "_outside_compilation_shape_inference_F1_O1"},
+            {"shape_inference_graph", shape_inference_graph1},
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
-      {{"i_0_retval_retval", "I:o:0"}});
+      {{"g_0_retval_retval", "outside_compilation_O2_host_compute:outputs:0"},
+       {"i_0_retval_retval", "I:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1105,19 +1127,22 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                             {DT_FLOAT, DT_FLOAT}, b2.opts());
+    Node* recv1 = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      b2.opts()
                          .WithName("E")
-                         .WithControlInputs({recv1, b})
+                         .WithControlInputs({recv1})
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O1"));
     Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
-                               b2.opts().WithControlInput(e));
+                               b2.opts().WithControlInput(e).WithAttr(
+                                   kXlaHasHostTransferAttrName, true));
 
-    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
-                             {DT_FLOAT, DT_FLOAT}, b2.opts());
+    Node* recv2 = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O2", {DT_FLOAT, DT_FLOAT},
+        b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* g = Binary(e, ops::NodeOut(recv2, 0),
                      b2.opts()
                          .WithName("G")
@@ -1130,7 +1155,8 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O2"));
     Node* send2 =
-        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {h}, b2.opts());
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g, h},
+                     b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
 
     Node* s = Sequencer(b2.opts()
                             .WithName("F1_sequencer")
@@ -1139,12 +1165,13 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
 
     NodeBuilder node_builder("F1", "F1", lib_def.get());
     node_builder.Input(a).Input(b);
-    Node* call = b2.opts().WithControlInput(s).FinalizeBuilder(&node_builder);
+    Node* call =
+        b2.opts().WithControlInputs({s, b}).FinalizeBuilder(&node_builder);
 
-    Binary(g, call, b2.opts().WithName("J"));
+    Binary(ops::NodeOut(call, 0), ops::NodeOut(call, 1),
+           b2.opts().WithName("J"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
-
   TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
   TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
 }
@@ -1196,7 +1223,9 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
 
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"},
-      {"f_0_retval_retval:float", "d_0_retval_retval:float"}, {},
+      {"e_0_retval_retval:float", "f_0_retval_retval:float",
+       "d_0_retval_retval:float"},
+      {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1212,35 +1241,37 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
-      {{"d_0_retval_retval", "D:o:0"}, {"f_0_retval_retval", "F:o:0"}});
+      {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
+       {"d_0_retval_retval", "D:o:0"},
+       {"f_0_retval_retval", "F:o:0"}});
 
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F2", {"f_0_arg:float", "bridge_e_g_0_arg:float"},
-      {"i_0_retval_retval:float", "g_0_retval_retval:float"}, {},
+      "F2", {"e_0_arg:float", "f_0_arg:float", "d_0_arg:float"},
+      {"g_0_retval_retval:float", "i_0_retval_retval:float"}, {},
       {
-          {{"G"}, "BinaryTest", {"bridge_e_g_0_arg", "f_0_arg"}},
+          {{"G"}, "BinaryTest", {"e_0_arg", "f_0_arg"}},
           {{"I"},
            "BinaryTest",
            {"f_0_arg", "outside_compilation_O1_host_compute:outputs:0"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
-           {"G:o:0"},
-           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
+           {"d_0_arg", "G:o:0"},
+           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F2_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"}}},
       },
-      {{"i_0_retval_retval", "I:o:0"}, {"g_0_retval_retval", "G:o:0"}});
+      {{"g_0_retval_retval", "G:o:0"}, {"i_0_retval_retval", "I:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1251,16 +1282,18 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
 
     Node* key_constant1 =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant1, 0), "F1", "O1",
-                             {DT_FLOAT, DT_FLOAT}, b2.opts());
+    Node* recv1 = RecvAtHost(
+        ops::NodeOut(key_constant1, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      b2.opts()
                          .WithName("E")
-                         .WithControlInputs({recv1, b})
+                         .WithControlInputs({recv1})
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O1"));
     Node* send1 = SendFromHost(ops::NodeOut(key_constant1, 0), "F1", "O1", {e},
-                               b2.opts().WithControlInput(e));
+                               b2.opts().WithControlInput(e).WithAttr(
+                                   kXlaHasHostTransferAttrName, true));
     Node* s1 = Sequencer(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
         "F1");
@@ -1268,29 +1301,33 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 =
-        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
+        b2.opts().WithControlInputs({s1, b}).FinalizeBuilder(&node_builder1);
 
     Node* key_constant2 =
         KeyPlaceholder("F2", b2.opts().WithName("F2_key_placeholder"));
-    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant2, 0), "F2", "O1",
-                             {DT_FLOAT}, b2.opts());
-    Node* h = Binary(ops::NodeOut(call1, 1), recv2,
+    Node* recv2 = RecvAtHost(
+        ops::NodeOut(key_constant2, 0), "F2", "O1", {DT_FLOAT, DT_FLOAT},
+        b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* h = Binary(recv2, ops::NodeOut(recv2, 1),
                      b2.opts()
                          .WithName("H")
                          .WithAttr("_encapsulate", "F2")
                          .WithAttr("_outside", "O1"));
-    Node* send2 = SendFromHost(ops::NodeOut(key_constant2, 0), "F2", "O1", {h},
-                               b2.opts());
+    Node* send2 =
+        SendFromHost(ops::NodeOut(key_constant2, 0), "F2", "O1", {h},
+                     b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
 
     Node* s2 = Sequencer(
         b2.opts().WithName("F2_sequencer").WithControlInputs({recv2, send2}),
         "F2");
     NodeBuilder node_builder2("F2", "F2", lib_def.get());
-    node_builder2.Input(call1).Input(e);
+    node_builder2.Input(call1)
+        .Input(ops::NodeOut(call1, 1))
+        .Input(ops::NodeOut(call1, 2));
     Node* call2 = b2.opts()
-                      .WithControlInputs({s2, e, call1})
+                      .WithControlInputs({s2, call1})
                       .FinalizeBuilder(&node_builder2);
-    Binary(ops::NodeOut(call2, 1), call2, b2.opts().WithName("J"));
+    Binary(call2, ops::NodeOut(call2, 1), b2.opts().WithName("J"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1326,8 +1363,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
     Node* h = Unary(g, b1.opts()
                            .WithName("H")
                            .WithAttr("_encapsulate", "F2")
-                           .WithAttr("_outside", "O1")
-                           .WithControlInput(e));
+                           .WithAttr("_outside", "O1"));
     Node* i = Unary(h, b1.opts().WithName("I").WithAttr("_encapsulate", "F2"));
     Binary(f, i, b1.opts().WithName("J"));
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
@@ -1358,7 +1394,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"}},
@@ -1380,7 +1416,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F2_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"}}},
@@ -1401,7 +1437,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      b2.opts()
                          .WithName("E")
-                         .WithControlInputs({recv1, b})
+                         .WithControlInputs({recv1})
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O1"));
     Node* send1 = SendFromHost(ops::NodeOut(key_constant1, 0), "F1", "O1", {e},
@@ -1413,7 +1449,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 =
-        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
+        b2.opts().WithControlInputs({s1, b}).FinalizeBuilder(&node_builder1);
 
     Node* key_constant2 =
         KeyPlaceholder("F2", b2.opts().WithName("F2_key_placeholder"));
@@ -1422,8 +1458,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
     Node* h = Unary(recv2, b2.opts()
                                .WithName("H")
                                .WithAttr("_encapsulate", "F2")
-                               .WithAttr("_outside", "O1")
-                               .WithControlInput(e));
+                               .WithAttr("_outside", "O1"));
     Node* send2 = SendFromHost(ops::NodeOut(key_constant2, 0), "F2", "O1", {h},
                                b2.opts());
 
@@ -1484,12 +1519,12 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
            {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
-           {},
-           {{"Tinputs", absl::Span<const DataType>({})},
+           {"a_0_arg"},
+           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"}}},
@@ -1503,16 +1538,19 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
     Node* a = InputShaped(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* e = Unary(a, b2.opts()
-                           .WithName("E")
-                           .WithAttr("_encapsulate", "F1")
-                           .WithAttr("_outside", "O1"));
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
     Node* send1 =
         SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
     Node* s1 = Sequencer(
-        b2.opts().WithName("F1_sequencer").WithControlInput(send1), "F1");
+        b2.opts().WithName("F1_sequencer").WithControlInputs({send1, recv1}),
+        "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 =
@@ -1569,12 +1607,12 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
            {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
-           {},
-           {{"Tinputs", absl::Span<const DataType>({})},
+           {"a_0_arg"},
+           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"}},
@@ -1591,13 +1629,13 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {}, b2.opts());
-    Node* e = Unary(a, b2.opts()
-                           .WithName("E")
-                           .WithControlInput(recv1)
-                           .WithAttr("_encapsulate", "F1")
-                           .WithAttr("_outside", "O1"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithControlInput(recv1)
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
     Node* send1 =
         SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
     Node* s1 = Sequencer(
@@ -1644,8 +1682,27 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* key_constant = KeyPlaceholder("F1", shape1.opts());
+    Node* recv1 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = Unary(ops::NodeOut(recv1, 0), shape1.opts()
+                                                .WithName("E")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
+  }
+
+  NameAttrList shape_inference_graph;
+  shape_inference_graph.set_name("_outside_compilation_shape_inference_F1_O1");
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"},
+      {"e_0_retval_retval:float", "f_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1654,14 +1711,15 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
            "XlaHostCompute",
            {"D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"Toutputs", absl::Span<const DataType>({})},
+            {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"}}},
       },
-      {{"f_0_retval_retval", "F:o:0"}});
+      {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
+       {"f_0_retval_retval", "F:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1678,14 +1736,17 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
                                .WithName("E")
                                .WithAttr("_encapsulate", "F1")
                                .WithAttr("_outside", "O1"));
+    Node* send1 =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
     Node* s1 = Sequencer(
-        b2.opts().WithName("F1_sequencer").WithControlInput(recv1), "F1");
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
+        "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 =
         b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
 
-    Binary(e, call1, b2.opts().WithName("G"));
+    Binary(call1, ops::NodeOut(call1, 1), b2.opts().WithName("G"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1722,8 +1783,27 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* key_constant = KeyPlaceholder("F1", shape1.opts());
+    Node* recv1 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = Unary(ops::NodeOut(recv1, 0), shape1.opts()
+                                                .WithName("E")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
+  }
+
+  NameAttrList shape_inference_graph;
+  shape_inference_graph.set_name("_outside_compilation_shape_inference_F1_O1");
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"},
+      {"e_0_retval_retval:float", "f_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1736,14 +1816,15 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
            "XlaHostCompute",
            {"D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"Toutputs", absl::Span<const DataType>({})},
+            {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"}}},
       },
-      {{"f_0_retval_retval", "F:o:0"}});
+      {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
+       {"f_0_retval_retval", "F:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1760,7 +1841,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
                                .WithName("E")
                                .WithAttr("_encapsulate", "F1")
                                .WithAttr("_outside", "O1"));
-    Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {},
+    Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
                                b2.opts().WithControlInput(e));
     Node* s1 = Sequencer(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
@@ -1770,7 +1851,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
     Node* call1 =
         b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
 
-    Binary(e, call1, b2.opts().WithName("G"));
+    Binary(call1, ops::NodeOut(call1, 1), b2.opts().WithName("G"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1813,22 +1894,45 @@ TEST(EncapsulateSubgraphsTest,
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* key_constant = KeyPlaceholder("F1", shape1.opts());
+    Node* recv1 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = Unary(ops::NodeOut(recv1, 0), shape1.opts()
+                                                .WithName("E")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
+  }
+
   {
     GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
     Node* key_constant = KeyPlaceholder("F1", shape2.opts());
-    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
-                             {DT_FLOAT}, shape2.opts());
+    Node* recv2 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", {DT_FLOAT},
+                   shape2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* g = Unary(ops::NodeOut(recv2, 0), shape2.opts()
                                                 .WithName("G")
                                                 .WithAttr("_encapsulate", "F1")
                                                 .WithAttr("_outside", "O2"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, shape2.opts());
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g},
+                 shape2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape2, "F1_O2", &library_expected));
   }
 
+  NameAttrList shape_inference_graph1;
+  shape_inference_graph1.set_name("_outside_compilation_shape_inference_F1_O1");
+  NameAttrList shape_inference_graph2;
+  shape_inference_graph2.set_name("_outside_compilation_shape_inference_F1_O2");
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"},
+      {"e_0_retval_retval:float", "h_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1836,6 +1940,16 @@ TEST(EncapsulateSubgraphsTest,
           {{"H"},
            "UnaryTest",
            {"outside_compilation_O2_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "XlaHostCompute",
+           {"a_0_arg"},
+           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
+            {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
+            {"ancestors", absl::Span<const string>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_inference_graph1},
+            {"shapes", absl::Span<const TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
           {{"outside_compilation_O2_host_compute"},
            "XlaHostCompute",
            {"F:o:0"},
@@ -1843,12 +1957,12 @@ TEST(EncapsulateSubgraphsTest,
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O2"},
-            {"shape_inference_graph",
-             "_outside_compilation_shape_inference_F1_O2"},
+            {"shape_inference_graph", shape_inference_graph2},
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O2"}}},
       },
-      {{"h_0_retval_retval", "H:o:0"}});
+      {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
+       {"h_0_retval_retval", "H:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1856,30 +1970,39 @@ TEST(EncapsulateSubgraphsTest,
     GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
-
-    Node* e = Unary(a, b2.opts()
-                           .WithName("E")
-                           .WithAttr("_encapsulate", "F1")
-                           .WithAttr("_outside", "O1"));
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
-                            {DT_FLOAT}, b2.opts());
-    Node* g = Unary(recv, b2.opts()
-                              .WithName("G")
-                              .WithAttr("_encapsulate", "F1")
-                              .WithAttr("_outside", "O2")
-                              .WithControlInput(e));
-    Node* send =
-        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, b2.opts());
-    Node* s1 = Sequencer(
-        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
-        "F1");
+    Node* recv1 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
+    Node* send1 =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                     b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* recv2 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* g = Unary(recv2, b2.opts()
+                               .WithName("G")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O2")
+                               .WithControlInput(e));
+    Node* send2 =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g},
+                     b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* s1 = Sequencer(b2.opts()
+                             .WithName("F1_sequencer")
+                             .WithControlInputs({recv1, send1, recv2, send2}),
+                         "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b).ControlInput(s1);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
 
-    Binary(e, call1, b2.opts().WithName("I"));
+    Binary(call1, ops::NodeOut(call1, 1), b2.opts().WithName("I"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1925,19 +2048,24 @@ TEST(EncapsulateSubgraphsTest,
   {
     GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
     Node* key_constant = KeyPlaceholder("F1", shape1.opts());
-    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                             {DT_FLOAT}, shape1.opts());
+    Node* recv2 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts()
                                                 .WithName("E")
                                                 .WithAttr("_encapsulate", "F1")
                                                 .WithAttr("_outside", "O1"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts());
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
   }
 
+  NameAttrList shape_inference_graph;
+  shape_inference_graph.set_name("_outside_compilation_shape_inference_F1_O1");
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"},
+      {"e_0_retval_retval:float", "h_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1945,6 +2073,16 @@ TEST(EncapsulateSubgraphsTest,
            "UnaryTest",
            {"outside_compilation_O1_host_compute:outputs:0"}},
           {{"H"}, "UnaryTest", {"F:o:0"}},
+          {{"outside_compilation_O2_host_compute"},
+           "XlaHostCompute",
+           {"a_0_arg"},
+           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
+            {"Toutputs", absl::Span<const DataType>({})},
+            {"ancestors", absl::Span<const string>({})},
+            {"key", "host_compute_channel_F1_O2"},
+            {"shape_inference_graph", NameAttrList()},
+            {"shapes", absl::Span<const TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O2"}}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
            {"D:o:0"},
@@ -1952,12 +2090,12 @@ TEST(EncapsulateSubgraphsTest,
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph",
-             "_outside_compilation_shape_inference_F1_O1"},
+            {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"}}},
       },
-      {{"h_0_retval_retval", "H:o:0"}});
+      {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
+       {"h_0_retval_retval", "H:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1968,27 +2106,33 @@ TEST(EncapsulateSubgraphsTest,
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                            {DT_FLOAT}, b2.opts());
-    Node* e = Unary(recv, b2.opts()
-                              .WithName("E")
-                              .WithAttr("_encapsulate", "F1")
-                              .WithAttr("_outside", "O1"));
+    Node* recv1 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
     Node* send =
-        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
-    /*Node* g =*/Unary(a, b2.opts()
-                              .WithName("G")
-                              .WithAttr("_encapsulate", "F1")
-                              .WithAttr("_outside", "O2")
-                              .WithControlInput(e));
-    Node* s1 = Sequencer(
-        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
-        "F1");
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                     b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* recv2 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    /*Node* g =*/Unary(recv2, b2.opts()
+                                  .WithName("G")
+                                  .WithAttr("_encapsulate", "F1")
+                                  .WithAttr("_outside", "O2")
+                                  .WithControlInput(e));
+    Node* s1 = Sequencer(b2.opts()
+                             .WithName("F1_sequencer")
+                             .WithControlInputs({recv1, recv2, send}),
+                         "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b).ControlInput(s1);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
 
-    Binary(e, call1, b2.opts().WithName("I"));
+    Binary(call1, ops::NodeOut(call1, 1), b2.opts().WithName("I"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -2039,19 +2183,24 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
   {
     GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
     Node* key_constant = KeyPlaceholder("F1", shape1.opts());
-    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                             {DT_FLOAT}, shape1.opts());
+    Node* recv2 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts()
                                                 .WithName("E")
                                                 .WithAttr("_encapsulate", "F1")
                                                 .WithAttr("_outside", "O1"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts());
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
   }
 
+  NameAttrList shape_inference_graph;
+  shape_inference_graph.set_name("_outside_compilation_shape_inference_F1_O1");
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"},
+      {"e_0_retval_retval:float", "h_0_retval_retval:float"}, {},
       {{{"C"}, "UnaryTest", {"a_0_arg"}},
        {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
        {{"F"}, "UnaryTest", {"outside_compilation_O1_host_compute:outputs:0"}},
@@ -2063,8 +2212,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
          {"ancestors", absl::Span<const string>({})},
          {"key", "host_compute_channel_F1_O1"},
-         {"shape_inference_graph",
-          "_outside_compilation_shape_inference_F1_O1"},
+         {"shape_inference_graph", shape_inference_graph},
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O1"}}},
        {{"outside_compilation_O2_host_compute"},
@@ -2074,7 +2222,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"Toutputs", absl::Span<const DataType>({})},
          {"ancestors", absl::Span<const string>({})},
          {"key", "host_compute_channel_F1_O2"},
-         {"shape_inference_graph", ""},
+         {"shape_inference_graph", NameAttrList()},
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O2"}},
         {}},
@@ -2085,11 +2233,12 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"Toutputs", absl::Span<const DataType>({})},
          {"ancestors", absl::Span<const string>({})},
          {"key", "host_compute_channel_F1_O3"},
-         {"shape_inference_graph", ""},
+         {"shape_inference_graph", NameAttrList()},
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O3"}},
         {}}},
-      {{"h_0_retval_retval", "H:o:0"}});
+      {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
+       {"h_0_retval_retval", "H:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -2100,23 +2249,27 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                             {DT_FLOAT}, b2.opts());
+    Node* recv1 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Unary(recv1, b2.opts()
                                .WithName("E")
                                .WithAttr("_encapsulate", "F1")
                                .WithAttr("_outside", "O1"));
     Node* send =
-        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
-    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
-                             {DT_FLOAT}, b2.opts());
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                     b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* recv2 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* g = Unary(recv2, b2.opts()
                                .WithName("G")
                                .WithAttr("_encapsulate", "F1")
                                .WithAttr("_outside", "O2")
                                .WithControlInput(e));
-    Node* recv3 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O3",
-                             {DT_FLOAT}, b2.opts());
+    Node* recv3 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O3", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     /*Node* i =*/Binary(recv3, e,
                         b2.opts()
                             .WithName("I")
@@ -2131,7 +2284,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
     node_builder1.Input(a).Input(b).ControlInput(s1);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
 
-    Binary(e, call1, b2.opts().WithName("J"));
+    Binary(call1, ops::NodeOut(call1, 1), b2.opts().WithName("J"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -2167,14 +2320,44 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* key_constant = KeyPlaceholder("F1", shape1.opts());
+    Node* recv2 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts()
+                                                .WithName("E")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
+  }
+
+  NameAttrList shape_inference_graph;
+  shape_inference_graph.set_name("_outside_compilation_shape_inference_F1_O1");
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"},
+      {"e_0_retval_retval:float", "f_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"}, "UnaryTest", {"D:o:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "XlaHostCompute",
+           {"a_0_arg"},
+           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
+            {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
+            {"ancestors", absl::Span<const string>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_inference_graph},
+            {"shapes", absl::Span<const TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
-      {{"f_0_retval_retval", "F:o:0"}});
+      {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
+       {"f_0_retval_retval", "F:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -2183,15 +2366,26 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* e = Unary(a, b2.opts()
-                           .WithName("E")
-                           .WithAttr("_encapsulate", "F1")
-                           .WithAttr("_outside", "O1"));
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = Unary(recv, b2.opts()
+                              .WithName("E")
+                              .WithAttr("_encapsulate", "F1")
+                              .WithAttr("_outside", "O1"));
+    Node* send =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                     b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* s = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
+        "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
-    node_builder1.Input(a).Input(b);
+    node_builder1.Input(a).Input(b).ControlInput(s);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
 
-    Binary(e, call1, b2.opts().WithName("G"));
+    Binary(call1, ops::NodeOut(call1, 1), b2.opts().WithName("G"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -2236,20 +2430,22 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
   {
     GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
     Node* key_constant = KeyPlaceholder("F1", shape.opts());
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                            {DT_FLOAT}, shape.opts());
-    Node* a = InputShaped(shape.opts().WithName("A"));
-    Node* c = Unary(a, shape.opts().WithName("C"));
-    Node* e = BinaryUnknownShape(c, recv,
+    Node* recv = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        shape.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = BinaryUnknownShape(recv, ops::NodeOut(recv, 1),
                                  shape.opts()
                                      .WithName("E")
                                      .WithAttr("_encapsulate", "F1")
                                      .WithAttr("_outside", "O1"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
   }
 
+  NameAttrList shape_inference_graph;
+  shape_inference_graph.set_name("_outside_compilation_shape_inference_F1_O1");
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"b_0_arg:float", "c_0_arg:float"}, {"f_0_retval_retval:float"}, {},
@@ -2262,13 +2458,12 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
            {"outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
-           {"c:o:0"},
-           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
+           {"c_0_arg", "c:o:0"},
+           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph",
-             "_outside_compilation_shape_inference_F1_O1"},
+            {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O1"}},
            {"c"}},
@@ -2285,16 +2480,18 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                            {DT_FLOAT}, b2.opts());
-    Node* e = BinaryUnknownShape(c, ops::NodeOut(recv, 0),
+    Node* recv = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = BinaryUnknownShape(recv, ops::NodeOut(recv, 1),
                                  b2.opts()
                                      .WithName("E")
-                                     .WithControlInputs({recv, b})
+                                     .WithControlInputs({recv})
                                      .WithAttr("_encapsulate", "F1")
                                      .WithAttr("_outside", "O1"));
     Node* send = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
-                              b2.opts().WithControlInput(e));
+                              b2.opts().WithControlInput(e).WithAttr(
+                                  kXlaHasHostTransferAttrName, true));
 
     Node* s = Sequencer(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
@@ -2303,9 +2500,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
     NodeBuilder node_builder("F1", "F1", lib_def.get());
     node_builder.Input(b).Input(c);
     Node* call =
-        b2.opts().WithControlInputs({s, c}).FinalizeBuilder(&node_builder);
+        b2.opts().WithControlInputs({s, b, c}).FinalizeBuilder(&node_builder);
 
-    Binary(a, call, b2.opts().WithName("G").WithControlInputs({e}));
+    Binary(a, call, b2.opts().WithName("G").WithControlInputs({call}));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
diff --git a/tensorflow/compiler/jit/encapsulate_util.cc b/tensorflow/compiler/jit/encapsulate_util.cc
index bcc3213285bee2a2094bd6c39b37ba95874d90ed..2264806d6bdabd9f26d9f83b681524399f996317 100644
--- a/tensorflow/compiler/jit/encapsulate_util.cc
+++ b/tensorflow/compiler/jit/encapsulate_util.cc
@@ -62,516 +62,6 @@ void ReplaceAttr(Node* n, const string& attr_name, const T& value) {
   n->AddAttr(attr_name, value);
 }
 
-// Step 1a ~ 1d for PreprocessForEncapsulation(). See comments of
-// PreprocessForEncapsulation() for details.
-Status ProcessControlEdges(Graph* g, const string& xla_computation_attr_name,
-                           const string& outside_compilation_attr_name) {
-  // Gather edges to remove. We should not remove the edge while iterating.
-  std::vector<const Edge*> edges_to_remove;
-  for (const Edge* e : g->edges()) {
-    if (!e->IsControlEdge()) {
-      continue;
-    }
-
-    auto src_xla_computation =
-        GetStringAttr(*e->src(), xla_computation_attr_name);
-    auto dst_xla_computation =
-        GetStringAttr(*e->dst(), xla_computation_attr_name);
-    auto src_outside_compilation =
-        GetStringAttr(*e->src(), outside_compilation_attr_name);
-    auto dst_outside_compilation =
-        GetStringAttr(*e->dst(), outside_compilation_attr_name);
-
-    if (!src_xla_computation && !dst_xla_computation) {
-      continue;
-    } else if (src_xla_computation && !dst_xla_computation) {
-      if (src_outside_compilation) {
-        // Case 1c: outside compilation to host computation control edge.
-        edges_to_remove.push_back(e);
-
-        TF_RETURN_IF_ERROR(AppendToListAttr<string>(
-            e->dst(), kXlaControlDependenciesAttrName, e->src()->name()));
-      }
-    } else if (!src_xla_computation && dst_xla_computation) {
-      if (dst_outside_compilation) {
-        // Case 1c: host computation control to outside compilation edge.
-        edges_to_remove.push_back(e);
-
-        TF_RETURN_IF_ERROR(AppendToListAttr<string>(
-            e->dst(), kXlaControlDependenciesAttrName, e->src()->name()));
-      }
-    } else {  // src_xla_computation && dst_xla_computation
-      if (*src_xla_computation != *dst_xla_computation) {
-        if (src_outside_compilation && dst_outside_compilation) {
-          // Case 1b: outside compilation to outside compilation control edge.
-          edges_to_remove.push_back(e);
-
-          TF_RETURN_IF_ERROR(AppendToListAttr<string>(
-              e->dst(), kXlaControlDependenciesAttrName, e->src()->name()));
-        } else if (src_outside_compilation && !dst_outside_compilation) {
-          // Case 1a: outside compilation to another XLA computaition control
-          // edge.
-          TF_RETURN_IF_ERROR(AppendToListAttr<string>(
-              e->src(), kXlaConnectedToOtherXlaComputationAttrName,
-              *dst_xla_computation));
-        } else if (!src_outside_compilation && dst_outside_compilation) {
-          // Case 1a: another XLA computaition to outside compilation control
-          // edge.
-          TF_RETURN_IF_ERROR(AppendToListAttr<string>(
-              e->dst(), kXlaConnectedFromOtherXlaComputationAttrName,
-              *src_xla_computation));
-        }
-      }
-    }
-  }
-
-  for (auto e : edges_to_remove) {
-    g->RemoveEdge(e);
-  }
-  return Status::OK();
-}
-
-// Step 2 for PreprocessForEncapsulation(). See comments of
-// PreprocessForEncapsulation() for details.
-Status ProcessXlaToXlaDataEdges(Graph* g,
-                                const string& xla_computation_attr_name,
-                                const string& outside_compilation_attr_name) {
-  // Gather edges between XLA computations. Notice that we do not store `Edge*`
-  // directly because we remove some nodes while adding Identity nodes, and
-  // those Edge pointers might be invalidated.
-  struct EdgeInfo {
-    int dst_input, dst_node_id;
-  };
-  std::vector<EdgeInfo> edges;
-  for (const Edge* e : g->edges()) {
-    if (e->IsControlEdge()) {
-      continue;
-    }
-
-    auto src_xla_computation =
-        GetStringAttr(*e->src(), xla_computation_attr_name);
-    auto dst_xla_computation =
-        GetStringAttr(*e->dst(), xla_computation_attr_name);
-    auto src_outside_compilation =
-        GetStringAttr(*e->src(), outside_compilation_attr_name);
-    auto dst_outside_compilation =
-        GetStringAttr(*e->dst(), outside_compilation_attr_name);
-    if (!src_xla_computation || !dst_xla_computation) {
-      continue;
-    }
-
-    if (*src_xla_computation != *dst_xla_computation) {
-      if (src_outside_compilation || dst_outside_compilation) {
-        edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()});
-        VLOG(4) << "XLA -> XLA edge: " << e->DebugString();
-      }
-    }
-  }
-
-  // For each XLA -> XLA edge, add an Identity node between src and dst.
-  for (int i = 0; i < edges.size(); i++) {
-    Node* dst = g->FindNodeId(edges[i].dst_node_id);
-    const Edge* e;
-    TF_RETURN_IF_ERROR(dst->input_edge(edges[i].dst_input, &e));
-    Node* src = e->src();
-    int src_output = e->src_output(), dst_input = e->dst_input();
-    g->RemoveEdge(e);
-
-    // Create Identity node, and connect it between `src` and `dst`.
-    string identity_node_name =
-        absl::StrCat("bridge_", src->name(), "_", dst->name());
-    DataType dtype = src->output_type(src_output);
-    TF_ASSIGN_OR_RETURN(Node * identity_node,
-                        BuildIdentityNode(g, identity_node_name, dtype, src,
-                                          /*requested_device=*/absl::nullopt));
-    identity_node->AddAttr(kBridgeSourceNodeAttrName, src->name());
-    g->AddEdge(src, src_output, identity_node, 0);
-    g->AddEdge(identity_node, 0, dst, dst_input);
-
-    // Replace `e->dst()` because its input node changed.
-    NodeDef new_def = dst->def();
-    *new_def.mutable_input(dst_input) = identity_node->name();
-    TF_ASSIGN_OR_RETURN(Node * dst_replace_node, ReplaceNode(g, dst, new_def));
-
-    // Other edge in `edges` might have `e->dst()` as src or dst
-    // node. Before removing `e->dst()`, replace those edges with corresponding
-    // edges for `dst_replace_node`.
-    for (int j = i + 1; j < edges.size(); j++) {
-      if (edges[j].dst_node_id == edges[i].dst_node_id) {
-        edges[j].dst_node_id = dst_replace_node->id();
-      }
-    }
-  }
-  return Status::OK();
-}
-
-// Step 3 for PreprocessForEncapsulation(). See comments of
-// PreprocessForEncapsulation() for details.
-Status ProcessDataEdgeBetweenOutsideCompilationAndHostComputation(
-    Graph* g, const string& xla_computation_attr_name,
-    const string& outside_compilation_attr_name) {
-  // Gather edges between outside compilation and host computation. Notice that
-  // we do not store `Edge*` directly because we remove some nodes while adding
-  // Identity nodes, and those Edge pointers might be invalidated.
-  struct EdgeInfo {
-    int dst_input, dst_node_id;
-    bool is_host_to_outside_compilation;
-  };
-  std::vector<EdgeInfo> edges;
-  for (const Edge* e : g->edges()) {
-    if (e->IsControlEdge()) {
-      continue;
-    }
-
-    if (e->src()->attrs().Find(xla_computation_attr_name) == nullptr &&
-        e->dst()->attrs().Find(xla_computation_attr_name) != nullptr &&
-        e->dst()->attrs().Find(outside_compilation_attr_name) != nullptr) {
-      edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id(),
-                               /*is_host_to_outside_compilation=*/true});
-      VLOG(4) << "Host -> oc edge: " << e->DebugString();
-    } else if (e->dst()->attrs().Find(xla_computation_attr_name) == nullptr &&
-               e->src()->attrs().Find(xla_computation_attr_name) != nullptr &&
-               e->src()->attrs().Find(outside_compilation_attr_name) !=
-                   nullptr) {
-      edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id(),
-                               /*is_host_to_outside_compilation=*/false});
-      VLOG(4) << "Oc -> host edge: " << e->DebugString();
-    }
-  }
-
-  // Remove the edge from host to outside compilation. Add a placeholder as
-  // outside compilation node input.
-  std::map<string, Node*> placeholders;
-  for (int i = 0; i < edges.size(); i++) {
-    Node* dst = g->FindNodeId(edges[i].dst_node_id);
-    const Edge* e;
-    TF_RETURN_IF_ERROR(dst->input_edge(edges[i].dst_input, &e));
-    Node* src = e->src();
-    int src_output = e->src_output(), dst_input = e->dst_input();
-    g->RemoveEdge(e);
-
-    // Find or create placeholder node.
-    string new_name =
-        edges[i].is_host_to_outside_compilation
-            ? absl::StrCat(src->name(), "_host_to_oc_placeholder")
-            : absl::StrCat(src->name(), "_oc_to_host_placeholder");
-    auto iter = placeholders.find(new_name);
-    Node* placeholder_node;
-    if (iter == placeholders.end()) {
-      NodeDefBuilder placeholder_builder(new_name, "Placeholder");
-      placeholder_builder.Attr("dtype", src->output_type(src_output));
-      if (edges[i].is_host_to_outside_compilation) {
-        placeholder_builder.Attr(kHostToOutsideCompilationOriginalNodeAttrName,
-                                 src->name());
-        placeholder_builder.Attr(kHostToOutsideCompilationSrcOutputAttrName,
-                                 src_output);
-        // If this placeholder node is in outside compilation, we need to set
-        // `xla_computation_attr_name` and `outside_compilation_attr_name`.
-        string xla_computation_attr, outside_compilation_attr;
-        TF_RETURN_IF_ERROR(GetNodeAttr(dst->attrs(), xla_computation_attr_name,
-                                       &xla_computation_attr));
-        TF_RETURN_IF_ERROR(GetNodeAttr(dst->attrs(),
-                                       outside_compilation_attr_name,
-                                       &outside_compilation_attr));
-        placeholder_builder.Attr(xla_computation_attr_name,
-                                 xla_computation_attr);
-        placeholder_builder.Attr(outside_compilation_attr_name,
-                                 outside_compilation_attr);
-      } else {
-        placeholder_builder.Attr(kOutsideCompilationToHostOriginalNodeAttrName,
-                                 src->name());
-        placeholder_builder.Attr(kOutsideCompilationToHostSrcOutputAttrName,
-                                 src_output);
-      }
-      NodeDef placeholder_def;
-      TF_RETURN_IF_ERROR(placeholder_builder.Finalize(&placeholder_def));
-      Status s;
-      placeholder_node = g->AddNode(placeholder_def, &s);
-      TF_RETURN_IF_ERROR(s);
-      placeholders[new_name] = placeholder_node;
-    } else {
-      placeholder_node = iter->second;
-    }
-    g->AddEdge(placeholder_node, 0, dst, dst_input);
-
-    // Replace `e->dst()` because its input node changed.
-    NodeDef new_def = dst->def();
-    *new_def.mutable_input(dst_input) = placeholder_node->name();
-    TF_ASSIGN_OR_RETURN(Node * dst_replace_node, ReplaceNode(g, dst, new_def));
-
-    // Other edge in `edges` might have `e->dst()` as src or dst
-    // node. Before removing `e->dst()`, replace those edges with corresponding
-    // edges for `dst_replace_node`.
-    for (int j = i + 1; j < edges.size(); j++) {
-      if (edges[j].dst_node_id == edges[i].dst_node_id) {
-        edges[j].dst_node_id = dst_replace_node->id();
-      }
-    }
-  }
-  return Status::OK();
-}
-
-// Step 1 for `PostprocessForEncapsulation`. See comments of
-// `PostprocessForEncapsulation` for details.
-Status RemovePlaceholderBetweenOutsideCompilationAndHostComputation(Graph* g) {
-  // Gather all outside compilation to host computation nodes.
-  struct PlaceHolderNodeInfo {
-    Node* n;
-    bool is_host_to_oc;
-  };
-  std::vector<PlaceHolderNodeInfo> placeholder_nodes;
-  for (Node* n : g->nodes()) {
-    if (n->type_string() == "Placeholder") {
-      if (HasNodeAttr(n->def(),
-                      kOutsideCompilationToHostOriginalNodeAttrName)) {
-        placeholder_nodes.push_back({n, false});
-      } else if (HasNodeAttr(n->def(),
-                             kHostToOutsideCompilationOriginalNodeAttrName)) {
-        placeholder_nodes.push_back({n, true});
-      }
-    }
-  }
-
-  // Remove the placeholder nodes, and reconnect original edge.
-  auto node_name_index = g->BuildNodeNameIndex();
-  for (auto placeholder_iter : placeholder_nodes) {
-    Node* n = placeholder_iter.n;
-
-    string node_name;
-    int node_src_output;
-    if (placeholder_iter.is_host_to_oc) {
-      TF_RETURN_IF_ERROR(
-          GetNodeAttr(n->attrs(), kHostToOutsideCompilationOriginalNodeAttrName,
-                      &node_name));
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(),
-                                     kHostToOutsideCompilationSrcOutputAttrName,
-                                     &node_src_output));
-    } else {
-      TF_RETURN_IF_ERROR(
-          GetNodeAttr(n->attrs(), kOutsideCompilationToHostOriginalNodeAttrName,
-                      &node_name));
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(),
-                                     kOutsideCompilationToHostSrcOutputAttrName,
-                                     &node_src_output));
-    }
-    auto iter = node_name_index.find(node_name);
-    if (iter == node_name_index.end()) {
-      return errors::Internal(
-          "Cannot find original node for oc -> host placeholder node ",
-          node_name);
-    }
-
-    // Change all usage node to use the original node instead.
-    Node* original_node = iter->second;
-    std::vector<const Edge*> control_edges;
-    std::vector<OutEdgeInfo> data_edges;
-    for (auto e : n->out_edges()) {
-      if (e->IsControlEdge()) {
-        control_edges.push_back(e);
-      } else {
-        data_edges.push_back({e->dst(), e->src_output(), e->dst_input()});
-      }
-    }
-    for (const Edge* e : control_edges) {
-      g->AddControlEdge(original_node, e->dst());
-      g->RemoveEdge(e);
-    }
-    for (int i = 0; i < data_edges.size(); i++) {
-      Node* dst = data_edges[i].dst;
-      NodeDef new_def = dst->def();
-      int dst_input = data_edges[i].dst_input;
-      *new_def.mutable_input(dst_input) =
-          absl::StrCat(original_node->name(), ":", node_src_output);
-      TF_ASSIGN_OR_RETURN(Node * replace_node, ReplaceNode(g, dst, new_def));
-
-      const Edge* edge_to_replace = nullptr;
-      TF_RETURN_IF_ERROR(replace_node->input_edge(dst_input, &edge_to_replace));
-      g->RemoveEdge(edge_to_replace);
-      g->AddEdge(original_node, node_src_output, replace_node, dst_input);
-
-      // Other edges might have `dst` as dst node. Update those edges with
-      // `replace_node`.
-      for (int j = i + 1; j < data_edges.size(); j++) {
-        if (data_edges[j].dst == dst) {
-          data_edges[j].dst = replace_node;
-        }
-      }
-
-      // Other placeholder node might have `dst` as original node. Update
-      // `node_name_index` with `replace_node`.
-      node_name_index[replace_node->name()] = replace_node;
-    }
-
-    // Remove placeholder node.
-    g->RemoveNode(n);
-  }
-  return Status::OK();
-}
-
-// Step 2 for `PostprocessForEncapsulation`. See comments of
-// `PostprocessForEncapsulation` for details.
-Status RemoveIdentityBetweenDifferentXlaComputation(Graph* g) {
-  // Gather Identity nodes to remove.
-  std::vector<Node*> bridge_nodes;
-  for (Node* n : g->nodes()) {
-    if (n->type_string() == "Identity" &&
-        HasNodeAttr(n->def(), kBridgeSourceNodeAttrName)) {
-      bridge_nodes.push_back(n);
-    }
-  }
-
-  // Remove the identity nodes, and reconnect the original edge.
-  for (int i = 0; i < bridge_nodes.size(); i++) {
-    Node* n = bridge_nodes[i];
-    const Edge* src_edge = nullptr;
-    TF_RETURN_IF_ERROR(n->input_edge(0, &src_edge));
-
-    // Change all usage node to use the original node instead.
-    std::vector<const Edge*> control_edges;
-    std::vector<OutEdgeInfo> data_edges;
-    for (auto e : n->out_edges()) {
-      if (e->IsControlEdge()) {
-        control_edges.push_back(e);
-      } else {
-        data_edges.push_back({e->dst(), e->src_output(), e->dst_input()});
-      }
-    }
-    for (const Edge* e : control_edges) {
-      g->AddControlEdge(src_edge->src(), e->dst());
-      g->RemoveEdge(e);
-    }
-    for (int j = 0; j < data_edges.size(); j++) {
-      Node* dst = data_edges[j].dst;
-      NodeDef new_def = dst->def();
-      int dst_input = data_edges[j].dst_input;
-      *new_def.mutable_input(dst_input) =
-          absl::StrCat(src_edge->src()->name(), ":", src_edge->src_output());
-      TF_ASSIGN_OR_RETURN(Node * replace_node, ReplaceNode(g, dst, new_def));
-
-      const Edge* edge_to_replace = nullptr;
-      TF_RETURN_IF_ERROR(replace_node->input_edge(dst_input, &edge_to_replace));
-      g->RemoveEdge(edge_to_replace);
-      g->AddEdge(src_edge->src(), src_edge->src_output(), replace_node,
-                 dst_input);
-
-      // Other edges might have `dst` as dst node. Update those edges with
-      // `replace_node`.
-      for (int k = j + 1; k < data_edges.size(); k++) {
-        if (data_edges[k].dst == dst) {
-          data_edges[k].dst = replace_node;
-        }
-      }
-
-      // The node we replaced might be in `bridge_nodes`. If so, update
-      // `bridge_nodes` to use the replaced node.
-      for (int k = i + 1; k < bridge_nodes.size(); k++) {
-        if (bridge_nodes[k] == dst) {
-          bridge_nodes[k] = replace_node;
-        }
-      }
-    }
-
-    // Remove Identity node.
-    g->RemoveNode(n);
-  }
-  return Status::OK();
-}
-
-// Step 3 for `PostprocessForEncapsulation`. See comments of
-// `PostprocessForEncapsulation` for details.
-// We do not need to worry about removed nodes in step 1 and 2;
-// `PreprocessForEncapsulation` will not record control dependencies for those
-// remvoed nodes in the first place.
-Status AddControlDependencies(
-    Graph* g, const std::unordered_map<string, string>& cluster_node_names) {
-  auto node_name_index = g->BuildNodeNameIndex();
-
-  // Reconnect outside compilation to outside compilation control edge.
-  for (Node* n : g->nodes()) {
-    std::vector<string> control_deps;
-    Status s =
-        GetNodeAttr(n->attrs(), kXlaControlDependenciesAttrName, &control_deps);
-    if (!s.ok()) {
-      if (s.code() != error::NOT_FOUND) {
-        return s;
-      } else {
-        continue;
-      }
-    } else {
-      n->ClearAttr(kXlaControlDependenciesAttrName);
-      for (const string& control_input : control_deps) {
-        auto iter = node_name_index.find(control_input);
-        if (iter == node_name_index.end()) {
-          return errors::Internal("Cannot find original node for ",
-                                  control_input);
-        }
-        g->AddControlEdge(iter->second, n);
-      }
-    }
-  }
-
-  // Reconnect outside compilation to XLA computation control edge.
-  for (Node* n : g->nodes()) {
-    std::vector<string> control_deps;
-    Status s = GetNodeAttr(
-        n->attrs(), kXlaConnectedToOtherXlaComputationAttrName, &control_deps);
-    if (!s.ok()) {
-      if (s.code() != error::NOT_FOUND) {
-        return s;
-      } else {
-        continue;
-      }
-    } else {
-      n->ClearAttr(kXlaConnectedToOtherXlaComputationAttrName);
-      for (const string& control_input : control_deps) {
-        auto iter = cluster_node_names.find(control_input);
-        if (iter == cluster_node_names.end()) {
-          return errors::Internal("Cannot find cluster node for ",
-                                  control_input);
-        }
-        auto iter2 = node_name_index.find(iter->second);
-        if (iter2 == node_name_index.end()) {
-          return errors::Internal("Cannot find cluster node for ",
-                                  iter->second);
-        }
-        g->AddControlEdge(n, iter2->second);
-      }
-    }
-  }
-
-  // Reconnect XLA computation to outside compilation control edge.
-  for (Node* n : g->nodes()) {
-    std::vector<string> control_deps;
-    Status s =
-        GetNodeAttr(n->attrs(), kXlaConnectedFromOtherXlaComputationAttrName,
-                    &control_deps);
-    if (!s.ok()) {
-      if (s.code() != error::NOT_FOUND) {
-        return s;
-      } else {
-        continue;
-      }
-    } else {
-      n->ClearAttr(kXlaConnectedFromOtherXlaComputationAttrName);
-      for (const string& control_input : control_deps) {
-        auto iter = cluster_node_names.find(control_input);
-        if (iter == cluster_node_names.end()) {
-          return errors::Internal("Cannot find cluster node for ",
-                                  control_input);
-        }
-        auto iter2 = node_name_index.find(iter->second);
-        if (iter2 == node_name_index.end()) {
-          return errors::Internal("Cannot find cluster node for ",
-                                  iter->second);
-        }
-        g->AddControlEdge(iter2->second, n);
-      }
-    }
-  }
-
-  return Status::OK();
-}
-
 // Step 1 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of
 // `PreprocessEdgesBetweenOutsideCompilations` for details.
 Status PreprocessControlEdgesBetweenOutsideCompilations(
@@ -642,7 +132,7 @@ Status PreprocessDataEdgesBetweenOutsideCompilations(
 
   // Remove the edge from host to outside compilation. Add a placeholder as
   // outside compilation node input.
-  std::map<string, Node*> placeholders;
+  std::map<std::pair<string, int>, Node*> placeholders;
   for (int i = 0; i < edges.size(); i++) {
     Node* dst = g->FindNodeId(edges[i].dst_node_id);
     const Edge* e;
@@ -652,8 +142,10 @@ Status PreprocessDataEdgesBetweenOutsideCompilations(
     g->RemoveEdge(e);
 
     // Find or create placeholder node.
-    string new_name = absl::StrCat(src->name(), "_oc_to_oc_placeholder");
-    auto iter = placeholders.find(new_name);
+    string new_name =
+        absl::StrCat(src->name(), "_oc_to_oc_placeholder_", src_output);
+    auto placeholder_index = std::make_pair(src->name(), src_output);
+    auto iter = placeholders.find(placeholder_index);
     Node* placeholder_node;
     if (iter == placeholders.end()) {
       NodeDefBuilder placeholder_builder(new_name, "Placeholder");
@@ -673,7 +165,7 @@ Status PreprocessDataEdgesBetweenOutsideCompilations(
       Status s;
       placeholder_node = g->AddNode(placeholder_def, &s);
       TF_RETURN_IF_ERROR(s);
-      placeholders[new_name] = placeholder_node;
+      placeholders[placeholder_index] = placeholder_node;
     } else {
       placeholder_node = iter->second;
     }
@@ -808,20 +300,6 @@ Status PostprocessControlEdgesBetweenOutsideCompilations(
 
 const char kXlaInferredShapesAttrName[] = "_xla_inferred_shapes";
 
-const char kXlaConnectedToOtherXlaComputationAttrName[] =
-    "_xla_connected_to_other_xla_computation";
-const char kXlaConnectedFromOtherXlaComputationAttrName[] =
-    "_xla_connected_from_other_xla_computation";
-const char kXlaControlDependenciesAttrName[] = "_xla_control_dependencies";
-const char kBridgeSourceNodeAttrName[] = "_xla_bridge_src";
-const char kOutsideCompilationToHostOriginalNodeAttrName[] =
-    "_xla_oc_to_host_node_name";
-const char kOutsideCompilationToHostSrcOutputAttrName[] =
-    "_xla_oc_to_host_src_output";
-const char kHostToOutsideCompilationOriginalNodeAttrName[] =
-    "_xla_host_to_oc_node_name";
-const char kHostToOutsideCompilationSrcOutputAttrName[] =
-    "_xla_host_to_oc_src_output";
 const char kXlaConnectedToXlaComputationAttrName[] =
     "_xla_connected_to_xla_computation";
 const char kXlaConnectedFromXlaComputationAttrName[] =
@@ -832,32 +310,7 @@ const char kOutsideCompilationSrcOutputAttrName[] = "_xla_oc_to_oc_src_output";
 const char kXlaControlDependenciesWithinXlaClusterAttrName[] =
     "_xla_control_dependencies_within_xla_cluster";
 
-Status PerformStaticShapeInferenceBeforeEncapsulation(
-    Graph* g, const string& xla_computation_attr_name,
-    const string& outside_compilation_attr_name) {
-  // Find all outside compilation to XLA computation data edges.
-  std::unordered_set<Node*> outside_compilation_send_nodes;
-  for (auto e : g->edges()) {
-    if (e->IsControlEdge()) {
-      continue;
-    }
-
-    auto src_computation = GetStringAttr(*e->src(), xla_computation_attr_name);
-    auto dst_computation = GetStringAttr(*e->dst(), xla_computation_attr_name);
-    if (!src_computation || !dst_computation ||
-        *src_computation != *dst_computation) {
-      continue;
-    }
-
-    auto src_outside_compilation =
-        GetStringAttr(*e->src(), outside_compilation_attr_name);
-    auto dst_outside_compilation =
-        GetStringAttr(*e->dst(), outside_compilation_attr_name);
-    if (src_outside_compilation && !dst_outside_compilation) {
-      outside_compilation_send_nodes.insert(e->src());
-    }
-  }
-
+Status PerformStaticShapeInferenceBeforeEncapsulation(Graph* g) {
   // Perform shape inference.
   std::map<int, InferredShape> arg_shapes;
   GraphShapeInfo shape_info;
@@ -865,55 +318,21 @@ Status PerformStaticShapeInferenceBeforeEncapsulation(
       InferShapes(g, arg_shapes, /*fnlib_def=*/nullptr, &shape_info));
 
   // Add attribute for output shapes.
-  for (Node* n : outside_compilation_send_nodes) {
-    auto iter = shape_info.find(n->name());
-    if (iter == shape_info.end()) {
-      continue;
-    }
-
+  auto node_name_index = g->BuildNodeNameIndex();
+  for (auto iter : shape_info) {
     std::vector<PartialTensorShape> output_shapes;
-    std::transform(iter->second.begin(), iter->second.end(),
+    std::transform(iter.second.begin(), iter.second.end(),
                    std::back_inserter(output_shapes),
                    [](const InferredShape& inferred_shape) {
                      return inferred_shape.shape;
                    });
+    Node* n = node_name_index[iter.first];
     n->AddAttr(kXlaInferredShapesAttrName, output_shapes);
   }
 
   return Status::OK();
 }
 
-Status PreprocessForEncapsulation(Graph* g,
-                                  const string& xla_computation_attr_name,
-                                  const string& outside_compilation_attr_name) {
-  TF_RETURN_IF_ERROR(ProcessControlEdges(g, xla_computation_attr_name,
-                                         outside_compilation_attr_name));
-  TF_RETURN_IF_ERROR(ProcessXlaToXlaDataEdges(g, xla_computation_attr_name,
-                                              outside_compilation_attr_name));
-  TF_RETURN_IF_ERROR(ProcessDataEdgeBetweenOutsideCompilationAndHostComputation(
-      g, xla_computation_attr_name, outside_compilation_attr_name));
-  return Status::OK();
-}
-
-Status PostprocessForEncapsulation(
-    Graph* g, const string& xla_computation_attr_name,
-    const string& outside_compilation_attr_name,
-    const std::unordered_map<string, XlaClusterInfo>& clusters) {
-  // The `node` pointer in `XlaClusterInfo` might be invalidated in step 1/2,
-  // but the node name won't change. Record cluster node name for
-  // `AddControlDependencies`.
-  std::unordered_map<string, string> cluster_node_names;
-  for (const auto& iter : clusters) {
-    cluster_node_names[iter.first] = iter.second.node->name();
-  }
-
-  TF_RETURN_IF_ERROR(
-      RemovePlaceholderBetweenOutsideCompilationAndHostComputation(g));
-  TF_RETURN_IF_ERROR(RemoveIdentityBetweenDifferentXlaComputation(g));
-  TF_RETURN_IF_ERROR(AddControlDependencies(g, cluster_node_names));
-  return Status::OK();
-}
-
 Status PreprocessEdgesBetweenOutsideCompilations(
     Graph* g, const string& outside_compilation_attr_name) {
   // Remove edges from source node to outside compilation nodes, and edges
diff --git a/tensorflow/compiler/jit/encapsulate_util.h b/tensorflow/compiler/jit/encapsulate_util.h
index e363bc5754ac395bae262dc67a780a0173efaf5e..c9f16d14168163e11bb19092f566f1de8724aca3 100644
--- a/tensorflow/compiler/jit/encapsulate_util.h
+++ b/tensorflow/compiler/jit/encapsulate_util.h
@@ -27,51 +27,13 @@ namespace tensorflow {
 // a list of PartialTensorShape objects.
 extern const char kXlaInferredShapesAttrName[];
 
-// Infer output shapes for outside compilation nodes which have output data
-// edges to XLA computation nodes. These shapes will be used later by XLA
-// compiler as output shapes of the outside compilation's XlaHostCompute op.
-// XLA computation nodes will be mark by attr `xla_computation_attr_name`;
-// outside compilation nodes will be marked by both attr
-// `xla_computation_attr_name` and `outside_compilation_attr_name`.
-//
-// Those outside compilation nodes will be marked with attribute
-// `kXlaInferredShapesAttrName`.
+// Infers output shapes for all nodes in graph `g`. The output shapes will be
+// stored in node attribute `kXlaInferredShapesAttrName`.
 //
 // We have to perform shape inference before encapsulation because after
 // encapsulation, some nodes will be encapsulated into function call, and shape
 // inference does not handle function call at the moment.
-Status PerformStaticShapeInferenceBeforeEncapsulation(
-    Graph* g, const string& xla_computation_attr_name,
-    const string& outside_compilation_attr_name);
-
-// Attribute indicating that some ops in other XLA computation has control
-// dependency on this node. Attribute value will be a list of string (XLA
-// computation names).
-extern const char kXlaConnectedToOtherXlaComputationAttrName[];
-
-// Attribute indicating that this node has control dependency on some ops in
-// other XLA computation. Attribute value will be a list of string (XLA
-// computation names).
-extern const char kXlaConnectedFromOtherXlaComputationAttrName[];
-
-// Attribute indicating that this node has control dependencies on some other
-// nodes. Attribute value will be a list of string (node names).
-extern const char kXlaControlDependenciesAttrName[];
-
-// Attribute indicating that this is an Identity node added to act as a bridge
-// between different XLA computations. Attribute value will be string (source
-// node name).
-extern const char kBridgeSourceNodeAttrName[];
-
-// Attribute indicating that this is an Placeholder node added to act as a
-// temporary input node for an outside compilation node. Attribute value will be
-// string (original input node name).
-extern const char kOutsideCompilationToHostOriginalNodeAttrName[];
-
-// Attribute indicating that this is an Placeholder node added to act as a
-// temporary input node for an outside compilation node. Attribute value will be
-// int (src_output for original edge).
-extern const char kOutsideCompilationToHostSrcOutputAttrName[];
+Status PerformStaticShapeInferenceBeforeEncapsulation(Graph* g);
 
 // Attribute indicating that some ops in this node's XLA computation has control
 // dependency on this node. Attribute value will always be "true".
@@ -81,16 +43,6 @@ extern const char kXlaConnectedToXlaComputationAttrName[];
 // this node's XLA computation. Attribute value will always be "true".
 extern const char kXlaConnectedFromXlaComputationAttrName[];
 
-// Attribute indicating that this is an Placeholder node added to act as a
-// temporary input node for an host node. Attribute value will be string
-// (original input node name).
-extern const char kHostToOutsideCompilationOriginalNodeAttrName[];
-
-// Attribute indicating that this is an Placeholder node added to act as a
-// temporary input node for a host node. Attribute value will be int (src_output
-// for original edge).
-extern const char kHostToOutsideCompilationSrcOutputAttrName[];
-
 // Attribute indicating that this is an Placeholder node added to act as a
 // temporary input node for an outside compilation node. Attribute value will be
 // string (original input node name).
@@ -106,27 +58,6 @@ extern const char kOutsideCompilationSrcOutputAttrName[];
 // (node names).
 extern const char kXlaControlDependenciesWithinXlaClusterAttrName[];
 
-// Preprocesses edges between different XLA clusters for encapsulation. It will
-// perform the following operations in order:
-//
-// 1a. For control edges between outside compilation and another XLA
-//     computation, add attr "kXlaConnected{From, To}OtherXlaComputationAttrName
-//     = XLA computation node name" to the outside compilation node.
-// 1b. For control edges between different outside compilations (in different
-//     XLA computations), remove the edge and add attr
-//     "kXlaControlDependenciesAttrName = src node name" to dst node.
-// 1c. For control edges between outside compilation and host computation,
-//     remove the edge and add attr "kXlaControlDependenciesAttrName = src node
-//     name" to dst node.
-// 2. For data edges between different XLA computations, if either src or dst
-//    is outside compilation, add an Identity node in between the edge. The
-//    identity node will have attr kBridgeSourceNodeAttrName.
-// 3. For data edges between outside compilation and host computation, remove
-//    the edge and create a Placeholder node as dst node's input.
-Status PreprocessForEncapsulation(Graph* g,
-                                  const string& xla_computation_attr_name,
-                                  const string& outside_compilation_attr_name);
-
 // Information for XLA computation.
 struct XlaClusterInfo {
   // Add an explicitly-defined default constructor for this class.
@@ -158,24 +89,6 @@ struct XlaClusterInfo {
   const std::map<string, int> host_compute_core;
 };
 
-// Postprocesses edges between different XLA clusters for encapsulation. This
-// function reverts what `PreprocessForEncapsulation` did. It will perform the
-// following operations in order:
-//
-// 1. Remove Placeholder nodes between outside compilation and host computation
-//     (created in `PreprocessForEncapsulation` step 3).
-// 2. Remove Identity nodes created in `PreprocessForEncapsulation` step 2.
-// 3a. Reconnect control edges between outside compilation and another XLA
-//     computation (marked by `PreprocessForEncapsulation` step 1a).
-// 3b. Reconnect control edges between different outside compilations (marked by
-//     `PreprocessForEncapsulation` step 1b).
-// 3c. Reconnect control edges between outside compilation and host computation
-//     (marked by `PreprocessForEncapsulation` step 1c).
-Status PostprocessForEncapsulation(
-    Graph* g, const string& xla_computation_attr_name,
-    const string& outside_compilation_attr_name,
-    const std::unordered_map<string, XlaClusterInfo>& clusters);
-
 // Preprocesses edges within the same XLA cluster. It will perform the following
 // operations in order:
 //
diff --git a/tensorflow/compiler/jit/encapsulate_util_test.cc b/tensorflow/compiler/jit/encapsulate_util_test.cc
index 25c32cef01d7f9877a35001457539f2ad189192f..3bb979e0698d2d6be42ed5bae66c25267928192c 100644
--- a/tensorflow/compiler/jit/encapsulate_util_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_util_test.cc
@@ -38,24 +38,11 @@ TEST(PerformStaticShapeInferenceBeforeEncapsulationTest, Basic) {
   Graph g(OpRegistry::Global());
   TF_CHECK_OK(s.ToGraph(&g));
 
-  // "add" node is outside compilation node, "identity" node is XLA node.
-  auto node_index = g.BuildNodeNameIndex();
-  Node *add_node = node_index["add"], *identity_node = node_index["identity"];
-  add_node->AddAttr("_xla", "cluster");
-  add_node->AddAttr("_oc", "cluster");
-  identity_node->AddAttr("_xla", "cluster");
-  TF_CHECK_OK(
-      PerformStaticShapeInferenceBeforeEncapsulation(&g, "_xla", "_oc"));
+  TF_CHECK_OK(PerformStaticShapeInferenceBeforeEncapsulation(&g));
 
-  // Check that only "add" node now has _xla_inferred_shapes attr.
-  std::vector<Node *> nodes_with_inferred_shape;
-  for (Node *n : g.nodes()) {
-    if (HasNodeAttr(n->def(), kXlaInferredShapesAttrName)) {
-      nodes_with_inferred_shape.push_back(n);
-    }
-  }
-  EXPECT_EQ(nodes_with_inferred_shape.size(), 1);
-  EXPECT_EQ(nodes_with_inferred_shape[0], add_node);
+  // Check that "add" node now has _xla_inferred_shapes attr.
+  auto node_index = g.BuildNodeNameIndex();
+  Node *add_node = node_index["add"];
   std::vector<PartialTensorShape> output_shapes;
   TF_CHECK_OK(GetNodeAttr(add_node->attrs(), kXlaInferredShapesAttrName,
                           &output_shapes));
@@ -66,293 +53,4 @@ TEST(PerformStaticShapeInferenceBeforeEncapsulationTest, Basic) {
   EXPECT_EQ(shape_proto.dim(0).size(), 2);
 }
 
-TEST(PreprocessForEncapsulationTest, ControlEdges) {
-  // Build the graph:
-  // "const_0" and "const_1" in host computation
-  // "add" = "const_0" + "const_1" in XLA computation 0
-  // "identity0" = "add" in XLA computation 0 & outside compilation 0
-  // "identity1" = "identity0" in XLA computation 0
-  // "identity2" = "identity1" in host computation
-  // "identity3" = "identity2" in XLA computation 1
-  // "identity4" = "identity3" in XLA computation 1 & outside compilation 1
-  // "identity5" = "identity4" in XLA computation 1
-  // "identity6" = "identity5" in host computation
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output const_0 = ops::Const(s.WithOpName("const_0"), 1, {});
-  Output const_1 = ops::Const(s.WithOpName("const_1"), 2, {});
-  Output add = ops::Add(s.WithOpName("add"), const_0, const_1);
-  Output identity0 = ops::Identity(s.WithOpName("identity0"), add);
-  Output identity1 = ops::Identity(s.WithOpName("identity1"), identity0);
-  Output identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
-  Output identity3 = ops::Identity(s.WithOpName("identity3"), identity2);
-  Output identity4 = ops::Identity(s.WithOpName("identity4"), identity3);
-  Output identity5 = ops::Identity(s.WithOpName("identity5"), identity4);
-  Graph g(OpRegistry::Global());
-  TF_CHECK_OK(s.ToGraph(&g));
-  auto node_index = g.BuildNodeNameIndex();
-
-  // Set XLA computation/outside compilation attr, and add control edges.
-  Node *const0_node = node_index["const_0"], *add_node = node_index["add"],
-       *identity0_node = node_index["identity0"],
-       *identity1_node = node_index["identity1"],
-       *identity2_node = node_index["identity2"],
-       *identity3_node = node_index["identity3"],
-       *identity4_node = node_index["identity4"],
-       *identity5_node = node_index["identity5"];
-  add_node->AddAttr("_xla", "0");
-  identity0_node->AddAttr("_xla", "0");
-  identity0_node->AddAttr("_oc", "0");
-  identity1_node->AddAttr("_xla", "0");
-  identity3_node->AddAttr("_xla", "1");
-  identity4_node->AddAttr("_xla", "1");
-  identity4_node->AddAttr("_oc", "0");
-  identity5_node->AddAttr("_xla", "1");
-  // Case 1a: control edges between outside compilation and another XLA
-  // computation.
-  g.AddControlEdge(identity0_node, identity3_node);
-  g.AddControlEdge(identity1_node, identity4_node);
-  // Case 1b: control edges between different outside compilations.
-  g.AddControlEdge(identity0_node, identity4_node);
-  // Case 1c: control edges between outside compilation and host computation.
-  g.AddControlEdge(const0_node, identity0_node);
-  g.AddControlEdge(identity0_node, identity2_node);
-
-  TF_CHECK_OK(PreprocessForEncapsulation(&g, "_xla", "_oc"));
-
-  // Case 1a: add attr "_xla_control_deps_{from/to} = XLA computation node name"
-  // to the outside compilation node.
-  std::vector<string> attr;
-  TF_CHECK_OK(GetNodeAttr(identity0_node->def(),
-                          kXlaConnectedToOtherXlaComputationAttrName, &attr));
-  EXPECT_EQ(attr.size(), 1);
-  EXPECT_EQ(attr[0], "1");
-  attr.clear();
-  TF_CHECK_OK(GetNodeAttr(identity4_node->def(),
-                          kXlaConnectedFromOtherXlaComputationAttrName, &attr));
-  EXPECT_EQ(attr.size(), 1);
-  EXPECT_EQ(attr[0], "0");
-  // Case 1b: add attr "_xla_control_deps = src node name" to dst node.
-  attr.clear();
-  TF_CHECK_OK(GetNodeAttr(identity4_node->def(),
-                          kXlaControlDependenciesAttrName, &attr));
-  EXPECT_EQ(attr.size(), 1);
-  EXPECT_EQ(attr[0], "identity0");
-  // Case 1c: add attr "_xla_control_deps = src node name" to dst node.
-  attr.clear();
-  TF_CHECK_OK(GetNodeAttr(identity0_node->def(),
-                          kXlaControlDependenciesAttrName, &attr));
-  EXPECT_EQ(attr.size(), 1);
-  EXPECT_EQ(attr[0], "const_0");
-  attr.clear();
-  TF_CHECK_OK(GetNodeAttr(identity2_node->def(),
-                          kXlaControlDependenciesAttrName, &attr));
-  EXPECT_EQ(attr.size(), 1);
-  EXPECT_EQ(attr[0], "identity0");
-}
-
-TEST(PreprocessForEncapsulationTest, DataEdges) {
-  // Build the graph:
-  // "const_0" and "const_1" in host computation
-  // "add0" = "const_0" + "const_1" in XLA computation 0
-  // "add1" = "add0" + "const_0" in XLA computation 0 & outside compilation 0
-  // "identity0" = "add1" in XLA computation 0
-  // "add2" = "add1" + "identity0" in host computation
-  // "add3" = "add1" + "add2" in XLA computation 1
-  // "add4" = "identity0" + "add2" in XLA computation 1 & outside compilation 1
-  // "identity1" = "add4" in XLA computation 1
-  // "identity2" = "identity1" in host computation
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output const_0 = ops::Const(s.WithOpName("const_0"), 1, {});
-  Output const_1 = ops::Const(s.WithOpName("const_1"), 2, {});
-  Output add0 = ops::Add(s.WithOpName("add0"), const_0, const_1);
-  Output add1 = ops::Add(s.WithOpName("add1"), add0, const_0);
-  Output identity0 = ops::Identity(s.WithOpName("identity0"), add1);
-  Output add2 = ops::Add(s.WithOpName("add2"), add1, identity0);
-  Output add3 = ops::Add(s.WithOpName("add3"), add1, add2);
-  Output add4 = ops::Add(s.WithOpName("add4"), identity0, add2);
-  Output identity1 = ops::Identity(s.WithOpName("identity1"), add4);
-  Output identity2 = ops::Identity(s.WithOpName("identity2"), add4);
-  Graph g(OpRegistry::Global());
-  TF_CHECK_OK(s.ToGraph(&g));
-  auto node_index = g.BuildNodeNameIndex();
-
-  // Set XLA computation/outside compilation attr.
-  Node *add0_node = node_index["add0"], *add1_node = node_index["add1"],
-       *identity0_node = node_index["identity0"],
-       *add3_node = node_index["add3"], *add4_node = node_index["add4"],
-       *identity1_node = node_index["identity1"];
-  add0_node->AddAttr("_xla", "0");
-  add1_node->AddAttr("_xla", "0");
-  add1_node->AddAttr("_oc", "0");
-  identity0_node->AddAttr("_xla", "0");
-  add3_node->AddAttr("_xla", "1");
-  add4_node->AddAttr("_xla", "1");
-  add4_node->AddAttr("_oc", "0");
-  identity1_node->AddAttr("_xla", "1");
-
-  TF_CHECK_OK(PreprocessForEncapsulation(&g, "_xla", "_oc"));
-
-  // Check input nodes for related data edges.
-  node_index = g.BuildNodeNameIndex();
-  // Step 2: add an Identity node between different XLA computations.
-  Node *bridge_add1_add3 = node_index["bridge_add1_add3"];
-  EXPECT_NE(bridge_add1_add3, nullptr);
-  string str;
-  TF_CHECK_OK(
-      GetNodeAttr(bridge_add1_add3->attrs(), kBridgeSourceNodeAttrName, &str));
-  EXPECT_EQ(str, "add1");
-  Node *bridge_identity0_add4 = node_index["bridge_identity0_add4"];
-  EXPECT_NE(bridge_identity0_add4, nullptr);
-  // Step 3: add placeholder for edges between host computation and outside
-  // compilation.
-  EXPECT_EQ(bridge_add1_add3->def().input(0), "add1_oc_to_host_placeholder");
-  Node *add1_oc_to_host_placeholder = node_index["add1_oc_to_host_placeholder"];
-  TF_CHECK_OK(GetNodeAttr(add1_oc_to_host_placeholder->attrs(),
-                          kOutsideCompilationToHostOriginalNodeAttrName, &str));
-  EXPECT_EQ(str, "add1");
-  int i;
-  TF_CHECK_OK(GetNodeAttr(add1_oc_to_host_placeholder->attrs(),
-                          kOutsideCompilationToHostSrcOutputAttrName, &i));
-  EXPECT_EQ(i, 0);
-  add4_node = node_index["add4"];
-  ASSERT_NE(add4_node, nullptr);
-  EXPECT_EQ(add4_node->def().input(0),
-            "bridge_identity0_add4_host_to_oc_placeholder");
-  Node *identity0_host_to_oc_placeholder =
-      node_index["bridge_identity0_add4_host_to_oc_placeholder"];
-  TF_CHECK_OK(GetNodeAttr(identity0_host_to_oc_placeholder->attrs(),
-                          kHostToOutsideCompilationOriginalNodeAttrName, &str));
-  EXPECT_EQ(str, "bridge_identity0_add4");
-  TF_CHECK_OK(GetNodeAttr(identity0_host_to_oc_placeholder->attrs(),
-                          kHostToOutsideCompilationSrcOutputAttrName, &i));
-  EXPECT_EQ(i, 0);
-}
-
-TEST(PostprocessForEncapsulationTest, ControlEdges) {
-  // Build the graph:
-  // "const0"
-  // "identity0" = "const0" (XLA computation 0)
-  // "identity1" = "identity0"
-  // "identity2" = "identity1" (XLA computation 1)
-  // "identity3" = "identity2"
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output const0 = ops::Const(s.WithOpName("const0"), 1, {});
-  Output identity0 = ops::Identity(s.WithOpName("identity0"), const0);
-  Output identity1 = ops::Identity(s.WithOpName("identity1"), identity0);
-  Output identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
-  Output identity3 = ops::Identity(s.WithOpName("identity3"), identity2);
-  Graph g(OpRegistry::Global());
-  TF_CHECK_OK(s.ToGraph(&g));
-  auto node_index = g.BuildNodeNameIndex();
-
-  // Set XLA computation/outside compilation attr, and add control edges.
-  Node *const0_node = node_index["const0"],
-       *identity0_node = node_index["identity0"],
-       *identity1_node = node_index["identity1"],
-       *identity2_node = node_index["identity2"],
-       *identity3_node = node_index["identity3"];
-  identity1_node->AddAttr(kXlaConnectedFromOtherXlaComputationAttrName,
-                          std::vector<string>{"0"});
-  identity1_node->AddAttr(kXlaConnectedToOtherXlaComputationAttrName,
-                          std::vector<string>{"1"});
-  identity3_node->AddAttr(kXlaControlDependenciesAttrName,
-                          std::vector<string>{"const0", "identity1"});
-
-  std::unordered_map<string, XlaClusterInfo> clusters;
-  clusters["0"].node = identity0_node;
-  clusters["1"].node = identity2_node;
-  TF_CHECK_OK(PostprocessForEncapsulation(&g, "_xla", "_oc", clusters));
-
-  // Case 3a: we have control edge identity0 -> identity1, and identity1 ->
-  // identity2.
-  bool edge_identity0_identity1 = false, edge_identity1_identity2 = false;
-  for (const Edge *e : g.edges()) {
-    if (!e->IsControlEdge()) {
-      continue;
-    }
-    if (e->src() == identity0_node && e->dst() == identity1_node) {
-      edge_identity0_identity1 = true;
-    } else if (e->src() == identity1_node && e->dst() == identity2_node) {
-      edge_identity1_identity2 = true;
-    }
-  }
-  EXPECT_TRUE(edge_identity0_identity1);
-  EXPECT_TRUE(edge_identity1_identity2);
-  // Case 3b: we have control edge const0 -> identity3, and identity1 ->
-  // identity3.
-  bool edge_const0_identity3 = false, edge_identity1_identity3 = false;
-  for (const Edge *e : g.edges()) {
-    if (!e->IsControlEdge()) {
-      continue;
-    }
-    if (e->src() == const0_node && e->dst() == identity3_node) {
-      edge_const0_identity3 = true;
-    } else if (e->src() == identity1_node && e->dst() == identity3_node) {
-      edge_identity1_identity3 = true;
-    }
-  }
-  EXPECT_TRUE(edge_const0_identity3);
-  EXPECT_TRUE(edge_identity1_identity3);
-}
-
-TEST(PostprocessForEncapsulationTest, DataEdges) {
-  // Build the graph:
-  // "const0" in outside compilation "0"
-  // "placeholder0" (for "const0") in host computation
-  // "add0" = "placeholder0" + "placeholder0" in host computation
-  // "placeholder1" (for "add0") in outside compilation 1
-  // "add1" = "placeholder1" + "placeholder1" in outside compilation 1
-  //
-  // "bridge" = "placeholder0" in host computation
-  // "placeholder2" (for "bridge") in outside compilation 1
-  // "add2" = "placeholder2" + "placeholder2" in outside compilation 1
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output const0 = ops::Const(s.WithOpName("const0"), 1, {});
-  Output placeholder0 =
-      ops::Placeholder(s.WithOpName("placeholder0"), DT_INT32);
-  Output add0 = ops::Add(s.WithOpName("add0"), placeholder0, placeholder0);
-  Output placeholder1 =
-      ops::Placeholder(s.WithOpName("placeholder1"), DT_INT32);
-  Output add1 = ops::Add(s.WithOpName("add1"), placeholder1, placeholder1);
-  Output bridge = ops::Identity(s.WithOpName("bridge"), placeholder0);
-  Output placeholder2 =
-      ops::Placeholder(s.WithOpName("placeholder2"), DT_INT32);
-  Output add2 = ops::Add(s.WithOpName("add2"), placeholder2, placeholder2);
-  Graph g(OpRegistry::Global());
-  TF_CHECK_OK(s.ToGraph(&g));
-  auto node_index = g.BuildNodeNameIndex();
-
-  // Set related attributes.
-  Node *placeholder0_node = node_index["placeholder0"];
-  placeholder0_node->AddAttr(kOutsideCompilationToHostOriginalNodeAttrName,
-                             "const0");
-  placeholder0_node->AddAttr(kOutsideCompilationToHostSrcOutputAttrName, 0);
-  Node *placeholder1_node = node_index["placeholder1"];
-  placeholder1_node->AddAttr(kHostToOutsideCompilationOriginalNodeAttrName,
-                             "add0");
-  placeholder1_node->AddAttr(kHostToOutsideCompilationSrcOutputAttrName, 0);
-  Node *bridge_node = node_index["bridge"];
-  bridge_node->AddAttr(kBridgeSourceNodeAttrName, "const0");
-  Node *placeholder2_node = node_index["placeholder2"];
-  placeholder2_node->AddAttr(kHostToOutsideCompilationOriginalNodeAttrName,
-                             "bridge");
-  placeholder2_node->AddAttr(kHostToOutsideCompilationSrcOutputAttrName, 0);
-
-  std::unordered_map<string, XlaClusterInfo> clusters;
-  TF_CHECK_OK(PostprocessForEncapsulation(&g, "_xla", "_oc", clusters));
-
-  // Result graph should be:
-  // "add0" = "const0" + "const0"
-  // "add1" = "add0" + "add0"
-  // "add2" = "const0" + "const0"
-  node_index = g.BuildNodeNameIndex();
-  EXPECT_EQ(node_index.size(), 6);
-  EXPECT_EQ(node_index["add0"]->def().input(0), "const0:0");
-  EXPECT_EQ(node_index["add0"]->def().input(1), "const0:0");
-  EXPECT_EQ(node_index["add1"]->def().input(0), "add0:0");
-  EXPECT_EQ(node_index["add1"]->def().input(1), "add0:0");
-  EXPECT_EQ(node_index["add2"]->def().input(0), "const0:0");
-  EXPECT_EQ(node_index["add2"]->def().input(1), "const0:0");
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index d334100aa4a915a87fb05d371e0e3379a7ee05f2..ec745cdbb7e237f8b4935dd41e9791fc75f5355d 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -297,6 +297,7 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
 
     NodeDef def;
     def.set_name(launch->name());
+    MergeDebugInfo(NodeDebugInfo(launch->def()), &def);
 
     // Target the XLA CPU/GPU backends.
     VLOG(2) << "Replacing with XlaLaunch";
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index e3c7e2f89be9b37b51a633dabb099969c181013f..8b01768c49422b331b52a8ba31bade000c95722e 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -20,8 +20,10 @@ limitations under the License.
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/jit/encapsulate_util.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
+#include "tensorflow/compiler/tf2xla/side_effect_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -98,9 +100,12 @@ xla::StatusOr<Node*> BuildRecvAtHostNode(
   recv_at_host_builder.Attr("Toutputs", recv_at_host_dtypes);
   // The correct device_ordinal will be inserted during replication in a
   // subsequent rewrite.
-  recv_at_host_builder.Attr("device_ordinal", 0);
+  AttrValue device_ordinal_value;
+  device_ordinal_value.set_placeholder("device_ordinal");
+  recv_at_host_builder.Attr("device_ordinal", device_ordinal_value);
   recv_at_host_builder.Attr(
       "key", absl::StrCat("host_compute_channel_", oc_cluster_name));
+  recv_at_host_builder.Attr(kXlaHasHostTransferAttrName, true);
   recv_at_host_builder.Input(key_placeholder->name(), 0, DT_STRING);
   TF_RETURN_IF_ERROR(recv_at_host_builder.Finalize(&recv_at_host_def));
   Status s;
@@ -197,9 +202,12 @@ xla::StatusOr<Node*> BuildSendFromHostNode(
   send_from_host_builder.Attr("Tinputs", send_from_host_dtypes);
   // The correct device_ordinal will be inserted during replication in a
   // subsequent rewrite.
-  send_from_host_builder.Attr("device_ordinal", 0);
+  AttrValue device_ordinal_value;
+  device_ordinal_value.set_placeholder("device_ordinal");
+  send_from_host_builder.Attr("device_ordinal", device_ordinal_value);
   send_from_host_builder.Attr(
       "key", absl::StrCat("host_compute_channel_", oc_cluster_name));
+  send_from_host_builder.Attr(kXlaHasHostTransferAttrName, true);
   std::vector<NodeDefBuilder::NodeOut> inputs(send_from_host_dtypes.size());
   for (auto* n : ret_nodes) {
     int index;
@@ -322,6 +330,38 @@ xla::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
   return new_def;
 }
 
+Status ValidateOutsideCompilationCallNode(Node* call_node) {
+  // DT_INT64 as input/output for outside compilation is not supported yet:
+  // b/120809951.
+  for (const Edge* e : call_node->in_edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+    DataType dtype = e->src()->output_type(e->src_output());
+    if (dtype == DT_INT64) {
+      return errors::Unimplemented(
+          "int64 input for outside compilation is not supported yet: "
+          "b/120809951. Please cast output of node ",
+          e->src()->DebugString(),
+          " to int32 before feeding it into outside compilation.");
+    }
+  }
+  for (const Edge* e : call_node->out_edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+    DataType dtype = e->dst()->input_type(e->dst_input());
+    if (dtype == DT_INT64) {
+      return errors::Unimplemented(
+          "int64 output for outside compilation is not supported yet: "
+          "b/120809951. Please cast input of node ",
+          e->dst()->DebugString(),
+          " to int32 before returning it from outside compilation.");
+    }
+  }
+  return Status::OK();
+}
+
 // Replace outside compilation function call node with XlaHostCompute node.
 // If the function call node has no input/output edges, we will just remove it
 // and not create a XlaHostCompute node.
@@ -357,6 +397,47 @@ Status ReplaceOrRemoveOutsideCompilationCallNode(
   return Status::OK();
 }
 
+// Resets "device_ordinal" attr to placeholder value for related nodes
+// (XlaRecvAtHost nodes; XlaSendFromHost nodes; If nodes containing
+// XlaRecvAtHost/XlaSendFromHost).
+Status ResetDeviceOrdinalToPlaceholderValue(Graph* g) {
+  AttrValue device_ordinal_value;
+  device_ordinal_value.set_placeholder("device_ordinal");
+  for (Node* n : g->nodes()) {
+    if (!HasNodeAttr(n->def(), kXlaHasHostTransferAttrName)) {
+      continue;
+    }
+
+    if (n->type_string() == "_XlaRecvAtHost" ||
+        n->type_string() == "_XlaSendFromHost") {
+      n->ClearAttr("device_ordinal");
+      n->AddAttr("device_ordinal", device_ordinal_value);
+    } else if (n->type_string() == "If") {
+      for (const string& attr_name :
+           std::vector<string>{"then_branch", "else_branch"}) {
+        NameAttrList branch_func;
+        TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), attr_name, &branch_func));
+        (*branch_func.mutable_attr())["device_ordinal"] = device_ordinal_value;
+        n->ClearAttr(attr_name);
+        n->AddAttr(attr_name, branch_func);
+      }
+    } else if (n->type_string() == "While") {
+      for (const string& attr_name : std::vector<string>{"cond", "body"}) {
+        NameAttrList branch_func;
+        TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), attr_name, &branch_func));
+        (*branch_func.mutable_attr())["device_ordinal"] = device_ordinal_value;
+        n->ClearAttr(attr_name);
+        n->AddAttr(attr_name, branch_func);
+      }
+    } else {
+      return errors::Internal("Unknown node marked with ",
+                              kXlaHasHostTransferAttrName, ": ",
+                              n->DebugString());
+    }
+  }
+  return Status::OK();
+}
+
 // For an XLA computation, builds host side graph given all outside compilation
 // graphs inside it. The host side graph contains:
 // 1) a "sequencer" node (we will add control edge between XlaRecvAtHost and
@@ -368,8 +449,8 @@ Status ReplaceOrRemoveOutsideCompilationCallNode(
 Status ConstructHostGraph(
     const string& xla_cluster_name, const string& outside_compilation_attr_name,
     const std::vector<string>& outside_compilation_host_graphs,
-    FunctionLibraryDefinition* fld, std::unique_ptr<Graph>* host_graph) {
-  host_graph->reset(new Graph(fld));
+    FunctionLibraryDefinition* fld, const string& host_graph_func_name) {
+  Graph host_graph(fld);
 
   // Create sequencer node in host graph.
   NodeDefBuilder sequencer_builder(absl::StrCat(xla_cluster_name, "_sequencer"),
@@ -378,24 +459,34 @@ Status ConstructHostGraph(
   NodeDef sequencer_def;
   TF_RETURN_IF_ERROR(sequencer_builder.Finalize(&sequencer_def));
   Status s;
-  Node* sequencer = (*host_graph)->AddNode(sequencer_def, &s);
+  Node* sequencer = host_graph.AddNode(sequencer_def, &s);
   TF_RETURN_IF_ERROR(s);
 
   // Create key placeholder in host graph.
   TF_ASSIGN_OR_RETURN(
       Node * key_placeholder,
-      AddHostComputeKeyPlaceholder(xla_cluster_name, host_graph->get()));
+      AddHostComputeKeyPlaceholder(xla_cluster_name, &host_graph));
 
   // For each outside compilation graph, copy them to host graph with the
   // following changes:
   // a) Use key_placeholder in host graph instead of its own.
-  // b) Add control edge from RecvAtHost/SendFromHost to sequencer.
+  // b) Add control edge from host transfer nodes (XlaRecvAtHost,
+  //    XlaSendFromHost, If/While nodes containing
+  //    XlaRecvAtHost/XlaSendFromHost) to sequencer node.
   // c) Clear node_def.device(), so device placer won't get confused.
   for (const string& host_func : outside_compilation_host_graphs) {
     VLOG(4) << "Expanding host graph " << host_func;
+    // Temporarily use "0" as "device_ordinal". It will be reset to placeholder
+    // value after we expanded all host graphs. We cannot just use placeholder
+    // value here because FunctionDef instantiation does not allow placeholder
+    // value for attributes.
+    AttrValue device_ordinal_attr;
+    device_ordinal_attr.set_i(0);
+    protobuf::Map<string, AttrValue> attrs;
+    attrs["device_ordinal"] = device_ordinal_attr;
     FunctionBody* host_fbody = nullptr;
     TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
-        *fld->Find(host_func), AttrSlice(), fld,
+        *fld->Find(host_func), AttrSlice(&attrs), fld,
         [&](const string& op, const OpDef** sig) {
           return fld->LookUpOpDef(op, sig);
         },
@@ -408,8 +499,8 @@ Status ConstructHostGraph(
     FixupSourceAndSinkEdges(host_fbody->graph);
 
     std::map<const Node*, Node*> node_map;
-    node_map[host_fbody->graph->source_node()] = (*host_graph)->source_node();
-    node_map[host_fbody->graph->sink_node()] = (*host_graph)->sink_node();
+    node_map[host_fbody->graph->source_node()] = host_graph.source_node();
+    node_map[host_fbody->graph->sink_node()] = host_graph.sink_node();
     Status s;
     ReverseDFS(
         *host_fbody->graph, /*enter=*/nullptr,
@@ -431,7 +522,7 @@ Status ConstructHostGraph(
             NodeDef copy_def = n->def();
             // Change c).
             copy_def.clear_device();
-            copy = (*host_graph)->AddNode(copy_def, &s);
+            copy = host_graph.AddNode(copy_def, &s);
             if (!s.ok()) {
               return;
             }
@@ -446,22 +537,23 @@ Status ConstructHostGraph(
                                    e->src()->DebugString());
               return;
             }
-            (*host_graph)
-                ->AddEdge(node_map[e->src()], e->src_output(), copy,
-                          e->dst_input());
+            host_graph.AddEdge(node_map[e->src()], e->src_output(), copy,
+                               e->dst_input());
           }
 
           // Change b).
-          if (copy->type_string() == "_XlaRecvAtHost" ||
-              copy->type_string() == "_XlaSendFromHost") {
-            (*host_graph)->AddControlEdge(copy, sequencer);
+          if (HasNodeAttr(copy->def(), kXlaHasHostTransferAttrName)) {
+            host_graph.AddControlEdge(copy, sequencer);
           }
         },
         NodeComparatorID());
+
     if (!s.ok()) {
       return s;
     }
   }
+  // Reset "device_ordinal" to placeholder value.
+  TF_RETURN_IF_ERROR(ResetDeviceOrdinalToPlaceholderValue(&host_graph));
 
   // sequencer and key_placeholder might be dead nodes. Prune them if necessary.
   // - sequencer should be pruned iff it has no input control edges from
@@ -470,21 +562,30 @@ Status ConstructHostGraph(
   // - key_placeholder should be pruned iff there's no RecvAtHost/SendFromHost.
   //   We don't need to do anything special.
   if (!sequencer->in_edges().empty()) {
-    (*host_graph)->AddControlEdge(sequencer, (*host_graph)->sink_node());
+    host_graph.AddControlEdge(sequencer, host_graph.sink_node());
   }
   PruneForReverseReachability(
-      host_graph->get(),
-      std::unordered_set<const Node*>{(*host_graph)->sink_node()});
+      &host_graph, std::unordered_set<const Node*>{host_graph.sink_node()});
 
   // Postprocess edges between different outside compilations.
   TF_RETURN_IF_ERROR(PostprocessEdgesBetweenOutsideCompilations(
-      host_graph->get(), outside_compilation_attr_name));
+      &host_graph, outside_compilation_attr_name));
 
   if (VLOG_IS_ON(4)) {
     dump_graph::DumpGraphToFile(
         absl::StrCat("extract_outside_compilation_host_graph_for_",
                      xla_cluster_name),
-        **host_graph, fld);
+        host_graph, fld);
+  }
+
+  FunctionDef host_graph_fdef;
+  TF_RETURN_IF_ERROR(
+      GraphToFunctionDef(host_graph, host_graph_func_name, &host_graph_fdef));
+  if (fld->Find(host_graph_func_name)) {
+    TF_RETURN_IF_ERROR(
+        fld->ReplaceFunction(host_graph_func_name, host_graph_fdef));
+  } else {
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(host_graph_fdef));
   }
 
   return Status::OK();
@@ -492,8 +593,28 @@ Status ConstructHostGraph(
 
 // Expand XLA computation's outside compilation host side graph into main graph.
 // Add a control edge between sequencer node and the XLA computation node.
-Status ExpandHostGraphIntoMainGraph(Graph* main_graph, Graph* host_graph,
+Status ExpandHostGraphIntoMainGraph(Graph* main_graph,
+                                    FunctionLibraryDefinition* fld,
+                                    const string& host_graph_func_name,
                                     Node* xla_computation_node) {
+  // Temporarily use "0" as "device_ordinal". It will be rewritten with the
+  // correct value in a later pass. We cannot just use placeholder value here
+  // because FunctionDef instantiation does not allow placeholder value for
+  // attributes.
+  AttrValue device_ordinal_attr;
+  device_ordinal_attr.set_i(0);
+  protobuf::Map<string, AttrValue> attrs;
+  attrs["device_ordinal"] = device_ordinal_attr;
+  FunctionBody* fbody = nullptr;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *fld->Find(host_graph_func_name), AttrSlice(&attrs), fld,
+      [&](const string& op, const OpDef** sig) {
+        return fld->LookUpOpDef(op, sig);
+      },
+      &fbody));
+  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
+  Graph* host_graph = fbody->graph;
+
   // We use ReverseDFS() to copy nodes. Make sure all nodes are reverse
   // reachable from sink node so all nodes will be copied.
   // TODO(b/77601805): consolidate copy graph functions.
@@ -545,23 +666,25 @@ Status ExpandHostGraphIntoMainGraph(Graph* main_graph, Graph* host_graph,
   return s;
 }
 
-// Rewrites shape inference graph for outside compilation.
-// 1. If the outside compilation is a "top-level" one (not in a function of any
-//    If/While/etc.), this shape inference graph might have host computation to
-//    outside compilation placeholder nodes, which will cause shape inference to
-//    fail. However, those nodes are not in `host_graph` any more (because we
-//    have executed `PostprocessForEncapsultion`). In this case, we clear the
-//    graph, and copy SendFromHost with all its predecessors from `host_graph`.
-//    This case is detected by whether the SendFromHost node exists in
-//    `host_graph` as well.
-// 2. Remove control edges, and prune nodes that are not useful for shape
-//    inference.
+// Rewrites shape inference graph for outside compilation:
+// 1) If XlaSendFromHost also exists in `host_graph`, copy nodes from
+//    `host_graph`. Because we might still have outside compilation to outside
+//    compilation placeholder nodes in shape inference graph, which will prevent
+//    us from inferring XlaSendFromHost shape. But in `host_graph`, we already
+//    removed those placeholder nodes.
+// 2) Remove control edges.
+// 3) Prune nodes that are not useful for shape inference.
 Status RewriteShapeInferenceGraph(const string& shape_inference_graph_name,
                                   Graph* host_graph,
                                   FunctionLibraryDefinition* fld) {
+  // Use "0" as "device_ordinal". It does not matter for shape inference.
+  AttrValue device_ordinal_attr;
+  device_ordinal_attr.set_i(0);
+  protobuf::Map<string, AttrValue> attrs;
+  attrs["device_ordinal"] = device_ordinal_attr;
   FunctionBody* fbody = nullptr;
   TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
-      *fld->Find(shape_inference_graph_name), AttrSlice(), fld,
+      *fld->Find(shape_inference_graph_name), AttrSlice(&attrs), fld,
       [&](const string& op, const OpDef** sig) {
         return fld->LookUpOpDef(op, sig);
       },
@@ -650,6 +773,7 @@ Status RewriteShapeInferenceGraph(const string& shape_inference_graph_name,
       g->RemoveEdge(e);
     }
   }
+
   // Nodes that are not reverse reachable from SendFromHost are not useful for
   // shape inference. Prune them.
   PruneForReverseReachability(g,
@@ -669,6 +793,572 @@ Status RewriteShapeInferenceGraph(const string& shape_inference_graph_name,
   return Status::OK();
 }
 
+// Builds XlaSendToHost node which sends cond predicate to host.
+xla::StatusOr<Node*> BuildSendIfPredNode(const string& name,
+                                         const string& host_transfer_key,
+                                         Node* pred_node, Graph* g) {
+  NodeDefBuilder send_pred_builder(name, "XlaSendToHost");
+  send_pred_builder.Attr("Tinput", DT_BOOL);
+  send_pred_builder.Attr("key", absl::StrCat(host_transfer_key, "_dtoh_0"));
+  send_pred_builder.Attr(kXlaTokenInputNodesAttrName,
+                         std::vector<string>{kXlaTokenArgNodeName});
+  send_pred_builder.Input(pred_node->name(), 0, DT_BOOL);
+  NodeDef send_pred_def;
+  TF_RETURN_IF_ERROR(send_pred_builder.Finalize(&send_pred_def));
+  Status s;
+  Node* send_pred_node = g->AddNode(send_pred_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  g->AddEdge(pred_node, 0, send_pred_node, 0);
+  return send_pred_node;
+}
+
+// Replaces key placeholder node with an _Arg node.
+Status ReplaceKeyPlaceholderWithArgNode(const string& xla_cluster_name,
+                                        const string& func_name,
+                                        FunctionLibraryDefinition* fld) {
+  // Temporarily use "0" as "device_ordinal". It will be reset to placeholder
+  // value after rewriting.
+  AttrValue device_ordinal_attr;
+  device_ordinal_attr.set_i(0);
+  protobuf::Map<string, AttrValue> attrs;
+  attrs["device_ordinal"] = device_ordinal_attr;
+  FunctionBody* fbody = nullptr;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *fld->Find(func_name), AttrSlice(&attrs), fld,
+      [&](const string& op, const OpDef** sig) {
+        return fld->LookUpOpDef(op, sig);
+      },
+      &fbody));
+  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
+  Graph* g = fbody->graph;
+
+  // Find or create the key placeholder node.
+  Node* key_placeholder = nullptr;
+  for (Node* n : g->nodes()) {
+    if (IsKeyPlaceholderNode(*n)) {
+      key_placeholder = n;
+      break;
+    }
+  }
+  if (!key_placeholder) {
+    TF_ASSIGN_OR_RETURN(key_placeholder,
+                        AddHostComputeKeyPlaceholder(xla_cluster_name, g));
+  }
+
+  // Build the _Arg node, and replace key placeholder node with it.
+  NodeDefBuilder arg_builder("key_arg", FunctionLibraryDefinition::kArgOp);
+  arg_builder.Attr("T", DT_STRING);
+  arg_builder.Attr("index", 0);
+  NodeDef arg_def;
+  TF_RETURN_IF_ERROR(arg_builder.Finalize(&arg_def));
+  TF_RETURN_IF_ERROR(ReplaceNode(g, key_placeholder, arg_def).status());
+
+  // Reset "device_ordinal" to placeholder value.
+  TF_RETURN_IF_ERROR(ResetDeviceOrdinalToPlaceholderValue(g));
+
+  FunctionDef replace_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(*g, func_name, &replace_fdef));
+  TF_RETURN_IF_ERROR(fld->ReplaceFunction(func_name, replace_fdef));
+  return Status::OK();
+}
+
+// Builds host side graph for If node.
+Status BuildHostGraphForIfNode(const string& xla_cluster_attr_name,
+                               const string& outside_compilation_attr_name,
+                               const string& xla_cluster_name,
+                               const string& if_node_name,
+                               const string& host_transfer_key,
+                               const string& host_graph_func_name,
+                               FunctionLibraryDefinition* fld,
+                               const string& then_branch_host_func_name,
+                               const string& else_branch_host_func_name) {
+  Graph host_graph(fld);
+  string outside_compilation_name = absl::StrCat("oc_if_", if_node_name);
+  AttrValue device_ordinal_value;
+  device_ordinal_value.set_placeholder("device_ordinal");
+
+  // Step 1: add key placeholder node.
+  TF_ASSIGN_OR_RETURN(
+      Node * key_placeholder,
+      AddHostComputeKeyPlaceholder(xla_cluster_name, &host_graph));
+
+  // Step 2: build XlaRecvAtHost node to recv predicate.
+  NodeDefBuilder recv_pred_builder(
+      absl::StrCat("recv_oc_if_pred_", if_node_name), "_XlaRecvAtHost");
+  recv_pred_builder.Attr("Toutputs", std::vector<DataType>{DT_BOOL});
+  recv_pred_builder.Attr("key", host_transfer_key);
+  recv_pred_builder.Attr("device_ordinal", device_ordinal_value);
+  recv_pred_builder.Attr(xla_cluster_attr_name, xla_cluster_name);
+  recv_pred_builder.Attr(outside_compilation_attr_name,
+                         outside_compilation_name);
+  recv_pred_builder.Attr(kXlaHasHostTransferAttrName, true);
+  recv_pred_builder.Input(key_placeholder->name(), 0, DT_STRING);
+  NodeDef recv_pred_def;
+  TF_RETURN_IF_ERROR(recv_pred_builder.Finalize(&recv_pred_def));
+  Status s;
+  Node* recv_pred_node = host_graph.AddNode(recv_pred_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  host_graph.AddEdge(key_placeholder, 0, recv_pred_node, 0);
+
+  // Step 3: rewrite `{then, else}_branch_host_func_name`, replace key
+  // placeholder with an _Arg node.
+  TF_RETURN_IF_ERROR(ReplaceKeyPlaceholderWithArgNode(
+      xla_cluster_name, then_branch_host_func_name, fld));
+  TF_RETURN_IF_ERROR(ReplaceKeyPlaceholderWithArgNode(
+      xla_cluster_name, else_branch_host_func_name, fld));
+
+  // Step 4: build If node to choose between `{then, else}_branch_host_graph`.
+  NodeDefBuilder if_builder(absl::StrCat("oc_if_", if_node_name), "If");
+  if_builder.Attr("Tcond", DT_BOOL);
+  if_builder.Attr("Tin", std::vector<DataType>{DT_STRING});
+  if_builder.Attr("Tout", std::vector<DataType>{});
+  NameAttrList host_then_branch, host_else_branch;
+  host_then_branch.set_name(then_branch_host_func_name);
+  (*host_then_branch.mutable_attr())["device_ordinal"] = device_ordinal_value;
+  host_else_branch.set_name(else_branch_host_func_name);
+  (*host_else_branch.mutable_attr())["device_ordinal"] = device_ordinal_value;
+  if_builder.Attr("then_branch", host_then_branch);
+  if_builder.Attr("else_branch", host_else_branch);
+  if_builder.Attr(kXlaHasHostTransferAttrName, true);
+  if_builder.Attr(xla_cluster_attr_name, xla_cluster_name);
+  if_builder.Attr(outside_compilation_attr_name, outside_compilation_name);
+  if_builder.Input(recv_pred_node->name(), 0, DT_BOOL);
+  std::vector<NodeDefBuilder::NodeOut> if_inputs{
+      {key_placeholder->name(), 0, DT_STRING}};
+  if_builder.Input(if_inputs);
+  NodeDef if_def;
+  TF_RETURN_IF_ERROR(if_builder.Finalize(&if_def));
+  Node* if_node = host_graph.AddNode(if_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  host_graph.AddEdge(recv_pred_node, 0, if_node, 0);
+  host_graph.AddEdge(key_placeholder, 0, if_node, 1);
+
+  // Convert `host_graph` to function, and add a "device_ordinal" attr.
+  FunctionDef oc_host_graph_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(host_graph, host_graph_func_name,
+                                        &oc_host_graph_fdef));
+  if (fld->Find(host_graph_func_name)) {
+    TF_RETURN_IF_ERROR(
+        fld->ReplaceFunction(host_graph_func_name, oc_host_graph_fdef));
+  } else {
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(oc_host_graph_fdef));
+  }
+
+  return Status::OK();
+}
+
+// Rewrites loop cond to add a node which sends loop cond to host.
+Status AddSendLoopPredToLoopCond(FunctionLibraryDefinition* fld,
+                                 const NameAttrList& loop_cond_func,
+                                 const string& while_node_name,
+                                 const string& host_transfer_key) {
+  // Instantiate the loop cond function.
+  FunctionBody* fbody = nullptr;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *fld->Find(loop_cond_func.name()), AttrSlice(&loop_cond_func.attr()), fld,
+      [&](const string& op, const OpDef** sig) {
+        return fld->LookUpOpDef(op, sig);
+      },
+      &fbody));
+  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
+  Graph* g = fbody->graph;
+
+  // Find the _Retval node and the loop cond node.
+  Node* ret_node = nullptr;
+  for (Node* n : g->nodes()) {
+    if (n->type_string() == "_Retval") {
+      if (ret_node) {
+        return errors::Internal("Multiple return node for loop cond function ",
+                                loop_cond_func.name(), ": ",
+                                ret_node->DebugString(), " and ",
+                                n->DebugString());
+      } else {
+        ret_node = n;
+      }
+    }
+  }
+  if (!ret_node) {
+    return errors::Internal("No _Retval node for loop cond function ",
+                            loop_cond_func.name());
+  }
+  Node* loop_cond;
+  TF_RETURN_IF_ERROR(ret_node->input_node(0, &loop_cond));
+
+  // Build the XlaSendToHost node.
+  NodeDefBuilder send_loop_cond_builder(
+      absl::StrCat("send_oc_while_cond_", while_node_name), "XlaSendToHost");
+  send_loop_cond_builder.Attr("Tinput", DT_BOOL);
+  send_loop_cond_builder.Attr("key",
+                              absl::StrCat(host_transfer_key, "_dtoh_0"));
+  send_loop_cond_builder.Attr(kXlaTokenInputNodesAttrName,
+                              std::vector<string>{kXlaTokenArgNodeName});
+  send_loop_cond_builder.Input(loop_cond->name(), 0, DT_BOOL);
+  NodeDef send_loop_cond_def;
+  TF_RETURN_IF_ERROR(send_loop_cond_builder.Finalize(&send_loop_cond_def));
+  Status s;
+  Node* send_loop_cond_node = g->AddNode(send_loop_cond_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  g->AddEdge(loop_cond, 0, send_loop_cond_node, 0);
+
+  // Replace original function.
+  FunctionDef replace_fdef;
+  TF_RETURN_IF_ERROR(
+      GraphToFunctionDef(*g, loop_cond_func.name(), &replace_fdef));
+  TF_RETURN_IF_ERROR(fld->ReplaceFunction(loop_cond_func.name(), replace_fdef));
+
+  return Status::OK();
+}
+
+// Rewrites while loop cond function for host.
+Status RewriteHostWhileLoopCond(
+    const string& cond_host_func_name, const string& while_node_name,
+    const string& host_transfer_key, const string& xla_cluster_attr_name,
+    const string& xla_cluster_name, const string& outside_compilation_attr_name,
+    const string& outside_compilation_name, FunctionLibraryDefinition* fld) {
+  // Replace key placeholder node with _Arg node.
+  TF_RETURN_IF_ERROR(ReplaceKeyPlaceholderWithArgNode(
+      xla_cluster_name, cond_host_func_name, fld));
+
+  // Instantiate cond function.
+  AttrValue device_ordinal_temp_value;
+  device_ordinal_temp_value.set_i(0);
+  protobuf::Map<string, AttrValue> attrs;
+  attrs["device_ordinal"] = device_ordinal_temp_value;
+  FunctionBody* cond_fbody = nullptr;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *fld->Find(cond_host_func_name), AttrSlice(&attrs), fld,
+      [&](const string& op, const OpDef** sig) {
+        return fld->LookUpOpDef(op, sig);
+      },
+      &cond_fbody));
+  std::unique_ptr<FunctionBody> cond_fbody_deleter(cond_fbody);
+  Graph* cond_graph = cond_fbody->graph;
+  Node* key_arg = nullptr;
+  for (Node* n : cond_graph->nodes()) {
+    if (n->type_string() == "_Arg") {
+      key_arg = n;
+    }
+  }
+  if (!key_arg) {
+    return errors::Internal(
+        "No _Arg node found for host compute key in function ",
+        cond_host_func_name);
+  }
+
+  // Add an XlaRecvAtHost node to use as cond function return value.
+  // We don't need to set kXlaHasHostTransferAttrName for this node, because
+  // it's already added for the "While" node on the host.
+  NodeDefBuilder recv_pred_builder(
+      absl::StrCat("recv_oc_while_cond_", while_node_name), "_XlaRecvAtHost");
+  recv_pred_builder.Attr("Toutputs", std::vector<DataType>{DT_BOOL});
+  recv_pred_builder.Attr("key", host_transfer_key);
+  AttrValue device_ordinal_value;
+  device_ordinal_value.set_placeholder("device_ordinal");
+  recv_pred_builder.Attr("device_ordinal", device_ordinal_value);
+  recv_pred_builder.Attr(xla_cluster_attr_name, xla_cluster_name);
+  recv_pred_builder.Attr(outside_compilation_attr_name,
+                         outside_compilation_name);
+  recv_pred_builder.Input(key_arg->name(), 0, DT_STRING);
+  NodeDef recv_pred_def;
+  TF_RETURN_IF_ERROR(recv_pred_builder.Finalize(&recv_pred_def));
+  Status s;
+  Node* recv_pred_node = cond_graph->AddNode(recv_pred_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  cond_graph->AddEdge(key_arg, 0, recv_pred_node, 0);
+  NodeDefBuilder ret_builder(
+      absl::StrCat("recv_oc_while_cond_ret_", while_node_name), "_Retval");
+  ret_builder.Attr("T", DT_BOOL);
+  ret_builder.Attr("index", 0);
+  ret_builder.Input(recv_pred_node->name(), 0, DT_BOOL);
+  NodeDef ret_def;
+  TF_RETURN_IF_ERROR(ret_builder.Finalize(&ret_def));
+  Node* ret_node = cond_graph->AddNode(ret_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  cond_graph->AddEdge(recv_pred_node, 0, ret_node, 0);
+
+  // Reset device_ordinal to placeholder value.
+  TF_RETURN_IF_ERROR(ResetDeviceOrdinalToPlaceholderValue(cond_graph));
+
+  // Replace original function.
+  FunctionDef cond_replace_fdef;
+  TF_RETURN_IF_ERROR(
+      GraphToFunctionDef(*cond_graph, cond_host_func_name, &cond_replace_fdef));
+  TF_RETURN_IF_ERROR(
+      fld->ReplaceFunction(cond_host_func_name, cond_replace_fdef));
+
+  return Status::OK();
+}
+
+// Rewrites while loop body function for host.
+Status RewriteHostWhileLoopBody(
+    const string& body_host_func_name, const string& while_node_name,
+    const string& host_transfer_key, const string& xla_cluster_attr_name,
+    const string& xla_cluster_name, const string& outside_compilation_attr_name,
+    const string& outside_compilation_name, FunctionLibraryDefinition* fld) {
+  // Replace key placeholder node with _Arg node.
+  TF_RETURN_IF_ERROR(ReplaceKeyPlaceholderWithArgNode(
+      xla_cluster_name, body_host_func_name, fld));
+
+  // Instantiate body function.
+  AttrValue device_ordinal_temp_value;
+  device_ordinal_temp_value.set_i(0);
+  protobuf::Map<string, AttrValue> attrs;
+  attrs["device_ordinal"] = device_ordinal_temp_value;
+  FunctionBody* body_fbody = nullptr;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *fld->Find(body_host_func_name), AttrSlice(&attrs), fld,
+      [&](const string& op, const OpDef** sig) {
+        return fld->LookUpOpDef(op, sig);
+      },
+      &body_fbody));
+  std::unique_ptr<FunctionBody> body_fbody_deleter(body_fbody);
+  Graph* body_graph = body_fbody->graph;
+  Node* key_arg = nullptr;
+  for (Node* n : body_graph->nodes()) {
+    if (n->type_string() == "_Arg") {
+      key_arg = n;
+    }
+  }
+  if (!key_arg) {
+    return errors::Internal(
+        "No _Arg node found for host compute key in function ",
+        body_host_func_name);
+  }
+
+  // Add a _Retval node to loop body.
+  NodeDefBuilder ret_builder(
+      absl::StrCat("recv_oc_while_body_ret_", while_node_name), "_Retval");
+  ret_builder.Attr("T", DT_STRING);
+  ret_builder.Attr("index", 0);
+  ret_builder.Input(key_arg->name(), 0, DT_STRING);
+  NodeDef ret_def;
+  TF_RETURN_IF_ERROR(ret_builder.Finalize(&ret_def));
+  Status s;
+  Node* ret_node = body_graph->AddNode(ret_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  body_graph->AddEdge(key_arg, 0, ret_node, 0);
+
+  // Reset device_ordinal to placeholder value.
+  TF_RETURN_IF_ERROR(ResetDeviceOrdinalToPlaceholderValue(body_graph));
+
+  // Replace original function.
+  FunctionDef body_replace_fdef;
+  TF_RETURN_IF_ERROR(
+      GraphToFunctionDef(*body_graph, body_host_func_name, &body_replace_fdef));
+  TF_RETURN_IF_ERROR(
+      fld->ReplaceFunction(body_host_func_name, body_replace_fdef));
+
+  return Status::OK();
+}
+
+// Builds host side graph for while node.
+Status BuildHostGraphForWhileNode(
+    const string& xla_cluster_attr_name,
+    const string& outside_compilation_attr_name, const string& xla_cluster_name,
+    const string& while_node_name, const string& host_transfer_key,
+    const string& host_graph_func_name, FunctionLibraryDefinition* fld,
+    const string& cond_host_func_name, const string& body_host_func_name) {
+  Graph host_graph(fld);
+  string outside_compilation_name = absl::StrCat("oc_while_", while_node_name);
+
+  // Step 1: add key placeholder node.
+  TF_ASSIGN_OR_RETURN(
+      Node * key_placeholder,
+      AddHostComputeKeyPlaceholder(xla_cluster_name, &host_graph));
+
+  // Step 2: rewrite cond function.
+  TF_RETURN_IF_ERROR(RewriteHostWhileLoopCond(
+      cond_host_func_name, while_node_name, host_transfer_key,
+      xla_cluster_attr_name, xla_cluster_name, outside_compilation_attr_name,
+      outside_compilation_name, fld));
+
+  // Step 3: rewrite body function.
+  TF_RETURN_IF_ERROR(RewriteHostWhileLoopBody(
+      body_host_func_name, while_node_name, host_transfer_key,
+      xla_cluster_attr_name, xla_cluster_name, outside_compilation_attr_name,
+      outside_compilation_name, fld));
+
+  // Step 4: build While node.
+  NodeDefBuilder while_builder(absl::StrCat("oc_while_", while_node_name),
+                               "While");
+  while_builder.Attr("T", std::vector<DataType>{DT_STRING});
+  NameAttrList func;
+  AttrValue device_ordinal_value;
+  device_ordinal_value.set_placeholder("device_ordinal");
+  (*func.mutable_attr())["device_ordinal"] = device_ordinal_value;
+  func.set_name(cond_host_func_name);
+  while_builder.Attr("cond", func);
+  func.set_name(body_host_func_name);
+  while_builder.Attr("body", func);
+  while_builder.Attr(kXlaHasHostTransferAttrName, true);
+  while_builder.Attr(xla_cluster_attr_name, xla_cluster_name);
+  while_builder.Attr(outside_compilation_attr_name, outside_compilation_name);
+  std::vector<NodeDefBuilder::NodeOut> while_inputs{
+      {key_placeholder->name(), 0, DT_STRING}};
+  while_builder.Input(while_inputs);
+  NodeDef while_def;
+  TF_RETURN_IF_ERROR(while_builder.Finalize(&while_def));
+  Status s;
+  Node* while_node = host_graph.AddNode(while_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  host_graph.AddEdge(key_placeholder, 0, while_node, 0);
+
+  // Convert `host_graph` to function.
+  FunctionDef oc_host_graph_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(host_graph, host_graph_func_name,
+                                        &oc_host_graph_fdef));
+  if (fld->Find(host_graph_func_name)) {
+    TF_RETURN_IF_ERROR(
+        fld->ReplaceFunction(host_graph_func_name, oc_host_graph_fdef));
+  } else {
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(oc_host_graph_fdef));
+  }
+
+  return Status::OK();
+}
+
+Status ExtractOutsideCompilationForNodesWithAssociatedFunctions(
+    Graph* g, const string& xla_cluster_attr_name,
+    const string& outside_compilation_attr_name, const string& xla_cluster_name,
+    const std::map<string, int>& host_compute_core,
+    FunctionLibraryDefinition* fld, std::vector<string>* host_graphs,
+    std::vector<string>* shape_inference_graphs,
+    bool* has_outside_compilation) {
+  std::vector<Node*> if_nodes, while_nodes;
+  for (Node* n : g->nodes()) {
+    if (n->type_string() == "If") {
+      if_nodes.push_back(n);
+    } else if (n->type_string() == "While") {
+      while_nodes.push_back(n);
+    }
+  }
+
+  for (Node* n : if_nodes) {
+    // Instantiate "then_branch" and "else_branch".
+    NameAttrList then_branch, else_branch;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "then_branch", &then_branch));
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "else_branch", &else_branch));
+
+    // Extract outside compilation for then_branch and else_branch.
+    bool then_branch_has_outside_compilation = false;
+    bool else_branch_has_outside_compilation = false;
+    string then_branch_host_func_name =
+               absl::StrCat("oc_then_branch_host_if_", n->name()),
+           else_branch_host_func_name =
+               absl::StrCat("oc_else_branch_host_if_", n->name());
+    string then_branch_xla_func_name = absl::StrCat(then_branch.name(), "_oc"),
+           else_branch_xla_func_name = absl::StrCat(else_branch.name(), "_oc");
+    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        then_branch, then_branch_xla_func_name, then_branch_host_func_name,
+        host_compute_core, fld, shape_inference_graphs,
+        &then_branch_has_outside_compilation));
+    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        else_branch, else_branch_xla_func_name, else_branch_host_func_name,
+        host_compute_core, fld, shape_inference_graphs,
+        &else_branch_has_outside_compilation));
+
+    // If then/else branch do not have outside compilation, nothing to do.
+    if (!then_branch_has_outside_compilation &&
+        !else_branch_has_outside_compilation) {
+      continue;
+    }
+
+    *has_outside_compilation = true;
+
+    // Change If node to call the new functions.
+    then_branch.set_name(then_branch_xla_func_name);
+    n->ClearAttr("then_branch");
+    n->AddAttr("then_branch", then_branch);
+    else_branch.set_name(else_branch_xla_func_name);
+    n->ClearAttr("else_branch");
+    n->AddAttr("else_branch", else_branch);
+
+    string host_transfer_key = absl::StrCat("oc_if_pred_", n->name());
+
+    // XLA computation: add a SendToHost node to send cond predicate.
+    Node* pred_node;
+    TF_RETURN_IF_ERROR(n->input_node(0, &pred_node));
+    TF_ASSIGN_OR_RETURN(
+        Node * send_pred_node,
+        BuildSendIfPredNode(absl::StrCat("send_oc_if_pred_", n->name()),
+                            host_transfer_key, pred_node, g));
+    n->AddAttr(kXlaTokenInputNodesAttrName,
+               std::vector<string>{send_pred_node->name()});
+
+    // Add a control edge from `send_pred_node` to If node, so XlaCompiler will
+    // visit If node after `send_pred_node`, thus the token output for
+    // `send_pred_node` has been generated.
+    g->AddControlEdge(send_pred_node, n);
+
+    // Build host side graph for the "If" node.
+    string oc_host_graph_name = absl::StrCat("oc_if_host_graph_", n->name());
+    TF_RETURN_IF_ERROR(BuildHostGraphForIfNode(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        n->name(), host_transfer_key, oc_host_graph_name, fld,
+        then_branch_host_func_name, else_branch_host_func_name));
+    host_graphs->push_back(oc_host_graph_name);
+  }
+
+  for (Node* n : while_nodes) {
+    // Instantiate "cond" and "body".
+    NameAttrList cond, body;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "cond", &cond));
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "body", &body));
+
+    // Extract outside compilation for cond and body.
+    bool cond_has_outside_compilation = false;
+    bool body_has_outside_compilation = false;
+    string cond_host_func_name = absl::StrCat("oc_cond_host_while_", n->name()),
+           body_host_func_name = absl::StrCat("oc_body_host_while_", n->name());
+    string cond_xla_func_name = absl::StrCat(cond.name(), "_oc"),
+           body_xla_func_name = absl::StrCat(body.name(), "_oc");
+    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        cond, cond_xla_func_name, cond_host_func_name, host_compute_core, fld,
+        shape_inference_graphs, &cond_has_outside_compilation));
+    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        body, body_xla_func_name, body_host_func_name, host_compute_core, fld,
+        shape_inference_graphs, &body_has_outside_compilation));
+
+    // If cond/body do not have outside compilation, nothing to do.
+    if (!cond_has_outside_compilation && !body_has_outside_compilation) {
+      continue;
+    }
+
+    *has_outside_compilation = true;
+
+    // Change While node to call the new functions.
+    cond.set_name(cond_xla_func_name);
+    n->ClearAttr("cond");
+    n->AddAttr("cond", cond);
+    body.set_name(body_xla_func_name);
+    n->ClearAttr("body");
+    n->AddAttr("body", body);
+
+    string host_transfer_key = absl::StrCat("oc_while_pred_", n->name());
+
+    // XLA computation: rewrite cond function to add a SendToHost node to send
+    // loop predicate.
+    TF_RETURN_IF_ERROR(
+        AddSendLoopPredToLoopCond(fld, cond, n->name(), host_transfer_key));
+    n->AddAttr(kXlaTokenInputNodesAttrName,
+               std::vector<string>{kXlaTokenArgNodeName});
+
+    // Build host side graph for the "While" node.
+    string oc_host_graph_name = absl::StrCat("oc_while_host_graph_", n->name());
+    TF_RETURN_IF_ERROR(BuildHostGraphForWhileNode(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        n->name(), host_transfer_key, oc_host_graph_name, fld,
+        cond_host_func_name, body_host_func_name));
+    host_graphs->push_back(oc_host_graph_name);
+  }
+
+  return Status::OK();
+}
+
 }  // namespace
 
 Status RewriteOutsideCompilationSubgraphFn::operator()(
@@ -755,12 +1445,15 @@ Status RewriteOutsideCompilationSubgraphFn::operator()(
   // it with HostCompute node later.
   AddNodeAttr("_outside_compilation_subgraph", old_name, node_def);
   if (shapes) {
-    AddNodeAttr("shape_inference_graph", "", node_def);
+    NameAttrList shape_inference_graph;
+    AddNodeAttr("shape_inference_graph", shape_inference_graph, node_def);
     AddNodeAttr("shapes", *shapes, node_def);
   } else {
     string shape_inference_func_name =
         absl::StrCat("_outside_compilation_shape_inference_", new_name);
-    AddNodeAttr("shape_inference_graph", shape_inference_func_name, node_def);
+    NameAttrList shape_inference_graph;
+    shape_inference_graph.set_name(shape_inference_func_name);
+    AddNodeAttr("shape_inference_graph", shape_inference_graph, node_def);
     AddNodeAttr("shapes", std::vector<TensorShapeProto>{}, node_def);
   }
   AddNodeAttr("ancestors", std::vector<string>{}, node_def);
@@ -775,11 +1468,10 @@ Status ExtractOutsideCompilationForFunction(
     const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name, const string& xla_cluster_name,
     const NameAttrList& func_name_attrs, const string& new_func_name,
+    const string& host_graph_func_name,
     const std::map<string, int>& host_compute_core,
-    FunctionLibraryDefinition* fld, std::unique_ptr<Graph>* host_graph,
-    std::vector<string>* shape_inference_graphs,
+    FunctionLibraryDefinition* fld, std::vector<string>* shape_inference_graphs,
     bool* has_outside_compilation) {
-  // Early return if function does not have any outside compilation nodes.
   const string& func_name = func_name_attrs.name();
   const FunctionDef* fdef = fld->Find(func_name);
   if (!fdef) {
@@ -792,9 +1484,8 @@ Status ExtractOutsideCompilationForFunction(
       break;
     }
   }
-  if (!has_outside_compilation) {
-    return Status::OK();
-  }
+  // We cannot early return here, because we might have outside compilation in
+  // If/While function body.
 
   // Convert the function to graph.
   FunctionBody* fbody = nullptr;
@@ -835,11 +1526,11 @@ Status ExtractOutsideCompilationForFunction(
       // If we could not infer shapes for XlaSendFromHost inputs statically, we
       // will set the "shape_inference_graph" attribute. In that case, copy
       // outside compilation subgraph as shape inference graph in `fld`.
-      string shape_inference_graph;
+      NameAttrList shape_inference_graph;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "shape_inference_graph",
                                      &shape_inference_graph));
-      if (!shape_inference_graph.empty()) {
-        shape_inference_graphs->push_back(shape_inference_graph);
+      if (!shape_inference_graph.name().empty()) {
+        shape_inference_graphs->push_back(shape_inference_graph.name());
 
         const FunctionDef* xla_fdef = fld->Find(n->name());
         if (!xla_fdef) {
@@ -847,9 +1538,9 @@ Status ExtractOutsideCompilationForFunction(
         }
         FunctionDef shape_inference_fdef = *xla_fdef;
         shape_inference_fdef.mutable_signature()->set_name(
-            shape_inference_graph);
-        if (fld->Find(shape_inference_graph)) {
-          TF_RETURN_IF_ERROR(fld->ReplaceFunction(shape_inference_graph,
+            shape_inference_graph.name());
+        if (fld->Find(shape_inference_graph.name())) {
+          TF_RETURN_IF_ERROR(fld->ReplaceFunction(shape_inference_graph.name(),
                                                   shape_inference_fdef));
         } else {
           TF_RETURN_IF_ERROR(fld->AddFunctionDef(shape_inference_fdef));
@@ -858,6 +1549,7 @@ Status ExtractOutsideCompilationForFunction(
     }
   }
   for (Node* n : outside_compilation_nodes) {
+    TF_RETURN_IF_ERROR(ValidateOutsideCompilationCallNode(n));
     TF_RETURN_IF_ERROR(ReplaceOrRemoveOutsideCompilationCallNode(
         graph_out.get(), n, host_compute_core));
   }
@@ -867,12 +1559,17 @@ Status ExtractOutsideCompilationForFunction(
         *graph_out, fld);
   }
 
+  // Handle nodes with associated functions.
+  TF_RETURN_IF_ERROR(ExtractOutsideCompilationForNodesWithAssociatedFunctions(
+      graph_out.get(), xla_cluster_attr_name, outside_compilation_attr_name,
+      xla_cluster_name, host_compute_core, fld,
+      &outside_compilation_host_graphs, shape_inference_graphs,
+      has_outside_compilation));
+
   // Construct host graph.
-  if (!outside_compilation_host_graphs.empty()) {
-    TF_RETURN_IF_ERROR(
-        ConstructHostGraph(xla_cluster_name, outside_compilation_attr_name,
-                           outside_compilation_host_graphs, fld, host_graph));
-  }
+  TF_RETURN_IF_ERROR(ConstructHostGraph(
+      xla_cluster_name, outside_compilation_attr_name,
+      outside_compilation_host_graphs, fld, host_graph_func_name));
 
   // Remove the outside compilation graphs from function library.
   for (const string& func : outside_compilation_host_graphs) {
@@ -909,24 +1606,17 @@ Status ExtractOutsideCompilation(
     auto const& host_compute_core = iter.second.host_compute_core;
 
     bool has_outside_compilation;
-    std::unique_ptr<Graph> host_graph;
+    string host_graph_func_name = absl::StrCat("oc_host_graph_", n->name());
     TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
-        func_name_attrs, func_name_attrs.name(), host_compute_core, fld,
-        &host_graph, &shape_inference_graphs, &has_outside_compilation));
-    if (host_graph) {
-      TF_RETURN_IF_ERROR(ExpandHostGraphIntoMainGraph(g, host_graph.get(), n));
-    }
-  }
-
-  if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile("extract_outside_compilation_expanded", *g,
-                                fld);
+        func_name_attrs, func_name_attrs.name(), host_graph_func_name,
+        host_compute_core, fld, &shape_inference_graphs,
+        &has_outside_compilation));
+    TF_RETURN_IF_ERROR(
+        ExpandHostGraphIntoMainGraph(g, fld, host_graph_func_name, n));
+    TF_RETURN_IF_ERROR(fld->RemoveFunction(host_graph_func_name));
   }
 
-  TF_RETURN_IF_ERROR(PostprocessForEncapsulation(
-      g, xla_cluster_attr_name, outside_compilation_attr_name, clusters));
-
   for (auto shape_inference_graph_name : shape_inference_graphs) {
     TF_RETURN_IF_ERROR(
         RewriteShapeInferenceGraph(shape_inference_graph_name, g, fld));
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.h b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
index 2a4f07cca213d999202024294f5d8f94527059c3..e07e7c5dd0cd42ddd4d643d8b36583c82056bbb2 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.h
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
@@ -88,9 +88,10 @@ Status ExtractOutsideCompilationForFunction(
     const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name, const string& xla_cluster_name,
     const NameAttrList& func_name_attrs, const string& new_func_name,
+    const string& host_graph_func_name,
     const std::map<string, int>& host_compute_core,
-    FunctionLibraryDefinition* fld, std::unique_ptr<Graph>* host_graph,
-    std::vector<string>* shape_inference_graphs, bool* has_outside_compilation);
+    FunctionLibraryDefinition* fld, std::vector<string>* shape_inference_graphs,
+    bool* has_outside_compilation);
 
 // Rewrites XLA computation in `clusters` to replace outside compilation nodes
 // with XlaHostCompute, and moves those outside compilations into `g`. If shapes
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
index bff956100da661b679b4557fce53671e6cef88c5..e9a89e34e0c7b04b4be34e367b2d0bf627c0061a 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@@ -19,8 +19,10 @@ limitations under the License.
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/encapsulate_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.h"
@@ -109,10 +111,10 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, Basic) {
   }
   EXPECT_TRUE(has_control_edge_to_send_from_host);
   // Verify step 7: necessary attrs added to call_node_def.
-  string shape_inference_graph;
+  NameAttrList shape_inference_graph;
   TF_CHECK_OK(GetNodeAttr(AttrSlice(&call_node_def.attr()),
                           "shape_inference_graph", &shape_inference_graph));
-  EXPECT_EQ(shape_inference_graph,
+  EXPECT_EQ(shape_inference_graph.name(),
             "_outside_compilation_shape_inference_cluster_0");
 }
 
@@ -249,27 +251,26 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
 
   protobuf::Map<string, tensorflow::AttrValue> attrs;
   std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
-  std::unique_ptr<Graph> host_graph;
   std::vector<string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
   TF_CHECK_OK(ExtractOutsideCompilationForFunction(
-      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten",
-      host_compute_core, &fld, &host_graph, &shape_inference_graphs,
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
+      host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
 
   // Get rewritten XLA computation function.
-  FunctionBody *fbody = nullptr;
-  TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
-                                      AttrSlice(), &fld,
-                                      [&](const string &op, const OpDef **sig) {
-                                        return fld.LookUpOpDef(op, sig);
-                                      },
-                                      &fbody));
-  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
-  auto node_name_index = fbody->graph->BuildNodeNameIndex();
+  FunctionBody *xla_fbody = nullptr;
+  TF_CHECK_OK(FunctionDefToBodyHelper(
+      *fld.Find("cluster_rewritten"), AttrSlice(), &fld,
+      [&](const string &op, const OpDef **sig) {
+        return fld.LookUpOpDef(op, sig);
+      },
+      &xla_fbody));
+  std::unique_ptr<FunctionBody> xla_fbody_deleter(xla_fbody);
+  auto node_name_index = xla_fbody->graph->BuildNodeNameIndex();
 
   // Check XlaHostCompute nodes.
   Node *host_compute_0 = node_name_index["outside_compilation_0_host_compute"];
@@ -292,18 +293,31 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
   EXPECT_EQ(shapes[0].dim_size(), 1);
   // Check XlaHostCompute nodes' "shape_inference_graph" attr. Both should have
   // empty values.
-  string shape_inference_graph;
+  NameAttrList shape_inference_graph;
   TF_CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shape_inference_graph",
                           &shape_inference_graph));
-  EXPECT_EQ(shape_inference_graph, "");
+  EXPECT_EQ(shape_inference_graph.name(), "");
   TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shape_inference_graph",
                           &shape_inference_graph));
-  EXPECT_EQ(shape_inference_graph, "");
+  EXPECT_EQ(shape_inference_graph.name(), "");
 
   // Check `shape_inference_graphs`.
   EXPECT_EQ(shape_inference_graphs.size(), 0);
 
-  // Check `host_graph`: verify we have key placeholder and sequencer.
+  // Check host graph: verify we have key placeholder and sequencer.
+  FunctionBody *host_fbody = nullptr;
+  AttrValue device_ordinal_temp_value;
+  device_ordinal_temp_value.set_i(0);
+  protobuf::Map<string, AttrValue> host_func_attrs;
+  host_func_attrs["device_ordinal"] = device_ordinal_temp_value;
+  TF_CHECK_OK(FunctionDefToBodyHelper(
+      *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld,
+      [&](const string &op, const OpDef **sig) {
+        return fld.LookUpOpDef(op, sig);
+      },
+      &host_fbody));
+  std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
+  Graph *host_graph = host_fbody->graph;
   Node *key_placeholder = nullptr, *sequencer = nullptr;
   for (Node *n : host_graph->nodes()) {
     if (n->type_string() == "Placeholder" &&
@@ -365,25 +379,37 @@ TEST(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
 
   protobuf::Map<string, tensorflow::AttrValue> attrs;
   std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
-  std::unique_ptr<Graph> host_graph;
   std::vector<string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
   TF_CHECK_OK(ExtractOutsideCompilationForFunction(
-      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten",
-      host_compute_core, &fld, &host_graph, &shape_inference_graphs,
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
+      host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
 
-  // Check `host_graph` is empty.
-  EXPECT_FALSE(host_graph);
+  // Check host graph is empty.
+  FunctionBody *host_fbody = nullptr;
+  AttrValue device_ordinal_temp_value;
+  device_ordinal_temp_value.set_i(0);
+  protobuf::Map<string, AttrValue> host_func_attrs;
+  host_func_attrs["device_ordinal"] = device_ordinal_temp_value;
+  TF_CHECK_OK(FunctionDefToBodyHelper(
+      *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld,
+      [&](const string &op, const OpDef **sig) {
+        return fld.LookUpOpDef(op, sig);
+      },
+      &host_fbody));
+  std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
+  Graph *host_graph = host_fbody->graph;
+  EXPECT_EQ(host_graph->num_nodes(), 2);
 }
 
 TEST(ExtractOutsideCompilationForFunctionTest, XlaHostComputeRemoved) {
   // Build the XLA computation func.
   // "const0"
-  // "const1" (outside compilation clsuter "0")
+  // "const1" (outside compilation cluster "0")
   FunctionDefLibrary fdl;
   {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -401,31 +427,43 @@ TEST(ExtractOutsideCompilationForFunctionTest, XlaHostComputeRemoved) {
 
   protobuf::Map<string, tensorflow::AttrValue> attrs;
   std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
-  std::unique_ptr<Graph> host_graph;
   std::vector<string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
   TF_CHECK_OK(ExtractOutsideCompilationForFunction(
-      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten",
-      host_compute_core, &fld, &host_graph, &shape_inference_graphs,
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
+      host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
 
   // Check rewritten XLA graph: verify that we have no XlaHostCompute.
-  FunctionBody *fbody = nullptr;
-  TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
-                                      AttrSlice(), &fld,
-                                      [&](const string &op, const OpDef **sig) {
-                                        return fld.LookUpOpDef(op, sig);
-                                      },
-                                      &fbody));
-  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
-  for (Node *n : fbody->graph->nodes()) {
+  FunctionBody *xla_fbody = nullptr;
+  TF_CHECK_OK(FunctionDefToBodyHelper(
+      *fld.Find("cluster_rewritten"), AttrSlice(), &fld,
+      [&](const string &op, const OpDef **sig) {
+        return fld.LookUpOpDef(op, sig);
+      },
+      &xla_fbody));
+  std::unique_ptr<FunctionBody> xla_fbody_deleter(xla_fbody);
+  for (Node *n : xla_fbody->graph->nodes()) {
     EXPECT_NE(n->type_string(), "XlaHostCompute");
   }
 
-  // Check `host_graph`: verify we have no placeholder, but we have "const1".
+  // Check host graph: verify we have no placeholder, but we have "const1".
+  FunctionBody *host_fbody = nullptr;
+  AttrValue device_ordinal_temp_value;
+  device_ordinal_temp_value.set_i(0);
+  protobuf::Map<string, AttrValue> host_func_attrs;
+  host_func_attrs["device_ordinal"] = device_ordinal_temp_value;
+  TF_CHECK_OK(FunctionDefToBodyHelper(
+      *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld,
+      [&](const string &op, const OpDef **sig) {
+        return fld.LookUpOpDef(op, sig);
+      },
+      &host_fbody));
+  std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
+  Graph *host_graph = host_fbody->graph;
   int num_key_placeholders = 0;
   for (Node *n : host_graph->nodes()) {
     if (n->type_string() == "Placeholder" &&
@@ -438,4 +476,310 @@ TEST(ExtractOutsideCompilationForFunctionTest, XlaHostComputeRemoved) {
   EXPECT_NE(node_name_index.find("const1"), node_name_index.end());
 }
 
+REGISTER_OP("XlaSendToHost")
+    .Input("input: Tinput")
+    .Attr("Tinput: type")
+    .Attr("key: string")
+    .SetIsStateful();
+
+REGISTER_OP("XlaRecvFromHost")
+    .Output("output: Toutput")
+    .Attr("Toutput: type")
+    .Attr("shape: shape")
+    .Attr("key: string")
+    .SetIsStateful();
+
+TEST(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
+  // Build the XLA computation func.
+  // "const0" (bool)
+  // "const1" (int32)
+  // "if0" (pred = "const0", input = "const1", then_branch = "true_fn",
+  //        else_branch = "false_fn")
+  FunctionDefLibrary fdl;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg = ops::_Arg(s.WithOpName("arg"), DT_INT32, 0);
+    Output identity = ops::Identity(s.WithOpName("identity_true_fn"), arg);
+    ops::_Retval retval(s.WithOpName("retval"), identity, 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    auto node_name_image = g->BuildNodeNameIndex();
+    node_name_image["identity_true_fn"]->AddAttr("_oc", "0");
+    PartialTensorShape shape({2});
+    node_name_image["identity_true_fn"]->AddAttr(
+        kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
+
+    FunctionDef *true_fn_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "true_fn", true_fn_fdef));
+  }
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg = ops::_Arg(s.WithOpName("arg"), DT_INT32, 0);
+    Output identity = ops::Identity(s.WithOpName("identity_false_fn"), arg);
+    ops::_Retval retval(s.WithOpName("retval"), identity, 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    auto node_name_image = g->BuildNodeNameIndex();
+    node_name_image["identity_false_fn"]->AddAttr("_oc", "0");
+    PartialTensorShape shape({2});
+    node_name_image["identity_false_fn"]->AddAttr(
+        kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
+
+    FunctionDef *false_fn_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "false_fn", false_fn_fdef));
+  }
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output cond = ops::Const(s.WithOpName("const0"), true, {2});
+    Output input = ops::Const(s.WithOpName("const1"), 1, {2});
+    NameAttrList true_fn;
+    true_fn.set_name("true_fn");
+    NameAttrList false_fn;
+    false_fn.set_name("false_fn");
+    auto if_op = ops::If(s.WithOpName("if"), cond,
+                         std::initializer_list<Input>{cond, input}, {DT_INT32},
+                         true_fn, false_fn);
+    ops::_Retval retval(s.WithOpName("retval"), if_op.output[0], 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+  }
+  FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
+
+  protobuf::Map<string, tensorflow::AttrValue> attrs;
+  std::map<string, int> host_compute_core;
+  std::vector<string> shape_inference_graphs;
+  bool has_outside_compilation;
+  NameAttrList name_attrs;
+  name_attrs.set_name("cluster");
+  *name_attrs.mutable_attr() = attrs;
+  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
+      host_compute_core, &fld, &shape_inference_graphs,
+      &has_outside_compilation));
+
+  // Check host graph.
+  {
+    FunctionBody *host_fbody = nullptr;
+    AttrValue device_ordinal_temp_value;
+    device_ordinal_temp_value.set_i(0);
+    protobuf::Map<string, AttrValue> host_func_attrs;
+    host_func_attrs["device_ordinal"] = device_ordinal_temp_value;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &host_fbody));
+    std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
+    Graph *host_graph = host_fbody->graph;
+    auto node_name_index = host_graph->BuildNodeNameIndex();
+
+    // Verify we have XlaRecvAtHost to receive "If" predicate.
+    Node *recv_if_pred_node = node_name_index["recv_oc_if_pred_if"];
+    EXPECT_NE(recv_if_pred_node, nullptr);
+
+    // Verify we have an "If" to choose outside compilation between then_branch
+    // and else_branch, and it has `recv_if_pred_node` as cond input.
+    Node *if_oc_node = node_name_index["oc_if_if"];
+    EXPECT_NE(if_oc_node, nullptr);
+    Node *if_oc_node_cond_input;
+    TF_CHECK_OK(if_oc_node->input_node(0, &if_oc_node_cond_input));
+    EXPECT_EQ(if_oc_node_cond_input, recv_if_pred_node);
+
+    // Check that then_branch outside compilation has node "identity_true_fn".
+    const FunctionDef *true_def = fld.Find("oc_then_branch_host_if_if");
+    EXPECT_NE(true_def, nullptr);
+    bool has_identity_true_fn_node = false;
+    for (const auto &node_def : true_def->node_def()) {
+      if (node_def.name() == "identity_true_fn") {
+        has_identity_true_fn_node = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(has_identity_true_fn_node);
+
+    // Check that else_branch outside compilation has node "identity_false_fn".
+    const FunctionDef *false_def = fld.Find("oc_else_branch_host_if_if");
+    EXPECT_NE(false_def, nullptr);
+    bool has_identity_false_fn_node = false;
+    for (const auto &node_def : false_def->node_def()) {
+      if (node_def.name() == "identity_false_fn") {
+        has_identity_false_fn_node = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(has_identity_false_fn_node);
+  }
+
+  // Check XLA graph.
+  {
+    FunctionBody *xla_fbody = nullptr;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("cluster_rewritten"), AttrSlice(), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &xla_fbody));
+    std::unique_ptr<FunctionBody> xla_fbody_deleter(xla_fbody);
+    Graph *xla_graph = xla_fbody->graph;
+    auto node_name_index = xla_graph->BuildNodeNameIndex();
+
+    // Check that we have XlaSendToHost to send cond predicate to host, and
+    // there is a control edge to If node.
+    Node *send_if_pred_node = node_name_index["send_oc_if_pred_if"];
+    EXPECT_NE(send_if_pred_node, nullptr);
+    bool has_control_edge_to_if = false;
+    for (const Edge *e : send_if_pred_node->out_edges()) {
+      if (e->IsControlEdge() && e->dst()->name() == "if") {
+        has_control_edge_to_if = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(has_control_edge_to_if);
+
+    // Check that the "If" node now has `send_if_pred_node` as attribute
+    // _xla_token_input_nodes.
+    Node *if_node = node_name_index["if"];
+    EXPECT_NE(if_node, nullptr);
+    std::vector<string> token_inputs;
+    TF_CHECK_OK(
+        GetNodeAttr(if_node->def(), "_xla_token_input_nodes", &token_inputs));
+    EXPECT_THAT(token_inputs, ::testing::ElementsAre("send_oc_if_pred_if"));
+  }
+}
+
+TEST(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
+  // Build the XLA computation func.
+  // "const0" (bool)
+  // "while0" (input = "const0", cond = "cond_fn", body = "body_fn")
+  FunctionDefLibrary fdl;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg = ops::_Arg(s.WithOpName("arg"), DT_BOOL, 0);
+    Output identity = ops::Identity(s.WithOpName("identity_cond_fn"), arg);
+    ops::_Retval retval(s.WithOpName("retval"), identity, 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    auto node_name_image = g->BuildNodeNameIndex();
+    node_name_image["identity_cond_fn"]->AddAttr("_oc", "0");
+    PartialTensorShape shape({2});
+    node_name_image["identity_cond_fn"]->AddAttr(
+        kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
+
+    FunctionDef *cond_fn_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cond_fn", cond_fn_fdef));
+  }
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg = ops::_Arg(s.WithOpName("arg"), DT_BOOL, 0);
+    Output identity = ops::Identity(s.WithOpName("identity_body_fn"), arg);
+    ops::_Retval retval(s.WithOpName("retval"), identity, 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    auto node_name_image = g->BuildNodeNameIndex();
+    node_name_image["identity_body_fn"]->AddAttr("_oc", "0");
+    PartialTensorShape shape({2});
+    node_name_image["identity_body_fn"]->AddAttr(
+        kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
+
+    FunctionDef *body_fn_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "body_fn", body_fn_fdef));
+  }
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output input = ops::Const(s.WithOpName("const0"), true, {2});
+    NameAttrList cond_fn;
+    cond_fn.set_name("cond_fn");
+    NameAttrList body_fn;
+    body_fn.set_name("body_fn");
+    auto while_op =
+        ops::While(s.WithOpName("while"), std::initializer_list<Input>{input},
+                   cond_fn, body_fn);
+    ops::_Retval retval(s.WithOpName("retval"), while_op.output[0], 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+  }
+  FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
+
+  protobuf::Map<string, tensorflow::AttrValue> attrs;
+  std::map<string, int> host_compute_core;
+  std::vector<string> shape_inference_graphs;
+  bool has_outside_compilation;
+  NameAttrList name_attrs;
+  name_attrs.set_name("cluster");
+  *name_attrs.mutable_attr() = attrs;
+  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
+      host_compute_core, &fld, &shape_inference_graphs,
+      &has_outside_compilation));
+
+  // Check host graph.
+  {
+    FunctionBody *host_fbody = nullptr;
+    AttrValue device_ordinal_temp_value;
+    device_ordinal_temp_value.set_i(0);
+    protobuf::Map<string, AttrValue> host_func_attrs;
+    host_func_attrs["device_ordinal"] = device_ordinal_temp_value;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &host_fbody));
+    std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
+    Graph *host_graph = host_fbody->graph;
+    auto node_name_index = host_graph->BuildNodeNameIndex();
+
+    // Verify we have an "While" to execute outside compilation.
+    Node *while_oc_node = node_name_index["oc_while_while"];
+    EXPECT_NE(while_oc_node, nullptr);
+
+    // Check that cond outside compilation has node "identity_cond_fn".
+    const FunctionDef *cond_def = fld.Find("oc_cond_host_while_while");
+    EXPECT_NE(cond_def, nullptr);
+    bool has_identity_cond_fn_node = false;
+    for (const auto &node_def : cond_def->node_def()) {
+      if (node_def.name() == "identity_cond_fn") {
+        has_identity_cond_fn_node = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(has_identity_cond_fn_node);
+
+    // Check that body outside compilation has node "identity_body_fn".
+    const FunctionDef *body_def = fld.Find("oc_body_host_while_while");
+    EXPECT_NE(body_def, nullptr);
+    bool has_identity_body_fn_node = false;
+    for (const auto &node_def : body_def->node_def()) {
+      if (node_def.name() == "identity_body_fn") {
+        has_identity_body_fn_node = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(has_identity_body_fn_node);
+  }
+
+  // Check XLA graph.
+  {
+    // Verify that rewritten cond fn has XlaSendToHost to send loop predicate to
+    // host.
+    const FunctionDef *cond_def = fld.Find("cond_fn_oc");
+    EXPECT_NE(cond_def, nullptr);
+    bool has_send_oc_while_cond_node = false;
+    for (const auto &node_def : cond_def->node_def()) {
+      if (node_def.name() == "send_oc_while_cond_while") {
+        has_send_oc_while_cond_node = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(has_send_oc_while_cond_node);
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 25796435a5c87af5e252981abf96833f4cda9a5e..6618e3a58ab7b6374ed775cd6e4e18a6a4975588 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -86,7 +86,7 @@ bool IsDummyImplOp(absl::string_view op_name) {
 bool IsStatefulRandomOp(absl::string_view op_name) {
   return op_name == "RandomUniform" || op_name == "RandomShuffle" ||
          op_name == "RandomUniformInt" || op_name == "RandomStandardNormal" ||
-         op_name == "TruncatedNormal";
+         op_name == "TruncatedNormal" || op_name == "Multinomial";
 }
 
 bool OpProducesOrConsumesVariant(const Node& node) {
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc
index 42ea3926e16ae791dbe1bede3b8742383db7667c..e1fd2aaee2822daeffb415d053c9c4f56002a856 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass.cc
@@ -120,6 +120,7 @@ Status PartiallyDeclusterNode(Graph* graph, Node* n) {
 
   NodeDef ndef = n->def();
   ndef.set_name(absl::StrCat(n->name(), "/declustered"));
+  MergeDebugInfo(NodeDebugInfo(n->def()), &ndef);
   RemoveFromXlaCluster(&ndef);
   Status s;
   Node* cloned_node = graph->AddNode(ndef, &s);
diff --git a/tensorflow/compiler/jit/shape_inference.cc b/tensorflow/compiler/jit/shape_inference.cc
index 80c691fe490c1092315708a2da754d367d585300..a27e0d9f2a6ecddfdbdb29be673084d77a178d8a 100644
--- a/tensorflow/compiler/jit/shape_inference.cc
+++ b/tensorflow/compiler/jit/shape_inference.cc
@@ -53,7 +53,15 @@ Status PropagateShapes(const Graph& graph,
     // shapes, even if no shape function is registered for a node.
     Status status = shape_refiner->AddNode(n);
     if (!status.ok()) {
-      VLOG(1) << "Shape inference failed for node: " << status;
+      VLOG(1) << "Shape inference failed for node " << n->name() << ": "
+              << status;
+    } else {
+      shape_inference::InferenceContext* context = shape_refiner->GetContext(n);
+      for (int i = 0; i < n->num_outputs(); i++) {
+        shape_inference::ShapeHandle handle = context->output(i);
+        VLOG(4) << "Output " << i << " for node " << n->name() << ": "
+                << context->DebugString(handle);
+      }
     }
 
     if (n->type_string() == "_Arg") {
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 1fe612d43d10030675cf307b109e4dcc89cb2d79..c7e8d61d280a33a83c3386d8ef801018634d31ec 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -142,11 +142,22 @@ Status XlaCompileOnDemandOp::Compile(
         TF_RETURN_IF_ERROR(ctx->allocate_temp(
             device_tensor.dtype(), device_tensor.shape(), &host_tensor, attrs));
         Notification n;
+        Status status;
         ctx->op_device_context()->CopyDeviceTensorToCPU(
             &device_tensor, "ConstantArgument",
             reinterpret_cast<Device*>(ctx->device()), &host_tensor,
-            [&](Status status) { n.Notify(); });
+            [&](Status s) {
+              status = s;
+              n.Notify();
+            });
         n.WaitForNotification();
+        if (!status.ok()) {
+          LOG(ERROR) << "Copying tensor of shape "
+                     << device_tensor.shape().DebugString() << " from "
+                     << ctx->device()->name() << "to CPU failed with "
+                     << status.ToString();
+          return status;
+        }
         constant_arguments[i] = host_tensor;
       }
     }
@@ -189,6 +200,7 @@ Status XlaCompileOnDemandOp::Compile(
   std::map<int, OptionalTensor> variable_args = GetVariables(ctx);
 
   std::vector<XlaCompiler::Argument> args;
+
   TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
       constant_arguments, variable_args, ctx, &args));
 
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index 7df898ad12a15345f45fc96e0ec3d42b6e51731b..e9770647e7ba96cc1db026d12d5f11f52ce98d35 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -63,7 +63,19 @@ Status XlaCpuDeviceFactory::CreateDevices(
   options.device_ordinal = 0;
   options.compilation_device_name = DEVICE_CPU_XLA_JIT;
   options.use_multiple_streams = false;
-  devices->push_back(absl::make_unique<XlaDevice>(session_options, options));
+  auto device = absl::make_unique<XlaDevice>(session_options, options);
+
+  // Setting GpuDeviceInfo because eager runtime relies on the device
+  // context in tensorflow_gpu_device_info(). Also,
+  // tensorflow_gpu_device_info() == nullptr is used as an IsCPU test.
+  // We need XlaCpuDevice to be treated not as CPU because it allocates
+  // XlaTensors, not regular Tensors.
+  Status status = device->UseGpuDeviceInfo();
+  if (!status.ok()) {
+    errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT);
+    return status;
+  }
+  devices->push_back(std::move(device));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 6e6532731e64bd42ee56aa719748988f321e0f17..1f3afe8822d441a5ce37617fe18d7767e9bc72e4 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -79,6 +79,13 @@ XlaDeviceContext::XlaDeviceContext(
   }
 }
 
+void XlaDeviceContext::CopyTensorInSameDevice(const Tensor* input_tensor,
+                                              Device* device,
+                                              Tensor* output_tensor,
+                                              StatusCallback done) const {
+  done(errors::Unimplemented("XLA->XLA same-device copies not implemented."));
+}
+
 void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                              Device* device,
                                              Tensor* device_tensor,
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index 1e18df197a2dd65590c5181b4dae4481dca36641..e45db989fac720df6c3458c93a6b8dbb0919f930 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -62,6 +62,9 @@ class XlaDeviceContext : public DeviceContext {
   void CopyDeviceTensorToCPU(const Tensor* device_tensor,
                              absl::string_view tensor_name, Device* device,
                              Tensor* cpu_tensor, StatusCallback done) override;
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override;
 
   xla::LocalClient* client() const { return client_; }
   se::Stream* stream() const { return stream_.get(); }
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index adf0f994b84d9fbf918a5b2478aa7d106853e038..927f983ba9ef23c8509523f42366c0c89c29db9f 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -203,6 +203,8 @@ class XlaAssignVariableOp : public OpKernel {
                               .HostMemory("output")                            \
                               .TypeConstraint<ResourceHandle>("T"),            \
                           ArgOp);                                              \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(kArgOp).Device(DEVICE).TypeConstraint<Variant>("T"), ArgOp);        \
                                                                                \
   REGISTER_KERNEL_BUILDER(Name(kRetOp)                                         \
                               .Device(DEVICE)                                  \
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 944f732b99c0924a08932eda0aedd8c815cc51d0..0191315a66f4d331e54fadc9dc6a073a05fd67ef 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -16,7 +16,10 @@ limitations under the License.
 // Registers the XLA_GPU device, which is an XlaDevice instantiation that runs
 // operators using XLA via the XLA "CUDA" (GPU) backend.
 
+#include <set>
 #include "absl/memory/memory.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/compiler/jit/kernels/xla_ops.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
@@ -52,8 +55,35 @@ Status XlaGpuDeviceFactory::CreateDevices(
     VLOG(1) << "Failed to create XLA_GPU device: " << platform.status();
     return Status::OK();
   }
-
-  for (int i = 0; i < platform.ValueOrDie()->VisibleDeviceCount(); ++i) {
+  string allowed_gpus =
+      session_options.config.gpu_options().visible_device_list();
+  std::set<int> gpu_ids;
+  int num_visible_devices = platform.ValueOrDie()->VisibleDeviceCount();
+  if (allowed_gpus.empty()) {
+    for (int i = 0; i < num_visible_devices; ++i) {
+      gpu_ids.insert(i);
+    }
+  } else {
+    // For loop below is copied from gpu/gpu_device.cc. It validates
+    // the visible_device_list and populates gpu_ids set.
+    const std::vector<string> visible_devices =
+        absl::StrSplit(allowed_gpus, ',');
+    for (const string& platform_gpu_id_str : visible_devices) {
+      int32 platform_gpu_id;
+      if (!absl::SimpleAtoi(platform_gpu_id_str, &platform_gpu_id)) {
+        return errors::InvalidArgument(
+            "Could not parse entry in 'visible_device_list': '",
+            platform_gpu_id_str, "'. visible_device_list = ", allowed_gpus);
+      }
+      if (platform_gpu_id < 0 || platform_gpu_id >= num_visible_devices) {
+        return errors::InvalidArgument(
+            "'visible_device_list' listed an invalid GPU id '", platform_gpu_id,
+            "' but visible device count is ", num_visible_devices);
+      }
+      gpu_ids.insert(platform_gpu_id);
+    }
+  }
+  for (int i : gpu_ids) {
     XlaDevice::Options options;
     options.platform = platform.ValueOrDie();
     options.device_name_prefix = name_prefix;
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 437db019a0eabe66417725148d8b121842e90479..554227f09de0ab4d9e07f199b957657f3121ff06 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -199,19 +199,17 @@ class XlaTensorBuffer : public TensorBuffer {
  public:
   XlaTensorBuffer(const void* ptr, size_t expected_size, size_t actual_size,
                   Allocator* allocator)
-      : expected_size_(expected_size),
+      : TensorBuffer(const_cast<void*>(ptr)),
+        expected_size_(expected_size),
         actual_size_(actual_size),
-        allocator_(allocator) {
-    data_ = const_cast<void*>(ptr);
-  }
+        allocator_(allocator) {}
 
   ~XlaTensorBuffer() override {
-    if (data_) {
-      allocator_->DeallocateRaw(data_);
+    if (data()) {
+      allocator_->DeallocateRaw(data());
     }
   }
 
-  void* data() const override { return data_; }
   size_t size() const override { return expected_size_; }
 
   TensorBuffer* root_buffer() override { return this; }
@@ -231,7 +229,6 @@ class XlaTensorBuffer : public TensorBuffer {
   }
 
  private:
-  void* data_;
   size_t expected_size_;
   size_t actual_size_;
   Allocator* allocator_;
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index bc3d60b90e58b4018f1c52b09941dedba7ef348a..093b61629cd0b04d5d8488139b8d7262b739f86d 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -408,13 +408,6 @@ tf_xla_py_test(
     name = "eager_test",
     size = "large",
     srcs = ["eager_test.py"],
-    disabled_backends = [
-        # TODO(b/78199195) Support XLA CPU devices in eager runtime
-        "cpu",
-        "cpu_ondemand",
-        # TODO(b/78468222) Enable GPU backend
-        "gpu",
-    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/compiler/tests/depthwise_conv_op_test.py b/tensorflow/compiler/tests/depthwise_conv_op_test.py
index 174bfa9efbcd7dcb4f895237eb01c17bc4a3a6b4..90146e6b27ca31304a2549ec247412341efe390c 100644
--- a/tensorflow/compiler/tests/depthwise_conv_op_test.py
+++ b/tensorflow/compiler/tests/depthwise_conv_op_test.py
@@ -350,8 +350,13 @@ class DepthwiseConv2DTest(xla_test.XLATestCase):
       self._CompareBackpropInput(input_size, filter_size, output_size, stride,
                                  padding)
 
-  def _CompareBackpropFilter(self, input_sizes, filter_sizes, output_sizes,
-                             stride, padding):
+  def _CompareBackpropFilter(self,
+                             input_sizes,
+                             filter_sizes,
+                             output_sizes,
+                             stride,
+                             padding,
+                             data_format="NHWC"):
     x0 = np.random.rand(*input_sizes).astype(np.float32)
     x2 = np.random.rand(*output_sizes).astype(np.float32)
 
@@ -360,13 +365,30 @@ class DepthwiseConv2DTest(xla_test.XLATestCase):
         t0 = array_ops.placeholder(np.float32, shape=input_sizes)
         t1 = constant_op.constant(filter_sizes, shape=[len(filter_sizes)])
         t2 = array_ops.placeholder(np.float32, shape=output_sizes)
+        native_t0 = t0
+        native_t2 = t2
+        strides = [1, stride, stride, 1]
+
         if use_xla:
+          if data_format == "NCHW":
+            # Transpose from NWHC input to NCHW
+            # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
+            native_t0 = array_ops.transpose(t0, [0, 3, 1, 2])
+            native_t2 = array_ops.transpose(t2, [0, 3, 1, 2])
+            strides = [1, 1, stride, stride]
           with self.test_scope():
             backprop = nn_ops.depthwise_conv2d_native_backprop_filter(
-                t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
+                native_t0,
+                t1,
+                native_t2,
+                strides=strides,
+                padding=padding,
+                data_format=data_format)
         else:
+          # For CPU, the format NCHW is not supported. Therefore we always use
+          # NHWC here.
           backprop = nn_ops.depthwise_conv2d_native_backprop_filter(
-              t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
+              native_t0, t1, native_t2, strides=strides, padding=padding)
         ret = backprop.eval({t0: x0, t2: x2})
         self.assertShapeEqual(ret, backprop)
         return ret
@@ -379,11 +401,24 @@ class DepthwiseConv2DTest(xla_test.XLATestCase):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(ConfigsToTest()):
       print("Testing DepthwiseConv2DFilterGradCompare,", index, "th config:",
-            input_size, "*", filter_size, "stride:", stride, "padding:",
-            padding)
+            input_size, "*", filter_size, "producing output", output_size,
+            "stride:", stride, "padding:", padding)
       self._CompareBackpropFilter(input_size, filter_size, output_size,
                                   stride, padding)
 
+  def testDepthwiseConv2DFilterGradFormatNCHWCompare(self):
+    for index, (input_size, filter_size, output_size, stride,
+                padding) in enumerate(ConfigsToTest()):
+      print("Testing DepthwiseConv2DFilterGradFormatNCHWCompare,", index,
+            "th config:", input_size, "*", filter_size, "producing output",
+            output_size, "stride:", stride, "padding:", padding)
+      self._CompareBackpropFilter(
+          input_size,
+          filter_size,
+          output_size,
+          stride,
+          padding,
+          data_format="NCHW")
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index 4cf88fc523735cc2d22e085afb83790c7ebb48e4..28274ff799de2c85e1e80512cadbe0206cb640a4 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -319,7 +319,7 @@ class XlaOpsTest(xla_test.XLATestCase, parameterized.TestCase):
         session.run(output)
       self.assertRegexpMatches(
           invalid_arg_error.exception.message,
-          (r'^start_indices must be a vector with length equal to input rank, '
+          (r'start_indices must be a vector with length equal to input rank, '
            r'but input rank is 3 and start_indices has shape \[2\].*'))
 
   def testDynamicSliceWithIncorrectSizeIndicesShape(self):
@@ -332,7 +332,7 @@ class XlaOpsTest(xla_test.XLATestCase, parameterized.TestCase):
         session.run(output)
       self.assertRegexpMatches(
           invalid_arg_error.exception.message,
-          (r'^size_indices must be a vector with length equal to input rank, '
+          (r'size_indices must be a vector with length equal to input rank, '
            r'but input rank is 3 and size_indices has shape \[2\].*'))
 
 
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 25a84fb1b6609106213231db1ca1ce54da8bd960..5a0d9b9af9d55a8dee809d3cf909bce39c3b8b6c 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -445,14 +445,9 @@ cc_library(
     ],
     deps = [
         "//tensorflow/compiler/jit:flags",
-        "//tensorflow/compiler/xla:parse_flags_from_env",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
+        "//tensorflow/core:graph",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/compiler/tf2xla/dump_graph.cc b/tensorflow/compiler/tf2xla/dump_graph.cc
index 1de85004a51bea464f8f0166511402e5dd85ac14..64fdbbebc65bff4ed0b965fcdd534cc9696472b6 100644
--- a/tensorflow/compiler/tf2xla/dump_graph.cc
+++ b/tensorflow/compiler/tf2xla/dump_graph.cc
@@ -18,86 +18,26 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 
-#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 namespace dump_graph {
 
-namespace {
-
-struct NameCounts {
-  mutex counts_mutex;
-  std::unordered_map<string, int> counts;
-};
-
-string MakeUniqueFilename(string name) {
-  static NameCounts& instance = *new NameCounts;
-
-  // Remove illegal characters from `name`.
-  for (int i = 0; i < name.size(); ++i) {
-    char ch = name[i];
-    if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?') {
-      name[i] = '_';
-    }
-  }
-
-  int count;
-  {
-    mutex_lock lock(instance.counts_mutex);
-    count = instance.counts[name]++;
-  }
-
-  string filename = name;
-  if (count > 0) {
-    absl::StrAppend(&filename, "_", count);
-  }
-  absl::StrAppend(&filename, ".pbtxt");
-  return filename;
-}
-
-string WriteTextProtoToUniqueFile(
-    Env* env, const string& name, const char* proto_type,
-    const ::tensorflow::protobuf::Message& proto) {
-  const string& dirname = GetDumpGraphFlags()->tf_dump_graph_prefix;
-  Status status = env->RecursivelyCreateDir(dirname);
-  if (!status.ok()) {
-    LOG(WARNING) << "Failed to create " << dirname << " for dumping "
-                 << proto_type << ": " << status;
-    return "(unavailable)";
-  }
-  string filepath = absl::StrCat(dirname, "/", MakeUniqueFilename(name));
-  status = WriteTextProto(Env::Default(), filepath, proto);
-  if (!status.ok()) {
-    LOG(WARNING) << "Failed to dump " << proto_type << " to file: " << filepath
-                 << " : " << status;
-    return "(unavailable)";
-  }
-  LOG(INFO) << "Dumped " << proto_type << " to " << filepath;
-  return filepath;
-}
-
-}  // anonymous namespace
-
 string DumpGraphDefToFile(const string& name, GraphDef const& graph_def) {
-  return WriteTextProtoToUniqueFile(Env::Default(), name, "GraphDef",
-                                    graph_def);
+  return tensorflow::DumpGraphDefToFile(
+      name, graph_def, GetDumpGraphFlags()->tf_dump_graph_prefix);
 }
 
 string DumpGraphToFile(const string& name, Graph const& graph,
                        const FunctionLibraryDefinition* flib_def) {
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-  if (flib_def) {
-    *graph_def.mutable_library() = flib_def->ToProto();
-  }
-  return DumpGraphDefToFile(name, graph_def);
+  return tensorflow::DumpGraphToFile(name, graph, flib_def,
+                                     GetDumpGraphFlags()->tf_dump_graph_prefix);
 }
 
 string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef) {
-  return WriteTextProtoToUniqueFile(Env::Default(), name, "FunctionDef", fdef);
+  return tensorflow::DumpFunctionDefToFile(
+      name, fdef, GetDumpGraphFlags()->tf_dump_graph_prefix);
 }
 
 }  // namespace dump_graph
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index c693e42d26712d55852f45c806215fc1f1b9a030..7ae96e1d484900e28e8c23c3bb2232401144ad82 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -640,7 +640,8 @@ Status Conditional::ExtractBodies(Graph* graph) {
 Status Conditional::BuildIfNode(Graph* graph,
                                 FunctionLibraryDefinition* library) {
   VLOG(2) << "Build cond function for " << name();
-  NodeDefBuilder builder(name(), "If", library);
+  NodeDebugInfo debug_info((*merges_.begin())->def());
+  NodeDefBuilder builder(name(), "If", library, &debug_info);
   const string branch_name[] = {"else_branch", "then_branch"};
   for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) {
     int branch_index = static_cast<int>(branch);
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index d85b4f5ae0cb9c7d2476158a5830f921742ae980..a18a4e92d62787051f6ab92e72ee8bf0d1060dca 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -1,16 +1,11 @@
+load("//tensorflow:tensorflow.bzl", "tf_copts", "tf_kernel_library")
+
 licenses(["notice"])  # Apache 2.0
 
 package(
     default_visibility = ["//tensorflow/compiler/tf2xla:internal"],
 )
 
-load("//tensorflow:tensorflow.bzl", "tf_copts")
-load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
-load(
-    "//third_party/mkl:build_defs.bzl",
-    "if_mkl",
-)
-
 tf_kernel_library(
     name = "xla_ops",
     srcs = [
@@ -121,15 +116,10 @@ tf_kernel_library(
         ":while_op",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/lib:batch_dot",
         "//tensorflow/compiler/tf2xla/lib:broadcast",
-        "//tensorflow/compiler/tf2xla/lib:cholesky",
-        "//tensorflow/compiler/tf2xla/lib:qr",
         "//tensorflow/compiler/tf2xla/lib:random",
         "//tensorflow/compiler/tf2xla/lib:scatter",
-        "//tensorflow/compiler/tf2xla/lib:triangular_solve",
         "//tensorflow/compiler/tf2xla/lib:util",
-        "//tensorflow/compiler/tf2xla/lib:while_loop",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal",
@@ -142,12 +132,16 @@ tf_kernel_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/lib:cholesky",
         "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:loops",
         "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/lib:numeric",
+        "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/client/lib:pooling",
         "//tensorflow/compiler/xla/client/lib:prng",
+        "//tensorflow/compiler/xla/client/lib:qr",
         "//tensorflow/compiler/xla/client/lib:sorting",
+        "//tensorflow/compiler/xla/client/lib:triangular_solve",
         "//tensorflow/core:framework",
         "//tensorflow/core:image_ops_op_lib",
         "//tensorflow/core:lib",
@@ -196,7 +190,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:numeric",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -216,7 +209,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:numeric",
         "//tensorflow/core:framework",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:conv_ops",
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
index 4cfe946b2e6146f034867c06e996ffae42b90705..1b254e328a8c71bd81a0ec700e2af1d81a5fa67a 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
+#include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 
 namespace tensorflow {
 namespace {
@@ -28,9 +30,11 @@ class BatchMatMulOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto result = BatchDot(ctx->Input(0), ctx->Input(1),
-                           /*transpose_x=*/adj_x_, /*transpose_y=*/adj_y_,
-                           /*conjugate_x=*/adj_x_, /*conjugate_y=*/adj_y_);
+    auto result =
+        xla::BatchDot(MaybeTransposeInMinorDims(
+                          MaybeConjugate(ctx->Input(0), adj_x_), adj_x_),
+                      MaybeTransposeInMinorDims(
+                          MaybeConjugate(ctx->Input(1), adj_y_), adj_y_));
     ctx->SetOutput(0, result);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
index 9fcbc86adc0967cbb7fb73da8bdabc58b60953da..0ed3044efa5b1060d2b0ad2d5563b0e02ebf66ec 100644
--- a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/cholesky.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/cholesky.h"
 
 namespace tensorflow {
 namespace {
@@ -24,7 +24,7 @@ class CholeskyOp : public XlaOpKernel {
  public:
   explicit CholeskyOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
-    ctx->SetOutput(0, Cholesky(ctx->Input(0)));
+    ctx->SetOutput(0, xla::Cholesky(ctx->Input(0)));
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index 641fefafb357f6ad10483c454600f3dadd4f8cb7..4124b258c7788e3850f07cbf4d53930784c635fd 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -392,23 +392,31 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
                       builder->GetShape(activations));
   TF_ASSIGN_OR_RETURN(xla::Shape out_backprop_shape,
                       builder->GetShape(gradients));
+  xla::XlaOp filter_backprop;
+
+  xla::Shape input_shape = activations_shape;
+  xla::Shape output_shape = out_backprop_shape;
+
+  TensorShape input_tensor_shape, filter_tensor_shape, output_tensor_shape;
+  TF_RETURN_IF_ERROR(XLAShapeToTensorShape(filter_shape, &filter_tensor_shape));
+  TF_RETURN_IF_ERROR(XLAShapeToTensorShape(input_shape, &input_tensor_shape));
+  TF_RETURN_IF_ERROR(XLAShapeToTensorShape(output_shape, &output_tensor_shape));
+
   const xla::Shape expanded_filter_shape =
       attrs.depthwise ? ExpandedFilterShapeForDepthwiseConvolution(filter_shape)
                       : filter_shape;
-
   // Reuse dimension computation logic from conv_grad_ops.cc.
   ConvBackpropDimensions dims;
-  TF_RETURN_IF_ERROR(ConvBackpropComputeDimensionsV2XlaShapes(
-      type_string, attrs.num_spatial_dims, activations_shape,
-      expanded_filter_shape, out_backprop_shape, attrs.dilations, attrs.strides,
-      attrs.padding, attrs.data_format, &dims));
-
   // The filter gradients are computed by a convolution of the input
   // activations and the output gradients, with some appropriate padding.
   // See the comment at the top of conv_grad_ops.h for details.
-
   xla::ConvolutionDimensionNumbers dnums;
 
+  TF_RETURN_IF_ERROR(ConvBackpropComputeDimensionsV2XlaShapes(
+      type_string, attrs.num_spatial_dims, activations_shape,
+      expanded_filter_shape, out_backprop_shape, attrs.dilations, attrs.strides,
+      attrs.padding, attrs.data_format, &dims));
+
   // The activations (inputs) form the LHS of the convolution.
   // Activations have shape: [batch, in_rows, in_cols, ..., in_depth]
   // For the gradient computation, we flip the roles of the batch and
@@ -420,29 +428,99 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
   int n_dim = GetTensorBatchDimIndex(num_dims, attrs.data_format);
   int c_dim = GetTensorFeatureDimIndex(num_dims, attrs.data_format);
 
-  // Swap n_dim and c_dim in the activations.
-  dnums.set_input_batch_dimension(c_dim);
-  dnums.set_input_feature_dimension(n_dim);
+  int64 total_spatial_size = 1;
+  for (int i = 0; i < attrs.num_spatial_dims; ++i) {
+    total_spatial_size *= dims.input_size(i);
+  }
 
-  // The gradients become the RHS of the convolution.
-  // The gradients have shape [batch, out_rows, out_cols, ..., out_depth]
-  // where the batch becomes the input feature for the convolution.
-  dnums.set_kernel_input_feature_dimension(n_dim);
-  dnums.set_kernel_output_feature_dimension(c_dim);
+  // We use this approach only for depthwise convolutions where feature counts
+  // are large but space dimensions are small. The conversion logic below
+  // assumes that the data format is NHWC, so we also check that here.
+  bool should_perform_depthwise_conv =
+      attrs.data_format == FORMAT_NHWC &&
+      (total_spatial_size < dims.in_depth) &&
+      filter_tensor_shape.dim_size(num_dims - 1) == 1 && attrs.depthwise;
+
+  int64 num_spatial_dims =
+      attrs.num_spatial_dims + (should_perform_depthwise_conv ? 1 : 0);
+
+  std::vector<std::pair<int64, int64>> padding(num_spatial_dims);
+  std::vector<int64> rhs_dilation(num_spatial_dims);
+  std::vector<int64> window_strides(num_spatial_dims);
+  std::vector<int64> ones(num_spatial_dims, 1);
+
+  if (should_perform_depthwise_conv) {
+    // This approach is similar to handling of grouped convolutions in
+    // the convolution_feature_group_converter.cc. Please refer to it for
+    // details.
+
+    // Add spatial dimension to the activation, and reshape.
+    std::vector<int64> activations_reshape_sizes, gradients_reshape_sizes;
+
+    activations_reshape_sizes.push_back(dims.batch_size);
+    gradients_reshape_sizes.push_back(dims.batch_size);
+    for (int i = 0; i < attrs.num_spatial_dims; i++) {
+      activations_reshape_sizes.push_back(dims.input_size(i));
+      gradients_reshape_sizes.push_back(dims.output_size(i));
+    }
+    activations_reshape_sizes.push_back(dims.in_depth);
+    activations_reshape_sizes.push_back(1);
+    gradients_reshape_sizes.push_back(dims.out_depth);
+    gradients_reshape_sizes.push_back(1);
+
+    activations = xla::Reshape(activations, activations_reshape_sizes);
+    gradients = xla::Reshape(gradients, gradients_reshape_sizes);
+
+    int64 new_spatial_dim = activations_reshape_sizes.size() - 1;
+
+    // Set the newly added dimension to be the batch.
+    dnums.set_input_batch_dimension(new_spatial_dim);
+    dnums.set_input_feature_dimension(c_dim);
+
+    // The gradients become the RHS of the convolution.
+    // The gradients have shape [batch, out_rows, out_cols, ..., out_depth, 1]
+    // where the batch becomes a spatial dimension, and 1 becomes
+    // the input feature for the convolution.
+    dnums.set_kernel_input_feature_dimension(new_spatial_dim);
+    dnums.set_kernel_output_feature_dimension(c_dim);
+
+    // Treat original batch dimension as a spatial dimension.
+    dnums.add_input_spatial_dimensions(n_dim);
+    dnums.add_kernel_spatial_dimensions(n_dim);
+  } else {
+    // The activations (inputs) form the LHS of the convolution.
+    // Activations have shape: [batch, in_rows, in_cols, ..., in_depth]
+    // For the gradient computation, we flip the roles of the batch and
+    // feature dimensions.
+    // Each spatial entry has size in_depth * batch
+
+    // Swap n_dim and c_dim in the activations.
+    dnums.set_input_batch_dimension(c_dim);
+    dnums.set_input_feature_dimension(n_dim);
+
+    // The gradients become the RHS of the convolution.
+    // The gradients have shape [batch, out_rows, out_cols, ..., out_depth]
+    // where the batch becomes the input feature for the convolution.
+    dnums.set_kernel_input_feature_dimension(n_dim);
+    dnums.set_kernel_output_feature_dimension(c_dim);
+  }
 
-  std::vector<std::pair<int64, int64>> padding(attrs.num_spatial_dims);
-  std::vector<int64> rhs_dilation(attrs.num_spatial_dims);
-  std::vector<int64> window_strides(attrs.num_spatial_dims);
-  std::vector<int64> ones(attrs.num_spatial_dims, 1);
+  dnums.set_output_batch_dimension(num_spatial_dims);
+  dnums.set_output_feature_dimension(num_spatial_dims + 1);
 
   // Tensorflow filter shape is [ H, W, ..., inC, outC ].
-  for (int i = 0; i < attrs.num_spatial_dims; ++i) {
+  for (int i = 0; i < num_spatial_dims; ++i) {
     dnums.add_output_spatial_dimensions(i);
   }
-  dnums.set_output_batch_dimension(attrs.num_spatial_dims);
-  dnums.set_output_feature_dimension(attrs.num_spatial_dims + 1);
 
-  for (int i = 0; i < attrs.num_spatial_dims; ++i) {
+  if (should_perform_depthwise_conv) {
+    // Set the right parameters for the newly created spatial dimension.
+    padding[0] = {0, 0};
+    rhs_dilation[0] = 1;
+    window_strides[0] = 1;
+  }
+
+  for (int64 i = 0; i < attrs.num_spatial_dims; ++i) {
     int64 dim = GetTensorSpatialDimIndex(num_dims, attrs.data_format, i);
     dnums.add_input_spatial_dimensions(dim);
     dnums.add_kernel_spatial_dimensions(dim);
@@ -483,9 +561,10 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
     const int64 pad_before =
         attrs.padding == Padding::SAME ? std::max<int64>(pad_total / 2, 0) : 0;
 
-    padding[i] = {pad_before, pad_total - pad_before};
-    rhs_dilation[i] = dims.spatial_dims[i].stride;
-    window_strides[i] = attrs.dilations[dim];
+    int64 dim_being_operated = should_perform_depthwise_conv ? i + 1 : i;
+    padding[dim_being_operated] = {pad_before, pad_total - pad_before};
+    rhs_dilation[dim_being_operated] = dims.spatial_dims[i].stride;
+    window_strides[dim_being_operated] = attrs.dilations[dim];
   }
 
   // Besides padding the input, we will also expand output_rows to
@@ -496,13 +575,19 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
   //
   // This is done by specifying the window dilation factors in the
   // convolution HLO below.
-  auto filter_backprop =
-      xla::ConvGeneralDilated(activations, gradients, window_strides, padding,
-                              /*lhs_dilation=*/ones, rhs_dilation, dnums);
-
-  if (attrs.depthwise) {
-    filter_backprop = ContractFilterForDepthwiseBackprop(
-        filter_shape, filter_backprop, activations.builder());
+  filter_backprop = xla::ConvGeneralDilated(
+      activations, gradients, window_strides, padding,
+      /*lhs_dilation=*/ones, rhs_dilation, dnums,
+      /*feature_group_count=*/
+      should_perform_depthwise_conv ? dims.in_depth : 1);
+
+  if (should_perform_depthwise_conv) {
+    filter_backprop = xla::Reshape(filter_backprop, filter_shape.dimensions());
+  } else {
+    if (attrs.depthwise) {
+      filter_backprop = ContractFilterForDepthwiseBackprop(
+          filter_shape, filter_backprop, activations.builder());
+    }
   }
 
   return filter_backprop;
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index d820528a43064e327cb90e5a2889f77ab1f3f3e2..eafdba876ae9e2c38694f065cf83bb3725b8460e 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/node_def_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
index 49c12fc232092873b69961644a059abc6035f64f..ee79cbc70da269be7586c47b4fd33c901f4fd581 100644
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 20b0de193dc060197f3062d3be0b8d45f7dcb9b1..41c31d0ed58fe9bc9bbde0bd58993c975f04fd60 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
-#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index b5e083912555c865b5eadc7697075c9ca4451ca9..4f0f0fd9aefecc3d31f8bd9c8ca40ebb0860c82d 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -56,6 +56,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
   VLOG(1) << "Building If: " << input_types_.size() << " inputs";
 
   std::vector<XlaCompiler::Argument> arguments(input_types_.size());
+  int num_resource_args = 0;
   for (int i = 0; i < input_types_.size(); ++i) {
     XlaCompiler::Argument& arg = arguments[i];
     DataType type = ctx->input_type(i + 1);
@@ -81,6 +82,8 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
               << " type: " << DataTypeString(arg.type)
               << " shape: " << arg.shape.DebugString()
               << " initialized: " << arg.initialized;
+
+      num_resource_args++;
     } else {
       arg.kind = XlaCompiler::Argument::kParameter;
       arg.type = input_types_[i];
@@ -236,9 +239,13 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
     ctx->SetOutput(i, output_handle);
   }
   if (has_token_input_output_) {
-    // Set token output for this "if" op.
+    // Set token output for this "If" op. Token output is the last output of
+    // XLA computation, which comes after all "normal" TF outputs and resource
+    // updates. For "If" node, num of resource updates equals to number of
+    // resource args because we set `return_updated_values_for_all_resources`
+    // to true in XlaCompiler option.
     xla::XlaOp token_output =
-        xla::GetTupleElement(outputs, output_types_.size());
+        xla::GetTupleElement(outputs, output_types_.size() + num_resource_args);
     auto shape_or = b->GetShape(token_output);
     OP_REQUIRES_OK(ctx, shape_or.status());
     OP_REQUIRES(ctx, xla::ShapeUtil::IsToken(shape_or.ValueOrDie()),
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index e9bb0a77e99d144863b027bd214081316d61c314..96ddd42e2ae04d454e4fb85628d139e17a543d2e 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
-#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -505,9 +505,9 @@ class NonMaxSuppressionOp : public XlaOpKernel {
     init_values.push_back(included_iou);
 
     auto suppress_loop_result =
-        XlaWhileLoop(WhileCondFn(num_boxes, output_size),
-                     SuppressBodyFn(num_boxes), init_values, "suppress_loop",
-                     builder)
+        xla::WhileLoopHelper(WhileCondFn(num_boxes, output_size),
+                             SuppressBodyFn(num_boxes), init_values,
+                             "suppress_loop", builder)
             .ValueOrDie();
 
     xla::XlaOp included_score =
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index 0c7ca602bfacd598dada0303d3a3e77fe7f1b0fc..5a10c52ba8b6d4fab73f0dda67cbd52fd625e76b 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
index f4def11d08c31513aec5aad15187016a7294c2fd..90c0ebefb24ec2c4378782e9b15d3f57c33032a4 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 
 namespace tensorflow {
 namespace {
@@ -29,7 +29,7 @@ class MatrixTriangularSolveOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto result = TriangularSolve(
+    auto result = xla::TriangularSolve(
         ctx->Input(0), ctx->Input(1), /*left_side=*/true,
         /*lower=*/lower_, /*transpose_a=*/adjoint_, /*conjugate_a=*/adjoint_);
     ctx->SetOutput(0, result);
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index a259da6383d461fd11b0d79096bf66aae7ddef06..06c6cc37ec90192486ba15010bfeb763a9ffb987 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -152,7 +152,12 @@ class MaxPoolOp : public PoolingOp {
  public:
   MaxPoolOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims,
-                  /*reduction_type=*/ctx->input_type(0)) {}
+                  /*reduction_type=*/ctx->input_type(0)) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     auto ksize_or_error = GetKernelSize(ctx);
@@ -180,10 +185,6 @@ class MaxPool2DOp : public MaxPoolOp {
  public:
   explicit MaxPool2DOp(OpKernelConstruction* ctx)
       : MaxPoolOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format_str;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
   }
 };
 REGISTER_XLA_OP(Name("MaxPool"), MaxPool2DOp);
@@ -204,7 +205,12 @@ class AvgPoolOp : public PoolingOp {
   AvgPoolOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims,
                   /*reduction_type=*/
-                  XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
+                  XlaHelpers::SumAccumulationType(ctx->input_type(0))) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     auto ksize_or_error = GetKernelSize(ctx);
@@ -241,10 +247,6 @@ class AvgPool2DOp : public AvgPoolOp {
  public:
   explicit AvgPool2DOp(OpKernelConstruction* ctx)
       : AvgPoolOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format_str;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
   }
 };
 REGISTER_XLA_OP(Name("AvgPool"), AvgPool2DOp);
@@ -390,6 +392,11 @@ class AvgPoolGradOp : public XlaOpKernel {
     OP_REQUIRES(ctx, ksize_[0] == 1 && stride_[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
+
+    string data_format;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
+    OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
   }
 
   int num_dims() const { return num_spatial_dims_ + 2; }
@@ -449,10 +456,6 @@ class AvgPool2DGradOp : public AvgPoolGradOp {
  public:
   explicit AvgPool2DGradOp(OpKernelConstruction* ctx)
       : AvgPoolGradOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
-    OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
   }
 };
 REGISTER_XLA_OP(
diff --git a/tensorflow/compiler/tf2xla/kernels/qr_op.cc b/tensorflow/compiler/tf2xla/kernels/qr_op.cc
index 7ea0afc1f53cbe4cfcc3f6121a4ecd55864c1b52..66ec40a946b8a063d84acd33daf81f52ea2c35ed 100644
--- a/tensorflow/compiler/tf2xla/kernels/qr_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/qr_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/qr.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/qr.h"
 
 namespace tensorflow {
 namespace {
@@ -26,7 +26,7 @@ class QROp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("full_matrices", &full_matrices_));
   }
   void Compile(XlaOpKernelContext* ctx) override {
-    auto result = QRDecomposition(ctx->Input(0), full_matrices_);
+    auto result = xla::QRDecomposition(ctx->Input(0), full_matrices_);
     if (!result.ok()) {
       ctx->SetStatus(result.status());
       return;
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 8822e29f7e77b1cbc6fa6ca61d0062d9b1b0c36e..2d92056e4f522f6206e7d632f0fa1e8b793fd6e3 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -20,12 +20,12 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
 #include "tensorflow/compiler/tf2xla/lib/random.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
-#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -175,8 +175,8 @@ class RandomShuffleOp : public XlaOpKernel {
     };
     // for i in range(n):
     auto swap_loop_result =
-        XlaForEachIndex(n, xla::S32, swap_body_fn, {swaps, indices},
-                        "indices_swap_loop", builder)
+        xla::ForEachIndex(n, xla::S32, swap_body_fn, {swaps, indices},
+                          "indices_swap_loop", builder)
             .ValueOrDie();
     auto swapped_indices = swap_loop_result[1];
 
diff --git a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
index 769e0cd1409dd7e8099178c8d80b5a9adb0b20b3..f9985d526033ca675c701a508a3d1576e46bc5f7 100644
--- a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -126,7 +125,7 @@ XlaOp ConcatenateIota(xla::XlaBuilder* b, XlaOp indices,
   dimensions.back() = 1;
 
   auto batch_indices =
-      xla::Iota(b, xla::ShapeUtil::MakeShape(xla::U32, dimensions),
+      xla::Iota(b, xla::ShapeUtil::MakeShape(xla::S32, dimensions),
                 /*iota_dimension=*/0);
 
   return xla::ConcatInDim(b, {batch_indices, indices}, dimensions.size() - 1);
@@ -190,11 +189,53 @@ XlaOp ScatterToGradData(XlaOpKernelContext* ctx, XlaOp grad_data, XlaOp indices,
                       scatter_dim_numbers);
 }
 
+// Bounds samples to 0 if the warp image indices are out of the (-1, image_size)
+// bound.
+// The resulting dimension is given by 'result_dims'.
+XlaOp BoundSamples(XlaOpKernelContext* ctx, XlaOp warp,
+                   xla::PrimitiveType warp_type, TensorShape warp_shape,
+                   std::vector<int64> result_dims,
+                   std::vector<int64> broadcasted_dims, int64 last_warp_dim,
+                   xla::Shape data_shape, XlaOp sample) {
+  auto is_gt_minus_one =
+      xla::Gt(warp,
+              xla::ConvertElementType(
+                  xla::ConstantR1<float>(ctx->builder(), {-1, -1}), warp_type),
+              /*broadcast_dimensions=*/{warp_shape.dims() - 1});
+  auto is_lt_image_size = xla::Lt(
+      warp,
+      xla::ConvertElementType(
+          xla::ConstantR1<float>(
+              ctx->builder(),
+              {/*width=*/static_cast<float>(data_shape.dimensions(2)),
+               /*height=*/static_cast<float>(data_shape.dimensions(1))}),
+          warp_type),
+      /*broadcast_dimensions=*/{warp_shape.dims() - 1});
+
+  auto is_in_bound_padded_x_y = xla::And(is_gt_minus_one, is_lt_image_size);
+  // Reduce along last dimension. The resulting dimension is:
+  // [batch, dim_0, ...dim_n].
+  auto is_in_bound = xla::Reduce(
+      is_in_bound_padded_x_y, xla::ConstantR0<bool>(ctx->builder(), true),
+      xla::CreateScalarAndComputation(xla::PrimitiveType::PRED, ctx->builder()),
+      {last_warp_dim});
+
+  // Broadcast 'is_in_bound' to the same dimension as 'result_dims'.
+  auto broadcasted_is_in_bound =
+      xla::BroadcastInDim(is_in_bound, result_dims, broadcasted_dims);
+
+  // Set out of bound samples to zero.
+  auto zeros =
+      xla::Broadcast(xla::Zero(ctx->builder(), warp_type), result_dims);
+  return xla::Select(broadcasted_is_in_bound, sample, zeros);
+}
+
 // Build computation the backprop into input 'data'.
 // Where input:
 // grad_output is of dimension [batch, dim_0, ...dim_n, channel]
 // ratio is of dimension [batch, dim_0, ...dim_n, 2]
 // gather_indices is of dimension [batch, dim_0, ...dim_n, 3]
+// data_shape is of dimension [batch, x(width), y(height), channel]
 //
 // Output:
 // scatter-add to each 2x2 grad_data neighbor:
@@ -202,10 +243,12 @@ XlaOp ScatterToGradData(XlaOpKernelContext* ctx, XlaOp grad_data, XlaOp indices,
 //  grad_data[cx, fy, chan] += output_grad * (1 - dx) * dy
 //  grad_data[fx, cy, chan] += output_grad * dx * (1 - dy)
 //  grad_data[cx, cy, chan] += output_grad * (1 - dx) * (1 - dy)
-// where (dx, dy) is (1 - ratio).
+// where (dx, dy) is (1 - ratio). If (dx, dy) is out of bound, then the their
+// contribution is 0 to 'grad_data'.
 XlaOp CalculateGradData(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
-                        XlaOp gather_indices, xla::PrimitiveType warp_type,
-                        TensorShape warp_shape, int64 data_channels,
+                        XlaOp gather_indices, XlaOp warp,
+                        xla::PrimitiveType warp_type, TensorShape warp_shape,
+                        int64 last_warp_dim, int64 data_channels,
                         xla::Shape data_shape) {
   // Weights tensor has dimension [batch, dim_0, ... dim_n, 4].
   auto weights = BilinearWeights(ctx, ratio, warp_shape, warp_type);
@@ -230,6 +273,18 @@ XlaOp CalculateGradData(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
   std::iota(reshaped_weights_indices.begin(), reshaped_weights_indices.end(),
             0);
 
+  // Set out of bound weights to 0.
+  // The dimension of the reshaped_weight: [batch, dim_0, ...dim_n, 2, 2].
+  std::vector<int64> reshaped_result_dims(warp_dims.begin(),
+                                          warp_dims.end() - 1);
+  reshaped_result_dims.push_back(2);
+  reshaped_result_dims.push_back(2);
+  std::vector<int64> broadcasted_dims(warp_dims.size() - 1);
+  std::iota(broadcasted_dims.begin(), broadcasted_dims.end(), 0);
+  reshaped_weights = BoundSamples(ctx, warp, warp_type, warp_shape,
+                                  reshaped_result_dims, broadcasted_dims,
+                                  last_warp_dim, data_shape, reshaped_weights);
+
   // The dimension is [batch, dim_0, ..., dim_n, 2, 2, data_channel].
   auto broadcast_reshaped_weights = xla::BroadcastInDim(
       reshaped_weights, weights_with_channels_dims, reshaped_weights_indices);
@@ -246,18 +301,41 @@ XlaOp CalculateGradData(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
   auto grad_data = xla::ConstantLiteral(
       ctx->builder(), xla::Literal::CreateFromShape(data_shape));
 
-  return ScatterToGradData(ctx, grad_data, gather_indices,
-                           grad_output_multiply_weights, warp_shape.dims(),
-                           warp_type);
+  // Pad grad data then slice it back.
+  //
+  // After left and right column 0-padding, the new dimension of padded data
+  // will be [batch, x+2, y+2, channel].
+  auto padded_grad_data =
+      xla::Pad(grad_data, xla::Zero(ctx->builder(), warp_type),
+               xla::MakeEdgePaddingConfig({{0, 0}, {1, 1}, {1, 1}, {0, 0}}));
+
+  auto shifting_value = xla::ConstantR1<int32>(
+      ctx->builder(), {/*batch=*/0, /*x(width)=*/1, /*y(height)=*/1});
+  auto shifted_gather_indices =
+      xla::Add(gather_indices, shifting_value, {last_warp_dim});
+
+  auto updated_grad_data = ScatterToGradData(
+      ctx, padded_grad_data, shifted_gather_indices,
+      grad_output_multiply_weights, warp_shape.dims(), warp_type);
+
+  const int64 batch_size = data_shape.dimensions(0);
+  const int64 width = data_shape.dimensions(1);
+  const int64 height = data_shape.dimensions(2);
+  // Slice out the result accounting for the padding.
+  return xla::Slice(
+      updated_grad_data, /*start_indices=*/{0, 1, 1, 0},
+      /*limit_indices=*/{batch_size, width + 1, height + 1, data_channels},
+      /*strides=*/{1, 1, 1, 1});
 }
 
 // Build computation for the backprop into input 'warp'.
 // Where input:
-// warp is of dimension [batch, dim_0, ...dim_n, 2]
-// grad_output is of dimension [batch, dim_0, ...dim_n, channel]
-// ratio is of dimension [batch, dim_0, ...dim_n, 2]
-// gather_indices is of dimension [batch, dim_0, ...dim_n, 3]
-// data is of dimension [batch, x, y, channel]
+//  warp is of dimension [batch, dim_0, ...dim_n, 2]
+//  grad_output is of dimension [batch, dim_0, ...dim_n, channel]
+//  ratio is of dimension [batch, dim_0, ...dim_n, 2]
+//  gather_indices is of dimension [batch, dim_0, ...dim_n, 3] where the last
+//  dimension of size 3 is for {batch, x(width), y(height)}.
+//  data is of dimension [batch, x, y, channel]
 //
 // Output (simplified by ignoring the batch dimensions):
 // Since the forward path has:
@@ -276,12 +354,12 @@ XlaOp CalculateGradData(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
 //    grad_warp_x = py * (img_cxcy - img_fxcy) + (1-py) * (img_cxfy-img_fxfy)
 //    grad_warp_y = px * (img_cxcy - img_cxfy) + (1-px) * (img_fxcy-img_fxfy)
 //
-// where (px, py) is warp, (fx, fy) is the left top corner and (cx, cy) is the
+// where (px, py) is warp, (fx, fy) is the top left corner and (cx, cy) is the
 // bottom right corner in a 2x2 neighborhood.
 XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
                         XlaOp gather_indices, XlaOp data,
                         TensorShape warp_shape, int64 data_channels,
-                        xla::PrimitiveType data_type) {
+                        xla::PrimitiveType data_type, xla::Shape data_shape) {
   auto warp_dims = warp_shape.dim_sizes();
   std::vector<int64> warp_dims_without_last_dims(warp_dims.begin(),
                                                  warp_dims.end() - 1);
@@ -290,12 +368,30 @@ XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
   std::vector<int64> neighbor_broadcast_dims = warp_dims_without_last_dims;
   neighbor_broadcast_dims.push_back(4);
 
-  // The dimension is [batch, dim_0, ... dim_n, 4, data_channels]
-  auto neighbors_data = Gather2by2Neighbors(
-      ctx->builder(), data, gather_indices, data_channels, warp_shape.dims());
+  // With dimension [batch, dim_0, ...dim_n, 4]
+  auto neighbor_broadcast_shape =
+      xla::ShapeUtil::MakeShape(data_type, neighbor_broadcast_dims);
 
   const int64 last_warp_dim = warp_shape.dims() - 1;
 
+  // Pad data with 0, before gathering such that 0 will be returned for samples
+  // in the range of (-1, 0) or (image_dimension-1, image_dimension).
+  // After left and right column 0-padding, the new dimension of padded data
+  // will be [batch, x+2, y+2, channel].
+  auto padded_data =
+      xla::Pad(data, xla::Zero(ctx->builder(), data_type),
+               xla::MakeEdgePaddingConfig({{0, 0}, {1, 1}, {1, 1}, {0, 0}}));
+
+  auto shifting_value = xla::ConstantR1<int32>(
+      ctx->builder(), {/*batch=*/0, /*x(width)=*/1, /*y(height)=*/1});
+  auto shifted_gather_indices =
+      xla::Add(gather_indices, shifting_value, {last_warp_dim});
+
+  // The dimension is [batch, dim_0, ... dim_n, 4, data_channels]
+  auto neighbors_data =
+      Gather2by2Neighbors(ctx->builder(), padded_data, shifted_gather_indices,
+                          data_channels, warp_shape.dims());
+
   // Since we will be creating the dot product of:
   //  lhs: [batch, dim_0, ...dim_n, 4]
   // and
@@ -418,7 +514,7 @@ class ResamplerOp : public XlaOpKernel {
     // Find the coordinates of the top left corner for the 2x2 region to be
     // sampled from. The dimensions are [batch, dim_0, ... dim_n, 2] where the
     // last dimension of size 2 in turn is [x, y].
-    XlaOp top_left = xla::ConvertElementType(warp, xla::U32);
+    XlaOp top_left = xla::ConvertElementType(warp, xla::S32);
 
     auto gather_indices = ConcatenateIota(ctx->builder(), top_left, warp_shape);
 
@@ -527,7 +623,8 @@ class ResamplerGradOp : public XlaOpKernel {
                                           size, "]"));
     }
     // Last dimension of warp shape must be of size 2.
-    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2,
+    const int64 last_warp_dim = warp_shape.dims() - 1;
+    OP_REQUIRES(ctx, warp_shape.dim_size(last_warp_dim) == 2,
                 errors::InvalidArgument(
                     "the last dimension of warp must be exactly size 2."));
     xla::PrimitiveType warp_type = ctx->input_xla_type(1);
@@ -550,24 +647,32 @@ class ResamplerGradOp : public XlaOpKernel {
     // Find the top left corner coordinate for the region to be sampled from.
     // The dimensions are [batch, dim_0, ... dim_n, 2] where the last dimension
     // of size 2 in turn is [x, y].
-    XlaOp top_left = xla::ConvertElementType(warp, xla::U32);
+    XlaOp top_left = xla::ConvertElementType(xla::Floor(warp), xla::S32);
 
-    // Dimensions are [batch, dim_0, ... dim_n, 2]
+    // Dimensions are [batch, dim_0, ... dim_n, 2].
     XlaOp ratio = warp - xla::ConvertElementType(top_left, warp_type);
 
     // Indices for gathering neighboring pixels.
     auto gather_indices = ConcatenateIota(ctx->builder(), top_left, warp_shape);
 
-    auto grad_data =
-        CalculateGradData(ctx, grad_output, ratio, gather_indices, warp_type,
-                          warp_shape, data_channels, data_shape);
+    auto grad_data = CalculateGradData(
+        ctx, grad_output, ratio, gather_indices, warp, warp_type, warp_shape,
+        last_warp_dim, data_channels, data_shape);
 
     auto grad_warp =
         CalculateGradWarp(ctx, grad_output, ratio, gather_indices, data,
-                          warp_shape, data_channels, data_type);
+                          warp_shape, data_channels, data_type, data_shape);
+    auto warp_dims = warp_shape.dim_sizes();
+    std::vector<int64> result_dims(warp_dims.begin(), warp_dims.end() - 1);
+    result_dims.push_back(2);
+    std::vector<int64> broadcasted_dims(warp_dims.size() - 1);
+    std::iota(broadcasted_dims.begin(), broadcasted_dims.end(), 0);
+    auto grad_warp_bounded =
+        BoundSamples(ctx, warp, warp_type, warp_shape, result_dims,
+                     broadcasted_dims, last_warp_dim, data_shape, grad_warp);
 
     ctx->SetOutput(0, grad_data);
-    ctx->SetOutput(1, grad_warp);
+    ctx->SetOutput(1, grad_warp_bounded);
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index 5db52781be473a9a1aef0adf105e3edf69ccd306..50653d7b3973b73d580cdeec5d71943b575d7cc9 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/lib/prng.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/topk_op.cc b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
index 8a0c94cfae1b298bd62a3231caf39ecf9b32880e..ee3bdf3394e37c757f31724e73e95417becaa534 100644
--- a/tensorflow/compiler/tf2xla/kernels/topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index 960c1462ceb8c00a2d6c96564f6c985fd1caef0f..26d4214099d1d07c1b2e275d783654d9cd948e28 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -172,6 +172,65 @@ class ResourceApplyMomentum : public XlaOpKernel {
 REGISTER_XLA_OP(Name("ResourceApplyMomentum").TypeConstraint("T", kFloatTypes),
                 ResourceApplyMomentum);
 
+class ResourceApplyKerasMomentum : public XlaOpKernel {
+ public:
+  explicit ResourceApplyKerasMomentum(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    DataType type = ctx->input_type(2);
+
+    TensorShape var_shape, accum_shape;
+    xla::XlaOp var, accum;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &accum_shape, &accum));
+
+    OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
+                errors::InvalidArgument(
+                    "var and accum do not have the same shape",
+                    var_shape.DebugString(), " ", accum_shape.DebugString()));
+
+    TensorShape lr_shape = ctx->InputShape(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr_shape.DebugString()));
+
+    TensorShape grad_shape = ctx->InputShape(3);
+    OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape),
+                errors::InvalidArgument(
+                    "var and grad do not have the same shape",
+                    var_shape.DebugString(), " ", grad_shape.DebugString()));
+
+    TensorShape momentum_shape = ctx->InputShape(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum_shape),
+                errors::InvalidArgument("momentum is not a scalar: ",
+                                        momentum_shape.DebugString()));
+
+    xla::XlaOp lr = ctx->Input(2);
+    xla::XlaOp grad = ctx->Input(3);
+    xla::XlaOp momentum = ctx->Input(4);
+
+    accum = accum * momentum - grad * lr;
+    if (use_nesterov_) {
+      // See https://github.com/tensorflow/tensorflow/pull/2798 for an
+      // explanation of the reparameterization used here.
+      var = var + accum * momentum - grad * lr;
+    } else {
+      var = var + accum;
+    }
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, var));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, type, accum));
+  }
+
+ private:
+  bool use_nesterov_;
+};
+REGISTER_XLA_OP(
+    Name("ResourceApplyKerasMomentum").TypeConstraint("T", kFloatTypes),
+    ResourceApplyKerasMomentum);
+
 class ResourceApplyAdagrad : public XlaOpKernel {
  public:
   explicit ResourceApplyAdagrad(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index ce007fc04a818869686b9936a1607cee42665e87..89b577bfc05b4665d492f4ea5cf6f869af2fa9a9 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -41,8 +41,7 @@ Status MakeXlaCompilerArgumentsFromInputs(
   *has_uninitialized_vars = false;
   *has_tensor_arrays = false;
   for (int i = 0; i < ctx->num_inputs(); ++i) {
-    VLOG(2) << " Input " << i
-            << " type: " << DataTypeString(ctx->input_type(i))
+    VLOG(2) << " Input " << i << " type: " << DataTypeString(ctx->input_type(i))
             << " shape: " << ctx->InputShape(i).DebugString();
     XlaCompiler::Argument& arg = (*args)[i];
     DataType type = ctx->input_type(i);
@@ -233,13 +232,22 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
           xla::ShapeUtil::HumanString(body_input_shape), " vs. ",
           xla::ShapeUtil::HumanString(body.xla_output_shape)));
 
-  xla::Shape expected_cond_output_shape = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::PRED, {})});
+  xla::Shape expected_cond_output_shape_without_side_effect =
+      xla::ShapeUtil::MakeTupleShape(
+          {xla::ShapeUtil::MakeShape(xla::PRED, {})});
+  xla::Shape expected_cond_output_shape_with_side_effect =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::PRED, {}),
+                                      xla::ShapeUtil::MakeTokenShape()});
   OP_REQUIRES(ctx,
-              xla::ShapeUtil::Compatible(cond.xla_output_shape,
-                                         expected_cond_output_shape),
+              xla::ShapeUtil::Compatible(
+                  cond.xla_output_shape,
+                  expected_cond_output_shape_without_side_effect) ||
+                  xla::ShapeUtil::Compatible(
+                      cond.xla_output_shape,
+                      expected_cond_output_shape_with_side_effect),
               errors::InvalidArgument(
-                  "Output shape of loop condition should be (pred[]), got: ",
+                  "Output shape of loop condition should be (pred[]) or "
+                  "(pred[], token[]), got: ",
                   xla::ShapeUtil::HumanString(cond.xla_output_shape)));
 
   int num_inputs = body.input_mapping.size();
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 1ce3930fd1cd91f8e8dfb765b49be2dc969d1bd7..3d7b0bc959f9dbf3c1b9749379e2ea0d285b302b 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -15,22 +15,6 @@ filegroup(
     ]),
 )
 
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
-
-cc_library(
-    name = "batch_dot",
-    srcs = ["batch_dot.cc"],
-    hdrs = ["batch_dot.h"],
-    deps = [
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "broadcast",
     srcs = ["broadcast.cc"],
@@ -47,26 +31,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "cholesky",
-    srcs = ["cholesky.cc"],
-    hdrs = ["cholesky.h"],
-    deps = [
-        ":batch_dot",
-        ":triangular_solve",
-        ":util",
-        ":while_loop",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "random",
     srcs = ["random.cc"],
@@ -82,35 +46,12 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "qr",
-    srcs = ["qr.cc"],
-    hdrs = ["qr.h"],
-    deps = [
-        ":batch_dot",
-        ":util",
-        ":while_loop",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/lib:numeric",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "scatter",
     srcs = ["scatter.cc"],
     hdrs = ["scatter.h"],
     deps = [
         ":util",
-        ":while_loop",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -124,51 +65,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "triangular_solve",
-    srcs = ["triangular_solve.cc"],
-    hdrs = ["triangular_solve.h"],
-    deps = [
-        ":batch_dot",
-        ":util",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:numeric",
-        "//tensorflow/core:lib",
-    ],
-)
-
-xla_test(
-    name = "triangular_solve_test",
-    srcs = ["triangular_solve_test.cc"],
-    tags = ["noasan"],  # sometimes times out, http://b/78650012
-    deps = [
-        ":triangular_solve",
-        "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-    ],
-)
-
 cc_library(
     name = "util",
     srcs = ["util.cc"],
@@ -186,42 +82,3 @@ cc_library(
         "@com_google_absl//absl/types:span",
     ],
 )
-
-xla_test(
-    name = "util_test",
-    srcs = ["util_test.cc"],
-    deps = [
-        ":batch_dot",
-        ":util",
-        "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-    ],
-)
-
-cc_library(
-    name = "while_loop",
-    srcs = ["while_loop.cc"],
-    hdrs = ["while_loop.h"],
-    deps = [
-        ":util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-    ],
-)
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
deleted file mode 100644
index 5400e8834cb9807f6dd71abe7789b2672e29e905..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
-
-#include <memory>
-#include <vector>
-
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace tensorflow {
-
-xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, bool transpose_x,
-                    bool transpose_y, bool conjugate_x, bool conjugate_y,
-                    xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape x_shape, builder->GetShape(x));
-    TF_ASSIGN_OR_RETURN(xla::Shape y_shape, builder->GetShape(y));
-
-    // Check that both tensors have the same number of dimensions. There must be
-    // at least two (the batch dimensions can be empty).
-    if (xla::ShapeUtil::Rank(x_shape) != xla::ShapeUtil::Rank(y_shape)) {
-      return errors::InvalidArgument(
-          "Arguments to BatchedDot have different ranks: ",
-          xla::ShapeUtil::HumanString(x_shape), " vs. ",
-          xla::ShapeUtil::HumanString(y_shape));
-    }
-    const int ndims = xla::ShapeUtil::Rank(x_shape);
-    if (ndims < 2) {
-      return errors::InvalidArgument(
-          "Arguments to BatchedDot must have rank >= 2: ", ndims);
-    }
-
-    // The batch dimensions must be equal and the matrix dimensions must be
-    // valid.
-    std::vector<int64> batch_dimension_numbers;
-    for (int i = 0; i < ndims - 2; ++i) {
-      if (x_shape.dimensions(i) != y_shape.dimensions(i)) {
-        return errors::InvalidArgument(
-            "Dimension ", i, " of inputs to BatchedDot must be equal: ",
-            xla::ShapeUtil::HumanString(x_shape), " vs ",
-            xla::ShapeUtil::HumanString(y_shape));
-      }
-      batch_dimension_numbers.push_back(i);
-    }
-
-    int x_inner_dim = transpose_x ? (ndims - 2) : (ndims - 1);
-    int y_inner_dim = transpose_y ? (ndims - 1) : (ndims - 2);
-    if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) {
-      return errors::InvalidArgument(
-          "Dimensions ", x_inner_dim, " and ", y_inner_dim,
-          " of arguments to BatchedDot must be equal: ",
-          xla::ShapeUtil::HumanString(x_shape), " transpose: ", transpose_x,
-          " vs. ", xla::ShapeUtil::HumanString(y_shape),
-          " transpose: ", transpose_y);
-    }
-
-    // Check for zero lhs/rhs dim size.
-    if (xla::ShapeUtil::IsZeroElementArray(x_shape) ||
-        xla::ShapeUtil::IsZeroElementArray(y_shape)) {
-      std::vector<int64> dimensions(batch_dimension_numbers.size());
-      for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
-        dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]);
-      }
-      int x_outer_dim = transpose_x ? (ndims - 1) : (ndims - 2);
-      int y_outer_dim = transpose_y ? (ndims - 2) : (ndims - 1);
-      dimensions.push_back(x_shape.dimensions(x_outer_dim));
-      dimensions.push_back(y_shape.dimensions(y_outer_dim));
-      return xla::Broadcast(
-          xla::ConstantLiteral(builder,
-                               xla::LiteralUtil::Zero(x_shape.element_type())),
-          dimensions);
-    }
-
-    if (x_shape.element_type() == xla::C64 && conjugate_x) {
-      x = xla::Conj(x);
-    }
-    if (y_shape.element_type() == xla::C64 && conjugate_y) {
-      y = xla::Conj(y);
-    }
-
-    xla::PrecisionConfig precision_proto;
-    precision_proto.add_operand_precision(precision);
-    precision_proto.add_operand_precision(precision);
-
-    xla::DotDimensionNumbers dot_dnums;
-    dot_dnums.add_lhs_contracting_dimensions(x_inner_dim);
-    dot_dnums.add_rhs_contracting_dimensions(y_inner_dim);
-    for (auto batch_dimension_number : batch_dimension_numbers) {
-      dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
-      dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
-    }
-
-    return xla::DotGeneral(x, y, dot_dnums, &precision_proto);
-  });
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h
deleted file mode 100644
index 6edd63a4d3b66c21aa4cce8c9f36eef0dc363cd8..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
-#define TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
-
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-namespace tensorflow {
-
-// Multiplies slices of two tensors in batches.
-
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be transposed before multiplication by
-// setting the `transpose_x` or `transpose_y` flag to `true`. Similarly, each
-// can be elementwise-complex-conjugated by setting the `conjugate_x` or
-// `conjugate_y` flag to `true`. To apply a Hermitian adjoint to `x`, set both
-// `transpose_x` and `conjugate_x` to `true`, and analogously for `y`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if transpose_x else r_x
-//     c_o = r_y if transpose_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-xla::XlaOp BatchDot(
-    xla::XlaOp x, xla::XlaOp y, bool transpose_x = false,
-    bool transpose_y = false, bool conjugate_x = false,
-    bool conjugate_y = false,
-    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.cc b/tensorflow/compiler/tf2xla/lib/scatter.cc
index 2b1c2ced925d9fee7392986015a6e716a94d356f..688056791f9750e6b22df4b2cd4643de0b780651 100644
--- a/tensorflow/compiler/tf2xla/lib/scatter.cc
+++ b/tensorflow/compiler/tf2xla/lib/scatter.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
-#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index 804671fbc75b0a5a6e04b204822b6f084013cd8b..c0bd172d17c192435ba8ee196f9def0491c0bf5c 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -113,36 +113,6 @@ xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
   return xla::ConstantLiteral(builder, literal);
 }
 
-xla::XlaOp SliceInMinorDims(xla::XlaOp x, absl::Span<const int64> start,
-                            absl::Span<const int64> end) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_RET_CHECK(start.size() == end.size());
-    int64 n_minor_dims = start.size();
-
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_minor_dims <= n_dims);
-    auto major_dims = xla::AsInt64Slice(shape.dimensions())
-                          .subspan(
-                              /*pos=*/0,
-                              /*len=*/n_dims - n_minor_dims);
-
-    // Prepends 0s in the major dim
-    std::vector<int64> padded_start(n_dims, 0);
-    std::copy(start.begin(), start.end(),
-              padded_start.begin() + major_dims.size());
-
-    // Prepends the shape of the major dims.
-    std::vector<int64> padded_end(n_dims);
-    std::copy(major_dims.begin(), major_dims.end(), padded_end.begin());
-    std::copy(end.begin(), end.end(), padded_end.begin() + major_dims.size());
-
-    std::vector<int64> strides(n_dims, 1);
-    return xla::Slice(x, padded_start, padded_end, strides);
-  });
-}
 
 std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
                                  absl::Span<const int64> ys) {
@@ -152,100 +122,4 @@ std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
   return output;
 }
 
-xla::XlaOp DynamicSliceInMinorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts,
-                                   absl::Span<const int64> sizes) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    int64 n_minor_dims = starts.size();
-    TF_RET_CHECK(n_minor_dims == sizes.size());
-    TF_RET_CHECK(n_minor_dims <= n_dims);
-    auto major_dims = xla::AsInt64Slice(shape.dimensions())
-                          .subspan(
-                              /*pos=*/0,
-                              /*len=*/n_dims - sizes.size());
-    auto padded_starts = PrependZerosInMajorDims(x, starts);
-    auto padded_sizes = ConcatVectors(major_dims, sizes);
-    return xla::DynamicSlice(x, padded_starts, padded_sizes);
-  });
-}
-
-xla::XlaOp UpdateSlice(xla::XlaOp x, xla::XlaOp update,
-                       absl::Span<const int64> start) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
-    std::vector<int32> start_as_int32(start.begin(), start.end());
-    auto start_constant = xla::ConstantR1<int32>(builder, start_as_int32);
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    TF_ASSIGN_OR_RETURN(xla::Shape start_constant_shape,
-                        builder->GetShape(start_constant));
-    const int64 start_length =
-        xla::ShapeUtil::GetDimension(start_constant_shape, -1);
-    TF_RET_CHECK(start_length == n_dims);
-    return xla::DynamicUpdateSlice(x, update, start_constant);
-  });
-}
-
-xla::XlaOp UpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                  absl::Span<const int64> start) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    const int64 n_minor_dims = start.size();
-    TF_RET_CHECK(n_minor_dims <= n_dims);
-    std::vector<int64> padded_start(n_dims, 0);
-    std::copy(start.begin(), start.end(),
-              padded_start.begin() + (n_dims - n_minor_dims));
-    return UpdateSlice(x, update, padded_start);
-  });
-}
-
-xla::XlaOp DynamicUpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                         absl::Span<const xla::XlaOp> starts) {
-  auto padded_starts = PrependZerosInMajorDims(x, starts);
-  return xla::DynamicUpdateSlice(x, update, padded_starts);
-}
-
-xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    auto zero = xla::Reshape(xla::ConstantR0<int32>(builder, 0), {1});
-    std::vector<xla::XlaOp> padded_starts(n_dims, zero);
-    for (int i = 0; i < starts.size(); ++i) {
-      padded_starts[n_dims - starts.size() + i] = xla::Reshape(starts[i], {1});
-    }
-    return xla::ConcatInDim(builder, padded_starts, 0);
-  });
-}
-
-xla::XlaOp TransposeInMinorDims(xla::XlaOp x) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_dims >= 2);
-    std::vector<int64> permutation(n_dims);
-    std::iota(permutation.begin(), permutation.end(), 0);
-    std::swap(permutation[n_dims - 1], permutation[n_dims - 2]);
-    return xla::Transpose(x, permutation);
-  });
-}
-
-xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    auto perform_conj = shape.element_type() == xla::C64 && conjugate;
-    return perform_conj ? xla::Conj(x) : x;
-  });
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
index 80e9e5b002d49581209e608b98606e02709c5876..aec8061cb4322b8d315b6cdc80c7fff1e0cb4cb1 100644
--- a/tensorflow/compiler/tf2xla/lib/util.h
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -38,44 +38,10 @@ xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x,
 xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
                           int64 value);
 
-// Builds a vector of zeros of length rank(x) with the last values being
-// those in `starts`.
-xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts);
-
-// Performs a slice in the minor dimensions of a Tensor.
-xla::XlaOp SliceInMinorDims(xla::XlaOp x, absl::Span<const int64> start,
-                            absl::Span<const int64> end);
-
 // Returns the concatenation of `xs` and `ys`.
 std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
                                  absl::Span<const int64> ys);
 
-// Performs a dynamic slice in the minor dimensions of a Tensor.
-xla::XlaOp DynamicSliceInMinorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts,
-                                   absl::Span<const int64> sizes);
-
-// Updates a slice of 'x', i.e.,
-// x[start[0], ..., start[n]] = update
-xla::XlaOp UpdateSlice(xla::XlaOp x, xla::XlaOp update,
-                       absl::Span<const int64> start);
-
-// Updates a slice of 'x', where 'start' contains a list of minor dimensions:
-// x[..., start[0], ..., start[n]] = update
-xla::XlaOp UpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                  absl::Span<const int64> start);
-
-xla::XlaOp DynamicUpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                         absl::Span<const xla::XlaOp> starts);
-
-// Transposes a stack of matrices `x` by swapping the last two dimensions.
-xla::XlaOp TransposeInMinorDims(xla::XlaOp x);
-
-// Applies a complex conjugation operation if `a` is complex and `conjugate_a`
-// is true, otherwise returns its argument.
-xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate);
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/resource_operation_table.cc b/tensorflow/compiler/tf2xla/resource_operation_table.cc
index 72b240996fb4d9dcb5f5dfd919da618cbae08c16..ff9f1b9ccba2c4f3307890d5aac4ddb6cfaafcd9 100644
--- a/tensorflow/compiler/tf2xla/resource_operation_table.cc
+++ b/tensorflow/compiler/tf2xla/resource_operation_table.cc
@@ -65,6 +65,7 @@ CreateResourceOpInfoMap() {
   add("ResourceApplyFtrlV2"                  , kReadWrite, kVariable);
   add("ResourceApplyGradientDescent"         , kReadWrite, kVariable);
   add("ResourceApplyMomentum"                , kReadWrite, kVariable);
+  add("ResourceApplyKerasMomentum"           , kReadWrite, kVariable);
   add("ResourceApplyPowerSign"               , kReadWrite, kVariable);
   add("ResourceApplyProximalAdagrad"         , kReadWrite, kVariable);
   add("ResourceApplyProximalGradientDescent" , kReadWrite, kVariable);
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.cc b/tensorflow/compiler/tf2xla/side_effect_util.cc
index b233e6b2c28e1968bb74901fc684e808ae45ab60..b62f8e9115229ac35c657d374c68336f1168ff77 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.cc
+++ b/tensorflow/compiler/tf2xla/side_effect_util.cc
@@ -24,6 +24,8 @@ const char kXlaTokenInputNodesAttrName[] = "_xla_token_input_nodes";
 
 const char kXlaTokenArgNodeName[] = "_xla_token_arg_node";
 
+const char kXlaHasHostTransferAttrName[] = "_xla_has_host_transfer";
+
 std::set<std::string> CalculateTokenInputsForOutputToken(const Graph& g) {
   std::set<std::string> results;
   Node* first_side_effecting_node_on_path = nullptr;
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.h b/tensorflow/compiler/tf2xla/side_effect_util.h
index f22ddb2f58e1fa5c10ca0fdb956d9136942388b7..7081b362c36c4785164b29003a5f89cd73bcf3af 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.h
+++ b/tensorflow/compiler/tf2xla/side_effect_util.h
@@ -35,6 +35,9 @@ extern const char kXlaTokenInputNodesAttrName[];
 // node has side-effect dependency on current graph's token input.
 extern const char kXlaTokenArgNodeName[];
 
+// This node have XlaRecvAtHost/XlaSendFromHost in its associated functions.
+extern const char kXlaHasHostTransferAttrName[];
+
 // Calculates side-effect dependencies for the graph's token output.
 // Returns a set of node names representing these dependencies.
 std::set<std::string> CalculateTokenInputsForOutputToken(const Graph& g);
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index ab26d939ccba75ce58609ffd71c7ccadbe90cfa8..24afe595b18b823818bd8fe65bc599af8bce040a 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -91,7 +91,7 @@ TEST(ConvertGraphDefToXla, Sum) {
       client->ExecuteAndTransfer(computation, {x_global.get(), y_global.get()});
   TF_EXPECT_OK(result_or.status());
   xla::Literal result = std::move(result_or.ValueOrDie());
-  EXPECT_EQ("(s32[]) (\n42\n)", result.ToString());
+  EXPECT_EQ("(\ns32[] 42\n)", result.ToString());
 
   config.mutable_feed(0)->mutable_id()->set_output_index(
       123); /* invalid output_index */
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index cc81772e8c5da710bc733f7e4f5fe820b2c2d110..18d87727c500619bf386be7d8c7085724f44aba3 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -364,6 +364,7 @@ Status AddPlaceholdersForFeeds(
       GraphDef gd;
       *gd.mutable_versions() = graph_def->versions();
       *gd.add_node() = *existing;
+      MergeDebugInfo(NodeDebugInfo(*existing), gd.mutable_node(0));
       TF_RETURN_IF_ERROR(
           AddDefaultAttrsToGraphDef(&gd, *op_registry, 0 /*node_offset*/));
 
@@ -390,6 +391,7 @@ Status AddPlaceholdersForFeeds(
   // in this code.
   for (auto it = placeholder_info.begin(); it != placeholder_info.end(); ++it) {
     const PlaceholderInfo& info = it->second;
+    // TODO(shikharagarwal): Add original node information.
     NodeDef* d = graph_def->add_node();
     d->set_name(info.placeholder_name);
     d->set_op("PlaceholderV2");
@@ -557,6 +559,12 @@ bool HasAssociatedFunction(const NodeDef& node_def,
     return true;
   }
 
+  if (node_def.op() == "XlaHostCompute") {
+    // XlaHostCompute has "shape_inference_graph" func attr, but that's not
+    // related to graph execution.
+    return false;
+  }
+
   for (const auto& iter : node_def.attr()) {
     if (iter.second.has_func()) {
       return true;
@@ -578,6 +586,9 @@ std::vector<AssociatedFunctionInfo> GetAssociatedFunctions(
     // This is a SymbolicGradient op.
     AttrValueMap attrs(node.attrs().begin(), node.attrs().end());
     results.emplace_back(AssociatedFunctionInfo::SymbolicGradient(op, attrs));
+  } else if (node.type_string() == "XlaHostCompute") {
+    // XlaHostCompute has "shape_inference_graph" func attr, but that's not
+    // related to graph execution.
   } else {
     // Collect all function attrs for the node.
     for (auto& iter : node.attrs()) {
@@ -599,7 +610,9 @@ Status RewriteAssociatedFunction(
   switch (associated_function.type()) {
     case AssociatedFunctionInfo::kFunctionCallNode: {
       // Change this node to call the new function.
-      NodeDefBuilder builder(node->name(), rewritten_function_name, fld);
+      NodeDebugInfo debug_info(*node);
+      NodeDefBuilder builder(node->name(), rewritten_function_name, fld,
+                             &debug_info);
       for (auto attr : node->attrs()) {
         builder.Attr(attr.first, attr.second);
       }
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index a1d359e97c4fad3ca74d44a358cba0e8190cdc22..c7341cf8b9e8d7a06fd304ae8766420d20f0c16e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -206,8 +206,14 @@ class XlaCompiledCpuFunction {
   //
   // Aliasing of argument and result buffers is not allowed, and results in
   // undefined behavior.
-  void set_arg_data(size_t index, void* data) {
-    buffer_table_[arg_index_table_[index]] = data;
+  void set_arg_data(size_t index, const void* data) {
+    // The const_cast is safe because the generated code does not write to arg
+    // buffers.
+    //
+    // buffer_table_ contains pointers to buffers that _will_ be written to by
+    // generated code so it would be misleading to make buffer_table_ a `const
+    // void**`.
+    buffer_table_[arg_index_table_[index]] = const_cast<void*>(data);
   }
 
   // ------------------------------
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 4360e0857964b0ac63fc887e269b04a4b00d854a..722d1376687efa1c04158e3fd9ce539aac9d0122 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -109,7 +109,7 @@ cc_library(
     name = "status_macros",
     srcs = ["status_macros.cc"],
     hdrs = ["status_macros.h"],
-    visibility = [":friends"],
+    visibility = ["//visibility:public"],
     deps = [
         ":statusor",
         ":types",
@@ -224,6 +224,7 @@ cc_library(
     name = "shape_util",
     srcs = [
         "index_util.cc",
+        "layout.cc",
         "layout_util.cc",
         "primitive_util.cc",
         "shape.cc",
@@ -231,6 +232,7 @@ cc_library(
     ],
     hdrs = [
         "index_util.h",
+        "layout.h",
         "layout_util.h",
         "primitive_util.h",
         "shape.h",
@@ -290,6 +292,22 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "primitive_util_test",
+    srcs = ["primitive_util_test.cc"],
+    deps = [
+        ":shape_util",
+        ":status_macros",
+        ":test",
+        ":test_helpers",
+        ":types",
+        ":util",
+        ":xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "layout_util_test",
     srcs = ["layout_util_test.cc"],
@@ -301,6 +319,22 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "layout_test",
+    srcs = ["layout_test.cc"],
+    deps = [
+        ":shape_util",
+        ":status_macros",
+        ":test",
+        ":test_helpers",
+        ":types",
+        ":util",
+        ":xla_data_proto",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_cc_test(
     name = "index_util_test",
     srcs = ["index_util_test.cc"],
@@ -575,6 +609,7 @@ cc_library(
         ":types",
         ":util",
         ":xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/memory",
@@ -705,7 +740,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_evaluator",
         "//tensorflow/compiler/xla/service:shape_inference",
-        "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index fe99564d3c671cd7890e1fa26fcd2e3384972983..e61d9d2520366f3f21a18b6c62ba924fba23308a 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -3,7 +3,7 @@
 
 licenses(["notice"])  # Apache 2.0
 
-package(default_visibility = [":friends"])
+package(default_visibility = ["//visibility:public"])
 
 package_group(
     name = "friends",
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 74b76f929949d3300a5d0ff45d5fa4cd9f162642..43127cae1e5d81521003a28288e27d291e33c9b9 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -186,7 +186,7 @@ StatusOr<Literal> Client::ComputeConstant(const XlaComputation& computation,
   ComputeConstantGraphRequest request;
   *request.mutable_computation() = computation.proto();
   if (output_layout != nullptr) {
-    *request.mutable_output_layout() = *output_layout;
+    *request.mutable_output_layout() = output_layout->ToProto();
   }
 
   ComputeConstantResponse response;
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index c5733bc66deb8d55a9186ad1893abaf17ed6909e..970f00759f630f30f1c1321231fd9e0199026142 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -1,5 +1,7 @@
 # Common computation builders for XLA.
 
+load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_suites", "xla_test")
+
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//tensorflow/compiler/xla/client:friends"])
@@ -13,9 +15,6 @@ filegroup(
     ]),
 )
 
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_suites")
-
 # Generate test_suites for all backends, named "${backend}_tests".
 generate_backend_suites()
 
@@ -35,6 +34,48 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "cholesky",
+    srcs = ["cholesky.cc"],
+    hdrs = ["cholesky.h"],
+    deps = [
+        ":math",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:loops",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
+        "//tensorflow/compiler/xla/client/lib:triangular_solve",
+        "//tensorflow/core:lib",
+    ],
+)
+
+xla_test(
+    name = "cholesky_test",
+    srcs = ["cholesky_test.cc"],
+    tags = ["optonly"],
+    deps = [
+        ":arithmetic",
+        ":cholesky",
+        ":matrix",
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "constants",
     srcs = ["constants.cc"],
@@ -75,6 +116,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "loops",
+    srcs = ["loops.cc"],
+    hdrs = ["loops.h"],
+    deps = [
+        ":constants",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "math",
     srcs = ["math.cc"],
@@ -104,13 +161,17 @@ xla_test(
 )
 
 cc_library(
-    name = "numeric",
-    srcs = ["numeric.cc"],
-    hdrs = ["numeric.h"],
+    name = "matrix",
+    srcs = ["matrix.cc"],
+    hdrs = ["matrix.h"],
     deps = [
         ":arithmetic",
         ":constants",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "@com_google_absl//absl/types:span",
@@ -118,11 +179,12 @@ cc_library(
 )
 
 xla_test(
-    name = "numeric_test",
-    srcs = ["numeric_test.cc"],
+    name = "matrix_test",
+    srcs = ["matrix_test.cc"],
     tags = ["enable_for_xla_interpreter"],
     deps = [
-        ":numeric",
+        ":matrix",
+        ":slicing",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -172,6 +234,80 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "qr",
+    srcs = ["qr.cc"],
+    hdrs = ["qr.h"],
+    deps = [
+        ":arithmetic",
+        ":constants",
+        ":loops",
+        ":math",
+        ":matrix",
+        ":slicing",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/core:lib",
+    ],
+)
+
+xla_test(
+    name = "qr_test",
+    srcs = ["qr_test.cc"],
+    tags = ["optonly"],
+    deps = [
+        ":matrix",
+        ":qr",
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:array3d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_library(
+    name = "slicing",
+    srcs = ["slicing.cc"],
+    hdrs = ["slicing.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_test(
+    name = "slicing_test",
+    srcs = ["slicing_test.cc"],
+    tags = ["enable_for_xla_interpreter"],
+    deps = [
+        ":slicing",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "sorting",
     srcs = ["sorting.cc"],
@@ -200,6 +336,34 @@ xla_test(
     ],
 )
 
+cc_library(
+    name = "quantize",
+    hdrs = ["quantize.h"],
+    deps = [
+        ":constants",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/core:lib",
+    ],
+)
+
+xla_test(
+    name = "quantize_test",
+    srcs = ["quantize_test.cc"],
+    tags = ["enable_for_xla_interpreter"],
+    deps = [
+        ":quantize",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "testing",
     srcs = ["testing.cc"],
@@ -221,3 +385,48 @@ cc_library(
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "triangular_solve",
+    srcs = ["triangular_solve.cc"],
+    hdrs = ["triangular_solve.h"],
+    deps = [
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
+        "//tensorflow/core:lib",
+    ],
+)
+
+xla_test(
+    name = "triangular_solve_test",
+    srcs = ["triangular_solve_test.cc"],
+    tags = ["noasan"],  # sometimes times out, http://b/78650012
+    deps = [
+        ":triangular_solve",
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/xla/client/lib/cholesky.cc
similarity index 61%
rename from tensorflow/compiler/tf2xla/lib/cholesky.cc
rename to tensorflow/compiler/xla/client/lib/cholesky.cc
index ab3d0a566839343828d176d9a46672824e425613..fd98049968491d80b9717a2de1f34997bd9d18c1 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/xla/client/lib/cholesky.cc
@@ -13,16 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/cholesky.h"
+#include "tensorflow/compiler/xla/client/lib/cholesky.h"
 
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
-#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
-#include "tensorflow/compiler/tf2xla/lib/util.h"
-#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -30,7 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/errors.h"
 
-namespace tensorflow {
+namespace xla {
 
 namespace {
 
@@ -49,26 +50,25 @@ namespace {
 //     l[..., j+1:, j] = (a[..., j+1:, j] - np.dot(l[..., j+1:, :j], row_t)) /
 //                       l[..., j, j]
 //   return l
-xla::XlaOp CholeskyUnblocked(xla::XlaOp a,
-                             xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-    const int n_dims = xla::ShapeUtil::Rank(a_shape);
-    const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1);
-    auto major_dims = xla::AsInt64Slice(a_shape.dimensions())
+XlaOp CholeskyUnblocked(XlaOp a, PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+    const int n_dims = ShapeUtil::Rank(a_shape);
+    const int64 n = ShapeUtil::GetDimension(a_shape, -1);
+    auto major_dims = AsInt64Slice(a_shape.dimensions())
                           .subspan(
                               /*pos=*/0,
                               /*len=*/n_dims - 2);
 
-    xla::XlaOp l = xla::ZerosLike(a);
+    XlaOp l = ZerosLike(a);
 
     // Construct the for loop body to iterate over rows.
-    auto body_fn = [&](xla::XlaOp i, absl::Span<const xla::XlaOp> loop_vars,
-                       xla::XlaBuilder* body_builder)
-        -> xla::StatusOr<std::vector<xla::XlaOp>> {
-      xla::Shape col_shape;
-      xla::Shape row_shape;
+    auto body_fn =
+        [&](XlaOp i, absl::Span<const XlaOp> loop_vars,
+            XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
+      Shape col_shape;
+      Shape row_shape;
       for (int64 d : major_dims) {
         row_shape.add_dimensions(d);
         col_shape.add_dimensions(d);
@@ -76,59 +76,49 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a,
       row_shape.add_dimensions(1);
       row_shape.add_dimensions(n);
       row_shape.set_element_type(a_shape.element_type());
-      auto mask_zeros_row = xla::Zeros(body_builder, row_shape);
+      auto mask_zeros_row = Zeros(body_builder, row_shape);
 
       col_shape.add_dimensions(n);
       col_shape.add_dimensions(1);
       col_shape.set_element_type(a_shape.element_type());
-      auto mask_zeros_col = xla::Zeros(body_builder, col_shape);
+      auto mask_zeros_col = Zeros(body_builder, col_shape);
 
       std::vector<int32> mask_vector(n);
       std::iota(mask_vector.begin(), mask_vector.end(), 0);
-      auto mask_range = xla::ConstantR1<int32>(body_builder, mask_vector);
+      auto mask_range = ConstantR1<int32>(body_builder, mask_vector);
       auto mask_range_row =
-          xla::Broadcast(xla::Reshape(mask_range, {0}, {1, n}), major_dims);
+          Broadcast(Reshape(mask_range, {0}, {1, n}), major_dims);
       auto mask_range_col =
-          xla::Broadcast(xla::Reshape(mask_range, {0}, {n, 1}), major_dims);
+          Broadcast(Reshape(mask_range, {0}, {n, 1}), major_dims);
       auto body_a = loop_vars[0];
       auto body_l = loop_vars[1];
 
       // row = l[..., i, :i]
       // select the whole i-th row, then mask out all columns past i-1
-      auto zero = xla::ConstantR0<int32>(body_builder, 0);
+      auto zero = ConstantR0<int32>(body_builder, 0);
       auto l_i = DynamicSliceInMinorDims(body_l, {i, zero}, {1, n});
-      auto row = xla::Select(xla::Ge(mask_range_row, i), mask_zeros_row, l_i);
+      auto row = Select(Ge(mask_range_row, i), mask_zeros_row, l_i);
       // a[..., i, i]
       auto a_ii = DynamicSliceInMinorDims(body_a, {i, i}, {1, 1});
       // np.dot(row, np.swapaxes(row, -1, -2))
-      auto diag_dot = BatchDot(row, row,
-                               /*transpose_x=*/false,
-                               /*transpose_y=*/true, /*conjugate_x=*/false,
-                               /*conjugate_y=*/false, precision);
+      auto diag_dot = BatchDot(row, TransposeInMinorDims(row), precision);
       // l[..., i, i] = np.sqrt(a[..., i, i] - np.dot(row,
       //                                              np.swapaxes(row, -1, -2)))
-      auto l_ii =
-          xla::Pow(a_ii - diag_dot,
-                   FloatLiteral(body_builder, a_shape.element_type(), 0.5));
+      auto l_ii = Sqrt(a_ii - diag_dot);
 
       // a[..., i+1:, i]
       // select the whole i-th column, then mask out all rows above i+1
       auto a_0i = DynamicSliceInMinorDims(body_a, {i}, {1});
-      auto a_ip1i =
-          xla::Select(xla::Le(mask_range_col, i), mask_zeros_col, a_0i);
+      auto a_ip1i = Select(Le(mask_range_col, i), mask_zeros_col, a_0i);
 
       // l[..., i+1:, i] = (a[..., i+1:, i] - np.dot(l[..., i+1:, :i], r.T)) /
       //                   l[..., i, i]
       // The columns in [i, n] are zeroed out in `row`, so we just have to
       // zero out rows above i+1 after the BatchDot. np.dot(l[..., :, :i],
       // r.T)
-      auto dot = BatchDot(body_l, row,
-                          /*transpose_x=*/false,
-                          /*transpose_y=*/true, /*conjugate_x=*/false,
-                          /*conjugate_y=*/false, precision);
+      auto dot = BatchDot(body_l, TransposeInMinorDims(row), precision);
       // np.dot(l[..., i+1:, :i], r.T)
-      auto dot_ip1 =
-          xla::Select(xla::Le(mask_range_col, i), mask_zeros_col, dot);
+      auto dot_ip1 = Select(Le(mask_range_col, i), mask_zeros_col, dot);
 
       body_l =
           DynamicUpdateSliceInMinorDims(body_l, (a_ip1i - dot_ip1) / l_ii, {i});
@@ -136,12 +126,12 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a,
       // column assign will wrap around and overwrite the diagonal assign.
       body_l = DynamicUpdateSliceInMinorDims(body_l, l_ii, {i, i});
 
-      return std::vector<xla::XlaOp>{body_a, body_l};
+      return std::vector<XlaOp>{body_a, body_l};
     };
 
     TF_ASSIGN_OR_RETURN(
         auto cholesky_while,
-        XlaForEachIndex(n, xla::S32, body_fn, {a, l}, "unblocked", builder));
+        ForEachIndex(n, S32, body_fn, {a, l}, "unblocked", builder));
 
     return cholesky_while[1];
   });
@@ -149,34 +139,35 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a,
 
 }  // namespace
 
-xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size,
-                    xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-    const int ndims = xla::ShapeUtil::Rank(a_shape);
+XlaOp Cholesky(XlaOp a, int64 block_size,
+               PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+    const int ndims = ShapeUtil::Rank(a_shape);
     if (ndims < 2) {
-      return errors::InvalidArgument(
-          "Arguments to Cholesky must have rank >= 2: ", ndims);
+      return InvalidArgument(
+          "Argument to Cholesky must have rank >= 2; shape was %s",
+          a_shape.ToString());
     }
 
-    const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1);
-    if (n != xla::ShapeUtil::GetDimension(a_shape, -2)) {
-      return errors::InvalidArgument(
-          "Arguments to Cholesky must be square matrices: ",
-          xla::ShapeUtil::HumanString(a_shape));
+    const int64 n = ShapeUtil::GetDimension(a_shape, -1);
+    if (n != ShapeUtil::GetDimension(a_shape, -2)) {
+      return InvalidArgument(
+          "Argument to Cholesky must be batched square matrices; got shape %s",
+          ShapeUtil::HumanString(a_shape));
     }
 
     if (block_size < 1) {
-      return errors::InvalidArgument(
-          "block_size argument to Cholesky must be >= 1; got ", block_size);
+      return InvalidArgument(
+          "block_size argument to Cholesky must be >= 1; got %d", block_size);
     }
 
     // Blocked left-looking Cholesky factorization.
     // Algorithm 1 from
     // Haidar, Azzam, et al. "High-performance Cholesky factorization for
     // GPU-only execution." Proceedings of General Purpose GPUs. ACM, 2017.
-    xla::XlaOp l = xla::ZerosLike(a);
+    XlaOp l = ZerosLike(a);
     for (int64 i = 0; i < n; i += block_size) {
       int64 k = std::min(block_size, n - i);
       if (i > 0) {
@@ -185,9 +176,7 @@ xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size,
         // a[i:, i:i+k] -= np.dot(l[i:, :i], np.transpose(l[i:i+k, :i]))
         auto lhs = SliceInMinorDims(l, {i, 0}, {n, i});
         auto rhs = SliceInMinorDims(l, {i, 0}, {i + k, i});
-        auto delta = BatchDot(lhs, rhs, /*transpose_x=*/false,
-                              /*transpose_y=*/true, /*conjugate_x=*/false,
-                              /*conjugate_y=*/false, precision);
+        auto delta = BatchDot(lhs, TransposeInMinorDims(rhs), precision);
         auto before = SliceInMinorDims(a, {i, i}, {n, i + k});
         a = UpdateSliceInMinorDims(a, before - delta, {i, i});
       }
@@ -214,4 +203,4 @@ xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size,
   });
 }
 
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.h b/tensorflow/compiler/xla/client/lib/cholesky.h
similarity index 87%
rename from tensorflow/compiler/tf2xla/lib/cholesky.h
rename to tensorflow/compiler/xla/client/lib/cholesky.h
index 9a561c34b92ee45059f2a05336e682838f8e36e2..0bae26837c0f14dd0cfab82cf426becc787ec11c 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.h
+++ b/tensorflow/compiler/xla/client/lib/cholesky.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
-#define TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CHOLESKY_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CHOLESKY_H_
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
-namespace tensorflow {
+namespace xla {
 
 // Computes the Cholesky decompositions of a batch of symmetric positive
 // definite matrices.
@@ -34,6 +34,6 @@ xla::XlaOp Cholesky(
     xla::XlaOp a, int64 block_size = 256,
     xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::HIGHEST);
 
-}  // namespace tensorflow
+}  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CHOLESKY_H_
diff --git a/tensorflow/compiler/xla/client/lib/cholesky_test.cc b/tensorflow/compiler/xla/client/lib/cholesky_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ba9580a3d32225625acc1447344b7d2c16c5d8a5
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/cholesky_test.cc
@@ -0,0 +1,166 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/cholesky.h"
+
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace {
+
+using xla::int64;
+
+using CholeskyTest = xla::ClientLibraryTestBase;
+
+XLA_TEST_F(CholeskyTest, Simple) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::Array2D<float> a_vals({
+      {4, 6, 8, 10},
+      {6, 45, 54, 63},
+      {8, 54, 146, 166},
+      {10, 63, 166, 310},
+  });
+
+  xla::XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
+  xla::Cholesky(a, /*block_size=*/2);
+
+  xla::Array2D<float> expected({
+      {2, 0, 0, 0},
+      {3, 6, 0, 0},
+      {4, 7, 9, 0},
+      {5, 8, 10, 11},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get()},
+                             xla::ErrorSpec(1e-4, 1e-4));
+}
+
+XLA_TEST_F(CholeskyTest, Simple2) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::Array2D<float> a_vals({
+      {16, 24, 8, 12},
+      {24, 61, 82, 48},
+      {8, 82, 456, 106},
+      {12, 48, 106, 62},
+  });
+
+  xla::XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
+  xla::Cholesky(a);
+
+  xla::Array2D<float> expected(
+      {{4, 0, 0, 0}, {6, 5, 0, 0}, {2, 14, 16, 0}, {3, 6, 1, 4}});
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get()},
+                             xla::ErrorSpec(1e-4, 1e-4));
+}
+
+XLA_TEST_F(CholeskyTest, SimpleBatched) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::Array3D<float> a_vals({
+      {
+          {4, 6, 8, 10},
+          {6, 45, 54, 63},
+          {8, 54, 146, 166},
+          {10, 63, 166, 310},
+      },
+      {
+          {16, 24, 8, 12},
+          {24, 61, 82, 48},
+          {8, 82, 456, 106},
+          {12, 48, 106, 62},
+      },
+  });
+
+  xla::XlaOp a;
+  auto a_data = CreateR3Parameter<float>(a_vals, 0, "a", &builder, &a);
+  xla::Cholesky(a);
+
+  xla::Array3D<float> expected({
+      {
+          {2, 0, 0, 0},
+          {3, 6, 0, 0},
+          {4, 7, 9, 0},
+          {5, 8, 10, 11},
+      },
+      {{4, 0, 0, 0}, {6, 5, 0, 0}, {2, 14, 16, 0}, {3, 6, 1, 4}},
+  });
+
+  ComputeAndCompareR3<float>(&builder, expected, {a_data.get()},
+                             xla::ErrorSpec(1e-4, 1e-4));
+}
+
+using CholeskyTestCase = std::tuple<int64, int64>;
+
+class RandomCholeskyTest
+    : public xla::ClientLibraryTestBase,
+      public ::testing::WithParamInterface<CholeskyTestCase> {};
+
+XLA_TEST_P(RandomCholeskyTest, Random) {
+  xla::XlaBuilder builder(TestName());
+
+  auto test_params = GetParam();
+  std::vector<int64> dimensions = {std::get<0>(test_params),
+                                   std::get<1>(test_params),
+                                   std::get<1>(test_params)};
+  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, dimensions);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto literal,
+      xla::LiteralUtil::CreateRandomLiteral<xla::F32>(shape, 0.0, 1.0));
+
+  auto input = xla::Parameter(&builder, 0, shape, "input");
+  // Form a random positive definite matrix.
+  auto matrix = xla::BatchDot(input, TransposeInMinorDims(input),
+                              xla::PrecisionConfig::HIGHEST);
+
+  auto cholesky = xla::Cholesky(matrix, /*block_size=*/4);
+
+  // Verify that ||matrix - cholesky * cholesky_t||_2 ~= 0
+  auto verification = xla::BatchDot(cholesky, TransposeInMinorDims(cholesky),
+                                    xla::PrecisionConfig::HIGHEST);
+  auto delta = matrix - verification;
+  xla::Reduce(delta * delta, xla::ConstantR0<float>(&builder, 0.0),
+              CreateScalarAddComputation(xla::F32, &builder), {0, 1, 2});
+
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data, client_->TransferToServer(literal));
+  ComputeAndCompareR0<float>(&builder, 0.0, {input_data.get()},
+                             xla::ErrorSpec(1e-4, 1e-4));
+}
+
+INSTANTIATE_TEST_CASE_P(RandomCholeskyTestInstance, RandomCholeskyTest,
+                        ::testing::Values(CholeskyTestCase{1, 1},
+                                          CholeskyTestCase{1, 2},
+                                          CholeskyTestCase{10, 5},
+                                          CholeskyTestCase{2, 20}));
+
+}  // namespace
diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.cc b/tensorflow/compiler/xla/client/lib/loops.cc
similarity index 50%
rename from tensorflow/compiler/tf2xla/lib/while_loop.cc
rename to tensorflow/compiler/xla/client/lib/loops.cc
index 594ab1dfd0700f47501712183f6efe62d17e15e7..721f987628a8ac7da3f3f872939c3f0457d6bbe2 100644
--- a/tensorflow/compiler/tf2xla/lib/while_loop.cc
+++ b/tensorflow/compiler/xla/client/lib/loops.cc
@@ -13,44 +13,43 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
-#include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
+
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
-namespace tensorflow {
+namespace xla {
 
-xla::StatusOr<std::vector<xla::XlaOp>> XlaWhileLoop(
-    const LoopConditionFunction& condition_function,
-    const LoopBodyFunction& body_function,
-    absl::Span<const xla::XlaOp> initial_values, absl::string_view name,
-    xla::XlaBuilder* builder) {
+StatusOr<std::vector<XlaOp>> WhileLoopHelper(
+    const WhileLoopHelperConditionFunction& condition_function,
+    const WhileLoopHelperBodyFunction& body_function,
+    absl::Span<const XlaOp> initial_values, absl::string_view name,
+    XlaBuilder* builder) {
   int arity = initial_values.size();
-  std::vector<xla::Shape> var_shapes;
+  std::vector<Shape> var_shapes;
   var_shapes.reserve(arity);
-  for (const xla::XlaOp& input : initial_values) {
+  for (const XlaOp& input : initial_values) {
     TF_ASSIGN_OR_RETURN(auto shape, builder->GetShape(input));
     var_shapes.push_back(std::move(shape));
   }
-  xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(var_shapes);
+  Shape tuple_shape = ShapeUtil::MakeTupleShape(var_shapes);
 
   // Unpacks a tuple into its component parts.
-  auto unpack_tuple = [](xla::XlaOp tuple, int arity,
-                         xla::XlaBuilder* builder) {
-    std::vector<xla::XlaOp> elements(arity);
+  auto unpack_tuple = [](XlaOp tuple, int arity, XlaBuilder* builder) {
+    std::vector<XlaOp> elements(arity);
     for (int i = 0; i < arity; ++i) {
-      elements[i] = xla::GetTupleElement(tuple, i);
+      elements[i] = GetTupleElement(tuple, i);
     }
     return elements;
   };
 
   // Build the condition.
-  std::unique_ptr<xla::XlaBuilder> cond_builder =
+  std::unique_ptr<XlaBuilder> cond_builder =
       builder->CreateSubBuilder(absl::StrCat(name, "_condition"));
   {
-    auto parameter =
-        xla::Parameter(cond_builder.get(), 0, tuple_shape, "parameter");
+    auto parameter = Parameter(cond_builder.get(), 0, tuple_shape, "parameter");
 
     TF_RETURN_IF_ERROR(
         condition_function(unpack_tuple(parameter, arity, cond_builder.get()),
@@ -60,11 +59,10 @@ xla::StatusOr<std::vector<xla::XlaOp>> XlaWhileLoop(
   TF_ASSIGN_OR_RETURN(auto cond, cond_builder->Build());
 
   // Build the body.
-  std::unique_ptr<xla::XlaBuilder> body_builder =
+  std::unique_ptr<XlaBuilder> body_builder =
       builder->CreateSubBuilder(absl::StrCat(name, "_body"));
   {
-    auto parameter =
-        xla::Parameter(body_builder.get(), 0, tuple_shape, "parameter");
+    auto parameter = Parameter(body_builder.get(), 0, tuple_shape, "parameter");
 
     TF_ASSIGN_OR_RETURN(
         auto result,
@@ -72,56 +70,54 @@ xla::StatusOr<std::vector<xla::XlaOp>> XlaWhileLoop(
                       body_builder.get()));
 
     TF_RET_CHECK(result.size() == initial_values.size());
-    xla::Tuple(body_builder.get(), result);
+    Tuple(body_builder.get(), result);
   }
   TF_ASSIGN_OR_RETURN(auto body, body_builder->Build());
 
-  auto outputs = xla::While(cond, body, xla::Tuple(builder, initial_values));
+  auto outputs = While(cond, body, Tuple(builder, initial_values));
 
   return unpack_tuple(outputs, arity, builder);
 }
 
-xla::StatusOr<std::vector<xla::XlaOp>> XlaForEachIndex(
-    int64 num_iterations, xla::PrimitiveType num_iterations_type,
+StatusOr<std::vector<XlaOp>> ForEachIndex(
+    int64 num_iterations, PrimitiveType num_iterations_type,
     const ForEachIndexBodyFunction& body_function,
-    absl::Span<const xla::XlaOp> initial_values, absl::string_view name,
-    xla::XlaBuilder* builder) {
-  auto while_cond_fn =
-      [&](absl::Span<const xla::XlaOp> values,
-          xla::XlaBuilder* cond_builder) -> xla::StatusOr<xla::XlaOp> {
-    return xla::Lt(values[0], IntegerLiteral(cond_builder, num_iterations_type,
-                                             num_iterations));
+    absl::Span<const XlaOp> initial_values, absl::string_view name,
+    XlaBuilder* builder) {
+  auto while_cond_fn = [&](absl::Span<const XlaOp> values,
+                           XlaBuilder* cond_builder) -> StatusOr<XlaOp> {
+    return Lt(values[0], ConstantR0WithType(cond_builder, num_iterations_type,
+                                            num_iterations));
   };
-  auto while_body_fn = [&](absl::Span<const xla::XlaOp> values,
-                           xla::XlaBuilder* body_builder)
-      -> xla::StatusOr<std::vector<xla::XlaOp>> {
-    xla::XlaOp iteration = values[0];
+  auto while_body_fn =
+      [&](absl::Span<const XlaOp> values,
+          XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
+    XlaOp iteration = values[0];
 
-    std::vector<xla::XlaOp> updated_values;
+    std::vector<XlaOp> updated_values;
     updated_values.reserve(values.size());
-    updated_values.push_back(xla::Add(
+    updated_values.push_back(Add(
         iteration,
-        xla::ConstantLiteral(body_builder,
-                             xla::LiteralUtil::One(num_iterations_type))));
+        ConstantLiteral(body_builder, LiteralUtil::One(num_iterations_type))));
 
     values.remove_prefix(1);
-    TF_ASSIGN_OR_RETURN(std::vector<xla::XlaOp> body_outputs,
+    TF_ASSIGN_OR_RETURN(std::vector<XlaOp> body_outputs,
                         body_function(iteration, values, body_builder));
     updated_values.insert(updated_values.end(), body_outputs.begin(),
                           body_outputs.end());
     return updated_values;
   };
 
-  std::vector<xla::XlaOp> values;
+  std::vector<XlaOp> values;
   values.reserve(initial_values.size() + 1);
-  values.push_back(xla::ConstantLiteral(
-      builder, xla::LiteralUtil::Zero(num_iterations_type)));
+  values.push_back(
+      ConstantLiteral(builder, LiteralUtil::Zero(num_iterations_type)));
   values.insert(values.end(), initial_values.begin(), initial_values.end());
 
-  TF_ASSIGN_OR_RETURN(values, XlaWhileLoop(while_cond_fn, while_body_fn, values,
-                                           name, builder));
+  TF_ASSIGN_OR_RETURN(values, WhileLoopHelper(while_cond_fn, while_body_fn,
+                                              values, name, builder));
   values.erase(values.begin(), values.begin() + 1);
   return values;
 }
 
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.h b/tensorflow/compiler/xla/client/lib/loops.h
similarity index 62%
rename from tensorflow/compiler/tf2xla/lib/while_loop.h
rename to tensorflow/compiler/xla/client/lib/loops.h
index f2134bb4495a12b8342961d96f70e7737f816c7d..e11de59493e9c1de51fbdb6c45dab6d82b85a62a 100644
--- a/tensorflow/compiler/tf2xla/lib/while_loop.h
+++ b/tensorflow/compiler/xla/client/lib/loops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_WHILE_LOOP_H_
-#define TENSORFLOW_COMPILER_TF2XLA_LIB_WHILE_LOOP_H_
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_LOOPS_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_LOOPS_H_
 
 #include <functional>
 #include <vector>
@@ -25,19 +25,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
-namespace tensorflow {
+namespace xla {
 
 // Function that builds a loop condition. Takes as input a sequence of input
 // values, and returns a boolean value representing if the condition succeeds.
-typedef std::function<xla::StatusOr<xla::XlaOp>(absl::Span<const xla::XlaOp>,
-                                                xla::XlaBuilder*)>
-    LoopConditionFunction;
+typedef std::function<StatusOr<XlaOp>(absl::Span<const XlaOp>, XlaBuilder*)>
+    WhileLoopHelperConditionFunction;
 
 // Function that builds a loop body. Takes as input a sequence of input values
 // and returns a sequence of output values.
-typedef std::function<xla::StatusOr<std::vector<xla::XlaOp>>(
-    absl::Span<const xla::XlaOp>, xla::XlaBuilder*)>
-    LoopBodyFunction;
+typedef std::function<StatusOr<std::vector<XlaOp>>(absl::Span<const XlaOp>,
+                                                   XlaBuilder*)>
+    WhileLoopHelperBodyFunction;
 
 // Helper function for building an XLA while loop, where the values carried by
 // the loop are a tuple of values, e.g., (a, b, c):
@@ -47,27 +46,27 @@ typedef std::function<xla::StatusOr<std::vector<xla::XlaOp>>(
 //   init: (a, b, c)
 // )
 // 'name' is a descriptive name for the loop.
-xla::StatusOr<std::vector<xla::XlaOp>> XlaWhileLoop(
-    const LoopConditionFunction& condition_function,
-    const LoopBodyFunction& body_function,
-    absl::Span<const xla::XlaOp> initial_values, absl::string_view name,
-    xla::XlaBuilder* builder);
+StatusOr<std::vector<XlaOp>> WhileLoopHelper(
+    const WhileLoopHelperConditionFunction& condition_function,
+    const WhileLoopHelperBodyFunction& body_function,
+    absl::Span<const XlaOp> initial_values, absl::string_view name,
+    XlaBuilder* builder);
 
 // Builds an XLA loop that repeats a computation `num_iterations` times.
 //
 // The body function (ForEachIndexBodyFunction) takes as input a pair of
 // (current iteration number, loop-carried values), and returns an updated
 // vector of the loop-carried values.
-typedef std::function<xla::StatusOr<std::vector<xla::XlaOp>>(
-    xla::XlaOp, absl::Span<const xla::XlaOp>, xla::XlaBuilder*)>
+typedef std::function<StatusOr<std::vector<XlaOp>>(
+    XlaOp, absl::Span<const XlaOp>, XlaBuilder*)>
     ForEachIndexBodyFunction;
 
-xla::StatusOr<std::vector<xla::XlaOp>> XlaForEachIndex(
-    int64 num_iterations, xla::PrimitiveType num_iterations_type,
+StatusOr<std::vector<XlaOp>> ForEachIndex(
+    int64 num_iterations, PrimitiveType num_iterations_type,
     const ForEachIndexBodyFunction& body_function,
-    absl::Span<const xla::XlaOp> initial_values, absl::string_view name,
-    xla::XlaBuilder* builder);
+    absl::Span<const XlaOp> initial_values, absl::string_view name,
+    XlaBuilder* builder);
 
-}  // namespace tensorflow
+}  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_WHILE_LOOP_H_
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_LOOPS_H_
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 08a887a6e4660cb2528f0ec7244b7ccc540808d2..36fdda39b4124b9100c6054160f9c17bdf787d6f 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -268,17 +268,16 @@ XlaOp Digamma(XlaOp input) {
 // Implements Banker's rounding: numbers that are equidistant between two
 // integers are rounded towards even.
 XlaOp RoundToEven(XlaOp x) {
-  auto half = xla::ScalarLike(x, 0.5);
-  auto one = xla::ScalarLike(x, 1.0);
-  auto two = xla::ScalarLike(x, 2.0);
+  auto half = ScalarLike(x, 0.5);
+  auto one = ScalarLike(x, 1.0);
+  auto two = ScalarLike(x, 2.0);
 
-  auto round_val = xla::Floor(x);
+  auto round_val = Floor(x);
   auto fraction = x - round_val;
-  auto nearest_even_int = round_val - two * xla::Floor(half * x);
-  auto is_odd = xla::Eq(nearest_even_int, one);
-  return xla::Select(xla::Or(xla::Gt(fraction, half),
-                             xla::And(xla::Eq(fraction, half), is_odd)),
-                     round_val + one, round_val);
+  auto nearest_even_int = round_val - two * Floor(half * x);
+  auto is_odd = Eq(nearest_even_int, one);
+  return Select(Or(Gt(fraction, half), And(Eq(fraction, half), is_odd)),
+                round_val + one, round_val);
 }
 
 // Trigonometric functions.
@@ -320,4 +319,13 @@ XlaOp Cosh(XlaOp x) { return (Exp(x) + Exp(-x)) * ScalarLike(x, 0.5); }
 
 XlaOp Sinh(XlaOp x) { return (Exp(x) - Exp(-x)) * ScalarLike(x, 0.5); }
 
+XlaOp MaybeConjugate(XlaOp x, bool conjugate) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    auto perform_conj = shape.element_type() == C64 && conjugate;
+    return perform_conj ? Conj(x) : x;
+  });
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index 3f06d04b9ae98b3aa75e68cd07810b2b4c24d280..17612bf9fdc0f1eabb338671c93c025c5b268872 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -86,6 +86,10 @@ XlaOp Cosh(XlaOp x);
 // Computes the hyperbolic sine of 'x'.
 XlaOp Sinh(XlaOp x);
 
+// Applies a complex conjugation operation if `a` is complex and `conjugate`
+// is true, otherwise returns its argument.
+xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATH_H_
diff --git a/tensorflow/compiler/xla/client/lib/matrix.cc b/tensorflow/compiler/xla/client/lib/matrix.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ffd744d190885b8e3f4149a48a706498b3787618
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/matrix.cc
@@ -0,0 +1,185 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+
+#include <numeric>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m,
+                     int64 n) {
+  auto a = Iota(builder, type, m);
+  auto b = Iota(builder, type, n);
+  auto indicator = Eq(a, Broadcast(b, {m}), /*broadcast_dimensions=*/{0});
+  return ConvertElementType(indicator, type);
+}
+
+XlaOp GetMatrixDiagonal(XlaOp x) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_dims >= 2);
+    const int64 m = shape.dimensions(n_dims - 2);
+    const int64 n = shape.dimensions(n_dims - 1);
+    absl::Span<const int64> major_dims =
+        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
+    auto a = Iota(builder, U32, n);
+    auto b = Iota(builder, U32, m);
+    auto indicator = Eq(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    auto mask = Broadcast(indicator, major_dims);
+
+    // TPUs don't support S64 add reduction at the moment. But fortunately
+    // OR-reductions work just as well for integers.
+    XlaComputation reducer =
+        primitive_util::IsIntegralType(shape.element_type())
+            ? CreateScalarOrComputation(shape.element_type(), builder)
+            : CreateScalarAddComputation(shape.element_type(), builder);
+
+    return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
+                  reducer, {m >= n ? n_dims - 2 : n_dims - 1});
+  });
+}
+
+XlaOp Triangle(XlaOp x, bool lower) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_dims >= 2);
+    const int64 m = shape.dimensions(n_dims - 2);
+    const int64 n = shape.dimensions(n_dims - 1);
+    absl::Span<const int64> major_dims =
+        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
+    auto a = Iota(builder, U32, n);
+    auto b = Iota(builder, U32, m);
+    XlaOp indicator;
+    if (lower) {
+      indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    } else {
+      indicator = Le(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    }
+    auto mask = Broadcast(indicator, major_dims);
+
+    return Select(mask, x, Zeros(builder, shape));
+  });
+}
+
+XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); }
+
+XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); }
+
+XlaOp BatchDot(XlaOp x, XlaOp y, PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
+    TF_ASSIGN_OR_RETURN(Shape y_shape, builder->GetShape(y));
+
+    // Check that both tensors have the same number of dimensions. There must be
+    // at least two (the batch dimensions can be empty).
+    if (ShapeUtil::Rank(x_shape) != ShapeUtil::Rank(y_shape)) {
+      return InvalidArgument(
+          "Arguments to BatchDot have different ranks: %s vs. %s",
+          ShapeUtil::HumanString(x_shape), ShapeUtil::HumanString(y_shape));
+    }
+    const int ndims = ShapeUtil::Rank(x_shape);
+    if (ndims < 2) {
+      return InvalidArgument(
+          "Arguments to BatchDot must have rank >= 2: got %d", ndims);
+    }
+
+    // The batch dimensions must be equal and the matrix dimensions must be
+    // valid.
+    std::vector<int64> batch_dimension_numbers;
+    for (int i = 0; i < ndims - 2; ++i) {
+      if (x_shape.dimensions(i) != y_shape.dimensions(i)) {
+        return InvalidArgument(
+            "Dimension %d of inputs to BatchDot must be equal: shapes %s vs %s",
+            i, ShapeUtil::HumanString(x_shape),
+            ShapeUtil::HumanString(y_shape));
+      }
+      batch_dimension_numbers.push_back(i);
+    }
+
+    int x_inner_dim = ndims - 1;
+    int y_inner_dim = ndims - 2;
+    if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) {
+      return InvalidArgument(
+          "Dimensions %d and %d of arguments to BatchDot must be equal: "
+          "shapes %s vs %s",
+          x_inner_dim, y_inner_dim, ShapeUtil::HumanString(x_shape),
+          ShapeUtil::HumanString(y_shape));
+    }
+
+    // Check for zero lhs/rhs dim size.
+    if (ShapeUtil::IsZeroElementArray(x_shape) ||
+        ShapeUtil::IsZeroElementArray(y_shape)) {
+      std::vector<int64> dimensions(batch_dimension_numbers.size());
+      for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
+        dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]);
+      }
+      int x_outer_dim = ndims - 2;
+      int y_outer_dim = ndims - 1;
+      dimensions.push_back(x_shape.dimensions(x_outer_dim));
+      dimensions.push_back(y_shape.dimensions(y_outer_dim));
+      return Broadcast(
+          ConstantLiteral(builder, LiteralUtil::Zero(x_shape.element_type())),
+          dimensions);
+    }
+
+    PrecisionConfig precision_proto;
+    precision_proto.add_operand_precision(precision);
+    precision_proto.add_operand_precision(precision);
+
+    DotDimensionNumbers dot_dnums;
+    dot_dnums.add_lhs_contracting_dimensions(x_inner_dim);
+    dot_dnums.add_rhs_contracting_dimensions(y_inner_dim);
+    for (auto batch_dimension_number : batch_dimension_numbers) {
+      dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
+      dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
+    }
+
+    return DotGeneral(x, y, dot_dnums, &precision_proto);
+  });
+}
+
+XlaOp TransposeInMinorDims(XlaOp x) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_dims >= 2);
+    std::vector<int64> permutation(n_dims);
+    std::iota(permutation.begin(), permutation.end(), 0);
+    std::swap(permutation[n_dims - 1], permutation[n_dims - 2]);
+    return Transpose(x, permutation);
+  });
+}
+
+XlaOp MaybeTransposeInMinorDims(XlaOp x, bool transpose) {
+  return transpose ? TransposeInMinorDims(x) : x;
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/numeric.h b/tensorflow/compiler/xla/client/lib/matrix.h
similarity index 56%
rename from tensorflow/compiler/xla/client/lib/numeric.h
rename to tensorflow/compiler/xla/client/lib/matrix.h
index f62fdab4b0e5e84347cfaa1424a8c2e5c58dd3ce..8856f99c7a0fee8f315aac11fab392cf5536f57b 100644
--- a/tensorflow/compiler/xla/client/lib/numeric.h
+++ b/tensorflow/compiler/xla/client/lib/matrix.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -40,6 +40,34 @@ XlaOp UpperTriangle(XlaOp x);
 // Get the lower triangle part of the last two dimensions
 XlaOp LowerTriangle(XlaOp x);
 
+// Multiplies slices of two tensors in batches.
+
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if transpose_x else r_x
+//     c_o = r_y if transpose_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+xla::XlaOp BatchDot(
+    xla::XlaOp x, xla::XlaOp y,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
+
+// Transposes a stack of matrices `x` by swapping the last two dimensions.
+xla::XlaOp TransposeInMinorDims(xla::XlaOp x);
+
+// Transposes `x` in its minor dimensions if `transpose` is true, otherwise
+// returns `x` unchanged.
+xla::XlaOp MaybeTransposeInMinorDims(xla::XlaOp x, bool transpose);
+
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
diff --git a/tensorflow/compiler/xla/client/lib/numeric_test.cc b/tensorflow/compiler/xla/client/lib/matrix_test.cc
similarity index 53%
rename from tensorflow/compiler/xla/client/lib/numeric_test.cc
rename to tensorflow/compiler/xla/client/lib/matrix_test.cc
index 7d6aedd49462bd4f075f90d0b0f85c40f1191aa1..0593a7517ac125ca8dc5395cee76f6bc23232cd3 100644
--- a/tensorflow/compiler/xla/client/lib/numeric_test.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -24,13 +26,13 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class NumericTest : public ClientLibraryTestBase {
+class MatrixTest : public ClientLibraryTestBase {
  protected:
   template <typename T>
   void TestMatrixDiagonal();
 };
 
-XLA_TEST_F(NumericTest, Triangle) {
+XLA_TEST_F(MatrixTest, Triangle) {
   XlaBuilder builder(TestName());
   Array3D<int32> input(2, 3, 4);
   input.FillIota(0);
@@ -45,7 +47,7 @@ XLA_TEST_F(NumericTest, Triangle) {
 }
 
 template <typename T>
-void NumericTest::TestMatrixDiagonal() {
+void MatrixTest::TestMatrixDiagonal() {
   XlaBuilder builder("GetMatrixDiagonal");
   Array3D<T> input(2, 3, 4);
   input.FillIota(0);
@@ -58,11 +60,46 @@ void NumericTest::TestMatrixDiagonal() {
   ComputeAndCompareR2<T>(&builder, expected, {a_data.get()});
 }
 
-XLA_TEST_F(NumericTest, GetMatrixDiagonal_S32) { TestMatrixDiagonal<int32>(); }
+XLA_TEST_F(MatrixTest, GetMatrixDiagonal_S32) { TestMatrixDiagonal<int32>(); }
+
+XLA_TEST_F(MatrixTest, GetMatrixDiagonal_S64) { TestMatrixDiagonal<int64>(); }
+
+XLA_TEST_F(MatrixTest, GetMatrixDiagonal_F32) { TestMatrixDiagonal<float>(); }
+
+Array3D<float> BatchedAValsFull() {
+  return {{
+              {2, 0, 1, 2},
+              {3, 6, 0, 1},
+              {4, 7, 9, 0},
+              {5, 8, 10, 11},
+          },
+          {
+              {16, 24, 8, 12},
+              {24, 61, 82, 48},
+              {8, 82, 456, 106},
+              {12, 48, 106, 62},
+          }};
+}
+
+XLA_TEST_F(MatrixTest, RowBatchDot) {
+  XlaBuilder builder(TestName());
+
+  int n = 4;
 
-XLA_TEST_F(NumericTest, GetMatrixDiagonal_S64) { TestMatrixDiagonal<int64>(); }
+  XlaOp a, row, index;
+  auto a_data =
+      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
+  auto row_data = CreateR3Parameter<float>({{{9, 1, 0, 0}}, {{2, 4, 0, 0}}}, 1,
+                                           "row", &builder, &row);
+  // Select {{3, 6, 0, 1}, {24, 61,  82,  48}} out of BatchedAValsFull().
+  auto index_data = CreateR0Parameter<int>(1, 2, "index", &builder, &index);
 
-XLA_TEST_F(NumericTest, GetMatrixDiagonal_F32) { TestMatrixDiagonal<float>(); }
+  auto l_index = DynamicSliceInMinorDims(
+      a, {index, ConstantR0<int32>(&builder, 0)}, {1, n});
+  BatchDot(l_index, TransposeInMinorDims(row));
 
+  ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
+                             {a_data.get(), row_data.get(), index_data.get()});
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/numeric.cc b/tensorflow/compiler/xla/client/lib/numeric.cc
deleted file mode 100644
index 377654220b5df4487e9e194361473d54ff46a54e..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/client/lib/numeric.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <numeric>
-#include <vector>
-
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
-
-namespace xla {
-
-XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m,
-                     int64 n) {
-  auto a = Iota(builder, type, m);
-  auto b = Iota(builder, type, n);
-  auto indicator = Eq(a, Broadcast(b, {m}), /*broadcast_dimensions=*/{0});
-  return ConvertElementType(indicator, type);
-}
-
-XlaOp GetMatrixDiagonal(XlaOp x) {
-  XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_dims >= 2);
-    const int64 m = shape.dimensions(n_dims - 2);
-    const int64 n = shape.dimensions(n_dims - 1);
-    absl::Span<const int64> major_dims =
-        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
-    auto a = Iota(builder, U32, n);
-    auto b = Iota(builder, U32, m);
-    auto indicator = Eq(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    auto mask = Broadcast(indicator, major_dims);
-
-    // TPUs don't support S64 add reduction at the moment. But fortunately
-    // OR-reductions work just as well for integers.
-    XlaComputation reducer =
-        primitive_util::IsIntegralType(shape.element_type())
-            ? CreateScalarOrComputation(shape.element_type(), builder)
-            : CreateScalarAddComputation(shape.element_type(), builder);
-
-    return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
-                  reducer, {m >= n ? n_dims - 2 : n_dims - 1});
-  });
-}
-
-XlaOp Triangle(XlaOp x, bool lower) {
-  XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_dims >= 2);
-    const int64 m = shape.dimensions(n_dims - 2);
-    const int64 n = shape.dimensions(n_dims - 1);
-    absl::Span<const int64> major_dims =
-        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
-    auto a = Iota(builder, U32, n);
-    auto b = Iota(builder, U32, m);
-    xla::XlaOp indicator;
-    if (lower) {
-      indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    } else {
-      indicator = Le(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    }
-    auto mask = Broadcast(indicator, major_dims);
-
-    return Select(mask, x, Zeros(builder, shape));
-  });
-}
-
-XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); }
-
-XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); }
-
-}  // namespace xla
diff --git a/tensorflow/compiler/tf2xla/lib/qr.cc b/tensorflow/compiler/xla/client/lib/qr.cc
similarity index 55%
rename from tensorflow/compiler/tf2xla/lib/qr.cc
rename to tensorflow/compiler/xla/client/lib/qr.cc
index 6b3f2b6e065b5c99e2d0248237369ecc30188aa5..72ca653173b78d9338f632c41779f2a30db1e978 100644
--- a/tensorflow/compiler/tf2xla/lib/qr.cc
+++ b/tensorflow/compiler/xla/client/lib/qr.cc
@@ -13,18 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/qr.h"
+#include "tensorflow/compiler/xla/client/lib/qr.h"
 
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
-#include "tensorflow/compiler/tf2xla/lib/util.h"
-#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -32,10 +31,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/errors.h"
 
-namespace tensorflow {
+namespace xla {
 
 namespace {
 
+std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
+                                 absl::Span<const int64> ys) {
+  std::vector<int64> output(xs.size() + ys.size());
+  std::copy(xs.begin(), xs.end(), output.begin());
+  std::copy(ys.begin(), ys.end(), output.begin() + xs.size());
+  return output;
+}
+
 // Computes a Householder reflection of the form:
 // H = I - tau v v.T.
 // such that
@@ -65,52 +72,47 @@ namespace {
 //   return (v, tau, beta)
 // TODO(phawkins): LAPACK's xLARFG implementation has code for handling
 // overflows in the norm/beta calculations. Perhaps do the same here.
-xla::Status House(xla::XlaOp x, xla::XlaOp k,
-                  absl::Span<const int64> batch_dims, const int64 m,
-                  xla::XlaOp* v, xla::XlaOp* tau, xla::XlaOp* beta) {
-  xla::XlaBuilder* const builder = x.builder();
-  TF_ASSIGN_OR_RETURN(xla::Shape x_shape, builder->GetShape(x));
-  const xla::PrimitiveType type = x_shape.element_type();
+Status House(XlaOp x, XlaOp k, absl::Span<const int64> batch_dims,
+             const int64 m, XlaOp* v, XlaOp* tau, XlaOp* beta) {
+  XlaBuilder* const builder = x.builder();
+  TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
+  const PrimitiveType type = x_shape.element_type();
 
   std::vector<int64> batch_dim_ids(batch_dims.size());
   std::iota(batch_dim_ids.begin(), batch_dim_ids.end(), 0);
   const int64 minor_dim = batch_dims.size();
 
-  xla::XlaOp zero = xla::ScalarLike(x, 0.0);
-  xla::XlaOp one = xla::ScalarLike(x, 1.0);
+  XlaOp zero = ScalarLike(x, 0.0);
+  XlaOp one = ScalarLike(x, 1.0);
 
   // alpha = x[k]
-  xla::XlaOp alpha =
-      xla::Reshape(DynamicSliceInMinorDims(x, {k}, {1}), batch_dims);
+  XlaOp alpha = Reshape(DynamicSliceInMinorDims(x, {k}, {1}), batch_dims);
 
   // Compute x[k+1:] (padded with zeros in elements 0..k)
-  xla::XlaOp iota = xla::Iota(builder, xla::S32, m);
-  xla::XlaOp x_after_k =
-      xla::Mul(x, xla::ConvertElementType(xla::Gt(iota, k), type),
-               /*broadcast_dimensions=*/{minor_dim});
+  XlaOp iota = Iota(builder, S32, m);
+  XlaOp x_after_k = Mul(x, ConvertElementType(Gt(iota, k), type),
+                        /*broadcast_dimensions=*/{minor_dim});
 
   // sigma = np.dot(x[k+1:], x[k+1:])
-  auto sigma =
-      xla::Reduce(x_after_k * x_after_k, zero,
-                  xla::CreateScalarAddComputation(type, builder), {minor_dim});
+  auto sigma = Reduce(x_after_k * x_after_k, zero,
+                      CreateScalarAddComputation(type, builder), {minor_dim});
   // mu = np.sqrt(x[k]*x[k] + sigma)
-  auto mu = xla::Sqrt(xla::Square(alpha) + sigma);
+  auto mu = Sqrt(Square(alpha) + sigma);
 
-  auto sigma_is_zero = xla::Eq(sigma, zero);
+  auto sigma_is_zero = Eq(sigma, zero);
 
-  *beta = xla::Select(sigma_is_zero, alpha, -xla::Sign(alpha) * mu);
-  *tau = xla::Select(sigma_is_zero, xla::Broadcast(zero, batch_dims),
-                     (*beta - alpha) / *beta);
-  auto divisor = xla::Select(sigma_is_zero, xla::Broadcast(one, batch_dims),
-                             alpha - *beta);
+  *beta = Select(sigma_is_zero, alpha, -Sign(alpha) * mu);
+  *tau = Select(sigma_is_zero, Broadcast(zero, batch_dims),
+                (*beta - alpha) / *beta);
+  auto divisor =
+      Select(sigma_is_zero, Broadcast(one, batch_dims), alpha - *beta);
 
-  auto e_k = xla::Broadcast(xla::ConvertElementType(xla::Eq(iota, k), type),
-                            std::vector<int64>(batch_dims.size(), 1));
+  auto e_k = Broadcast(ConvertElementType(Eq(iota, k), type),
+                       std::vector<int64>(batch_dims.size(), 1));
 
   // Form v as [0, 0, ..., 1] ++ x[k+1:] / divisor
   // If sigma is zero, x[k+1:] is zero, so use any non-zero divisor.
-  *v = e_k +
-       xla::Div(x_after_k, divisor, /*broadcast_dimensions=*/batch_dim_ids);
+  *v = e_k + Div(x_after_k, divisor, /*broadcast_dimensions=*/batch_dim_ids);
   return Status::OK();
 }
 
@@ -143,94 +145,86 @@ xla::Status House(xla::XlaOp x, xla::XlaOp k,
 //   return (q, vs, taus)
 struct QRBlockResult {
   // The factored R value
-  xla::XlaOp r;
+  XlaOp r;
 
   // Representation of the Householder matrices I - beta v v.T
-  xla::XlaOp taus;  // Shape: [..., n]
-  xla::XlaOp vs;    // Shape: [..., m, n]
+  XlaOp taus;  // Shape: [..., n]
+  XlaOp vs;    // Shape: [..., m, n]
 };
-xla::StatusOr<QRBlockResult> QRBlock(
-    xla::XlaOp a, xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = a.builder();
-  TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-  const int num_dims = xla::ShapeUtil::Rank(a_shape);
+StatusOr<QRBlockResult> QRBlock(XlaOp a, PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+  const int num_dims = ShapeUtil::Rank(a_shape);
   if (num_dims < 2) {
-    return errors::InvalidArgument("Arguments to QR must have rank >= 2: ",
-                                   num_dims);
+    return InvalidArgument("Argument to QR must have rank >= 2; got shape %s",
+                           a_shape.ToString());
   }
-  xla::PrimitiveType type = a_shape.element_type();
+  PrimitiveType type = a_shape.element_type();
 
-  const int64 m = xla::ShapeUtil::GetDimension(a_shape, -2);
-  const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1);
+  const int64 m = ShapeUtil::GetDimension(a_shape, -2);
+  const int64 n = ShapeUtil::GetDimension(a_shape, -1);
 
   const int64 num_batch_dims = num_dims - 2;
   std::vector<int64> batch_dims(num_batch_dims);
   for (int i = 0; i < num_batch_dims; ++i) {
-    batch_dims[i] = xla::ShapeUtil::GetDimension(a_shape, i);
+    batch_dims[i] = ShapeUtil::GetDimension(a_shape, i);
   }
 
   std::vector<int64> batch_dim_indices(num_batch_dims);
   std::iota(batch_dim_indices.begin(), batch_dim_indices.end(), 0);
 
-  auto qr_body_fn =
-      [&](xla::XlaOp j, absl::Span<const xla::XlaOp> values,
-          xla::XlaBuilder* builder) -> xla::StatusOr<std::vector<xla::XlaOp>> {
+  auto qr_body_fn = [&](XlaOp j, absl::Span<const XlaOp> values,
+                        XlaBuilder* builder) -> StatusOr<std::vector<XlaOp>> {
     auto a = values[0];
     auto vs = values[1];
     auto taus = values[2];
 
     // v, beta = house(a[:, j], j)
     auto x = DynamicSliceInMinorDims(a, {j}, {1});
-    xla::XlaOp v, tau, beta;
-    TF_RETURN_IF_ERROR(House(xla::Collapse(x, {num_dims - 2, num_dims - 1}), j,
+    XlaOp v, tau, beta;
+    TF_RETURN_IF_ERROR(House(Collapse(x, {num_dims - 2, num_dims - 1}), j,
                              batch_dims, m, &v, &tau, &beta));
 
     std::vector<int64> shape = batch_dims;
     shape.push_back(1);
     shape.push_back(m);
-    auto v_broadcast = xla::Reshape(v, shape);
+    auto v_broadcast = Reshape(v, shape);
     // a[:, :] -= tau * np.dot(v[:, np.newaxis],
     //                          np.dot(v[np.newaxis, :], a[:, :]))
-    auto vva =
-        BatchDot(v_broadcast, a, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
-    vva =
-        BatchDot(v_broadcast, vva, /*transpose_x=*/true, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
-    a = a - xla::Mul(tau, vva,
-                     /*broadcast_dimensions=*/batch_dim_indices);
+    auto vva = BatchDot(v_broadcast, a, precision);
+    vva = BatchDot(TransposeInMinorDims(v_broadcast), vva, precision);
+    a = a - Mul(tau, vva,
+                /*broadcast_dimensions=*/batch_dim_indices);
 
     // It is more precise to populate column 'k' explicitly, rather than
     // computing it implicitly by applying the Householder transformation.
     // a[k,k] = beta
     // a[k+1:,k] = np.zeros([m-k-1], dtype=a.dtype)
-    auto iota = xla::Reshape(xla::Iota(a.builder(), xla::S32, m), {m, 1});
-    auto predecessor_mask = xla::ConvertElementType(xla::Lt(iota, j), type);
-    auto mask = xla::Broadcast(xla::ConvertElementType(xla::Eq(iota, j), type),
-                               std::vector<int64>(batch_dims.size(), 1));
-    auto new_x =
-        xla::Mul(x, predecessor_mask,
-                 /*broadcast_dimensions=*/{num_dims - 2, num_dims - 1}) +
-        xla::Mul(beta, mask, /*broadcast_dimensions=*/batch_dim_indices);
+    auto iota = Reshape(Iota(a.builder(), S32, m), {m, 1});
+    auto predecessor_mask = ConvertElementType(Lt(iota, j), type);
+    auto mask = Broadcast(ConvertElementType(Eq(iota, j), type),
+                          std::vector<int64>(batch_dims.size(), 1));
+    auto new_x = Mul(x, predecessor_mask,
+                     /*broadcast_dimensions=*/{num_dims - 2, num_dims - 1}) +
+                 Mul(beta, mask, /*broadcast_dimensions=*/batch_dim_indices);
     a = DynamicUpdateSliceInMinorDims(a, new_x, {j});
 
     // vs[:, j] = v
     vs = DynamicUpdateSliceInMinorDims(
-        vs, xla::Reshape(v, ConcatVectors(batch_dims, {m, 1})), {j});
+        vs, Reshape(v, ConcatVectors(batch_dims, {m, 1})), {j});
     // taus[j] = tau
     taus = DynamicUpdateSliceInMinorDims(
-        taus, xla::Reshape(tau, ConcatVectors(batch_dims, {1})), {j});
-    return std::vector<xla::XlaOp>{a, vs, taus};
+        taus, Reshape(tau, ConcatVectors(batch_dims, {1})), {j});
+    return std::vector<XlaOp>{a, vs, taus};
   };
 
-  auto vs = xla::Zeros(builder, xla::ShapeUtil::MakeShape(
-                                    type, ConcatVectors(batch_dims, {m, n})));
-  auto taus = xla::Zeros(
-      builder, xla::ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {n})));
+  auto vs = Zeros(
+      builder, ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {m, n})));
+  auto taus = Zeros(builder,
+                    ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {n})));
 
-  TF_ASSIGN_OR_RETURN(auto values,
-                      XlaForEachIndex(std::min(m, n), xla::S32, qr_body_fn,
-                                      {a, vs, taus}, "qr", builder));
+  TF_ASSIGN_OR_RETURN(auto values, ForEachIndex(std::min(m, n), S32, qr_body_fn,
+                                                {a, vs, taus}, "qr", builder));
 
   QRBlockResult result;
   result.r = values[0];
@@ -254,62 +248,58 @@ xla::StatusOr<QRBlockResult> QRBlock(
 // return W
 // There is no need to return Y since at termination of the loop it is equal to
 // vs.
-xla::StatusOr<xla::XlaOp> ComputeWYRepresentation(
-    xla::PrimitiveType type, absl::Span<const int64> batch_dims, xla::XlaOp vs,
-    xla::XlaOp taus, int64 m, int64 n,
-    xla::PrecisionConfig::Precision precision) {
+StatusOr<XlaOp> ComputeWYRepresentation(PrimitiveType type,
+                                        absl::Span<const int64> batch_dims,
+                                        XlaOp vs, XlaOp taus, int64 m, int64 n,
+                                        PrecisionConfig::Precision precision) {
   std::vector<int64> batch_dim_indices(batch_dims.size());
   std::iota(batch_dim_indices.begin(), batch_dim_indices.end(), 0);
   int64 n_index = batch_dims.size() + 1;
 
-  auto body_fn =
-      [&](xla::XlaOp j, absl::Span<const xla::XlaOp> values,
-          xla::XlaBuilder* builder) -> xla::StatusOr<std::vector<xla::XlaOp>> {
+  auto body_fn = [&](XlaOp j, absl::Span<const XlaOp> values,
+                     XlaBuilder* builder) -> StatusOr<std::vector<XlaOp>> {
     auto w = values[0];
     auto y = values[1];
     const auto vs = values[2];
     const auto taus = values[3];
 
     // Want j values in range [1, ... n).
-    j = j + xla::ConstantR0<int32>(builder, 1);
+    j = j + ConstantR0<int32>(builder, 1);
     // vs has shape [..., m, 1]
     auto v = DynamicSliceInMinorDims(vs, {j}, {1});
     // beta has shape [..., 1]
     auto beta = DynamicSliceInMinorDims(taus, {j}, {1});
 
     // yv has shape [..., n, 1]
-    auto yv = BatchDot(y, v, /*transpose_x=*/true, /*transpose_y=*/false,
-                       /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    auto yv = BatchDot(TransposeInMinorDims(y), v, precision);
     // wyv has shape [..., m, 1]
-    auto wyv =
-        BatchDot(w, yv, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    auto wyv = BatchDot(w, yv, precision);
 
-    auto z = xla::Mul(
+    auto z = Mul(
         -beta, v + wyv,
         /*broadcast_dimensions=*/ConcatVectors(batch_dim_indices, {n_index}));
 
     w = DynamicUpdateSliceInMinorDims(w, z, {j});
     y = DynamicUpdateSliceInMinorDims(y, v, {j});
 
-    return std::vector<xla::XlaOp>{w, y, vs, taus};
+    return std::vector<XlaOp>{w, y, vs, taus};
   };
 
-  xla::XlaBuilder* builder = vs.builder();
-  auto w = xla::Zeros(builder, xla::ShapeUtil::MakeShape(
-                                   type, ConcatVectors(batch_dims, {m, n})));
+  XlaBuilder* builder = vs.builder();
+  auto w = Zeros(builder,
+                 ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {m, n})));
   auto y = w;
   auto v = SliceInMinorDims(vs, {0}, {1});
   auto beta = SliceInMinorDims(taus, {0}, {1});
   y = UpdateSliceInMinorDims(y, v, {0});
-  auto bv = xla::Mul(
-      -beta, v,
-      /*broadcast_dimensions=*/ConcatVectors(batch_dim_indices, {n_index}));
+  auto bv =
+      Mul(-beta, v,
+          /*broadcast_dimensions=*/ConcatVectors(batch_dim_indices, {n_index}));
   w = UpdateSliceInMinorDims(w, bv, {0});
 
   TF_ASSIGN_OR_RETURN(
-      auto values, XlaForEachIndex(n - 1, xla::S32, body_fn, {w, y, vs, taus},
-                                   "wy", builder));
+      auto values,
+      ForEachIndex(n - 1, S32, body_fn, {w, y, vs, taus}, "wy", builder));
   return values[0];
 }
 
@@ -330,34 +320,34 @@ xla::StatusOr<xla::XlaOp> ComputeWYRepresentation(
 //   return (q, a)
 // TODO(phawkins): consider using UT transformations (in the form I - V U V')
 // rather than WY transformations.
-xla::StatusOr<QRDecompositionResult> QRDecomposition(
-    xla::XlaOp a, bool full_matrices, int64 block_size,
-    xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = a.builder();
-  TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-  const int num_dims = xla::ShapeUtil::Rank(a_shape);
+StatusOr<QRDecompositionResult> QRDecomposition(
+    XlaOp a, bool full_matrices, int64 block_size,
+    PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+  const int num_dims = ShapeUtil::Rank(a_shape);
   if (num_dims < 2) {
-    return errors::InvalidArgument("Arguments to QR must have rank >= 2: ",
-                                   num_dims);
+    return InvalidArgument("Arguments to QR must have rank >= 2: got shape %s",
+                           a_shape.ToString());
   }
-  xla::PrimitiveType type = a_shape.element_type();
+  PrimitiveType type = a_shape.element_type();
 
-  const int64 m = xla::ShapeUtil::GetDimension(a_shape, -2);
-  const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1);
+  const int64 m = ShapeUtil::GetDimension(a_shape, -2);
+  const int64 n = ShapeUtil::GetDimension(a_shape, -1);
   const int64 p = std::min(m, n);
 
   if (block_size < 1) {
-    return errors::InvalidArgument(
-        "block_size argument to QR must be >= 1; got ", block_size);
+    return InvalidArgument("block_size argument to QR must be >= 1; got %d",
+                           block_size);
   }
 
   const int64 num_batch_dims = num_dims - 2;
   std::vector<int64> batch_dims(num_batch_dims);
   for (int i = 0; i < num_batch_dims; ++i) {
-    batch_dims[i] = xla::ShapeUtil::GetDimension(a_shape, i);
+    batch_dims[i] = ShapeUtil::GetDimension(a_shape, i);
   }
 
-  auto q = xla::Broadcast(xla::IdentityMatrix(builder, type, m, m), batch_dims);
+  auto q = Broadcast(IdentityMatrix(builder, type, m, m), batch_dims);
   for (int64 i = 0; i < p; i += block_size) {
     int64 k = std::min(block_size, p - i);
 
@@ -375,23 +365,15 @@ xla::StatusOr<QRDecompositionResult> QRDecomposition(
 
     // a[i:, i+k:] += np.dot(Y, np.dot(W.T, a[i:, i+k:]))
     auto a_panel = SliceInMinorDims(a, {i, i + k}, {m, n});
-    auto a_update =
-        BatchDot(w, a_panel, /*transpose_x=*/true, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
-    a_update =
-        BatchDot(y, a_update, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    auto a_update = BatchDot(TransposeInMinorDims(w), a_panel, precision);
+    a_update = BatchDot(y, a_update, precision);
     a_panel = a_panel + a_update;
     a = UpdateSliceInMinorDims(a, a_panel, {i, i + k});
 
     // q[:, i:] += np.dot(np.dot(q[:, i:], W), Y.T))
     auto q_panel = SliceInMinorDims(q, {0, i}, {m, m});
-    auto q_update =
-        BatchDot(q_panel, w, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
-    q_update = BatchDot(q_update, y, /*transpose_x=*/false,
-                        /*transpose_y=*/true, /*conjugate_x=*/false,
-                        /*conjugate_y=*/false, precision);
+    auto q_update = BatchDot(q_panel, w, precision);
+    q_update = BatchDot(q_update, TransposeInMinorDims(y), precision);
     q_panel = q_panel + q_update;
     q = UpdateSliceInMinorDims(q, q_panel, {0, i});
   }
@@ -408,4 +390,4 @@ xla::StatusOr<QRDecompositionResult> QRDecomposition(
   return result;
 }
 
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/tf2xla/lib/qr.h b/tensorflow/compiler/xla/client/lib/qr.h
similarity index 74%
rename from tensorflow/compiler/tf2xla/lib/qr.h
rename to tensorflow/compiler/xla/client/lib/qr.h
index 24b537ac8b63b93e734c3d0e335ea455f7d51a54..827c8eeca05ef09a0d77363eb3c40961b95813d8 100644
--- a/tensorflow/compiler/tf2xla/lib/qr.h
+++ b/tensorflow/compiler/xla/client/lib/qr.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_QR_H_
-#define TENSORFLOW_COMPILER_TF2XLA_LIB_QR_H_
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_QR_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_QR_H_
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
-namespace tensorflow {
+namespace xla {
 
 // Computes the QR decompositions of a batch of matrices. That is,
 // given a (batched) matrix a, computes an orthonormal matrix Q and an
@@ -29,14 +29,14 @@ namespace tensorflow {
 // the block size to use.
 // TODO(phawkins): handle the complex case.
 struct QRDecompositionResult {
-  xla::XlaOp q;
-  xla::XlaOp r;
+  XlaOp q;
+  XlaOp r;
 };
 
-xla::StatusOr<QRDecompositionResult> QRDecomposition(
-    xla::XlaOp a, bool full_matrices, int64 block_size = 128,
-    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::HIGHEST);
+StatusOr<QRDecompositionResult> QRDecomposition(
+    XlaOp a, bool full_matrices, int64 block_size = 128,
+    PrecisionConfig::Precision precision = PrecisionConfig::HIGHEST);
 
-}  // namespace tensorflow
+}  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_QR_H_
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_QR_H_
diff --git a/tensorflow/compiler/xla/client/lib/qr_test.cc b/tensorflow/compiler/xla/client/lib/qr_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b27d364b62444d6d5fb1278b6e6461affc15b2e6
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/qr_test.cc
@@ -0,0 +1,93 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/qr.h"
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/array3d.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace {
+
+using QrTest = xla::ClientLibraryTestBase;
+
+XLA_TEST_F(QrTest, Simple) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::Array2D<float> a_vals({
+      {4, 6, 8, 10},
+      {6, 45, 54, 63},
+      {8, 54, 146, 166},
+      {10, 63, 166, 310},
+  });
+
+  xla::XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result,
+      xla::QRDecomposition(a, /*full_matrices=*/true, /*block_size=*/2));
+
+  // Verifies that the decomposition composes back to the original matrix.
+  //
+  // This isn't a terribly demanding test, (e.g., we should verify that Q is
+  // orthonormal and R is upper-triangular) but it's awkward to write such tests
+  // without more linear algebra libraries. It's easier to test the numerics
+  // from Python, anyway, where we have access to numpy and scipy.
+  xla::BatchDot(result.q, result.r, xla::PrecisionConfig::HIGHEST);
+
+  ComputeAndCompareR2<float>(&builder, a_vals, {a_data.get()},
+                             xla::ErrorSpec(1e-4, 1e-4));
+}
+
+XLA_TEST_F(QrTest, SimpleBatched) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::Array3D<float> a_vals({
+      {
+          {4, 6, 8, 10},
+          {6, 45, 54, 63},
+          {8, 54, 146, 166},
+          {10, 63, 166, 310},
+      },
+      {
+          {16, 24, 8, 12},
+          {24, 61, 82, 48},
+          {8, 82, 456, 106},
+          {12, 48, 106, 62},
+      },
+  });
+
+  xla::XlaOp a;
+  auto a_data = CreateR3Parameter<float>(a_vals, 0, "a", &builder, &a);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result,
+      xla::QRDecomposition(a, /*full_matrices=*/true, /*block_size=*/2));
+
+  xla::BatchDot(result.q, result.r, xla::PrecisionConfig::HIGHEST);
+
+  ComputeAndCompareR3<float>(&builder, a_vals, {a_data.get()},
+                             xla::ErrorSpec(1e-4, 1e-4));
+}
+
+}  // namespace
diff --git a/tensorflow/compiler/xla/client/lib/quantize.h b/tensorflow/compiler/xla/client/lib/quantize.h
new file mode 100644
index 0000000000000000000000000000000000000000..26dbbd5b00bd1a29f4047c9a4294fcac7340cf6c
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/quantize.h
@@ -0,0 +1,186 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_QUANTIZE_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_QUANTIZE_H_
+
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+
+namespace xla {
+
+constexpr int64 kBitsOfByte = 8;
+
+// Represents the range used for quantization
+struct QuantizedRange {
+  QuantizedRange() = default;
+  QuantizedRange(float min_in, float max_in) : min(min_in), max(max_in) {}
+
+  bool operator==(const QuantizedRange& rhs) const {
+    return this->min == rhs.min && this->max == rhs.max;
+  }
+
+  bool operator!=(const QuantizedRange& rhs) const { return !(*this == rhs); }
+
+  tensorflow::bfloat16 min = tensorflow::bfloat16(0.0f);
+  tensorflow::bfloat16 max = tensorflow::bfloat16(0.0f);
+};
+
+template <typename T>
+inline std::vector<uint32> PackToUint32(absl::Span<const T> input) {
+  const int64 kElementsPerPack = sizeof(uint32) / sizeof(T);
+  const int64 input_size = input.size();
+  const int64 output_size = CeilOfRatio(input_size, kElementsPerPack);
+
+  std::vector<uint32> output_vec;
+  constexpr int64 kShiftBits = sizeof(T) / sizeof(uint8) * kBitsOfByte;
+
+  for (int64 i = 0; i < output_size; i++) {
+    uint32 result = 0;
+    for (int64 p = 0; p < kElementsPerPack; p++) {
+      int64 index = i * kElementsPerPack + p;
+      if (index < input_size) {
+        int64 total_shift_bits = kShiftBits * (kElementsPerPack - p - 1);
+        result |= (input[index] << total_shift_bits);
+      }
+    }
+    output_vec.push_back(result);
+  }
+
+  return output_vec;
+}
+
+// Dequantize the quantized input of packed uint32 to bfloat16.
+// Only uint8 or uint16 is supported for the original unpacked input.
+// Returns a tensor of shape [d0,..., dn * unpack_size] if
+// input shape is [d0, ..., dn], where unpack_size = sizeof(unit32) / sizeof(T).
+// If transpose_output is true, will return a tensor of shape
+// [dn * unpack_size, dn-1, ..., d1, d0]. transpose_output is faster when
+// input's rank higher than 1. The input needs to be transposed to use
+// transpose_output feature.
+template <typename T>
+inline XlaOp Dequantize(XlaOp input, const QuantizedRange& range,
+                        absl::string_view mode_string = "MIN_COMBINED",
+                        bool transpose_output = false) {
+  XlaBuilder* const builder = input.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    float half_range =
+        !std::is_signed<T>::value
+            ? 0.0f
+            : (static_cast<float>(std::numeric_limits<T>::max()) -
+               std::numeric_limits<T>::min() + 1) /
+                  2.0f;
+    const int64 unpack_size = sizeof(uint32) / sizeof(T);
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(input));
+
+    auto element_type = shape.element_type();
+    if (element_type != U32) {
+      return InvalidArgument(
+          "Only U32 is supported for input type of xla::Dequantize Op.");
+    }
+
+    // Broadcast the input to [unpack_size, d0, ..., dn] if input size is
+    // [d0, ..., dn].
+    auto broadcast_input = Broadcast(input, {unpack_size});
+
+    XlaOp iota_r1 = Iota(builder, U32, unpack_size);
+    // Highest significant bytes needs to shift more bytes than lower
+    // significant bytes.
+    XlaOp shift_bytes =
+        xla::ConstantR0<uint32>(builder, unpack_size - 1) - iota_r1;
+
+    const int bytes_of_type = sizeof(T) / sizeof(uint8);
+    std::vector<uint32> shift_vec(unpack_size, kBitsOfByte * bytes_of_type);
+    XlaOp shift_bits =
+        shift_bytes * xla::ConstantR1<uint32>(builder, shift_vec);
+
+    // Make bit_mask for different data type T.
+    uint32 bit_mask = 0x00000000;
+    for (int i = 0; i < bytes_of_type; i++) {
+      bit_mask <<= kBitsOfByte;
+      bit_mask |= 0x000000ff;
+    }
+
+    std::vector<int64> shift_transpose_dimensions(shape.dimensions_size());
+    std::iota(shift_transpose_dimensions.begin(),
+              shift_transpose_dimensions.end(), 0);
+    shift_transpose_dimensions.insert(shift_transpose_dimensions.begin(), 1,
+                                      shape.dimensions_size());
+
+    // Shift the input by sizeof(T) bytes and apply bit_mask to unpack.
+    XlaOp shifted_input = ShiftRightLogical(
+        broadcast_input, Transpose(Broadcast(shift_bits, shape.dimensions()),
+                                   shift_transpose_dimensions));
+    XlaOp unpack_input =
+        And(shifted_input, xla::ConstantR0<uint32>(builder, bit_mask));
+
+    XlaOp result;
+
+    if (mode_string == "MIN_COMBINED") {
+      const tensorflow::bfloat16 scale_factor =
+          (range.max - range.min) /
+          (static_cast<tensorflow::bfloat16>(std::numeric_limits<T>::max() -
+                                             std::numeric_limits<T>::min()));
+      // result = bfloat16(input + half_range) * scale_factor + range.min
+      XlaOp unpack_input_bf16 = ConvertElementType(unpack_input, BF16);
+      XlaOp half_range_bf16 = xla::ConstantR0<tensorflow::bfloat16>(
+          builder, static_cast<bfloat16>(half_range));
+      XlaOp sum = unpack_input_bf16 + half_range_bf16;
+
+      result =
+          sum * xla::ConstantR0<tensorflow::bfloat16>(builder, scale_factor) +
+          xla::ConstantR0<tensorflow::bfloat16>(builder, range.min);
+    } else {
+      // TODO(wangtao): support other modes.
+      return InvalidArgument(
+          "Only MIN_COMBINED mode is supported in xla::Dequantize Op.");
+    }
+
+    std::vector<int64> transpose_dimensions(shape.dimensions_size());
+    std::iota(transpose_dimensions.begin(), transpose_dimensions.end(), 1);
+    std::reverse(transpose_dimensions.begin(), transpose_dimensions.end());
+    transpose_dimensions.insert(transpose_dimensions.begin() + 1, 1, 0);
+
+    // Transpose the result to be [dn, unpack_size, dn-1, ..., d1, d0].
+    XlaOp transposed_result = Transpose(result, transpose_dimensions);
+
+    // Reshape to be [dn * unpack_size, dn-1, ..., d1, d0].
+    XlaOp reshaped_result = Collapse(transposed_result, {0, 1});
+
+    // Return the transpose result if transpose_output is true.
+    if (transpose_output) {
+      return reshaped_result;
+    }
+
+    // Transpose the result to be [d0, d1, ..., dn-1, dn * unpack_size].
+    std::vector<int64> result_dimensions(shape.dimensions_size());
+    std::iota(result_dimensions.begin(), result_dimensions.end(), 0);
+    std::reverse(result_dimensions.begin(), result_dimensions.end());
+
+    return Transpose(reshaped_result, result_dimensions);
+  });
+}
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_QUANTIZE_H_
diff --git a/tensorflow/compiler/xla/client/lib/quantize_test.cc b/tensorflow/compiler/xla/client/lib/quantize_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be3603d9e11670913c21a834d2216a999306d582
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/quantize_test.cc
@@ -0,0 +1,337 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/quantize.h"
+
+#include <limits>
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace {
+
+using bfloat16 = tensorflow::bfloat16;
+
+template <typename NativeT>
+std::vector<NativeT> GenerateInput() {
+  std::vector<NativeT> input;
+
+  for (int64 i = std::numeric_limits<NativeT>::min();
+       i < std::numeric_limits<NativeT>::max(); ++i) {
+    input.push_back(static_cast<NativeT>(i));
+  }
+
+  return input;
+}
+
+template <typename NativeT>
+Array2D<NativeT> GenerateLargeSizeInput(int num_columns, int num_rows) {
+  Array2D<NativeT> input(num_columns, num_rows);
+
+  input.FillRandom(6, 128);
+
+  return input;
+}
+
+template <typename NativeT>
+Array2D<uint32> PackLargeInput(Array2D<NativeT> &input) {
+  const int64 size_per_pack = sizeof(uint32) / sizeof(NativeT);
+  int64 width = input.width();
+
+  int64 padded_output_width = CeilOfRatio(width, size_per_pack);
+
+  Array2D<uint32> pack_input(input.height(), padded_output_width);
+
+  for (int h = 0; h < input.height(); h++) {
+    std::vector<NativeT> input_row;
+    for (int w = 0; w < width; w++) {
+      input_row.push_back(input({h, w}));
+    }
+
+    auto pack_input_vec = PackToUint32<uint8>(input_row);
+
+    for (int w = 0; w < padded_output_width; w++) {
+      pack_input(h, w) = pack_input_vec[w];
+    }
+  }
+
+  return pack_input;
+}
+
+template <typename NativeT>
+Array2D<bfloat16> GenerateLargeSizeMinCombinedOutput(
+    Array2D<NativeT> &input, const QuantizedRange &range,
+    bool transpose_output = false) {
+  const int64 size_per_pack = sizeof(uint32) / sizeof(NativeT);
+  int64 width = input.width();
+
+  int64 padded_output_width = CeilOfRatio(width, size_per_pack) * size_per_pack;
+
+  int64 output_height;
+  int64 output_width;
+
+  if (transpose_output) {
+    output_height = padded_output_width;
+    output_width = input.height();
+  } else {
+    output_height = input.height();
+    output_width = padded_output_width;
+  }
+
+  Array2D<bfloat16> output(output_height, output_width, bfloat16(0.0));
+
+  float half_range =
+      !std::is_signed<NativeT>::value
+          ? 0.0f
+          : (static_cast<float>(std::numeric_limits<NativeT>::max() -
+                                std::numeric_limits<NativeT>::min() + 1)) /
+                2.0f;
+  const bfloat16 scale_factor =
+      (range.max - range.min) /
+      (static_cast<bfloat16>(std::numeric_limits<NativeT>::max() -
+                             std::numeric_limits<NativeT>::min()));
+
+  for (int h = 0; h < input.height(); h++) {
+    std::vector<NativeT> input_row;
+    for (int w = 0; w < width; w++) {
+      bfloat16 result =
+          static_cast<bfloat16>(input(h, w) + half_range) * scale_factor +
+          range.min;
+      if (transpose_output) {
+        output(w, h) = result;
+      } else {
+        output(h, w) = result;
+      }
+    }
+  }
+
+  return output;
+}
+
+template <typename NativeT>
+std::vector<bfloat16> GenerateMinCombinedOutput(const QuantizedRange &range) {
+  float half_range =
+      !std::is_signed<NativeT>::value
+          ? 0.0f
+          : (static_cast<float>(std::numeric_limits<NativeT>::max() -
+                                std::numeric_limits<NativeT>::min() + 1)) /
+                2.0f;
+  const bfloat16 scale_factor =
+      (range.max - range.min) /
+      (static_cast<bfloat16>(std::numeric_limits<NativeT>::max() -
+                             std::numeric_limits<NativeT>::min()));
+  std::vector<bfloat16> output;
+  for (int64 i = std::numeric_limits<NativeT>::min();
+       i < std::numeric_limits<NativeT>::max(); ++i) {
+    bfloat16 result =
+        static_cast<bfloat16>(i + half_range) * scale_factor + range.min;
+    output.push_back(result);
+  }
+
+  const int64 pack_size = sizeof(uint32) / sizeof(NativeT);
+  const int64 output_size = output.size();
+
+  int64 num_tailing_zeros =
+      CeilOfRatio(output_size, pack_size) * pack_size - output_size;
+
+  output.insert(output.end(), num_tailing_zeros, bfloat16(0.0));
+  return output;
+}
+
+// TODO(wangtao): add a test to make sure this op is the inverse of the existing
+// TF quantize op defined in: third_party/tensorflow/core/kernels/quantize_op.cc
+
+using DequantizeTest = ClientLibraryTestBase;
+
+TEST(PackTest, PackUint8ToUint32) {
+  std::vector<uint8> input = {0xAB, 0x0B, 0x00, 0xF0, 0x01};
+  auto output = PackToUint32<uint8>(input);
+  EXPECT_THAT(output, ::testing::ElementsAre(0xAB0B00F0, 0x01000000));
+}
+
+TEST(PackTest, PackInt8ToUint32) {
+  std::vector<int8> input = {static_cast<signed char>(0x81), 0x0B, 0x00, 0x20,
+                             0x01};
+  auto output = PackToUint32<int8>(input);
+  EXPECT_THAT(output, ::testing::ElementsAre(0x810B0020, 0x01000000));
+}
+
+TEST(PackTest, PackUint8ToUint32PerfectSize) {
+  std::vector<uint8> input = {3, 2, 1, 0};
+  auto output = PackToUint32<uint8>(input);
+  EXPECT_THAT(output, ::testing::ElementsAre(0x03020100));
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint16R1) {
+  XlaBuilder builder(TestName());
+  auto input = GenerateInput<uint16>();
+  auto x = ConstantR1<uint32>(&builder, PackToUint32<uint16>(input));
+  QuantizedRange range(0, 255.0f);
+  xla::Dequantize<uint16>(x, range, "MIN_COMBINED");
+  auto expected = GenerateMinCombinedOutput<uint16>(range);
+  ComputeAndCompareR1<bfloat16>(&builder, expected, {});
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint8R1) {
+  XlaBuilder builder(TestName());
+  auto input = GenerateInput<uint8>();
+  auto x = ConstantR1<uint32>(&builder, PackToUint32<uint8>(input));
+  QuantizedRange range(0, 127.0f);
+  xla::Dequantize<uint8>(x, range, "MIN_COMBINED");
+  auto expected = GenerateMinCombinedOutput<uint8>(range);
+  ComputeAndCompareR1<bfloat16>(&builder, expected, {});
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint8R2) {
+  XlaBuilder builder(TestName());
+  std::vector<std::vector<uint8>> input = {
+      {0, 1, 2, 3},
+      {4, 5, 6, 7},
+      {8, 9, 10, 11},
+      {12, 13, 16, 15},
+  };
+  auto x = ConstantR2<uint32>(&builder, {{PackToUint32<uint8>(input[0])[0]},
+                                         {PackToUint32<uint8>(input[1])[0]},
+                                         {PackToUint32<uint8>(input[2])[0]},
+                                         {PackToUint32<uint8>(input[3])[0]}});
+  QuantizedRange range(0, 255.0f);
+  xla::Dequantize<uint8>(x, range, "MIN_COMBINED");
+  const Array2D<bfloat16> expected = {
+      {bfloat16(0.0), bfloat16(1.0), bfloat16(2.0), bfloat16(3.0)},
+      {bfloat16(4.0), bfloat16(5.0), bfloat16(6.0), bfloat16(7.0)},
+      {bfloat16(8.0), bfloat16(9.0), bfloat16(10.0), bfloat16(11.0)},
+      {bfloat16(12.0), bfloat16(13.0), bfloat16(16.0), bfloat16(15.0)},
+  };
+  ComputeAndCompareR2<bfloat16>(&builder, expected, {});
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint8R2TransposeOutput) {
+  XlaBuilder builder(TestName());
+  std::vector<std::vector<uint8>> input = {
+      {0, 1, 2, 3},
+      {4, 5, 6, 7},
+      {8, 9, 10, 11},
+      {12, 13, 16, 15},
+  };
+  auto x = ConstantR2<uint32>(&builder, {{PackToUint32<uint8>(input[0])[0]},
+                                         {PackToUint32<uint8>(input[1])[0]},
+                                         {PackToUint32<uint8>(input[2])[0]},
+                                         {PackToUint32<uint8>(input[3])[0]}});
+  QuantizedRange range(0, 255.0f);
+  xla::Dequantize<uint8>(x, range, "MIN_COMBINED", /*transpose_output=*/true);
+  const Array2D<bfloat16> expected = {
+      {bfloat16(0.0), bfloat16(4.0), bfloat16(8.0), bfloat16(12.0)},
+      {bfloat16(1.0), bfloat16(5.0), bfloat16(9.0), bfloat16(13.0)},
+      {bfloat16(2.0), bfloat16(6.0), bfloat16(10.0), bfloat16(16.0)},
+      {bfloat16(3.0), bfloat16(7.0), bfloat16(11.0), bfloat16(15.0)},
+  };
+  ComputeAndCompareR2<bfloat16>(&builder, expected, {});
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint8R2TailingZero) {
+  XlaBuilder builder(TestName());
+  std::vector<std::vector<uint8>> input = {
+      {0, 1, 2, 3, 16},
+      {4, 5, 6, 7, 17},
+      {8, 9, 10, 11, 18},
+      {12, 13, 16, 15, 19},
+  };
+  auto x = ConstantR2<uint32>(
+      &builder,
+      {{PackToUint32<uint8>(input[0])[0], PackToUint32<uint8>(input[0])[1]},
+       {PackToUint32<uint8>(input[1])[0], PackToUint32<uint8>(input[1])[1]},
+       {PackToUint32<uint8>(input[2])[0], PackToUint32<uint8>(input[2])[1]},
+       {PackToUint32<uint8>(input[3])[0], PackToUint32<uint8>(input[3])[1]}});
+  QuantizedRange range(0, 255.0f);
+  xla::Dequantize<uint8>(x, range, "MIN_COMBINED");
+
+  const Array2D<bfloat16> expected = {
+      {bfloat16(0.0), bfloat16(1.0), bfloat16(2.0), bfloat16(3.0),
+       bfloat16(16.0), bfloat16(0.0), bfloat16(0.0), bfloat16(0.0)},
+      {bfloat16(4.0), bfloat16(5.0), bfloat16(6.0), bfloat16(7.0),
+       bfloat16(17.0), bfloat16(0.0), bfloat16(0.0), bfloat16(0.0)},
+      {bfloat16(8.0), bfloat16(9.0), bfloat16(10.0), bfloat16(11.0),
+       bfloat16(18.0), bfloat16(0.0), bfloat16(0.0), bfloat16(0.0)},
+      {bfloat16(12.0), bfloat16(13.0), bfloat16(16.0), bfloat16(15.0),
+       bfloat16(19.0), bfloat16(0.0), bfloat16(0.0), bfloat16(0.0)},
+  };
+  ComputeAndCompareR2<bfloat16>(&builder, expected, {});
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint8R2TailingZeroTransposeOutput) {
+  XlaBuilder builder(TestName());
+  std::vector<std::vector<uint8>> input = {
+      {0, 1, 2, 3, 16},
+      {4, 5, 6, 7, 17},
+      {8, 9, 10, 11, 18},
+      {12, 13, 16, 15, 19},
+  };
+  auto x = ConstantR2<uint32>(
+      &builder,
+      {{PackToUint32<uint8>(input[0])[0], PackToUint32<uint8>(input[0])[1]},
+       {PackToUint32<uint8>(input[1])[0], PackToUint32<uint8>(input[1])[1]},
+       {PackToUint32<uint8>(input[2])[0], PackToUint32<uint8>(input[2])[1]},
+       {PackToUint32<uint8>(input[3])[0], PackToUint32<uint8>(input[3])[1]}});
+  QuantizedRange range(0, 255.0f);
+  xla::Dequantize<uint8>(x, range, "MIN_COMBINED", /*transpose_output=*/true);
+
+  const Array2D<bfloat16> expected = {
+      {bfloat16(0.0), bfloat16(4.0), bfloat16(8.0), bfloat16(12.0)},
+      {bfloat16(1.0), bfloat16(5.0), bfloat16(9.0), bfloat16(13.0)},
+      {bfloat16(2.0), bfloat16(6.0), bfloat16(10.0), bfloat16(16.0)},
+      {bfloat16(3.0), bfloat16(7.0), bfloat16(11.0), bfloat16(15.0)},
+      {bfloat16(16.0), bfloat16(17.0), bfloat16(18.0), bfloat16(19.0)},
+      {bfloat16(0.0), bfloat16(0.0), bfloat16(0.0), bfloat16(0.0)},
+      {bfloat16(0.0), bfloat16(0.0), bfloat16(0.0), bfloat16(0.0)},
+      {bfloat16(0.0), bfloat16(0.0), bfloat16(0.0), bfloat16(0.0)},
+  };
+  ComputeAndCompareR2<bfloat16>(&builder, expected, {});
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint8LargeSizeTest) {
+  XlaBuilder builder(TestName());
+  Array2D<uint8> input = GenerateLargeSizeInput<uint8>(500, 3547);
+  Array2D<uint32> input_packed = PackLargeInput<uint8>(input);
+
+  auto x = ConstantR2FromArray2D<uint32>(&builder, input_packed);
+  QuantizedRange range(0, 255.0f);
+  xla::Dequantize<uint8>(x, range, "MIN_COMBINED");
+
+  const Array2D<bfloat16> expected =
+      GenerateLargeSizeMinCombinedOutput<uint8>(input, range);
+  ComputeAndCompareR2<bfloat16>(&builder, expected, {});
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint8LargeSizeTestTransposeOutput) {
+  XlaBuilder builder(TestName());
+  Array2D<uint8> input = GenerateLargeSizeInput<uint8>(500, 3547);
+  Array2D<uint32> input_packed = PackLargeInput<uint8>(input);
+
+  auto x = ConstantR2FromArray2D<uint32>(&builder, input_packed);
+  QuantizedRange range(0, 255.0f);
+  xla::Dequantize<uint8>(x, range, "MIN_COMBINED", /*transpose_output=*/true);
+
+  const Array2D<bfloat16> expected = GenerateLargeSizeMinCombinedOutput<uint8>(
+      input, range, /*transpose_output=*/true);
+  ComputeAndCompareR2<bfloat16>(&builder, expected, {});
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/slicing.cc b/tensorflow/compiler/xla/client/lib/slicing.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8c7df3ff5189c817202eaf39adb572f7e232ec2
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/slicing.cc
@@ -0,0 +1,134 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+
+namespace xla {
+
+XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64> start,
+                       absl::Span<const int64> end) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RET_CHECK(start.size() == end.size());
+    int64 n_minor_dims = start.size();
+
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_minor_dims <= n_dims);
+    auto major_dims = AsInt64Slice(shape.dimensions())
+                          .subspan(
+                              /*pos=*/0,
+                              /*len=*/n_dims - n_minor_dims);
+
+    // Prepends 0s in the major dim
+    std::vector<int64> padded_start(n_dims, 0);
+    std::copy(start.begin(), start.end(),
+              padded_start.begin() + major_dims.size());
+
+    // Prepends the shape of the major dims.
+    std::vector<int64> padded_end(n_dims);
+    std::copy(major_dims.begin(), major_dims.end(), padded_end.begin());
+    std::copy(end.begin(), end.end(), padded_end.begin() + major_dims.size());
+
+    std::vector<int64> strides(n_dims, 1);
+    return Slice(x, padded_start, padded_end, strides);
+  });
+}
+
+XlaOp UpdateSlice(XlaOp x, XlaOp update, absl::Span<const int64> start) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
+    std::vector<int32> start_as_int32(start.begin(), start.end());
+    auto start_constant = ConstantR1<int32>(builder, start_as_int32);
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_ASSIGN_OR_RETURN(Shape start_constant_shape,
+                        builder->GetShape(start_constant));
+    const int64 start_length =
+        ShapeUtil::GetDimension(start_constant_shape, -1);
+    TF_RET_CHECK(start_length == n_dims);
+    return DynamicUpdateSlice(x, update, start_constant);
+  });
+}
+
+XlaOp UpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                             absl::Span<const int64> start) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_minor_dims = start.size();
+    TF_RET_CHECK(n_minor_dims <= n_dims);
+    std::vector<int64> padded_start(n_dims, 0);
+    std::copy(start.begin(), start.end(),
+              padded_start.begin() + (n_dims - n_minor_dims));
+    return UpdateSlice(x, update, padded_start);
+  });
+}
+
+namespace {
+
+std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
+                                 absl::Span<const int64> ys) {
+  std::vector<int64> output(xs.size() + ys.size());
+  std::copy(xs.begin(), xs.end(), output.begin());
+  std::copy(ys.begin(), ys.end(), output.begin() + xs.size());
+  return output;
+}
+
+XlaOp PrependZerosInMajorDims(XlaOp x, absl::Span<const XlaOp> starts) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    auto zero = Reshape(ConstantR0<int32>(builder, 0), {1});
+    std::vector<XlaOp> padded_starts(n_dims, zero);
+    for (int i = 0; i < starts.size(); ++i) {
+      padded_starts[n_dims - starts.size() + i] = Reshape(starts[i], {1});
+    }
+    return ConcatInDim(builder, padded_starts, 0);
+  });
+}
+
+}  // namespace
+
+XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
+                              absl::Span<const int64> sizes) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    int64 n_minor_dims = starts.size();
+    TF_RET_CHECK(n_minor_dims == sizes.size());
+    TF_RET_CHECK(n_minor_dims <= n_dims);
+    auto major_dims = AsInt64Slice(shape.dimensions())
+                          .subspan(
+                              /*pos=*/0,
+                              /*len=*/n_dims - sizes.size());
+    auto padded_starts = PrependZerosInMajorDims(x, starts);
+    auto padded_sizes = ConcatVectors(major_dims, sizes);
+    return DynamicSlice(x, padded_starts, padded_sizes);
+  });
+}
+
+XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                                    absl::Span<const XlaOp> starts) {
+  auto padded_starts = PrependZerosInMajorDims(x, starts);
+  return DynamicUpdateSlice(x, update, padded_starts);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/slicing.h b/tensorflow/compiler/xla/client/lib/slicing.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c482a38b5489c9fb17c3dca9ee3d2a1b8fd1890
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/slicing.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/types.h"
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SLICING_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SLICING_H_
+
+namespace xla {
+
+// Updates a slice of 'x', i.e.,
+// x[start[0], ..., start[n]] = update
+XlaOp UpdateSlice(XlaOp x, XlaOp update, absl::Span<const int64> start);
+
+// Performs a slice in the minor dimensions of a tensor.
+// x[..., start[0]:end[0], ..., start[n]:end[n]]
+XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64> start,
+                       absl::Span<const int64> end);
+
+// Updates a slice of 'x', where 'start' contains a list of minor dimensions:
+// x[..., start[0]:..., ..., start[n]:...] = update
+XlaOp UpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                             absl::Span<const int64> start);
+
+// Performs a dynamic slice in the minor dimensions of a tensor.
+XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
+                              absl::Span<const int64> sizes);
+
+XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                                    absl::Span<const XlaOp> starts);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SLICING_H_
diff --git a/tensorflow/compiler/tf2xla/lib/util_test.cc b/tensorflow/compiler/xla/client/lib/slicing_test.cc
similarity index 67%
rename from tensorflow/compiler/tf2xla/lib/util_test.cc
rename to tensorflow/compiler/xla/client/lib/slicing_test.cc
index 442fe92c34ca26cb1a854cc90da8dc034bca79bb..8d362119e01006555db0f82d02626175936e1d05 100644
--- a/tensorflow/compiler/tf2xla/lib/util_test.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing_test.cc
@@ -13,28 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 
-#include <memory>
-#include <numeric>
-#include <vector>
-
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
-#include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
 
-namespace tensorflow {
+namespace xla {
 namespace {
 
-using UtilTest = xla::ClientLibraryTestBase;
-using UtilLeftLookingTest = xla::ClientLibraryTestBase;
+using SlicingTest = xla::ClientLibraryTestBase;
 
 xla::Array2D<float> BValsRight() {
   return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
@@ -63,7 +54,7 @@ xla::Array3D<float> BatchedAValsFull() {
           }};
 }
 
-XLA_TEST_F(UtilTest, Simple2dLookup) {
+XLA_TEST_F(SlicingTest, Simple2dLookup) {
   xla::XlaBuilder builder(TestName());
 
   xla::XlaOp a, x, y;
@@ -77,7 +68,7 @@ XLA_TEST_F(UtilTest, Simple2dLookup) {
                              xla::ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(UtilTest, Simple3dLookup) {
+XLA_TEST_F(SlicingTest, Simple3dLookup) {
   xla::XlaBuilder builder(TestName());
 
   xla::XlaOp a, index;
@@ -92,7 +83,7 @@ XLA_TEST_F(UtilTest, Simple3dLookup) {
                              {a_data.get(), index_data.get()});
 }
 
-XLA_TEST_F(UtilTest, SimpleSliceUpdate) {
+XLA_TEST_F(SlicingTest, SimpleSliceUpdate) {
   xla::XlaBuilder builder(TestName());
 
   xla::XlaOp a, b, x, y;
@@ -111,26 +102,5 @@ XLA_TEST_F(UtilTest, SimpleSliceUpdate) {
       {a_data.get(), b_data.get(), x_data.get(), y_data.get()});
 }
 
-XLA_TEST_F(UtilTest, RowBatchDot) {
-  xla::XlaBuilder builder(TestName());
-
-  int n = 4;
-
-  xla::XlaOp a, row, index;
-  auto a_data =
-      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
-  auto row_data = CreateR3Parameter<float>({{{9, 1, 0, 0}}, {{2, 4, 0, 0}}}, 1,
-                                           "row", &builder, &row);
-  // Select {{3, 6, 0, 1}, {24, 61,  82,  48}} out of BatchedAValsFull().
-  auto index_data = CreateR0Parameter<int>(1, 2, "index", &builder, &index);
-
-  auto l_index = DynamicSliceInMinorDims(
-      a, {index, xla::ConstantR0<int32>(&builder, 0)}, {1, n});
-  BatchDot(l_index, row, /*transpose_x=*/false, /*transpose_y=*/true);
-
-  ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
-                             {a_data.get(), row_data.get(), index_data.get()});
-}
-
 }  // namespace
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index a95bbf2c8c860914877d3195b97342097dafc725..5db9d10dff4c50d71cde934b3f3c345bee571f29 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -59,22 +59,25 @@ XlaOp BuildFakeDataOpOnDevice(const Shape& shape, XlaBuilder* builder) {
   return Tuple(builder, parts);
 }
 
-std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(const Shape& shape,
-                                                       Client* client) {
+std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(
+    const Shape& shape, Client* client, DebugOptions* debug_opts) {
   XlaBuilder b(absl::StrCat("make_fake_", ShapeUtil::HumanString(shape)));
   BuildFakeDataOpOnDevice(shape, &b);
   XlaComputation computation = b.Build().ConsumeValueOrDie();
 
   auto execution_options = CreateDefaultExecutionOptions();
   *execution_options.mutable_shape_with_output_layout() = shape.ToProto();
+  if (debug_opts) {
+    *execution_options.mutable_debug_options() = *debug_opts;
+  }
   return client->Execute(computation, /*arguments=*/{}, &execution_options)
       .ConsumeValueOrDie();
 }
 
 }  // namespace
 
-std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
-                                              Client* client) {
+std::unique_ptr<GlobalData> MakeFakeDataOrDie(
+    const Shape& shape, Client* client, DebugOptions* debug_opts /*=nullptr*/) {
   if (DataSizeOfShape(shape) < (1LL << 20)) {
     StatusOr<Literal> literal_status = MakeFakeLiteral(shape);
     if (!literal_status.ok()) {
@@ -82,24 +85,25 @@ std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
       // an on-device computation.
       CHECK_EQ(literal_status.status().code(),
                tensorflow::error::UNIMPLEMENTED);
-      return MakeFakeDataViaDeviceOrDie(shape, client);
+      return MakeFakeDataViaDeviceOrDie(shape, client, debug_opts);
     }
     return client->TransferToServer(literal_status.ValueOrDie()).ValueOrDie();
   }
 
   // If the data is large, generate it on-device.
-  return MakeFakeDataViaDeviceOrDie(shape, client);
+  return MakeFakeDataViaDeviceOrDie(shape, client, debug_opts);
 }
 
 std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
-    const XlaComputation& computation, Client* client) {
+    const XlaComputation& computation, Client* client,
+    DebugOptions* debug_opts /*=nullptr*/) {
   CHECK(computation.proto().has_host_program_shape())
       << "Computation should have progran shape.";
   auto program_shape = computation.proto().host_program_shape();
 
   std::vector<std::unique_ptr<GlobalData>> results;
   for (const ShapeProto& shape : program_shape.parameters()) {
-    results.push_back(MakeFakeDataOrDie(Shape(shape), client));
+    results.push_back(MakeFakeDataOrDie(Shape(shape), client, debug_opts));
   }
   return results;
 }
diff --git a/tensorflow/compiler/xla/client/lib/testing.h b/tensorflow/compiler/xla/client/lib/testing.h
index 03695ce2a339735e3e49522f4fe1bbf2d83a3834..428fa3e93d1b46983aae60176e7c2242d2552fdb 100644
--- a/tensorflow/compiler/xla/client/lib/testing.h
+++ b/tensorflow/compiler/xla/client/lib/testing.h
@@ -29,14 +29,19 @@ namespace xla {
 // Generates fake data of the given shape on the device or dies. The fake data
 // is created by performing a computation on the device rather than transferring
 // data from the host to the device.
-std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
-                                              Client* client);
+//
+// The optional DebugOptions are used when generating fake data on the device.
+std::unique_ptr<GlobalData> MakeFakeDataOrDie(
+    const Shape& shape, Client* client, DebugOptions* debug_opts = nullptr);
 
 // Returns vector of GlobalData handles of fake data (created using
 // MakeFakeDataOrDie) that are correctly shaped arguments for the given
 // xla computation.
+//
+// The optional DebugOptions are used when generating fake data on the device.
 std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
-    const XlaComputation& computation, Client* client);
+    const XlaComputation& computation, Client* client,
+    DebugOptions* debug_opts = nullptr);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/xla/client/lib/triangular_solve.cc
similarity index 62%
rename from tensorflow/compiler/tf2xla/lib/triangular_solve.cc
rename to tensorflow/compiler/xla/client/lib/triangular_solve.cc
index 6524c2a9b1ada632d80edd234272760c2b545cc4..ac58090dfe33a8ae350019771e0b970d6f26e476 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/xla/client/lib/triangular_solve.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
+#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
-#include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -29,21 +29,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/math/math_util.h"
 
-namespace tensorflow {
+namespace xla {
 
 // Get the diagonal blocks of the coefficient matrix
-xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(a));
-    int ndims = xla::ShapeUtil::Rank(shape);
-    int64 n = xla::ShapeUtil::GetDimension(shape, -1);
+XlaOp DiagonalBlocks(XlaOp a, int64 block_size) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(a));
+    int ndims = ShapeUtil::Rank(shape);
+    int64 n = ShapeUtil::GetDimension(shape, -1);
     int64 num_blocks = n / block_size;
 
-    xla::XlaOp diag_blocks;
+    XlaOp diag_blocks;
 
     // If the coefficient matrix is exactly the block size, we just add a
     // singleton dimension i.e. [..., n, n] -> [..., 1, n, n]
@@ -58,13 +57,13 @@ xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
     if (n > block_size) {
       // Construct the starting indices of the diagonal blocks
       auto start_indices =
-          Transpose(Broadcast(Mul(Iota(builder, xla::S32, num_blocks),
-                                  xla::ConstantR0<int32>(builder, block_size)),
+          Transpose(Broadcast(Mul(Iota(builder, S32, num_blocks),
+                                  ConstantR0<int32>(builder, block_size)),
                               /*broadcast_sizes=*/{2}),
                     /*permutation=*/{1, 0});
 
       // Gather the diagonal blocks
-      xla::GatherDimensionNumbers dim_numbers;
+      GatherDimensionNumbers dim_numbers;
       dim_numbers.add_offset_dims(ndims - 1);
       dim_numbers.add_offset_dims(ndims);
       dim_numbers.add_start_index_map(ndims - 2);
@@ -80,7 +79,7 @@ xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
       // Pad with zeros
       auto last_blocks =
           SliceInMinorDims(a, {n - n % block_size, n - n % block_size}, {n, n});
-      xla::PaddingConfig config = xla::MakeNoPaddingConfig(ndims);
+      PaddingConfig config = MakeNoPaddingConfig(ndims);
       int64 padding = block_size - n % block_size;
       config.mutable_dimensions(ndims - 1)->set_edge_padding_high(padding);
       config.mutable_dimensions(ndims - 2)->set_edge_padding_high(padding);
@@ -89,9 +88,8 @@ xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
 
       // Add a singleton dimension
       // i.e. [..., block_size, block_size] -> [..., 1, block_size, block_size]
-      TF_ASSIGN_OR_RETURN(xla::Shape blocks_shape,
-                          builder->GetShape(last_blocks));
-      auto shape_dims = xla::AsInt64Slice(blocks_shape.dimensions());
+      TF_ASSIGN_OR_RETURN(Shape blocks_shape, builder->GetShape(last_blocks));
+      auto shape_dims = AsInt64Slice(blocks_shape.dimensions());
       auto last_blocks_dims = std::vector<int64>(ndims);
       std::copy(shape_dims.begin(), shape_dims.end(), last_blocks_dims.begin());
       last_blocks_dims.insert(last_blocks_dims.end() - 2, 1);
@@ -100,7 +98,7 @@ xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
       // Concatenate with the other blocks if necessary
       if (n > block_size) {
         diag_blocks =
-            xla::ConcatInDim(builder, {diag_blocks, last_blocks}, ndims - 2);
+            ConcatInDim(builder, {diag_blocks, last_blocks}, ndims - 2);
       } else {
         diag_blocks = last_blocks;
       }
@@ -110,16 +108,16 @@ xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
   });
 }
 
-xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
-                                bool transpose_a, bool conjugate_a,
-                                xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = diag_blocks.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
+XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
+                           bool conjugate_a,
+                           PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = diag_blocks.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     // Input is a batch of square lower triangular square matrices. Its shape is
     // (..., size, size). We resize this to (num_blocks, size, size).
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(diag_blocks));
-    int64 block_size = xla::ShapeUtil::GetDimension(shape, -1);
-    int64 num_blocks = xla::ShapeUtil::ElementsIn(shape) /
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(diag_blocks));
+    int64 block_size = ShapeUtil::GetDimension(shape, -1);
+    int64 num_blocks = ShapeUtil::ElementsIn(shape) /
                        tensorflow::MathUtil::IPow(block_size, 2);
     diag_blocks = Reshape(diag_blocks, {num_blocks, block_size, block_size});
 
@@ -131,9 +129,9 @@ xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
     // zero (which can happen if the last block was padded) otherwise it will
     // introduce nans which will propagate
     auto diags = GetMatrixDiagonal(diag_blocks);
-    TF_ASSIGN_OR_RETURN(xla::Shape diags_shape, builder->GetShape(diags));
+    TF_ASSIGN_OR_RETURN(Shape diags_shape, builder->GetShape(diags));
     auto one = ScalarLike(diags, 1);
-    auto ones = Broadcast(one, xla::AsInt64Slice(diags_shape.dimensions()));
+    auto ones = Broadcast(one, AsInt64Slice(diags_shape.dimensions()));
     diags = Select(Eq(diags, Zero(builder, shape.element_type())), ones, diags);
     auto scaled_diag_blocks = Div(diag_blocks, diags, {0, 2});
 
@@ -159,40 +157,40 @@ xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
     auto start_index = (lower) ? 0 : block_size - 1;
     auto output_block = DynamicUpdateSlice(
         neg_identity, pos_one,
-        /*start_indices=*/xla::ConstantR1<int>(builder, 2, start_index));
+        /*start_indices=*/ConstantR1<int>(builder, 2, start_index));
 
     // Broadcast diag([1, -1, -1, ...]) to every block
-    xla::XlaOp output = Broadcast(output_block,
-                                  /*broadcast_sizes=*/{num_blocks});
+    XlaOp output = Broadcast(output_block,
+                             /*broadcast_sizes=*/{num_blocks});
 
     // Now we construct a loop that performs matrix-vector multiplications
     // inverting the blocks one row at a time
-    std::vector<xla::Shape> tuple_shapes = {
+    std::vector<Shape> tuple_shapes = {
         // The loop iteration counter is a scalar, incremented each iteration.
-        xla::ShapeUtil::MakeShape(xla::S32, {}),
+        ShapeUtil::MakeShape(S32, {}),
         // The output has the shape of A, with one row updated each iteration.
-        xla::ShapeUtil::MakeShape(shape.element_type(),
-                                  {num_blocks, block_size, block_size}),
+        ShapeUtil::MakeShape(shape.element_type(),
+                             {num_blocks, block_size, block_size}),
         // The input is a loop invariant.
-        xla::ShapeUtil::MakeShape(shape.element_type(),
-                                  {num_blocks, block_size, block_size})};
-    xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
+        ShapeUtil::MakeShape(shape.element_type(),
+                             {num_blocks, block_size, block_size})};
+    Shape tuple_shape = ShapeUtil::MakeTupleShape(tuple_shapes);
 
-    auto init_i = One(builder, xla::S32);
-    auto init = xla::Tuple(builder, {init_i, output, scaled_diag_blocks});
+    auto init_i = One(builder, S32);
+    auto init = Tuple(builder, {init_i, output, scaled_diag_blocks});
 
     // Construct the loop condition function.
-    std::unique_ptr<xla::XlaBuilder> condb =
+    std::unique_ptr<XlaBuilder> condb =
         builder->CreateSubBuilder("InvertDiagCond");
     {
       auto i = GetTupleElement(
           Parameter(condb.get(), 0, tuple_shape, "InvertDiagCondTuple"), 0);
-      Lt(i, xla::ConstantR0<int32>(condb.get(), block_size));
+      Lt(i, ConstantR0<int32>(condb.get(), block_size));
     }
     TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
 
     // Construct the loop body function.
-    std::unique_ptr<xla::XlaBuilder> bodyb =
+    std::unique_ptr<XlaBuilder> bodyb =
         builder->CreateSubBuilder("InvertDiagBody");
     {
       auto input_tuple =
@@ -202,21 +200,21 @@ xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
       auto body_out = GetTupleElement(input_tuple, 1);
       auto body_input = GetTupleElement(input_tuple, 2);
 
-      auto zero = xla::ConstantR1<int32>(bodyb.get(), 1, 0);
+      auto zero = ConstantR1<int32>(bodyb.get(), 1, 0);
       auto j = (lower) ? i : ScalarLike(i, block_size - 1) - i;
       auto start_indices =
-          xla::ConcatInDim(bodyb.get(), {zero, Reshape(j, {1}), zero}, 0);
+          ConcatInDim(bodyb.get(), {zero, Reshape(j, {1}), zero}, 0);
       auto input_row =
           DynamicSlice(body_input, start_indices,
                        /*slice_sizes=*/{num_blocks, 1, block_size});
 
       // We want -L21 L11^{-1}
-      xla::DotDimensionNumbers dnums;
+      DotDimensionNumbers dnums;
       dnums.add_lhs_batch_dimensions(0);
       dnums.add_rhs_batch_dimensions(0);
       dnums.add_lhs_contracting_dimensions(2);
       dnums.add_rhs_contracting_dimensions(1);
-      xla::PrecisionConfig precision_proto;
+      PrecisionConfig precision_proto;
       precision_proto.add_operand_precision(precision);
       precision_proto.add_operand_precision(precision);
       auto update = -DotGeneral(input_row, body_out, dnums, &precision_proto);
@@ -224,7 +222,7 @@ xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
       body_out = DynamicUpdateSlice(body_out, update, start_indices);
 
       auto next_i = i + ScalarLike(i, 1);
-      xla::Tuple(bodyb.get(), {next_i, body_out, body_input});
+      Tuple(bodyb.get(), {next_i, body_out, body_input});
     }
     TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
 
@@ -238,27 +236,26 @@ xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
                           /*broadcast_dimensions=*/{0, 1});
 
     // Reshape back to original batch major dimensions
-    return Reshape(inv_diag_blocks, xla::AsInt64Slice(shape.dimensions()));
+    return Reshape(inv_diag_blocks, AsInt64Slice(shape.dimensions()));
   });
 }
 
-xla::XlaOp SolveWithInvertedDiagonalBlocks(
-    xla::XlaOp a, xla::XlaOp b, xla::XlaOp inv_diag_blocks, bool left_side,
-    bool lower, bool transpose_a, bool conjugate_a,
-    xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape blocks_shape,
-                        builder->GetShape(inv_diag_blocks));
-    TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b));
-    int64 block_size = xla::ShapeUtil::GetDimension(blocks_shape, -1);
-
-    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-    int64 ndims = xla::ShapeUtil::Rank(a_shape);
-    int64 n = xla::ShapeUtil::GetDimension(a_shape, -1);
+XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
+                                      bool left_side, bool lower,
+                                      bool transpose_a, bool conjugate_a,
+                                      PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape blocks_shape, builder->GetShape(inv_diag_blocks));
+    TF_ASSIGN_OR_RETURN(Shape b_shape, builder->GetShape(b));
+    int64 block_size = ShapeUtil::GetDimension(blocks_shape, -1);
+
+    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+    int64 ndims = ShapeUtil::Rank(a_shape);
+    int64 n = ShapeUtil::GetDimension(a_shape, -1);
     int64 num_blocks = n / block_size + (n % block_size != 0);
     int64 m_dim = (left_side) ? -1 : -2;
-    int64 m = xla::ShapeUtil::GetDimension(b_shape, m_dim);
+    int64 m = ShapeUtil::GetDimension(b_shape, m_dim);
 
     // Initialize the solution
     auto x = ZerosLike(b);
@@ -294,7 +291,7 @@ xla::XlaOp SolveWithInvertedDiagonalBlocks(
       }
       auto b_row = SliceInMinorDims(b, start, end);
 
-      xla::XlaOp remainder;
+      XlaOp remainder;
       if (i == 0) {
         remainder = b_row;
       } else {
@@ -311,29 +308,27 @@ xla::XlaOp SolveWithInvertedDiagonalBlocks(
         auto a_row =
             MaybeConjugate(SliceInMinorDims(a, start, end), conjugate_a);
         if (left_side) {
-          remainder = b_row - BatchDot(a_row, x, transpose_a, false,
-                                       /*conjugate_x=*/false,
-                                       /*conjugate_y=*/false, precision);
+          remainder =
+              b_row - BatchDot(MaybeTransposeInMinorDims(a_row, transpose_a), x,
+                               precision);
         } else {
-          remainder = b_row - BatchDot(x, a_row, false, transpose_a,
-                                       /*conjugate_x=*/false,
-                                       /*conjugate_y=*/false, precision);
+          remainder =
+              b_row - BatchDot(x, MaybeTransposeInMinorDims(a_row, transpose_a),
+                               precision);
         }
       }
 
-      xla::XlaOp x_update;
-      auto zero = Zero(builder, xla::S32);
-      auto start_index =
-          xla::ConstantR0WithType(builder, xla::S32, j * block_size);
-      std::vector<xla::XlaOp> update_starts = {start_index, zero};
+      XlaOp x_update;
+      auto zero = Zero(builder, S32);
+      auto start_index = ConstantR0WithType(builder, S32, j * block_size);
+      std::vector<XlaOp> update_starts = {start_index, zero};
       if (left_side) {
-        x_update =
-            BatchDot(inv_block, remainder, transpose_a, false,
-                     /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+        x_update = BatchDot(MaybeTransposeInMinorDims(inv_block, transpose_a),
+                            remainder, precision);
       } else {
-        x_update =
-            BatchDot(remainder, inv_block, false, transpose_a,
-                     /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+        x_update = BatchDot(remainder,
+                            MaybeTransposeInMinorDims(inv_block, transpose_a),
+                            precision);
         std::swap(update_starts[0], update_starts[1]);
       }
       x = DynamicUpdateSliceInMinorDims(x, x_update, /*starts=*/update_starts);
@@ -343,24 +338,24 @@ xla::XlaOp SolveWithInvertedDiagonalBlocks(
   });
 }
 
-xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
-                           bool lower, bool transpose_a, bool conjugate_a,
-                           int64 block_size,
-                           xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-    TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b));
-    if (xla::ShapeUtil::Rank(a_shape) != xla::ShapeUtil::Rank(b_shape)) {
-      return errors::InvalidArgument(
-          "Arguments to TriangularSolve have different ranks: ",
-          xla::ShapeUtil::HumanString(a_shape), " vs. ",
-          xla::ShapeUtil::HumanString(b_shape));
+XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                      bool transpose_a, bool conjugate_a, int64 block_size,
+                      PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+    TF_ASSIGN_OR_RETURN(Shape b_shape, builder->GetShape(b));
+    if (ShapeUtil::Rank(a_shape) != ShapeUtil::Rank(b_shape)) {
+      return InvalidArgument(
+          "Arguments to TriangularSolve have shapes with different ranks: "
+          "%s vs. %s",
+          ShapeUtil::HumanString(a_shape), ShapeUtil::HumanString(b_shape));
     }
-    const int64 ndims = xla::ShapeUtil::Rank(a_shape);
+    const int64 ndims = ShapeUtil::Rank(a_shape);
     if (ndims < 2) {
-      return errors::InvalidArgument(
-          "Arguments to TriangularSolve must have rank >= 2: ", ndims);
+      return InvalidArgument(
+          "Arguments to TriangularSolve was rank %d but must have rank >= 2.",
+          ndims);
     }
     // The batch dimensions must be equal.
     std::vector<int64> batch_dimensions;
@@ -368,35 +363,42 @@ xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
       int64 a_size = a_shape.dimensions(i);
       int64 b_size = b_shape.dimensions(i);
       if (a_size != b_size) {
-        return errors::InvalidArgument(
-            "Batch dimensions of arguments to TriangularSolve must be equal: ",
-            xla::ShapeUtil::HumanString(a_shape), " vs ",
-            xla::ShapeUtil::HumanString(b_shape));
+        return InvalidArgument(
+            "Batch dimensions of arguments to TriangularSolve must be equal; "
+            "shapes were %s and %s.",
+            ShapeUtil::HumanString(a_shape), ShapeUtil::HumanString(b_shape));
       }
       batch_dimensions.push_back(a_size);
     }
 
-    if (xla::ShapeUtil::GetDimension(a_shape, -1) !=
-        xla::ShapeUtil::GetDimension(a_shape, -2)) {
-      return errors::InvalidArgument(
-          "The 'a' arguments to TriangularSolve must be square matrices: ",
-          xla::ShapeUtil::HumanString(a_shape));
+    if (ShapeUtil::GetDimension(a_shape, -1) !=
+        ShapeUtil::GetDimension(a_shape, -2)) {
+      return InvalidArgument(
+          "The 'a' argument to TriangularSolve must be a batched square matrix;"
+          " shape was: %s",
+          ShapeUtil::HumanString(a_shape));
     }
-    const int64 m = xla::ShapeUtil::GetDimension(b_shape, -2);
-    const int64 n = xla::ShapeUtil::GetDimension(b_shape, -1);
-    if ((left_side ? m : n) != xla::ShapeUtil::GetDimension(a_shape, -1)) {
-      return errors::InvalidArgument(
-          "Arguments to TriangularSolve have incompatible matrix shapes: ",
-          xla::ShapeUtil::HumanString(a_shape), " vs ",
-          xla::ShapeUtil::HumanString(b_shape));
+    const int64 m = ShapeUtil::GetDimension(b_shape, -2);
+    const int64 n = ShapeUtil::GetDimension(b_shape, -1);
+    if ((left_side ? m : n) != ShapeUtil::GetDimension(a_shape, -1)) {
+      return InvalidArgument(
+          "Arguments to TriangularSolve have incompatible matrix shapes %s and "
+          "%s",
+          ShapeUtil::HumanString(a_shape), ShapeUtil::HumanString(b_shape));
     }
 
     if (block_size < 1) {
-      return errors::InvalidArgument(
-          "block_size argument to TriangularSolve must be >= 1; got ",
+      return InvalidArgument(
+          "block_size argument to TriangularSolve must be >= 1; got %d",
           block_size);
     }
 
+    if (ShapeUtil::IsZeroElementArray(b_shape)) {
+      // The output has the same shape as 'b', and since the output has zero
+      // elements, any such array will do.
+      return b;
+    }
+
     // We find the diagonal blocks of the coefficient matrix
     auto diag_blocks = DiagonalBlocks(a, block_size);
 
@@ -413,4 +415,4 @@ xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
   });
 }
 
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.h b/tensorflow/compiler/xla/client/lib/triangular_solve.h
similarity index 88%
rename from tensorflow/compiler/tf2xla/lib/triangular_solve.h
rename to tensorflow/compiler/xla/client/lib/triangular_solve.h
index 2303234f361e54cd2a0ad495cb03b371bed76877..50a3b30ebd1c15eb6d2ace4e351cb41f21db7093 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.h
+++ b/tensorflow/compiler/xla/client/lib/triangular_solve.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
-#define TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
-namespace tensorflow {
+namespace xla {
 
 // Solves systems of linear equations with lower or upper triangular coefficient
 // matrices by forward- or back-substitution. Broadcasting along leading
@@ -57,11 +57,11 @@ namespace tensorflow {
 //
 // Uses a blocked algorithm if `block_size` is > 1; if block_size == 1 then no
 // blocking is used.
-xla::XlaOp TriangularSolve(
-    xla::XlaOp a, xla::XlaOp b, bool left_side, bool lower, bool transpose_a,
+XlaOp TriangularSolve(
+    XlaOp a, XlaOp b, bool left_side, bool lower, bool transpose_a,
     bool conjugate_a, int64 block_size = 128,
-    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::HIGHEST);
+    PrecisionConfig::Precision precision = PrecisionConfig::HIGHEST);
 
-}  // namespace tensorflow
+}  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc b/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
similarity index 78%
rename from tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
rename to tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
index aeebf16028d40189203cdfd815f06a339ee72902..d0188e8ea06d0edacdba330f46647af201747abf 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
+++ b/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
+#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 
 #include <memory>
 #include <numeric>
@@ -30,59 +30,71 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
-namespace tensorflow {
+namespace xla {
 namespace {
 
-using TriangularSolveTest = xla::ClientLibraryTestBase;
-using TriangularSolveLeftLookingTest = xla::ClientLibraryTestBase;
-using complex64 = xla::complex64;
+using TriangularSolveTest = ClientLibraryTestBase;
+using TriangularSolveLeftLookingTest = ClientLibraryTestBase;
 
-xla::Array2D<float> AValsLower() {
+Array2D<float> AValsLower() {
   return {{2, 0, 0, 0}, {3, 6, 0, 0}, {4, 7, 9, 0}, {5, 8, 10, 11}};
 }
 
-xla::Array2D<float> AValsUpper() {
+Array2D<float> AValsUpper() {
   return {{2, 3, 4, 5}, {0, 6, 7, 8}, {0, 0, 9, 10}, {0, 0, 0, 11}};
 }
 
-xla::Array2D<float> BValsRight() {
+Array2D<float> BValsRight() {
   return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
 }
 
-xla::Array2D<float> BValsLeft() {
+Array2D<float> BValsLeft() {
   return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
 }
 
-xla::Array2D<complex64> AValsLowerComplex() {
+Array2D<complex64> AValsLowerComplex() {
   return {{2, 0, 0, 0},
           {complex64(3, 1), 6, 0, 0},
           {4, complex64(7, 2), 9, 0},
           {5, 8, complex64(10, 3), 11}};
 }
 
-xla::Array2D<complex64> AValsUpperComplex() {
+Array2D<complex64> AValsUpperComplex() {
   return {{2, 3, complex64(4, 3), 5},
           {0, 6, complex64(7, 2), 8},
           {0, 0, complex64(9, 1), 10},
           {0, 0, 0, 11}};
 }
 
-xla::Array2D<complex64> BValsRightComplex() {
+Array2D<complex64> BValsRightComplex() {
   return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
 }
 
-xla::Array2D<complex64> BValsLeftComplex() {
+Array2D<complex64> BValsLeftComplex() {
   return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
 }
 
-xla::Array2D<float> AValsFull() {
-  return {{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 7, 9, 0}, {5, 8, 10, 11}};
+XLA_TEST_F(TriangularSolveTest, EmptyArrays) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data =
+      CreateR2Parameter<float>(Array2D<float>(0, 0), 0, "a", &builder, &a);
+  auto b_data =
+      CreateR2Parameter<float>(Array2D<float>(0, 10), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/true,
+                  /*transpose_a=*/true, /*conjugate_a=*/false,
+                  /*block_size=*/2);
+
+  ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 10),
+                             {a_data.get(), b_data.get()});
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) {
-  xla::XlaBuilder builder(TestName());
+  XlaBuilder builder(TestName());
 
-  xla::XlaOp a, b;
+  XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
@@ -90,20 +102,20 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) {
                   /*transpose_a=*/true, /*conjugate_a=*/false,
                   /*block_size=*/2);
 
-  xla::Array2D<float> expected({
+  Array2D<float> expected({
       {0.5, 0.08333334, 0.04629629, 0.03367003},
       {2.5, -0.25, -0.1388889, -0.1010101},
       {4.5, -0.58333331, -0.32407406, -0.23569024},
   });
 
   ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
+                             ErrorSpec(1e-2, 1e-2));
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) {
-  xla::XlaBuilder builder(TestName());
+  XlaBuilder builder(TestName());
 
-  xla::XlaOp a, b;
+  XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
@@ -111,20 +123,20 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) {
                   /*transpose_a=*/false, /*conjugate_a=*/false,
                   /*block_size=*/2);
 
-  xla::Array2D<float> expected({
+  Array2D<float> expected({
       {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
       {0.64393939, 0.06565657, -0.03030303, 0.72727273},
       {1.4520202, 0.2003367, 0.01010101, 1.09090909},
   });
 
   ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
+                             ErrorSpec(1e-2, 1e-2));
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) {
-  xla::XlaBuilder builder(TestName());
+  XlaBuilder builder(TestName());
 
-  xla::XlaOp a, b;
+  XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
@@ -132,20 +144,20 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) {
                   /*transpose_a=*/true, /*conjugate_a=*/false,
                   /*block_size=*/2);
 
-  xla::Array2D<float> expected({
+  Array2D<float> expected({
       {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
       {0.64393939, 0.06565657, -0.03030303, 0.72727273},
       {1.4520202, 0.2003367, 0.01010101, 1.09090909},
   });
 
   ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
+                             ErrorSpec(1e-2, 1e-2));
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) {
-  xla::XlaBuilder builder(TestName());
+  XlaBuilder builder(TestName());
 
-  xla::XlaOp a, b;
+  XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
@@ -153,20 +165,20 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) {
                   /*transpose_a=*/false, /*conjugate_a=*/false,
                   /*block_size=*/2);
 
-  xla::Array2D<float> expected({
+  Array2D<float> expected({
       {0.5, 0.08333334, 0.04629629, 0.03367003},
       {2.5, -0.25, -0.1388889, -0.1010101},
       {4.5, -0.58333331, -0.32407406, -0.23569024},
   });
 
   ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
+                             ErrorSpec(1e-2, 1e-2));
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
-  xla::XlaBuilder builder(TestName());
+  XlaBuilder builder(TestName());
 
-  xla::XlaOp a, b;
+  XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
@@ -174,7 +186,7 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
                   /*transpose_a=*/true, /*conjugate_a=*/false,
                   /*block_size=*/2);
 
-  xla::Array2D<float> expected({
+  Array2D<float> expected({
       {-0.89646465, -0.69444444, -0.49242424},
       {-0.27441077, -0.24074074, -0.20707071},
       {-0.23232323, -0.22222222, -0.21212121},
@@ -182,13 +194,13 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
   });
 
   ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
+                             ErrorSpec(1e-2, 1e-2));
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
-  xla::XlaBuilder builder(TestName());
+  XlaBuilder builder(TestName());
 
-  xla::XlaOp a, b;
+  XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
@@ -196,7 +208,7 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
                   /*transpose_a=*/false, /*conjugate_a=*/false,
                   /*block_size=*/2);
 
-  xla::Array2D<float> expected({
+  Array2D<float> expected({
       {0.5, 1.0, 1.5},
       {0.41666667, 0.33333333, 0.25},
       {0.23148148, 0.18518519, 0.13888889},
@@ -204,13 +216,13 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
   });
 
   ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
+                             ErrorSpec(1e-2, 1e-2));
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotransposeIrregularblock) {
-  xla::XlaBuilder builder(TestName());
+  XlaBuilder builder(TestName());
 
-  xla::XlaOp a, b;
+  XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
@@ -218,7 +230,7 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotransposeIrregularblock) {
                   /*transpose_a=*/false, /*conjugate_a=*/false,
                   /*block_size=*/3);
 
-  xla::Array2D<float> expected({
+  Array2D<float> expected({
       {0.5, 1.0, 1.5},
       {0.41666667, 0.33333333, 0.25},
       {0.23148148, 0.18518519, 0.13888889},
@@ -226,13 +238,13 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotransposeIrregularblock) {
   });
 
   ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
+                             ErrorSpec(1e-2, 1e-2));
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
-  xla::XlaBuilder builder(TestName());
+  XlaBuilder builder(TestName());
 
-  xla::XlaOp a, b;
+  XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
@@ -240,7 +252,7 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
                   /*transpose_a=*/true, /*conjugate_a=*/false,
                   /*block_size=*/2);
 
-  xla::Array2D<float> expected({
+  Array2D<float> expected({
       {0.5, 1.0, 1.5},
       {0.41666667, 0.33333333, 0.25},
       {0.23148148, 0.18518519, 0.13888889},
@@ -248,13 +260,13 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
   });
 
   ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
+                             ErrorSpec(1e-2, 1e-2));
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
-  xla::XlaBuilder builder(TestName());
+  XlaBuilder builder(TestName());
 
-  xla::XlaOp a, b;
+  XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
@@ -262,7 +274,7 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
                   /*transpose_a=*/false, /*conjugate_a=*/false,
                   /*block_size=*/2);
 
-  xla::Array2D<float> expected({
+  Array2D<float> expected({
       {-0.89646465, -0.69444444, -0.49242424},
       {-0.27441077, -0.24074074, -0.20707071},
       {-0.23232323, -0.22222222, -0.21212121},
@@ -270,13 +282,13 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
   });
 
   ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
+                             ErrorSpec(1e-2, 1e-2));
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
-  xla::XlaBuilder builder(TestName());
+  XlaBuilder builder(TestName());
 
-  xla::XlaOp a, b;
+  XlaOp a, b;
   auto a_data =
       CreateR2Parameter<complex64>(AValsLowerComplex(), 0, "a", &builder, &a);
   auto b_data =
@@ -286,7 +298,7 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
                   /*transpose_a=*/true, /*conjugate_a=*/true,
                   /*block_size=*/2);
 
-  xla::Array2D<complex64> expected({
+  Array2D<complex64> expected({
       {0.5, complex64(0.08333333, 0.08333333),
        complex64(0.02777778, -0.0462963), complex64(0.06313131, -0.01094276)},
       {2.5, complex64(-0.25, 0.41666667), complex64(-0.23148148, -0.37962963),
@@ -295,15 +307,14 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
        complex64(0.11026936, -0.03114478)},
   });
 
-  ComputeAndCompareR2<complex64>(&builder, expected,
-                                 {a_data.get(), b_data.get()},
-                                 xla::ErrorSpec(1e-2, 1e-2));
+  ComputeAndCompareR2<complex64>(
+      &builder, expected, {a_data.get(), b_data.get()}, ErrorSpec(1e-2, 1e-2));
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
-  xla::XlaBuilder builder(TestName());
+  XlaBuilder builder(TestName());
 
-  xla::XlaOp a, b;
+  XlaOp a, b;
   auto a_data =
       CreateR2Parameter<complex64>(AValsUpperComplex(), 0, "a", &builder, &a);
   auto b_data =
@@ -313,7 +324,7 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
                   /*transpose_a=*/true, /*conjugate_a=*/false,
                   /*block_size=*/2);
 
-  xla::Array2D<complex64> expected({
+  Array2D<complex64> expected({
       {0.5, 1., 1.5},
       {0.41666667, 0.33333333, 0.25},
       {complex64(0.20020325, -2.81504065e-01),
@@ -324,10 +335,9 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
        complex64(0.15798226, 5.12749446e-01)},
   });
 
-  ComputeAndCompareR2<complex64>(&builder, expected,
-                                 {a_data.get(), b_data.get()},
-                                 xla::ErrorSpec(1e-2, 1e-2));
+  ComputeAndCompareR2<complex64>(
+      &builder, expected, {a_data.get(), b_data.get()}, ErrorSpec(1e-2, 1e-2));
 }
 
 }  // namespace
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index aaa5d6989eefb94edb8921d13f96e3705aa3e3a4..049cd15738a619294b19d5cf74ca514d7b4a00ad 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -71,9 +71,9 @@ Status LocalExecutable::ValidateExecutionOptions(
           "parameter "
           "%d: want %s, got %s",
           i,
-          ShapeUtil::HumanString(
+          ShapeUtil::HumanStringWithLayout(
               computation_layout.parameter_layout(i).shape()),
-          ShapeUtil::HumanString(arguments[i]->on_host_shape()));
+          ShapeUtil::HumanStringWithLayout(arguments[i]->on_host_shape()));
     }
   }
 
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index a40330a9b1fe201b6ec83d1bfe1a21e294e18f55..a9a91648ac377987e7f226116e11c9c697ace103 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -22,49 +22,49 @@ limitations under the License.
 #include "tensorflow/compiler/xla/parse_flags_from_env.h"
 
 namespace xla {
-namespace {
 
-DebugOptions* flag_values;
-std::vector<tensorflow::Flag>* flag_objects;
-std::once_flag flags_init;
-
-void SetDebugOptionsDefaults(DebugOptions* flags) {
-  flags->set_xla_llvm_enable_alias_scope_metadata(true);
-  flags->set_xla_llvm_enable_noalias_metadata(true);
-  flags->set_xla_llvm_enable_invariant_load_metadata(true);
-  flags->set_xla_llvm_disable_expensive_passes(false);
-  flags->set_xla_backend_optimization_level(3);
-  flags->set_xla_cpu_multi_thread_eigen(true);
-  flags->set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
-  flags->set_xla_eliminate_hlo_implicit_broadcast(true);
+DebugOptions DefaultDebugOptionsIgnoringFlags() {
+  DebugOptions opts;
+  opts.set_xla_llvm_enable_alias_scope_metadata(true);
+  opts.set_xla_llvm_enable_noalias_metadata(true);
+  opts.set_xla_llvm_enable_invariant_load_metadata(true);
+  opts.set_xla_llvm_disable_expensive_passes(false);
+  opts.set_xla_backend_optimization_level(3);
+  opts.set_xla_cpu_multi_thread_eigen(true);
+  opts.set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
+  opts.set_xla_eliminate_hlo_implicit_broadcast(true);
+  opts.set_xla_hlo_dump_as_html(false);
 #ifdef INTEL_MKL
-  flags->set_xla_cpu_use_mkl_dnn(true);
+  opts.set_xla_cpu_use_mkl_dnn(true);
 #endif  // INTEL_MKL
-  flags->set_xla_gpu_max_kernel_unroll_factor(4);
+  opts.set_xla_gpu_max_kernel_unroll_factor(4);
   // Set cudnn batchnorm off by default; it does not provide a performance win
   // on average.
-  flags->set_xla_gpu_use_cudnn_batchnorm(false);
+  opts.set_xla_gpu_use_cudnn_batchnorm(false);
 
   // Run all GPU work on one stream by default.  Using multiple streams
   // increases memory usage and we lack strong motivating benchmarks for tuning
   // the heuristics needed to decide when to run on multiple streams.  See
   // b/77879207.
-  flags->set_xla_gpu_disable_multi_streaming(true);
+  opts.set_xla_gpu_disable_multi_streaming(true);
 
   // TODO(jlebar): Disable fastmath once doing so is not a performance
   // regression.
-  flags->set_xla_cpu_enable_fast_math(true);
-  flags->set_xla_gpu_enable_fast_math(true);
+  opts.set_xla_cpu_enable_fast_math(true);
+  opts.set_xla_gpu_enable_fast_min_max(true);
 
-  flags->set_xla_force_host_platform_device_count(1);
+  opts.set_xla_force_host_platform_device_count(1);
+  return opts;
 }
 
+static DebugOptions* flag_values;
+static std::vector<tensorflow::Flag>* flag_objects;
+static std::once_flag flags_init;
+
 // Allocates flag_values and flag_objects; this function must not be called more
 // than once - its call done via call_once.
-void AllocateFlags() {
-  flag_values = new DebugOptions;
-
-  SetDebugOptionsDefaults(flag_values);
+static void AllocateFlags() {
+  flag_values = new DebugOptions(DefaultDebugOptionsIgnoringFlags());
 
   // Returns a lambda that calls "member_setter" on "flag_values" with the
   // argument passed in to the lambda.
@@ -133,6 +133,11 @@ void AllocateFlags() {
           bool_setter_for(&DebugOptions::set_xla_hlo_dump_as_graphdef),
           flag_values->xla_hlo_dump_as_graphdef(),
           "Dump HLO graphs as TensorFlow GraphDefs."),
+      tensorflow::Flag("xla_hlo_dump_as_html",
+                       bool_setter_for(&DebugOptions::set_xla_hlo_dump_as_html),
+                       flag_values->xla_hlo_dump_as_html(),
+                       "Dump HLO graphs as an HTML (DOT rendered into SVG "
+                       "inlined in HTML)."),
       tensorflow::Flag(
           "xla_hlo_graph_sharding_color",
           bool_setter_for(&DebugOptions::set_xla_hlo_graph_sharding_color),
@@ -160,11 +165,11 @@ void AllocateFlags() {
           "Enable unsafe fast-math optimizations in the CPU compiler; "
           "this may produce faster code at the expense of some accuracy."),
       tensorflow::Flag(
-          "xla_gpu_enable_fast_math",
-          bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math),
-          flag_values->xla_cpu_enable_fast_math(),
-          "Enable unsafe fast-math optimizations in the GPU compiler; "
-          "this may produce faster code at the expense of some accuracy."),
+          "xla_gpu_enable_fast_min_max",
+          bool_setter_for(&DebugOptions::set_xla_gpu_enable_fast_min_max),
+          flag_values->xla_gpu_enable_fast_min_max(),
+          "Enable fast floating point min/max lowering that does not propagate "
+          "NaNs."),
       tensorflow::Flag(
           "xla_llvm_enable_alias_scope_metadata",
           bool_setter_for(
@@ -202,6 +207,16 @@ void AllocateFlags() {
           "Comma-separated list of hlo passes to be disabled. These names "
           "must exactly match the passes' names; no whitespace around "
           "commas."),
+      tensorflow::Flag(
+          "xla_disable_all_hlo_passes",
+          bool_setter_for(&DebugOptions::set_xla_disable_all_hlo_passes), false,
+          "Disables all HLO passes.  Notes that some passes are necessary for "
+          "correctness and the invariants that must be satisfied by 'fully "
+          "optimized' HLO are different for different devices and may change "
+          "over time.  The only 'guarantee', such as it is, is that if you "
+          "compile XLA and dump the optimized HLO for some graph, you should "
+          "be able to run it again on the same device with the same build of "
+          "XLA."),
       tensorflow::Flag(
           "xla_embed_ir_in_executable",
           bool_setter_for(&DebugOptions::set_xla_embed_ir_in_executable),
@@ -334,12 +349,16 @@ void AllocateFlags() {
           "overhead from context switching but we let the user override this "
           "behavior to help run tests on the host that run models in parallel "
           "across multiple devices."),
+      tensorflow::Flag(
+          "xla_gpu_disable_ptxas_optimizations",
+          bool_setter_for(
+              &DebugOptions::set_xla_gpu_disable_ptxas_optimizations),
+          flag_values->xla_gpu_disable_ptxas_optimizations(),
+          "In XLA:GPU run ptxas in -O0 (default is -O3)."),
   });
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
 
-}  // namespace
-
 void AppendDebugOptionsFlags(std::vector<tensorflow::Flag>* flag_list) {
   std::call_once(flags_init, &AllocateFlags);
   flag_list->insert(flag_list->end(), flag_objects->begin(),
diff --git a/tensorflow/compiler/xla/debug_options_flags.h b/tensorflow/compiler/xla/debug_options_flags.h
index 60e59abc2a2e0f1cce3de1afc928f9fe36f75b33..dbf86a40f052af09c61da0e1abb3116ef5214357 100644
--- a/tensorflow/compiler/xla/debug_options_flags.h
+++ b/tensorflow/compiler/xla/debug_options_flags.h
@@ -29,7 +29,10 @@ void AppendDebugOptionsFlags(std::vector<tensorflow::Flag>* flag_list);
 // Fetches a DebugOptions proto message from flags provided to the program.
 // Flags must be registered with the flags parser using AppendDebugOptionsFlags
 // first.
-xla::DebugOptions GetDebugOptionsFromFlags();
+DebugOptions GetDebugOptionsFromFlags();
+
+// Gets a DebugOptions proto that reflects the defaults as if no flags were set.
+DebugOptions DefaultDebugOptionsIgnoringFlags();
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/g3doc/_book.yaml b/tensorflow/compiler/xla/g3doc/_book.yaml
index 12b7094705e75305dc43a013576f4549dd5f4185..267701e9c0e42a21d2cda6238520f6a9692e7e76 100644
--- a/tensorflow/compiler/xla/g3doc/_book.yaml
+++ b/tensorflow/compiler/xla/g3doc/_book.yaml
@@ -31,3 +31,5 @@ upper_tabs:
       - title: XLA compile API
         path: /xla/tutorials/xla_compile
         status: experimental
+
+- include: /_upper_tabs_right.yaml
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index e0807518bc401808266cd3b198efa9697d6804de..002ebc31b992826b4dfc53f31a9e3625cde3c5d0 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -38,25 +38,25 @@ Alltoall is a collective operation that sends data from all cores to all cores.
 It has two phases:
 
 1.  the scatter phase. On each core, the operand is split into `split_count`
-    number of blocks along the `split_dimensions`, and the blocks are scattered
-    to all cores, e.g., the ith block is send to the ith core.
+number of blocks along the `split_dimensions`, and the blocks are scattered
+to all cores, e.g., the ith block is send to the ith core.
 2.  the gather phase. Each core concatenates the received blocks along the
-    `concat_dimension`.
+`concat_dimension`.
 
 The participating cores can be configured by:
 
 -   `replica_groups`: each ReplicaGroup contains a list of replica id. If empty,
-    all replicas belong to one group in the order of 0 - (n-1). Alltoall will be
-    applied within subgroups in the specified order. For example, replica
-    groups = {{1,2,3},{4,5,0}} means, an Alltoall will be applied within replica
-    1, 2, 3, and in the gather phase, the received blocks will be concatenated
-    in the order of 1, 2, 3; another Alltoall will be applied within replica 4,
-    5, 0, and the concatenation order is 4, 5, 0.
+all replicas belong to one group in the order of 0 - (n-1). Alltoall will be
+applied within subgroups in the specified order. For example, replica
+groups = {{1,2,3},{4,5,0}} means, an Alltoall will be applied within replica
+1, 2, 3, and in the gather phase, the received blocks will be concatenated
+in the order of 1, 2, 3; another Alltoall will be applied within replica 4,
+5, 0, and the concatenation order is 4, 5, 0.
 
 Prerequisites:
 
 -   The dimension size of the operand on the split_dimension is divisible by
-    split_count.
+split_count.
 -   The operand's shape is not tuple.
 
 <b> `AllToAll(operand, split_dimension, concat_dimension, split_count,
@@ -93,7 +93,7 @@ AllToAll(x, /*split_dimension=*/1, /*concat_dimension=*/0, /*split_count=*/4);
 ```
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="./images/ops_alltoall.png">
+<img style="width:100%" src="./images/ops_alltoall.png">
 </div>
 
 In this example, there are 4 cores participating the Alltoall. On each core, the
@@ -387,34 +387,34 @@ For example, let v be an array of 24 elements:
 
 ```
 let v = f32[4x2x3] {{{10, 11, 12},  {15, 16, 17}},
-                    {{20, 21, 22},  {25, 26, 27}},
-                    {{30, 31, 32},  {35, 36, 37}},
-                    {{40, 41, 42},  {45, 46, 47}}};
+{{20, 21, 22},  {25, 26, 27}},
+{{30, 31, 32},  {35, 36, 37}},
+{{40, 41, 42},  {45, 46, 47}}};
 
 // Collapse to a single dimension, leaving one dimension.
 let v012 = Collapse(v, {0,1,2});
 then v012 == f32[24] {10, 11, 12, 15, 16, 17,
-                      20, 21, 22, 25, 26, 27,
-                      30, 31, 32, 35, 36, 37,
-                      40, 41, 42, 45, 46, 47};
+20, 21, 22, 25, 26, 27,
+30, 31, 32, 35, 36, 37,
+40, 41, 42, 45, 46, 47};
 
 // Collapse the two lower dimensions, leaving two dimensions.
 let v01 = Collapse(v, {0,1});
 then v01 == f32[4x6] {{10, 11, 12, 15, 16, 17},
-                      {20, 21, 22, 25, 26, 27},
-                      {30, 31, 32, 35, 36, 37},
-                      {40, 41, 42, 45, 46, 47}};
+{20, 21, 22, 25, 26, 27},
+{30, 31, 32, 35, 36, 37},
+{40, 41, 42, 45, 46, 47}};
 
 // Collapse the two higher dimensions, leaving two dimensions.
 let v12 = Collapse(v, {1,2});
 then v12 == f32[8x3] {{10, 11, 12},
-                      {15, 16, 17},
-                      {20, 21, 22},
-                      {25, 26, 27},
-                      {30, 31, 32},
-                      {35, 36, 37},
-                      {40, 41, 42},
-                      {45, 46, 47}};
+{15, 16, 17},
+{20, 21, 22},
+{25, 26, 27},
+{30, 31, 32},
+{35, 36, 37},
+{40, 41, 42},
+{45, 46, 47}};
 
 ```
 
@@ -441,9 +441,9 @@ replicas.
 Note that there are the following restrictions on the `source_target_pair`:
 
 -   Any two pairs should not have the same target replica id, and they should
-    not have the same source replica id.
+not have the same source replica id.
 -   If a replica id is not a target in any pair, then the output on that replica
-    is a tensor consists of 0(s) with the same shape as the input.
+is a tensor consists of 0(s) with the same shape as the input.
 
 ## Concatenate
 
@@ -480,25 +480,25 @@ Concat({{2, 3}, {4, 5}, {6, 7}}, 0)
 
 ```
 let a = {
-  {1, 2},
-  {3, 4},
-  {5, 6},
+{1, 2},
+{3, 4},
+{5, 6},
 };
 let b = {
-  {7, 8},
+{7, 8},
 };
 Concat({a, b}, 0)
 >>> {
-  {1, 2},
-  {3, 4},
-  {5, 6},
-  {7, 8},
+{1, 2},
+{3, 4},
+{5, 6},
+{7, 8},
 }
 ```
 
 Diagram:
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="./images/ops_concatenate.png">
+<img style="width:100%" src="./images/ops_concatenate.png">
 </div>
 
 ## Conditional
@@ -566,20 +566,20 @@ the rhs is also an input. In a neural network, these are the input activations.
 The n+2 dimensions are, in this order:
 
 *   `batch`: Each coordinate in this dimension represents an independent input
-    for which convolution is carried out.
+for which convolution is carried out.
 *   `z/depth/features`: Each (y,x) position in the base area has a vector
-    associated to it, which goes into this dimension.
+associated to it, which goes into this dimension.
 *   `spatial_dims`: Describes the `n` spatial dimensions that define the base
-    area that the window moves across.
+area that the window moves across.
 
 The `rhs` argument is a rank n+2 array describing the convolutional
 filter/kernel/window. The dimensions are, in this order:
 
 *   `output-z`: The `z` dimension of the output.
 *   `input-z`: The size of this dimension times `feature_group_count` should
-    equal the size of the `z` dimension in lhs.
+equal the size of the `z` dimension in lhs.
 *   `spatial_dims`: Describes the `n` spatial dimensions that define the n-d
-    window that moves across the base area.
+window that moves across the base area.
 
 The `window_strides` argument specifies the stride of the convolutional window
 in the spatial dimensions. For example, if the stride in the first spatial
@@ -633,7 +633,7 @@ The output shape has these dimensions, in this order:
 *   `batch`: Same size as `batch` on the input (`lhs`).
 *   `z`: Same size as `output-z` on the kernel (`rhs`).
 *   `spatial_dims`: One value for each valid placement of the convolutional
-    window.
+window.
 
 The valid placements of the convolutional window are determined by the strides
 and the size of the base area after padding.
@@ -658,15 +658,15 @@ Here is pseudo-code for a 2d convolution with padding and striding:
 
 ```
 for (b, oz, oy, ox) {  // output coordinates
-  value = 0;
-  for (iz, ky, kx) {  // kernel coordinates and input z
-    iy = oy*stride_y + ky - pad_low_y;
-    ix = ox*stride_x + kx - pad_low_x;
-    if ((iy, ix) inside the base area considered without padding) {
-      value += input(b, iz, iy, ix) * kernel(oz, iz, ky, kx);
-    }
-  }
-  output(b, oz, oy, ox) = value;
+value = 0;
+for (iz, ky, kx) {  // kernel coordinates and input z
+iy = oy*stride_y + ky - pad_low_y;
+ix = ox*stride_x + kx - pad_low_x;
+if ((iy, ix) inside the base area considered without padding) {
+value += input(b, iz, iy, ix) * kernel(oz, iz, ky, kx);
+}
+}
+output(b, oz, oy, ox) = value;
 }
 ```
 
@@ -777,19 +777,19 @@ Here is an example of an implementation of `myfunc`:
 
 ```
 extern "C" void myfunc(void* out, void** in) {
-  float (&x)[2] = *static_cast<float(*)[2]>(in[0]);
-  float (&y)[2][3] = *static_cast<float(*)[2][3]>(in[1]);
-  EXPECT_EQ(1, x[0]);
-  EXPECT_EQ(2, x[1]);
-  EXPECT_EQ(10, y[0][0]);
-  EXPECT_EQ(20, y[0][1]);
-  EXPECT_EQ(30, y[0][2]);
-  EXPECT_EQ(40, y[1][0]);
-  EXPECT_EQ(50, y[1][1]);
-  EXPECT_EQ(60, y[1][2]);
-  float (&z)[3][3] = *static_cast<float(*)[3][3]>(out);
-  z[0][0] = x[1] + y[1][0];
-  // ...
+float (&x)[2] = *static_cast<float(*)[2]>(in[0]);
+float (&y)[2][3] = *static_cast<float(*)[2][3]>(in[1]);
+EXPECT_EQ(1, x[0]);
+EXPECT_EQ(2, x[1]);
+EXPECT_EQ(10, y[0][0]);
+EXPECT_EQ(20, y[0][1]);
+EXPECT_EQ(30, y[0][2]);
+EXPECT_EQ(40, y[1][0]);
+EXPECT_EQ(50, y[1][1]);
+EXPECT_EQ(60, y[1][2]);
+float (&z)[3][3] = *static_cast<float(*)[3][3]>(out);
+z[0][0] = x[1] + y[1][0];
+// ...
 }
 ```
 
@@ -864,17 +864,17 @@ Example with contracting dimension numbers:
 
 ```
 lhs = { {1.0, 2.0, 3.0},
-        {4.0, 5.0, 6.0} }
+{4.0, 5.0, 6.0} }
 
 rhs = { {1.0, 1.0, 1.0},
-        {2.0, 2.0, 2.0} }
+{2.0, 2.0, 2.0} }
 
 DotDimensionNumbers dnums;
 dnums.add_lhs_contracting_dimensions(1);
 dnums.add_rhs_contracting_dimensions(1);
 
 DotGeneral(lhs, rhs, dnums) -> { {6.0, 12.0},
-                                 {15.0, 30.0} }
+{15.0, 30.0} }
 ```
 
 Associated batch dimension numbers from the 'lhs' and 'rhs' must have the same
@@ -886,14 +886,14 @@ Example with batch dimension numbers (batch size 2, 2x2 matrices):
 
 ```
 lhs = { { {1.0, 2.0},
-          {3.0, 4.0} },
-        { {5.0, 6.0},
-          {7.0, 8.0} } }
+{3.0, 4.0} },
+{ {5.0, 6.0},
+{7.0, 8.0} } }
 
 rhs = { { {1.0, 0.0},
-          {0.0, 1.0} },
-        { {1.0, 0.0},
-          {0.0, 1.0} } }
+{0.0, 1.0} },
+{ {1.0, 0.0},
+{0.0, 1.0} } }
 
 DotDimensionNumbers dnums;
 dnums.add_lhs_contracting_dimensions(2);
@@ -902,9 +902,9 @@ dnums.add_lhs_batch_dimensions(0);
 dnums.add_rhs_batch_dimensions(0);
 
 DotGeneral(lhs, rhs, dnums) -> { { {1.0, 2.0},
-                                   {3.0, 4.0} },
-                                 { {5.0, 6.0},
-                                   {7.0, 8.0} } }
+{3.0, 4.0} },
+{ {5.0, 6.0},
+{7.0, 8.0} } }
 ```
 
 | Input                               | Output            | Semantics        |
@@ -963,22 +963,22 @@ let a = {0.0, 1.0, 2.0, 3.0, 4.0}
 let s = {2}
 
 DynamicSlice(a, s, {2}) produces:
-  {2.0, 3.0}
+{2.0, 3.0}
 ```
 
 2-dimensional example:
 
 ```
 let b =
- { {0.0,  1.0,  2.0},
-   {3.0,  4.0,  5.0},
-   {6.0,  7.0,  8.0},
-   {9.0, 10.0, 11.0} }
+{ {0.0,  1.0,  2.0},
+{3.0,  4.0,  5.0},
+{6.0,  7.0,  8.0},
+{9.0, 10.0, 11.0} }
 let s = {2, 1}
 
 DynamicSlice(b, s, {2, 2}) produces:
-  { { 7.0,  8.0},
-    {10.0, 11.0} }
+{ { 7.0,  8.0},
+{10.0, 11.0} }
 ```
 ## DynamicUpdateSlice
 
@@ -1027,29 +1027,29 @@ let u = {5.0, 6.0}
 let s = {2}
 
 DynamicUpdateSlice(a, u, s) produces:
-  {0.0, 1.0, 5.0, 6.0, 4.0}
+{0.0, 1.0, 5.0, 6.0, 4.0}
 ```
 
 2-dimensional example:
 
 ```
 let b =
- { {0.0,  1.0,  2.0},
-   {3.0,  4.0,  5.0},
-   {6.0,  7.0,  8.0},
-   {9.0, 10.0, 11.0} }
+{ {0.0,  1.0,  2.0},
+{3.0,  4.0,  5.0},
+{6.0,  7.0,  8.0},
+{9.0, 10.0, 11.0} }
 let u =
- { {12.0,  13.0},
-   {14.0,  15.0},
-   {16.0,  17.0} }
+{ {12.0,  13.0},
+{14.0,  15.0},
+{16.0,  17.0} }
 
 let s = {1, 1}
 
 DynamicUpdateSlice(b, u, s) produces:
- { {0.0,  1.0,  2.0},
-   {3.0, 12.0, 13.0},
-   {6.0, 14.0, 15.0},
-   {9.0, 16.0, 17.0} }
+{ {0.0,  1.0,  2.0},
+{3.0, 12.0, 13.0},
+{6.0, 14.0, 15.0},
+{9.0, 16.0, 17.0} }
 ```
 
 ## Element-wise binary arithmetic operations
@@ -1235,42 +1235,42 @@ shape of `start_indices` to be `[6,7,1]`).
 
 The bounds for the output array along dimension `i` is computed as follows:
 
-  1. If `i` is present in `batch_dims` (i.e. is equal to `batch_dims[k]` for
-     some `k`) then we pick the corresponding dimension bounds out of
-     `start_indices.shape`, skipping `index_vector_dim` (i.e. pick
-     `start_indices.shape.dims`[`k`] if `k` < `index_vector_dim` and
-     `start_indices.shape.dims`[`k`+`1`] otherwise).
+1. If `i` is present in `batch_dims` (i.e. is equal to `batch_dims[k]` for
+some `k`) then we pick the corresponding dimension bounds out of
+`start_indices.shape`, skipping `index_vector_dim` (i.e. pick
+`start_indices.shape.dims`[`k`] if `k` < `index_vector_dim` and
+`start_indices.shape.dims`[`k`+`1`] otherwise).
 
-  2. If `i` is present in `offset_dims` (i.e. equal to `offset_dims`[`k`] for
-     some `k`) then we pick the corresponding bound out of `slice_sizes` after
-     accounting for `collapsed_slice_dims` (i.e. we pick
-     `adjusted_slice_sizes`[`k`] where `adjusted_slice_sizes` is `slice_sizes`
-     with the bounds at indices `collapsed_slice_dims` removed).
+2. If `i` is present in `offset_dims` (i.e. equal to `offset_dims`[`k`] for
+some `k`) then we pick the corresponding bound out of `slice_sizes` after
+accounting for `collapsed_slice_dims` (i.e. we pick
+`adjusted_slice_sizes`[`k`] where `adjusted_slice_sizes` is `slice_sizes`
+with the bounds at indices `collapsed_slice_dims` removed).
 
 Formally, the operand index `In` corresponding to an output index `Out` is
 computed as follows:
 
-  1. Let `G` = { `Out`[`k`] for `k` in `batch_dims` }.  Use `G` to slice out
-     vector `S` such that `S`[`i`] = `start_indices`[Combine(`G`, `i`)] where
-     Combine(A, b) inserts b at position `index_vector_dim` into A.  Note that
-     this is well defined even if `G` is empty -- if `G` is empty then `S` =
-     `start_indices`.
-
-  2. Create a starting index, `S`<sub>`in`</sub>, into `operand` using `S` by
-     scattering `S` using `start_index_map`.  More precisely:
-       1. `S`<sub>`in`</sub>[`start_index_map`[`k`]] = `S`[`k`] if `k` <
-          `start_index_map.size`.
-       2. `S`<sub>`in`</sub>[`_`] = `0` otherwise.
-
-  3. Create an index `O`<sub>`in`</sub> into `operand` by scattering the indices
-     at the offset dimensions in `Out` according to the `collapsed_slice_dims`
-     set.  More precisely:
-       1. `O`<sub>`in`</sub>[`expand_offset_dims`(`k`)] =
-          `Out`[`offset_dims`[`k`]] if `k` < `offset_dims.size`
-          (`expand_offset_dims` is defined below).
-       2. `O`<sub>`in`</sub>[`_`] = `0` otherwise.
-  4. `In` is `O`<sub>`in`</sub> + `S`<sub>`in`</sub> where + is element-wise
-     addition.
+1. Let `G` = { `Out`[`k`] for `k` in `batch_dims` }.  Use `G` to slice out
+vector `S` such that `S`[`i`] = `start_indices`[Combine(`G`, `i`)] where
+Combine(A, b) inserts b at position `index_vector_dim` into A.  Note that
+this is well defined even if `G` is empty -- if `G` is empty then `S` =
+`start_indices`.
+
+2. Create a starting index, `S`<sub>`in`</sub>, into `operand` using `S` by
+scattering `S` using `start_index_map`.  More precisely:
+1. `S`<sub>`in`</sub>[`start_index_map`[`k`]] = `S`[`k`] if `k` <
+`start_index_map.size`.
+2. `S`<sub>`in`</sub>[`_`] = `0` otherwise.
+
+3. Create an index `O`<sub>`in`</sub> into `operand` by scattering the indices
+at the offset dimensions in `Out` according to the `collapsed_slice_dims`
+set.  More precisely:
+1. `O`<sub>`in`</sub>[`expand_offset_dims`(`k`)] =
+`Out`[`offset_dims`[`k`]] if `k` < `offset_dims.size`
+(`expand_offset_dims` is defined below).
+2. `O`<sub>`in`</sub>[`_`] = `0` otherwise.
+4. `In` is `O`<sub>`in`</sub> + `S`<sub>`in`</sub> where + is element-wise
+addition.
 
 `expand_offset_dims` is the monotonic function with domain [`0`, `offset.size`)
 and range [`0`, `operand.rank`) \ `collapsed_slice_dims`.  So if, e.g.,
@@ -1282,21 +1282,21 @@ and range [`0`, `operand.rank`) \ `collapsed_slice_dims`.  So if, e.g.,
 Informally, every index `Out` in the output array corresponds to an element `E`
 in the operand array, computed as follows:
 
-  - We use the batch dimensions in `Out` to look up a starting index from
-    `start_indices`.
+- We use the batch dimensions in `Out` to look up a starting index from
+`start_indices`.
 
-  - We use `start_index_map` to map the starting index (which may have size less
-    than operand.rank) to a "full" starting index into operand.
+- We use `start_index_map` to map the starting index (which may have size less
+than operand.rank) to a "full" starting index into operand.
 
-  - We dynamic-slice out a slice with size `slice_sizes` using the full starting
-    index.
+- We dynamic-slice out a slice with size `slice_sizes` using the full starting
+index.
 
-  - We reshape the slice by collapsing the `collapsed_slice_dims` dimensions.
-    Since all collapsed slice dimensions have to have bound 1 this reshape is
-    always legal.
+- We reshape the slice by collapsing the `collapsed_slice_dims` dimensions.
+Since all collapsed slice dimensions have to have bound 1 this reshape is
+always legal.
 
-  - We use the offset dimensions in `Out` to index into this slice to get the
-    input element, `E`, corresponding to output index `Out`.
+- We use the offset dimensions in `Out` to index into this slice to get the
+input element, `E`, corresponding to output index `Out`.
 
 `index_vector_dim` is set to `start_indices.rank` - `1` in all of the
 examples that follow.  More interesting values for `index_vector_dim` does not
@@ -1315,7 +1315,7 @@ the output shape, and maps it to an element in the input array in the following
 way:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="./images/ops_xla_gather_0.svg">
+<img style="width:100%" src="./images/ops_xla_gather_0.svg">
 </div>
 
 We first select an (`X`,`Y`) vector from the gather indices array using `G`.
@@ -1334,7 +1334,7 @@ version of the example above using a "gather indices" array of shape `[4,5,2]`
 would translate indices like this:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="./images/ops_xla_gather_1.svg">
+<img style="width:100%" src="./images/ops_xla_gather_1.svg">
 </div>
 
 Again, this acts as a batch dynamic slice `G`<sub>`0`</sub> and
@@ -1343,27 +1343,27 @@ Again, this acts as a batch dynamic slice `G`<sub>`0`</sub> and
 The gather operation in XLA generalizes the informal semantics outlined above in
 the following ways:
 
- 1. We can configure which dimensions in the output shape are the offset
-    dimensions (dimensions containing `O`<sub>`0`</sub>, `O`<sub>`1`</sub> in
-    the last example).  The output batch dimensions (dimensions containing
-    `G`<sub>`0`</sub>, `G`<sub>`1`</sub> in the last example) are defined to be
-    the output dimensions that are not offset dimensions.
+1. We can configure which dimensions in the output shape are the offset
+dimensions (dimensions containing `O`<sub>`0`</sub>, `O`<sub>`1`</sub> in
+the last example).  The output batch dimensions (dimensions containing
+`G`<sub>`0`</sub>, `G`<sub>`1`</sub> in the last example) are defined to be
+the output dimensions that are not offset dimensions.
 
- 2. The number of output offset dimensions explicitly present in the output
-    shape may be smaller than the input rank.  These "missing" dimensions, which
-    are listed explicitly as `collapsed_slice_dims`, must have a slice size of
-    `1`.  Since they have a slice size of `1` the only valid index for them is
-    `0` and eliding them does not introduce ambiguity.
+2. The number of output offset dimensions explicitly present in the output
+shape may be smaller than the input rank.  These "missing" dimensions, which
+are listed explicitly as `collapsed_slice_dims`, must have a slice size of
+`1`.  Since they have a slice size of `1` the only valid index for them is
+`0` and eliding them does not introduce ambiguity.
 
- 3. The slice extracted from the "Gather Indices" array ((`X`, `Y`) in the last
-    example) may have fewer elements than the input array rank, and an explicit
-    mapping dictates how the index should be expanded to have the same rank as
-    the input.
+3. The slice extracted from the "Gather Indices" array ((`X`, `Y`) in the last
+example) may have fewer elements than the input array rank, and an explicit
+mapping dictates how the index should be expanded to have the same rank as
+the input.
 
 As a final example, we use (2) and (3) to implement `tf.gather_nd`:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="./images/ops_xla_gather_2.svg">
+<img style="width:100%" src="./images/ops_xla_gather_2.svg">
 </div>
 
 `G`<sub>`0`</sub> and `G`<sub>`1`</sub> are used to slice out a starting index
@@ -1442,11 +1442,11 @@ dependency between the while loops.
 
 ```
 result1 = while (condition, init = init_value) {
-  Infeed(shape)
+Infeed(shape)
 }
 
 result2 = while (condition, init = result1) {
-  Infeed(shape)
+Infeed(shape)
 }
 ```
 
@@ -1464,12 +1464,15 @@ Infeed of the device.
 
 Builds a constant literal on device rather than a potentially large host
 transfer. Creates a rank 1 array of values starting at zero and incrementing by
-one.
+one. For floating-point types, the produced array is equivalent to
+`ConvertElementType(Iota(...))` where the `Iota` is of integral type and the
+conversion is to the floating-point type.
 
-Arguments | Type            | Semantics
---------- | --------------- | ------------------------------------
-`type`    | `PrimitiveType` | type U
-`size`    | `int64`         | The number of elements in the array.
+Arguments        | Type            | Semantics
+---------------- | --------------- | ------------------------------------
+`type`           | `PrimitiveType` | type U
+`size`           | `int64`         | The number of elements in the array.
+`iota_dimension` | `int64`         | The dimension to increment along.
 
 ## Map
 
diff --git a/tensorflow/compiler/xla/layout.cc b/tensorflow/compiler/xla/layout.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e3b5fcd5274881cec31ecf906e3461685f82a1f4
--- /dev/null
+++ b/tensorflow/compiler/xla/layout.cc
@@ -0,0 +1,96 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/layout.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+
+namespace xla {
+
+TileProto Tile::ToProto() const {
+  TileProto tile_proto;
+  for (int64 i : dimensions()) {
+    tile_proto.add_dimensions(i);
+  }
+  return tile_proto;
+}
+
+string Tile::ToString() const {
+  return absl::StrCat("(", absl::StrJoin(dimensions(), ","), ")");
+}
+
+/* static */ Layout Layout::CreateFromProto(const LayoutProto& proto) {
+  Layout layout;
+  layout.set_format(proto.format());
+  layout.minor_to_major_.reserve(proto.minor_to_major_size());
+  for (const int64 dimension : proto.minor_to_major()) {
+    layout.add_minor_to_major(dimension);
+  }
+  layout.set_max_sparse_elements(proto.max_sparse_elements());
+  for (const TileProto& tile_proto : proto.tiles()) {
+    *layout.add_tiles() = Tile::CreateFromProto(tile_proto);
+  }
+  layout.set_element_size_in_bits(proto.element_size_in_bits());
+  return layout;
+}
+
+LayoutProto Layout::ToProto() const {
+  LayoutProto proto;
+  proto.set_format(format_);
+  proto.mutable_minor_to_major()->Reserve(minor_to_major_size());
+  for (const int64 dimension : minor_to_major()) {
+    proto.add_minor_to_major(dimension);
+  }
+  proto.set_max_sparse_elements(max_sparse_elements_);
+  for (const Tile& tile : tiles()) {
+    *proto.add_tiles() = tile.ToProto();
+  }
+  proto.set_element_size_in_bits(element_size_in_bits());
+  return proto;
+}
+
+string Layout::ToString() const {
+  // TODO(b/119839262): Emit tiles in string.
+  if (format() == SPARSE) {
+    return absl::StrCat("sparse{", max_sparse_elements(), "}");
+  } else if (format() == DENSE) {
+    return absl::StrCat("{", absl::StrJoin(minor_to_major(), ","), "}");
+  } else {
+    CHECK_EQ(format(), INVALID_FORMAT);
+    return "invalid{}";
+  }
+}
+
+bool Layout::operator==(const Layout& other) const {
+  return (other.format() == format() &&
+          other.minor_to_major() == minor_to_major() &&
+          other.element_size_in_bits() == element_size_in_bits() &&
+          other.max_sparse_elements() == max_sparse_elements() &&
+          other.tiles() == tiles());
+}
+
+std::ostream& operator<<(std::ostream& out, const Tile& tile) {
+  out << tile.ToString();
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const Layout& layout) {
+  out << layout.ToString();
+  return out;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/layout.h b/tensorflow/compiler/xla/layout.h
new file mode 100644
index 0000000000000000000000000000000000000000..313368c39e4c976fc481941eb17325101f2ba69a
--- /dev/null
+++ b/tensorflow/compiler/xla/layout.h
@@ -0,0 +1,187 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_LAYOUT_H_
+#define TENSORFLOW_COMPILER_XLA_LAYOUT_H_
+
+#include <vector>
+
+#include "absl/types/span.h"
+
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// Describes a tile used in tiling-based layout. Refer to
+// g3doc/third_party/tensorflow/compiler/xla/g3doc/layout_with_tiling.md for
+// details.
+class Tile {
+ public:
+  Tile() = default;
+  explicit Tile(absl::Span<const int64> dimensions)
+      : dimensions_(dimensions.begin(), dimensions.end()) {}
+
+  // De/Serialize a Tile to and from a TileProto.
+  static Tile CreateFromProto(const TileProto& tile_proto) {
+    return Tile(AsInt64Slice(tile_proto.dimensions()));
+  }
+  TileProto ToProto() const;
+
+  bool operator==(const Tile& other) const {
+    return dimensions() == other.dimensions();
+  }
+  bool operator!=(const Tile& other) const { return !(*this == other); }
+
+  string ToString() const;
+
+  // Returns the bound of the tile in the given dimension index.
+  int64 dimension(int i) const { return dimensions_.at(i); }
+
+  // Returns the dimensions of the tile.
+  const std::vector<int64>& dimensions() const { return dimensions_; }
+
+ private:
+  // The bounds of the tile.
+  std::vector<int64> dimensions_;
+};
+
+class Layout {
+ public:
+  Layout() = default;
+
+  // Constructs a dense layout with the given minor-to-major order.
+  explicit Layout(absl::Span<const int64> minor_to_major)
+      : format_(DENSE),
+        minor_to_major_(minor_to_major.begin(), minor_to_major.end()) {}
+
+  // Constructs a dense tiled layout with the given minor-to-major order and
+  // tiles.
+  Layout(absl::Span<const int64> minor_to_major, absl::Span<const Tile> tiles)
+      : format_(DENSE),
+        minor_to_major_(minor_to_major.begin(), minor_to_major.end()),
+        tiles_(tiles.begin(), tiles.end()) {}
+
+  // Construct a shape from a LayoutProto.
+  static Layout CreateFromProto(const LayoutProto& proto);
+
+  // Returns a LayoutProto representation of the Layout.
+  LayoutProto ToProto() const;
+
+  // Returns a human-readable string that represents this layout.
+  string ToString() const;
+
+  bool operator==(const Layout& other) const;
+  bool operator!=(const Layout& other) const { return !(*this == other); }
+
+  // The following methods mirror the protobuf generated code interface for the
+  // message LayoutProto. This enabled easy migration of this data structure
+  // from a proto to a proper C++ class.
+  //
+  // TODO(b/29771030): Replace or augment these methods with a more ergonomic
+  // interface.
+
+  // Methods for accessing the format.
+  Format format() const { return format_; }
+  Layout& set_format(Format value) {
+    format_ = value;
+    return *this;
+  }
+
+  // Methods for accessing the minor-to-major array.
+  int minor_to_major_size() const { return minor_to_major_.size(); }
+  int64 minor_to_major(int index) const { return minor_to_major_.at(index); }
+  Layout& set_minor_to_major(int index, int64 value) {
+    minor_to_major_.at(index) = value;
+    return *this;
+  }
+  Layout& add_minor_to_major(int64 value) {
+    minor_to_major_.push_back(value);
+    return *this;
+  }
+  Layout& clear_minor_to_major() {
+    minor_to_major_.clear();
+    return *this;
+  }
+  const std::vector<int64>& minor_to_major() const { return minor_to_major_; }
+  std::vector<int64>* mutable_minor_to_major() { return &minor_to_major_; }
+
+  // Methods for accessing the tile field.
+  int tiles_size() const { return tiles_.size(); }
+  const Tile& tiles(int index) const { return tiles_.at(index); }
+  Tile* mutable_tiles(int index) { return &tiles_.at(index); }
+  Tile* add_tiles() {
+    tiles_.push_back(Tile());
+    return &tiles_.back();
+  }
+  Layout& clear_tiles() {
+    tiles_.clear();
+    return *this;
+  }
+  const std::vector<Tile>& tiles() const { return tiles_; }
+  std::vector<Tile>* mutable_tiles() { return &tiles_; }
+
+  // Methods for accessing the int64 fields.
+  int64 max_sparse_elements() const { return max_sparse_elements_; }
+  Layout& set_max_sparse_elements(int64 value) {
+    max_sparse_elements_ = value;
+    return *this;
+  }
+  int64 element_size_in_bits() const { return element_size_in_bits_; }
+  Layout& set_element_size_in_bits(int64 value) {
+    element_size_in_bits_ = value;
+    return *this;
+  }
+
+  void Swap(Layout* other) {
+    using std::swap;
+    swap(*this, *other);
+  }
+
+  void Clear() {
+    format_ = INVALID_FORMAT;
+    minor_to_major_.clear();
+    max_sparse_elements_ = 0;
+    element_size_in_bits_ = 0;
+  }
+
+ public:
+  // The format of this layout.
+  Format format_ = INVALID_FORMAT;
+
+  // Sequence of dimension numbers, from minor (fastest varying index) to major
+  // (slowest varying index).
+  std::vector<int64> minor_to_major_;
+
+  // The maximum number of elements that can be stored for SPARSE formats.  This
+  // can be used to determine the maximum size in bytes of arrays stored in
+  // memory.  This field must be zero unless the format is SPARSE.
+  int64 max_sparse_elements_ = 0;
+
+  // The number of bits used to store an individual array element.
+  int64 element_size_in_bits_ = 0;
+
+  // The tiles used in tiling-based layout.
+  std::vector<Tile> tiles_;
+};
+
+std::ostream& operator<<(std::ostream& out, const Tile& Tile);
+std::ostream& operator<<(std::ostream& out, const Layout& layout);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_LAYOUT_H_
diff --git a/tensorflow/compiler/xla/layout_test.cc b/tensorflow/compiler/xla/layout_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fb6abd3f6523b978e72b21ec082ae06973e86243
--- /dev/null
+++ b/tensorflow/compiler/xla/layout_test.cc
@@ -0,0 +1,104 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/layout.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+class LayoutTest : public ::testing::Test {};
+
+TEST_F(LayoutTest, ToString) {
+  EXPECT_EQ(Layout().ToString(), "invalid{}");
+  EXPECT_EQ(Layout({4, 5, 6}).ToString(), "{4,5,6}");
+  EXPECT_EQ(Layout().set_format(SPARSE).set_max_sparse_elements(123).ToString(),
+            "sparse{123}");
+  EXPECT_EQ(Layout({4, 5, 6}).ToString(), "{4,5,6}");
+  EXPECT_EQ(Layout({3, 2, 1, 0}, {Tile({42, 123}), Tile({4, 5})}).ToString(),
+            "{3,2,1,0}");
+  EXPECT_EQ(
+      Layout({1, 0}, {Tile({2, 55})}).set_element_size_in_bits(42).ToString(),
+      "{1,0}");
+}
+
+TEST_F(LayoutTest, StreamOut) {
+  {
+    std::ostringstream oss;
+    oss << Tile({7, 8});
+    EXPECT_EQ(oss.str(), "(7,8)");
+  }
+
+  {
+    std::ostringstream oss;
+    oss << Layout({0, 1, 2});
+    EXPECT_EQ(oss.str(), "{0,1,2}");
+  }
+}
+
+TEST_F(LayoutTest, SparseLayoutMaxElements) {
+  EXPECT_EQ(LayoutUtil::MaxSparseElements(LayoutUtil::MakeSparseLayout(101)),
+            101);
+}
+
+TEST_F(LayoutTest, Equality) {
+  EXPECT_EQ(Layout(), Layout());
+  const std::vector<int64> empty_dims;
+  EXPECT_EQ(Layout(empty_dims), Layout(empty_dims));
+  EXPECT_NE(Layout(), Layout(empty_dims));
+  EXPECT_EQ(Layout({0, 1, 2, 3}), Layout({0, 1, 2, 3}));
+  EXPECT_NE(Layout({0, 1, 2, 3}), Layout({0, 1, 2}));
+  EXPECT_EQ(Layout({0, 1, 2}, {Tile({42, 44})}),
+            Layout({0, 1, 2}, {Tile({42, 44})}));
+  EXPECT_NE(Layout({0, 1, 2}, {Tile({42, 44})}),
+            Layout({0, 1, 2}, {Tile({42, 45})}));
+  EXPECT_NE(Layout({0, 1, 2}, {Tile({42, 44})}), Layout({0, 1, 2, 3}));
+  EXPECT_EQ(Layout({0, 1, 2}).set_element_size_in_bits(33),
+            Layout({0, 1, 2}).set_element_size_in_bits(33));
+  EXPECT_NE(Layout({0, 1, 2}).set_element_size_in_bits(33),
+            Layout({0, 1, 2}).set_element_size_in_bits(7));
+  EXPECT_EQ(Layout().set_format(SPARSE), Layout().set_format(SPARSE));
+  EXPECT_EQ(Layout().set_format(SPARSE).set_max_sparse_elements(42),
+            Layout().set_format(SPARSE).set_max_sparse_elements(42));
+  EXPECT_NE(Layout().set_format(SPARSE).set_max_sparse_elements(42),
+            Layout().set_format(SPARSE).set_max_sparse_elements(24));
+}
+
+TEST_F(LayoutTest, LayoutToFromProto) {
+  // Round-trips a Layout through proto de/serialization.
+  auto expect_unchanged = [](const Layout& layout) {
+    EXPECT_EQ(layout, Layout::CreateFromProto(layout.ToProto()));
+  };
+
+  expect_unchanged(Layout());
+  expect_unchanged(Layout({1, 3, 2, 0}));
+  expect_unchanged(Layout().set_format(SPARSE));
+  expect_unchanged(Layout().set_format(SPARSE).set_max_sparse_elements(123));
+  expect_unchanged(Layout({0, 1}).set_element_size_in_bits(42));
+  expect_unchanged(Layout({3, 2, 1, 0}, {Tile({42, 123}), Tile({4, 5})}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index dbb81381acde645f08639737b6e7b6f6ad971f9b..ddccd8c798df5b926d2e5aea8975cb6cb6640824 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -41,15 +41,13 @@ namespace {
 
 // Internal helper for GetDefaultLayoutForShape and SetToDefaultLayout. Sets
 // minor_to_major to the value that represents the default layout.
-void SetDefaultLayoutToContainer(
-    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-        minor_to_major) {
+void SetDefaultLayoutToContainer(std::vector<int64>* minor_to_major) {
   // The default XLA layout is major-to-minor (dim 0 is major).
   // For more information on XLA layouts, see:
   // https://www.tensorflow.org/performance/xla/shapes
   const int64 size = minor_to_major->size();
   for (int64 i = 0; i < size; ++i) {
-    minor_to_major->Set(i, size - 1 - i);
+    (*minor_to_major)[i] = size - 1 - i;
   }
 }
 
@@ -94,9 +92,8 @@ namespace {
 Layout CreateDefaultLayoutForRank(int64 rank) {
   Layout layout;
   layout.set_format(DENSE);
-  tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-      minor_to_major = layout.mutable_minor_to_major();
-  minor_to_major->Resize(rank, 0);
+  std::vector<int64>* minor_to_major = layout.mutable_minor_to_major();
+  minor_to_major->resize(rank, 0);
   SetDefaultLayoutToContainer(minor_to_major);
   return layout;
 }
@@ -139,9 +136,8 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
     shape->clear_layout();
   } else if (ShapeUtil::IsArray(*shape)) {
     shape->mutable_layout()->set_format(DENSE);
-    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-        minor_to_major = shape->mutable_layout()->mutable_minor_to_major();
-    minor_to_major->Resize(shape->dimensions_size(), 0);
+    auto* minor_to_major = shape->mutable_layout()->mutable_minor_to_major();
+    minor_to_major->resize(shape->dimensions_size(), 0);
     SetDefaultLayoutToContainer(minor_to_major);
   } else {
     // Opaque, token types etc. have no layout.
@@ -210,9 +206,8 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
   }
 
   if (layout.format() == INVALID_FORMAT || !Format_IsValid(layout.format())) {
-    return InvalidArgument(
-        "Layout has an invalid format (%d) in layout {%s}, shape {%s}",
-        layout.format(), layout.ShortDebugString(), shape.ShortDebugString());
+    return InvalidArgument("Layout has an invalid format (%d)",
+                           layout.format());
   }
 
   if (layout.format() == DENSE) {
@@ -316,7 +311,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }
 
 /* static */ bool LayoutUtil::Equal(const Layout& lhs, const Layout& rhs) {
-  return protobuf_util::ProtobufEquals(lhs, rhs);
+  return lhs == rhs;
 }
 
 /* static */ absl::Span<const int64> LayoutUtil::MinorToMajor(
@@ -358,11 +353,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }
 
 /* static */ string LayoutUtil::HumanString(const Layout& layout) {
-  if (IsSparse(layout)) {
-    return absl::StrCat("sparse{", layout.max_sparse_elements(), "}");
-  }
-  CHECK(IsDense(layout));
-  return absl::StrCat("{", absl::StrJoin(layout.minor_to_major(), ","), "}");
+  return layout.ToString();
 }
 
 namespace {
@@ -444,11 +435,6 @@ Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
   return true;
 }
 
-std::ostream& operator<<(std::ostream& out, const Layout& layout) {
-  out << LayoutUtil::HumanString(layout);
-  return out;
-}
-
 /*static*/ size_t LayoutUtil::Hash(const Layout& layout) {
   using tensorflow::hash;
   using tensorflow::Hash64Combine;
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index 6c298e57252449ce3f1f9055436e918f2d9f17f1..609dba67bcdbcb11be0906b7d87a52a17ba0dfbd 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/layout.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -195,8 +196,6 @@ class LayoutUtil {
   TF_DISALLOW_COPY_AND_ASSIGN(LayoutUtil);
 };
 
-std::ostream& operator<<(std::ostream& out, const Layout& layout);
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_LAYOUT_UTIL_H_
diff --git a/tensorflow/compiler/xla/layout_util_test.cc b/tensorflow/compiler/xla/layout_util_test.cc
index 12ce2d2d7c6fa8c590035f9ff2af50001ccf80d8..4cc94c270cd64eb19761cc1044861c7d185b7888 100644
--- a/tensorflow/compiler/xla/layout_util_test.cc
+++ b/tensorflow/compiler/xla/layout_util_test.cc
@@ -317,17 +317,6 @@ TEST_F(LayoutUtilTest, DefaultLayoutGettersMajorToMinor) {
                             ShapeUtil::MakeShape(F32, {10, 20, 30, 15, 25}))));
 }
 
-TEST_F(LayoutUtilTest, SparseLayoutMaxElements) {
-  EXPECT_EQ(LayoutUtil::MaxSparseElements(LayoutUtil::MakeSparseLayout(101)),
-            101);
-}
-
-TEST_F(LayoutUtilTest, StreamOut) {
-  std::ostringstream oss;
-  oss << LayoutUtil::MakeLayout({0, 1, 2});
-  EXPECT_EQ(oss.str(), "{0,1,2}");
-}
-
 TEST_F(LayoutUtilTest, ValidateLayout_ValidArrayLayout) {
   Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {0, 1});
   auto status =
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 8f480c1f1079b4e1a5be53958ebdf6e004ad9ebe..277c98721e59ac12965392500fdfdc3d91e59a8b 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -1028,20 +1028,21 @@ string ShapeToString(bool print_layout, const Shape& shape) {
 }
 
 void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
-                    bool print_layout, std::vector<string>* pieces);
+                    bool print_shape, bool print_layout,
+                    std::vector<string>* pieces);
 
 void TupleToStringHelper(const LiteralBase& literal,
-                         const ShapeIndex& shape_index, bool print_layout,
-                         std::vector<string>* pieces) {
+                         const ShapeIndex& shape_index, bool print_shape,
+                         bool print_layout, std::vector<string>* pieces) {
   const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
-  pieces->push_back(ShapeToString(print_layout, subshape));
-  pieces->push_back(" (\n");
+  pieces->push_back("(\n");
   std::vector<string> tuple_pieces;
   for (int i = 0; i < ShapeUtil::TupleElementCount(subshape); ++i) {
     ShapeIndex element_index = shape_index;
     element_index.push_back(i);
     std::vector<string> element_pieces;
-    ToStringHelper(literal, element_index, print_layout, &element_pieces);
+    ToStringHelper(literal, element_index, print_shape, print_layout,
+                   &element_pieces);
     tuple_pieces.push_back(absl::StrJoin(element_pieces, ""));
   }
   pieces->push_back(absl::StrJoin(tuple_pieces, ",\n"));
@@ -1049,9 +1050,11 @@ void TupleToStringHelper(const LiteralBase& literal,
 }
 
 void SparseArrayToStringHelper(const LiteralBase& literal,
-                               const Shape& subshape, bool print_layout,
-                               std::vector<string>* pieces) {
-  pieces->push_back(ShapeToString(print_layout, subshape));
+                               const Shape& subshape, bool print_shape,
+                               bool print_layout, std::vector<string>* pieces) {
+  if (print_shape) {
+    pieces->push_back(ShapeToString(print_layout, subshape));
+  }
   pieces->push_back("{");
   int64 rank = ShapeUtil::Rank(subshape);
   int64 num_elements = literal.sparse_element_count();
@@ -1073,8 +1076,8 @@ void SparseArrayToStringHelper(const LiteralBase& literal,
 }
 
 void DenseArrayToStringHelper(const LiteralBase& literal,
-                              const ShapeIndex& shape_index, bool print_layout,
-                              std::vector<string>* pieces) {
+                              const ShapeIndex& shape_index, bool print_shape,
+                              bool print_layout, std::vector<string>* pieces) {
   const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
   int64 rank = ShapeUtil::Rank(subshape);
 
@@ -1135,7 +1138,7 @@ void DenseArrayToStringHelper(const LiteralBase& literal,
         }
       };
 
-  if (rank > 1) {
+  if (print_shape) {
     pieces->push_back(ShapeToString(print_layout, subshape));
     pieces->push_back(" ");
   }
@@ -1146,19 +1149,23 @@ void DenseArrayToStringHelper(const LiteralBase& literal,
 }
 
 void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
-                    bool print_layout, std::vector<string>* pieces) {
+                    bool print_shape, bool print_layout,
+                    std::vector<string>* pieces) {
   const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
   CHECK(LayoutUtil::HasLayout(literal.shape()));
   CHECK(LayoutUtil::HasLayout(subshape));
   if (ShapeUtil::IsTuple(subshape)) {
-    TupleToStringHelper(literal, shape_index, print_layout, pieces);
+    TupleToStringHelper(literal, shape_index, print_shape, print_layout,
+                        pieces);
   } else if (ShapeUtil::IsToken(subshape)) {
     pieces->push_back("token");
   } else if (LayoutUtil::IsSparseArray(subshape)) {
-    SparseArrayToStringHelper(literal, subshape, print_layout, pieces);
+    SparseArrayToStringHelper(literal, subshape, print_shape, print_layout,
+                              pieces);
   } else {
     CHECK(LayoutUtil::IsDenseArray(subshape));
-    DenseArrayToStringHelper(literal, shape_index, print_layout, pieces);
+    DenseArrayToStringHelper(literal, shape_index, print_shape, print_layout,
+                             pieces);
   }
 }
 
@@ -1169,10 +1176,27 @@ int64 LiteralBase::sparse_element_count() const {
   return sparse_indices()->index_count();
 }
 
-string LiteralBase::ToString(bool print_layout) const {
+string LiteralBase::ToString() const {
+  std::vector<string> pieces;
+  CHECK(LayoutUtil::HasLayout(this->shape()));
+  ToStringHelper(*this, {}, /*print_shape=*/true,
+                 /*print_layout=*/false, &pieces);
+  return absl::StrJoin(pieces, "");
+}
+
+string LiteralBase::ToStringWithoutShape() const {
+  std::vector<string> pieces;
+  CHECK(LayoutUtil::HasLayout(this->shape()));
+  ToStringHelper(*this, {}, /*print_shape=*/false,
+                 /*print_layout=*/false, &pieces);
+  return absl::StrJoin(pieces, "");
+}
+
+string LiteralBase::ToStringWithLayout() const {
   std::vector<string> pieces;
   CHECK(LayoutUtil::HasLayout(this->shape()));
-  ToStringHelper(*this, {}, print_layout, &pieces);
+  ToStringHelper(*this, {}, /*print_shape=*/true,
+                 /*print_layout=*/true, &pieces);
   return absl::StrJoin(pieces, "");
 }
 
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index fa9a71af4ceb998a7a289443cbef70eb52cb1a11..67e908e7ec4d4346f4e26a99a42aac26928ec0c2 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -92,9 +92,20 @@ class LiteralBase {
   // array.
   string GetR1U8AsString() const;
 
-  // Returns a string representation of the literal value.
-  // Warning: this function can take minutes for multi-million element Literals.
-  string ToString(bool print_layout = false) const;
+  // Returns a string representation of the literal value. The Shape of the
+  // literal is a prefix of the literal value in the string.
+
+  // Warning: this function can take minutes for multi-million
+  // element Literals.
+  string ToString() const;
+
+  // Returns a string representation of the literal value which does *not*
+  // include the shape string.
+  string ToStringWithoutShape() const;
+
+  // Returns a string representation of the literal value which includes the
+  // shape string with its layout.does *not* include the shape string.
+  string ToStringWithLayout() const;
 
   // Gets an element in the literal at the given index. The multi_index is
   // CHECKed against the dimension sizes.
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index b044f0ad73f13a0599e77f1f43888bc974e31f73..1ac9a48e805daa86f0dc65b54626195c89241020 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -46,68 +46,102 @@ uint16 GetRawValue(Eigen::half val) { return val.x; }
 // between the left-hand-side and right-hand-side, by bit-casting to UnsignedT
 // -- on miscompare, a nice error message is given in the AssertionFailure.
 template <typename FloatT, typename UnsignedT>
-Status CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs,
-                                 absl::Span<const int64> multi_index) {
+bool CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs,
+                               absl::Span<const int64> multi_index) {
+  auto ulhs = absl::bit_cast<UnsignedT>(GetRawValue(lhs));
+  auto urhs = absl::bit_cast<UnsignedT>(GetRawValue(rhs));
+  return ulhs == urhs;
+}
+
+// Templated comparator that specializes for float equality comparison with the
+// bitwise helper above (this is the un-specialized fallback, to just use the
+// default gunit implementation).
+template <typename NativeT>
+bool CompareEqual(NativeT lhs, NativeT rhs,
+                  absl::Span<const int64> multi_index) {
+  return lhs == rhs;
+}
+
+// Specializations for floating types that do bitwise comparisons when equality
+// comparison is requested.
+template <>
+bool CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs,
+                            absl::Span<const int64> multi_index) {
+  return CompareFloatsBitwiseEqual<bfloat16, uint16>(lhs, rhs, multi_index);
+}
+template <>
+bool CompareEqual<Eigen::half>(Eigen::half lhs, Eigen::half rhs,
+                               absl::Span<const int64> multi_index) {
+  return CompareFloatsBitwiseEqual<Eigen::half, uint16>(lhs, rhs, multi_index);
+}
+template <>
+bool CompareEqual<float>(float lhs, float rhs,
+                         absl::Span<const int64> multi_index) {
+  return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs, multi_index);
+}
+template <>
+bool CompareEqual<double>(double lhs, double rhs,
+                          absl::Span<const int64> multi_index) {
+  return CompareFloatsBitwiseEqual<double, uint64>(lhs, rhs, multi_index);
+}
+template <>
+bool CompareEqual<complex64>(complex64 lhs, complex64 rhs,
+                             absl::Span<const int64> multi_index) {
+  return CompareEqual<float>(lhs.real(), rhs.real(), multi_index) &&
+         CompareEqual<float>(lhs.imag(), rhs.imag(), multi_index);
+}
+
+template <typename NativeT, typename UnsignedT>
+Status MakeBitwiseErrorStatus(NativeT lhs, NativeT rhs,
+                              absl::Span<const int64> multi_index) {
   auto ulhs = absl::bit_cast<UnsignedT>(GetRawValue(lhs));
   auto urhs = absl::bit_cast<UnsignedT>(GetRawValue(rhs));
   auto lhs_double = static_cast<double>(lhs);
   auto rhs_double = static_cast<double>(rhs);
-  if (ulhs != urhs) {
     return InvalidArgument(
         "floating values are not bitwise-equal; and equality testing "
         "was requested: %s=%g=%a vs %s=%g=%a at array index %s",
         StrCat(absl::Hex(ulhs)), lhs_double, lhs_double,
         StrCat(absl::Hex(urhs)), rhs_double, rhs_double,
         LiteralUtil::MultiIndexAsString(multi_index));
-  }
-  return Status::OK();
 }
 
-// Templated comparator that specializes for float equality comparison with the
-// bitwise helper above (this is the un-specialized fallback, to just use the
-// default gunit implementation).
 template <typename NativeT>
-Status CompareEqual(NativeT lhs, NativeT rhs,
-                    absl::Span<const int64> multi_index) {
-  if (lhs == rhs) {
-    return Status::OK();
-  }
+Status MakeErrorStatus(NativeT lhs, NativeT rhs,
+                       absl::Span<const int64> multi_index) {
   return InvalidArgument(
       "first mismatch at array index %s:\n  expected value: %s\n  actual "
       "value:   %s",
       LiteralUtil::MultiIndexAsString(multi_index), StrCat(lhs), StrCat(rhs));
 }
 
-// Specializations for floating types that do bitwise comparisons when equality
-// comparison is requested.
 template <>
-Status CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs,
-                              absl::Span<const int64> multi_index) {
-  return CompareFloatsBitwiseEqual<bfloat16, uint16>(lhs, rhs, multi_index);
+Status MakeErrorStatus(bfloat16 lhs, bfloat16 rhs,
+                       absl::Span<const int64> multi_index) {
+  return MakeBitwiseErrorStatus<bfloat16, uint16>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<Eigen::half>(Eigen::half lhs, Eigen::half rhs,
-                                 absl::Span<const int64> multi_index) {
-  return CompareFloatsBitwiseEqual<Eigen::half, uint16>(lhs, rhs, multi_index);
+Status MakeErrorStatus(Eigen::half lhs, Eigen::half rhs,
+                       absl::Span<const int64> multi_index) {
+  return MakeBitwiseErrorStatus<Eigen::half, uint16>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<float>(float lhs, float rhs,
-                           absl::Span<const int64> multi_index) {
-  return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs, multi_index);
+Status MakeErrorStatus(float lhs, float rhs,
+                       absl::Span<const int64> multi_index) {
+  return MakeBitwiseErrorStatus<float, uint32>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<double>(double lhs, double rhs,
-                            absl::Span<const int64> multi_index) {
-  return CompareFloatsBitwiseEqual<double, uint64>(lhs, rhs, multi_index);
+Status MakeErrorStatus(double lhs, double rhs,
+                       absl::Span<const int64> multi_index) {
+  return MakeBitwiseErrorStatus<double, uint64>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<complex64>(complex64 lhs, complex64 rhs,
-                               absl::Span<const int64> multi_index) {
-  auto res = CompareEqual<float>(lhs.real(), rhs.real(), multi_index);
-  if (!res.ok()) {
-    return res;
+Status MakeErrorStatus(complex64 lhs, complex64 rhs,
+                       absl::Span<const int64> multi_index) {
+  if (!CompareEqual<float>(lhs.real(), rhs.real(), multi_index)) {
+    return MakeErrorStatus(lhs.real(), rhs.real(), multi_index);
   }
-  return CompareEqual<float>(lhs.imag(), rhs.imag(), multi_index);
+  return MakeErrorStatus(lhs.imag(), rhs.imag(), multi_index);
 }
 
 // A recursive function which iterates through every index of expected and
@@ -119,7 +153,11 @@ Status Equal(LiteralSlice expected, LiteralSlice actual,
   if (dimension == expected.shape().dimensions_size()) {
     NativeT expected_value = expected.Get<NativeT>(multi_index);
     NativeT actual_value = actual.Get<NativeT>(multi_index);
-    return CompareEqual<NativeT>(expected_value, actual_value, multi_index);
+    bool result =
+        CompareEqual<NativeT>(expected_value, actual_value, multi_index);
+    return result ? Status::OK()
+                  : MakeErrorStatus<NativeT>(expected_value, actual_value,
+                                             multi_index);
   }
 
   Status result;
@@ -330,7 +368,7 @@ class NearComparator {
         NanMismatch(expected, actual, error_.relaxed_nans);
     float abs_error;
     float rel_error;
-    if (CompareEqual<T>(expected, actual, {linear_index}).ok()) {
+    if (CompareEqual<T>(expected, actual, {linear_index})) {
       abs_error = 0;
       rel_error = 0;
     } else if (is_nan_mismatch) {
@@ -344,7 +382,7 @@ class NearComparator {
     } else if (IsInf(expected) || IsInf(actual)) {
       // If either the expected or actual value is infinity but not both,
       // then both absolute and relative error are regarded as inifity.
-      CHECK(!CompareEqual(expected, actual, {linear_index}).ok());
+      CHECK(!CompareEqual(expected, actual, {linear_index}));
       abs_error = std::numeric_limits<float>::infinity();
       rel_error = std::numeric_limits<float>::infinity();
     } else {
diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc
index 49363ad802ddb9520f89b53257216bc7ddaf8ff5..d8c7141cacb8f60cb4ce56d07ac5827a8dbf9b20 100644
--- a/tensorflow/compiler/xla/literal_test.cc
+++ b/tensorflow/compiler/xla/literal_test.cc
@@ -98,42 +98,42 @@ class LiteralUtilTest : public ::testing::Test {
 
 TEST_F(LiteralUtilTest, LiteralScalarToString) {
   auto true_lit = LiteralUtil::CreateR0<bool>(true);
-  EXPECT_EQ("true", true_lit.ToString());
+  EXPECT_EQ("pred[] true", true_lit.ToString());
 
   auto false_lit = LiteralUtil::CreateR0<bool>(false);
-  EXPECT_EQ("false", false_lit.ToString());
+  EXPECT_EQ("pred[] false", false_lit.ToString());
 
   auto u32_lit = LiteralUtil::CreateR0<uint32>(42);
-  EXPECT_EQ("42", u32_lit.ToString());
+  EXPECT_EQ("u32[] 42", u32_lit.ToString());
 
   auto s32_lit = LiteralUtil::CreateR0<int32>(-999);
-  EXPECT_EQ("-999", s32_lit.ToString());
+  EXPECT_EQ("s32[] -999", s32_lit.ToString());
 
   auto f32_lit = LiteralUtil::CreateR0<float>(3.14f);
-  EXPECT_EQ("3.14", f32_lit.ToString());
+  EXPECT_EQ("f32[] 3.14", f32_lit.ToString());
 
   auto f16_lit = LiteralUtil::CreateR0<half>(static_cast<half>(0.5f));
-  EXPECT_EQ("0.5", f16_lit.ToString());
+  EXPECT_EQ("f16[] 0.5", f16_lit.ToString());
 
   auto c64_lit = LiteralUtil::CreateR0<complex64>({3.14f, 2.78f});
-  EXPECT_EQ("(3.14, 2.78)", c64_lit.ToString());
+  EXPECT_EQ("c64[] (3.14, 2.78)", c64_lit.ToString());
 
   auto bf16_lit = LiteralUtil::CreateR0<bfloat16>(static_cast<bfloat16>(0.5f));
-  EXPECT_EQ("0.5", bf16_lit.ToString());
+  EXPECT_EQ("bf16[] 0.5", bf16_lit.ToString());
 
   // 3.14 will be rounded to 3.14062 in bfloat16 format.
   auto bf16_lit_truncated =
       LiteralUtil::CreateR0<bfloat16>(static_cast<bfloat16>(3.14f));
-  ASSERT_EQ("3.14062", bf16_lit_truncated.ToString());
+  ASSERT_EQ("bf16[] 3.14062", bf16_lit_truncated.ToString());
 
   auto bf16_lit_truncated2 =
       LiteralUtil::CreateR0<bfloat16>(static_cast<bfloat16>(9.001f));
-  EXPECT_EQ("9", bf16_lit_truncated2.ToString());
+  EXPECT_EQ("bf16[] 9", bf16_lit_truncated2.ToString());
 }
 
 TEST_F(LiteralUtilTest, LiteralVectorToString) {
   auto pred_vec = LiteralUtil::CreateR1<bool>({true, false, true});
-  EXPECT_EQ("{1, 0, 1}", pred_vec.ToString());
+  EXPECT_EQ("pred[3] {1, 0, 1}", pred_vec.ToString());
 }
 
 TEST_F(LiteralUtilTest, R2ToString) {
@@ -210,8 +210,8 @@ TEST_F(LiteralUtilTest, TupleToString) {
   auto scalar = LiteralUtil::CreateR0<float>(1.0);
   auto matrix = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   auto tuple = LiteralUtil::MakeTuple({&scalar, &matrix});
-  const string expected = R"((f32[], f32[2,2]) (
-1,
+  const string expected = R"((
+f32[] 1,
 f32[2,2] {
   { 1, 2 },
   { 3, 4 }
@@ -1890,7 +1890,7 @@ TEST_F(LiteralUtilTest, SortSparseElements) {
   literal.AppendSparseElement<float>({3, 4, 5}, 3.0);
   literal.AppendSparseElement<float>({1, 2, 3}, 1.0);
   literal.SortSparseElements();
-  EXPECT_EQ(literal.ToString(false),
+  EXPECT_EQ(literal.ToString(),
             "f32[10,10,10]{[1, 2, 3]: 1, [2, 3, 4]: 2, [3, 4, 5]: 3}");
 }
 
diff --git a/tensorflow/compiler/xla/packed_literal_reader.cc b/tensorflow/compiler/xla/packed_literal_reader.cc
index 0f86f9f35e105713aa3072a9ebf572d33d35d66d..339660cf44fd64fc5859e72255d63762fcf20efe 100644
--- a/tensorflow/compiler/xla/packed_literal_reader.cc
+++ b/tensorflow/compiler/xla/packed_literal_reader.cc
@@ -42,8 +42,7 @@ PackedLiteralReader::~PackedLiteralReader() { delete file_; }
 StatusOr<Literal> PackedLiteralReader::Read(const Shape& shape,
                                             const Layout* layout) {
   VLOG(3) << "reading shape from file: " << ShapeUtil::HumanString(shape)
-          << " layout: "
-          << (layout == nullptr ? "<none>" : layout->ShortDebugString());
+          << " layout: " << (layout == nullptr ? "<none>" : layout->ToString());
   Shape literal_shape = shape;
   if (layout != nullptr) {
     TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index b16147e3be71771269d8b7a18528bef3a8c72d99..00ad01fc407017624a9183d69e61cb0d382e3f11 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/primitive_util.h"
 
+#include "absl/strings/ascii.h"
+#include "absl/strings/numbers.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -90,5 +93,65 @@ bool IsArrayType(PrimitiveType primitive_type) {
          primitive_type != OPAQUE && primitive_type != TOKEN;
 }
 
+// Class to memoize the computation of
+//   absl::AsciiStrToLower(PrimitiveType_Name(p))
+// for all PrimitiveType values "p"
+class PrimitiveTypeNameGenerator {
+ public:
+  PrimitiveTypeNameGenerator() {
+    for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) {
+      if (PrimitiveType_IsValid(i)) {
+        lowercase_name_[i] = absl::AsciiStrToLower(
+            PrimitiveType_Name(static_cast<PrimitiveType>(i)));
+      }
+    }
+  }
+  const string& LowercaseName(PrimitiveType t) {
+    return lowercase_name_[static_cast<int>(t)];
+  }
+
+ private:
+  string lowercase_name_[PrimitiveType_ARRAYSIZE];
+};
+
+const string& LowercasePrimitiveTypeName(PrimitiveType s) {
+  static auto* gen = new PrimitiveTypeNameGenerator();
+  return gen->LowercaseName(s);
+}
+
+namespace {
+
+// Returns a map from lower-case primitive type name to primitive type.
+const std::unordered_map<string, PrimitiveType>& GetPrimitiveTypeStringMap() {
+  static std::unordered_map<string, PrimitiveType>* name_to_type = [] {
+    static auto* map = new std::unordered_map<string, PrimitiveType>;
+    for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) {
+      if (PrimitiveType_IsValid(i) && i != PRIMITIVE_TYPE_INVALID) {
+        auto value = static_cast<PrimitiveType>(i);
+        (*map)[LowercasePrimitiveTypeName(value)] = value;
+      }
+    }
+    return map;
+  }();
+  return *name_to_type;
+}
+
+}  // namespace
+
+StatusOr<PrimitiveType> StringToPrimitiveType(absl::string_view name) {
+  const auto& map = GetPrimitiveTypeStringMap();
+  auto found = map.find(string(name));
+  if (found == map.end()) {
+    return InvalidArgument("Invalid element type string: \"%s\".", name);
+  }
+  return found->second;
+}
+
+bool IsPrimitiveTypeName(absl::string_view name) {
+  const auto& map = GetPrimitiveTypeStringMap();
+  auto found = map.find(string(name));
+  return found != map.end();
+}
+
 }  // namespace primitive_util
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index 889e9a1ceca675689406d255d348c82c398563aa..70603b6fed1be50c427799e6dce7b8bf9631a6f4 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -20,6 +20,9 @@ limitations under the License.
 
 #include <type_traits>
 
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -221,6 +224,17 @@ template <>
 struct PrimitiveTypeToNative<C64> {
   using type = complex64;
 };
+
+// Returns the lower-case name of the given primitive type.
+const string& LowercasePrimitiveTypeName(PrimitiveType s);
+
+// Returns the PrimitiveType matching the given name. The given name is expected
+// to be lower-case.
+StatusOr<PrimitiveType> StringToPrimitiveType(absl::string_view name);
+
+// Returns true if the given name is a primitive type string (lower-case).
+bool IsPrimitiveTypeName(absl::string_view name);
+
 }  // namespace primitive_util
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/primitive_util_test.cc b/tensorflow/compiler/xla/primitive_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f765d6da9ef65849fe8ede56ced7597d623cb59
--- /dev/null
+++ b/tensorflow/compiler/xla/primitive_util_test.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/primitive_util.h"
+
+#include <numeric>
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+TEST(PrimitiveUtilTest, StringToPrimitiveType) {
+  auto expect_ok_and_equal = [](const string& str, PrimitiveType expected) {
+    TF_ASSERT_OK_AND_ASSIGN(PrimitiveType actual,
+                            primitive_util::StringToPrimitiveType(str));
+    EXPECT_EQ(expected, actual);
+  };
+  expect_ok_and_equal("f32", F32);
+  expect_ok_and_equal("tuple", TUPLE);
+  expect_ok_and_equal("pred", PRED);
+  expect_ok_and_equal("s32", S32);
+
+  EXPECT_IS_NOT_OK(primitive_util::StringToPrimitiveType("F32").status());
+  EXPECT_IS_NOT_OK(primitive_util::StringToPrimitiveType("Pred").status());
+  EXPECT_IS_NOT_OK(primitive_util::StringToPrimitiveType("preD").status());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 63ac1c6649210cbae9e238a74e0a45fb8ee4da63..4a57b1051e081a706267df66e239dc9d330c57ba 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -66,7 +66,10 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:cholesky",
         "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:qr",
+        "//tensorflow/compiler/xla/client/lib:triangular_solve",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xrt:xrt_proto",
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index c0b57e7d26581662476fb64ddaedafe4d55d8619..5d191f5a18ebad8213c29fcc08f317db9626e4ed 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -24,7 +24,10 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/xla/client/lib/cholesky.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/qr.h"
+#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
@@ -148,14 +151,19 @@ static StatusOr<ScopedShapedBuffer> ToBuffer(LocalClient* client,
 
 /* static */
 StatusOr<LocalShapedBuffer*> LocalShapedBuffer::FromLiteral(
-    const Literal& argument, const absl::optional<Shape>& shape_with_layout) {
+    const Literal& argument, const absl::optional<Shape>& shape_with_layout,
+    int replica_number) {
   LocalClient* client = GetOrCreateLocalClient();
+  TF_ASSIGN_OR_RETURN(int device_ordinal,
+                      client->ReplicaNumberToDeviceOrdinal(replica_number));
+  VLOG(1) << "Creating shaped buffer from literal on replica/ordinal: "
+          << replica_number << "/" << device_ordinal;
   StatusOr<ScopedShapedBuffer> buf = [&] {
     if (shape_with_layout) {
       Literal relaid = argument.Relayout(shape_with_layout.value());
-      return ToBuffer(client, /*device_ordinal=*/0, relaid);
+      return ToBuffer(client, device_ordinal, relaid);
     }
-    return ToBuffer(client, /*device_ordinal=*/0, argument);
+    return ToBuffer(client, device_ordinal, argument);
   }();
   TF_RETURN_IF_ERROR(buf.status());
   return new LocalShapedBuffer(std::move(buf).ValueOrDie());
@@ -312,66 +320,127 @@ CompiledLocalComputation::CompiledLocalComputation(
 StatusOr<LocalShapedBuffer*> CompiledLocalComputation::Execute(
     absl::Span<LocalShapedBuffer* const> argument_handles) {
   LocalClient* client = GetOrCreateLocalClient();
+  StatusOr<int> device_ordinal_status = client->ReplicaNumberToDeviceOrdinal(0);
+  StatusOr<ScopedShapedBuffer> result_buffer_status;
+  if (!device_ordinal_status.ok()) {
+    result_buffer_status = device_ordinal_status.status();
+  } else {
+    const int device_ordinal = device_ordinal_status.ValueOrDie();
+    VLOG(3) << "Replica 0 mapped to device ordinal for execution: "
+            << device_ordinal;
+
+    std::vector<const ShapedBuffer*> argument_buffers;
+    argument_buffers.reserve(argument_handles.size());
+    for (auto& handle : argument_handles) {
+      argument_buffers.push_back(handle->shaped_buffer());
+    }
+
+    DeviceAssignment device_assignment =
+        client->backend()
+            .computation_placer()
+            ->AssignDevices(1, /*computation_count=*/1)
+            .ConsumeValueOrDie();
+
+    ExecutableRunOptions options;
+    options.set_device_ordinal(device_ordinal);
+    options.set_allocator(client->backend().memory_allocator());
+    options.set_intra_op_thread_pool(
+        client->backend().eigen_intra_op_thread_pool_device());
+    options.set_device_assignment(&device_assignment);
+
+    result_buffer_status = executable_->Run(argument_buffers, options);
+  }
+
+  if (!result_buffer_status.ok()) {
+    return InternalError(
+        "Failed running replica 0 (other replicas may have failed as well): "
+        "%s.",
+        result_buffer_status.status().ToString());
+  }
+  return new LocalShapedBuffer(std::move(result_buffer_status).ValueOrDie());
+}
+
+StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
+    absl::Span<const std::vector<LocalShapedBuffer*>> argument_handles) {
+  LocalClient* client = GetOrCreateLocalClient();
+  const int num_replicas = GetReplicaCount();
 
-  VLOG(1) << "Execution requested with " << GetReplicaCount() << " replicas.";
+  if (argument_handles.size() != num_replicas) {
+    return InvalidArgument(
+        "Attempted to execute with %d replicas when replica count is %d",
+        argument_handles.size(), num_replicas);
+  }
+
+  VLOG(1) << "Executing with " << num_replicas << " replicas.";
 
   // Each replica populates a StatusOr result, but only the output value of
   // replica zero is returned.
-  std::vector<StatusOr<ScopedShapedBuffer>> results(GetReplicaCount());
-  {
+  std::vector<StatusOr<ScopedShapedBuffer>> results(num_replicas);
+  auto execute = [this, client, num_replicas, &argument_handles,
+                  &results](int replica) {
+    StatusOr<int> device_ordinal_status =
+        client->ReplicaNumberToDeviceOrdinal(replica);
+    if (!device_ordinal_status.ok()) {
+      results[replica] = device_ordinal_status.status();
+      return;
+    }
+    const int device_ordinal = device_ordinal_status.ValueOrDie();
+    VLOG(3) << "Replica " << replica
+            << " mapped to device ordinal for execution: " << device_ordinal;
+
+    std::vector<const ShapedBuffer*> argument_buffers;
+    argument_buffers.reserve(argument_handles[replica].size());
+    for (auto& handle : argument_handles[replica]) {
+      argument_buffers.push_back(handle->shaped_buffer());
+    }
+
+    DeviceAssignment device_assignment =
+        client->backend()
+            .computation_placer()
+            ->AssignDevices(num_replicas, /*computation_count=*/1)
+            .ConsumeValueOrDie();
+
+    ExecutableRunOptions options;
+    options.set_device_ordinal(device_ordinal);
+    options.set_allocator(client->backend().memory_allocator());
+    options.set_intra_op_thread_pool(
+        client->backend().eigen_intra_op_thread_pool_device());
+    options.set_device_assignment(&device_assignment);
+    StatusOr<ScopedShapedBuffer> result_buffer_status =
+        executable_->Run(argument_buffers, options);
+
+    results[replica] = std::move(result_buffer_status);
+  };
+
+  if (num_replicas == 1) {
+    // Fast-path if there is only one replica — run the computation on the
+    // current thread.
+    execute(0);
+  } else {
+    // TODO(phawkins): don't recreate the threadpool for each execution.
     tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(), "xlarun",
-                                        GetReplicaCount());
-
-    for (int replica = 0; replica < GetReplicaCount(); ++replica) {
-      pool.Schedule([this, client, replica, &argument_handles, &results] {
-        StatusOr<int> device_ordinal_status =
-            client->ReplicaNumberToDeviceOrdinal(replica);
-        if (!device_ordinal_status.ok()) {
-          results[replica] = device_ordinal_status.status();
-          return;
-        }
-        const int device_ordinal = device_ordinal_status.ValueOrDie();
-        VLOG(3) << "Replica " << replica
-                << " mapped to device ordinal for execution: "
-                << device_ordinal;
-
-        std::vector<const ShapedBuffer*> argument_buffers;
-        argument_buffers.reserve(argument_handles.size());
-        for (auto& handle : argument_handles) {
-          argument_buffers.push_back(handle->shaped_buffer());
-        }
-
-        DeviceAssignment device_assignment =
-            client->backend()
-                .computation_placer()
-                ->AssignDevices(GetReplicaCount(), /*computation_count=*/1)
-                .ConsumeValueOrDie();
-
-        ExecutableRunOptions options;
-        options.set_device_ordinal(device_ordinal);
-        options.set_allocator(client->backend().memory_allocator());
-        options.set_intra_op_thread_pool(
-            client->backend().eigen_intra_op_thread_pool_device());
-        options.set_device_assignment(&device_assignment);
-        StatusOr<ScopedShapedBuffer> result_buffer_status =
-            executable_->Run(argument_buffers, options);
-
-        results[replica] = std::move(result_buffer_status);
-      });
+                                        num_replicas - 1);
+
+    for (int replica = 0; replica < num_replicas - 1; ++replica) {
+      pool.Schedule([&execute, replica] { execute(replica); });
     }
+    execute(num_replicas - 1);
   }
 
-  for (int replica = 0; replica < GetReplicaCount(); ++replica) {
-    const auto& statusor = results[replica];
+  std::vector<LocalShapedBuffer*> wrapped_results(num_replicas);
+  for (int replica = 0; replica < num_replicas; ++replica) {
+    auto& statusor = results[replica];
     if (!statusor.ok()) {
       return InternalError(
           "Failed running replica %d (other replicas may have failed as well): "
           "%s.",
           replica, statusor.status().ToString());
     }
+    wrapped_results[replica] =
+        new LocalShapedBuffer(std::move(statusor).ValueOrDie());
   }
 
-  return new LocalShapedBuffer(std::move(results[0]).ValueOrDie());
+  return new LocalShapedBufferTuple(std::move(wrapped_results));
 }
 
 static StatusOr<Shape> GetReturnValueShape(const XlaComputation& computation) {
@@ -578,6 +647,15 @@ LocalOp LocalComputationBuilder::ConstantLiteral(const Literal& literal) {
   return xla::ConstantLiteral(&builder_, literal);
 }
 
+LocalOp LocalComputationBuilder::Iota(PrimitiveType element_type, int64 size) {
+  return xla::Iota(&builder_, element_type, size);
+}
+
+LocalOp LocalComputationBuilder::BroadcastedIota(const Shape& shape,
+                                                 int64 dimension) {
+  return xla::Iota(&builder_, shape, dimension);
+}
+
 LocalOp LocalComputationBuilder::Broadcast(
     const LocalOp& operand, absl::Span<const int64> broadcast_sizes) {
   return xla::Broadcast(operand.op(), broadcast_sizes);
@@ -714,6 +792,21 @@ LocalOp LocalComputationBuilder::Call(const LocalComputation& local_computation,
   return xla::Call(&builder_, local_computation.computation(), xla_ops);
 }
 
+LocalOp LocalComputationBuilder::CustomCall(
+    const string& call_target_name, absl::Span<const LocalOp> operands,
+    const Shape& shape_with_layout,
+    const std::vector<Shape>& operand_shapes_with_layout,
+    const string& opaque) {
+  std::vector<XlaOp> xla_ops;
+  xla_ops.reserve(operands.size());
+  for (const auto& op : operands) {
+    xla_ops.push_back(op.op());
+  }
+  return xla::CustomCallWithLayout(&builder_, call_target_name, xla_ops,
+                                   shape_with_layout,
+                                   operand_shapes_with_layout, opaque);
+}
+
 LocalOp LocalComputationBuilder::Transpose(
     const LocalOp& operand, absl::Span<const int64> permutation) {
   return xla::Transpose(operand.op(), permutation);
@@ -799,6 +892,27 @@ LocalOp LocalComputationBuilder::SortKeyVal(const LocalOp& keys,
   return xla::Sort(keys.op(), {values.op()}, dimension);
 }
 
+LocalOp LocalComputationBuilder::Cholesky(const LocalOp& a) {
+  return xla::Cholesky(a.op());
+}
+
+LocalOp LocalComputationBuilder::QR(const LocalOp& a, bool full_matrices) {
+  XlaBuilder* builder = a.op().builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto qr, xla::QRDecomposition(a.op(), full_matrices));
+    return xla::Tuple(builder, {qr.q, qr.r});
+  });
+}
+
+LocalOp LocalComputationBuilder::TriangularSolve(const LocalOp& a,
+                                                 const LocalOp& b,
+                                                 bool left_side, bool lower,
+                                                 bool transpose_a,
+                                                 bool conjugate_a) {
+  return xla::TriangularSolve(a.op(), b.op(), left_side, lower, transpose_a,
+                              conjugate_a);
+}
+
 StatusOr<LocalComputation*> LocalComputationBuilder::BuildConstantSubGraph(
     const LocalOp& operand) {
   TF_ASSIGN_OR_RETURN(XlaComputation computation,
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index c9b7ae824a4e5dac3360de0f95859d7c1deb360f..c6e58ac971d93662c41fc7a6001f94fb26d2eff5 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -71,7 +71,8 @@ StatusOr<Literal> TransferFromOutfeedLocalReplica(const Shape& shape,
 class LocalShapedBuffer {
  public:
   static StatusOr<LocalShapedBuffer*> FromLiteral(
-      const Literal& argument, const absl::optional<Shape>& shape_with_layout);
+      const Literal& argument, const absl::optional<Shape>& shape_with_layout,
+      int replica_number);
 
   LocalShapedBuffer(ScopedShapedBuffer shaped_buffer);
   StatusOr<Literal> ToLiteral() const;
@@ -175,6 +176,12 @@ class CompiledLocalComputation {
   StatusOr<LocalShapedBuffer*> Execute(
       absl::Span<LocalShapedBuffer* const> argument_handles);
 
+  // Execute on many replicas. Takes a sequence of argument lists (one argument
+  // list per replica) and returns a tuple of results (one result per replica).
+  // The number of argument lists must be equal to the replica count.
+  StatusOr<LocalShapedBufferTuple*> ExecutePerReplica(
+      absl::Span<const std::vector<LocalShapedBuffer*> > argument_handles);
+
  private:
   std::unique_ptr<LocalExecutable> executable_;
 };
@@ -279,6 +286,10 @@ class LocalComputationBuilder {
 
   LocalOp ConstantLiteral(const Literal& literal);
 
+  LocalOp Iota(PrimitiveType element_type, int64 size);
+
+  LocalOp BroadcastedIota(const Shape& shape, int64 dimension);
+
   LocalOp Broadcast(const LocalOp& operand,
                     absl::Span<const int64> broadcast_sizes);
 
@@ -345,6 +356,12 @@ class LocalComputationBuilder {
   LocalOp Call(const LocalComputation& local_computation,
                absl::Span<const LocalOp> operands);
 
+  LocalOp CustomCall(const string& call_target_name,
+                     absl::Span<const LocalOp> operands,
+                     const Shape& shape_with_layout,
+                     const std::vector<Shape>& operand_shapes_with_layout,
+                     const string& opaque);
+
   LocalOp Transpose(const LocalOp& operand,
                     absl::Span<const int64> permutation);
 
@@ -387,6 +404,13 @@ class LocalComputationBuilder {
   LocalOp SortKeyVal(const LocalOp& keys, const LocalOp& values,
                      int64 dimension);
 
+  LocalOp QR(const LocalOp& a, bool full_matrices);
+
+  LocalOp Cholesky(const LocalOp& a);
+
+  LocalOp TriangularSolve(const LocalOp& a, const LocalOp& b, bool left_side,
+                          bool lower, bool transpose_a, bool conjugate_a);
+
   StatusOr<LocalComputation*> BuildConstantSubGraph(const LocalOp& operand);
 
 #define _FORWARD(method_name, return_sig, args_sig) \
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 5c2538dcc36d93008382a517fd4dc680caaa4347..11fb00e616ad410fd1e5b0225ca3cd5362fef59b 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -363,6 +363,37 @@ tensorflow::ImportNumpy();
   $1 = temps;
 }
 
+%typemap(in) absl::Span<const std::vector<xla::swig::LocalShapedBuffer*> >
+    (std::vector<std::vector<LocalShapedBuffer*> > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    std::vector<LocalShapedBuffer*> vec;
+    const int vec_size = PySequence_Size(o);
+    vec.reserve(vec_size);
+    for (int j = 0; j < vec_size; ++j) {
+      PyObject* vec_elt = PySequence_GetItem(o, j);
+      LocalShapedBuffer* lsbp;
+      if ((SWIG_ConvertPtr(vec_elt, (void**) &lsbp, $descriptor(xla::swig::LocalShapedBuffer*),
+                           SWIG_POINTER_EXCEPTION)) == -1) {
+        Py_DECREF(vec_elt);
+        Py_DECREF(o);
+        SWIG_fail;
+      }
+      vec.push_back(lsbp);
+      Py_DECREF(vec_elt);
+    }
+    temps.push_back(vec);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
 %typemap(in) absl::Span<xla::swig::XrtAllocation* const>
     (std::vector<XrtAllocation*> temps) {
   if (!PySequence_Check($input)) {
@@ -998,6 +1029,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::XrtAllocationTuple::size;
 %unignore xla::swig::CompiledLocalComputation;
 %unignore xla::swig::CompiledLocalComputation::Execute;
+%unignore xla::swig::CompiledLocalComputation::ExecutePerReplica;
 %unignore xla::swig::CompiledXrtComputation;
 %unignore xla::swig::CompiledXrtComputation::Execute;
 %unignore xla::swig::LocalComputation;
@@ -1019,6 +1051,8 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Outfeed;
 %unignore xla::swig::LocalComputationBuilder::ConstantLiteral;
 %unignore xla::swig::LocalComputationBuilder::ConstantR0;
+%unignore xla::swig::LocalComputationBuilder::Iota;
+%unignore xla::swig::LocalComputationBuilder::BroadcastedIota;
 %unignore xla::swig::LocalComputationBuilder::Broadcast;
 %unignore xla::swig::LocalComputationBuilder::BroadcastInDim;
 %unignore xla::swig::LocalComputationBuilder::Pad;
@@ -1112,6 +1146,10 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Imag;
 %unignore xla::swig::LocalComputationBuilder::Conj;
 %unignore xla::swig::LocalComputationBuilder::Complex;
+%unignore xla::swig::LocalComputationBuilder::Cholesky;
+%unignore xla::swig::LocalComputationBuilder::QR;
+%unignore xla::swig::LocalComputationBuilder::TriangularSolve;
+%unignore xla::swig::LocalComputationBuilder::CustomCall;
 %unignore xla::swig::DeleteLocalComputation;
 %unignore xla::swig::DestructureLocalShapedBufferTuple;
 %unignore xla::swig::DestructureXrtAllocationTuple;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index e5fba0d7acb838788f8e7e05a4634e807d9d21d0..4166fa0327eba5edd0dee030e283c86ade627040 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -222,24 +222,33 @@ class LocalBuffer(object):
   means the referent is in device memory.
   """
 
-  def __init__(self, c_buffer, backend):
+  def __init__(self, c_buffer, backend, replica):
     self.c_buffer = c_buffer
     self._backend = backend
+    self._replica = replica
     if backend.backend_type == BackendType.XRT:
       self._delete = c_api.DeleteXrtAllocation
     else:
       self._delete = c_api.DeleteLocalShapedBuffer
 
   @staticmethod
-  def from_pyval(pyval, backend=XLA_LOCAL_BACKEND):
+  def from_pyval(pyval, replica=0, backend=XLA_LOCAL_BACKEND):
     """Allocate and copy to XLA the given python value."""
     pyval = require_numpy_array_layout(pyval)
+    num_replicas = get_replica_count()
+    if not 0 <= replica < num_replicas:
+      raise ValueError(
+          'Attempt to place buffer on replica {} when the replica count is {}'
+          .format(replica, num_replicas))
     if backend.backend_type == BackendType.XRT:
+      if replica != 0:
+        raise NotImplementedError(
+            'Multi-replica execution is not yet supported via the XRT backend.')
       cbuf = c_api.XrtAllocation.FromLiteral(
           pyval, _maybe_encode_string(backend.target))
     else:
-      cbuf = c_api.LocalShapedBuffer.FromLiteral(pyval, None)
-    return LocalBuffer(cbuf, backend)
+      cbuf = c_api.LocalShapedBuffer.FromLiteral(pyval, None, replica)
+    return LocalBuffer(cbuf, backend, replica)
 
   def to_py(self):
     return self.c_buffer.ToLiteral()
@@ -247,6 +256,9 @@ class LocalBuffer(object):
   def shape(self):
     return _wrap_shape(self.c_buffer.shape())
 
+  def replica(self):
+    return self._replica
+
   def delete(self):
     if self.c_buffer is not None:
       self._delete(self.c_buffer)
@@ -263,7 +275,8 @@ class LocalBuffer(object):
     self.delete()
     size = result.size()
     destructured = tuple(
-        LocalBuffer(result.Release(i), backend=self._backend)
+        LocalBuffer(
+            result.Release(i), replica=self._replica, backend=self._backend)
         for i in xrange(size))
     return destructured
 
@@ -575,23 +588,87 @@ class LocalComputation(object):
         compile_options=compile_options,
         layout_fn=layout_fn)
 
-  def Execute(self, arguments=()):
-    """Execute with LocalBuffer arguments and return value."""
+  def GetReturnValueShape(self):
+    return _wrap_shape(self._c_computation.GetReturnValueShape())
+
+  def Execute(self, arguments=(), check_for_deleted_args=True):
+    """Execute on one replica with LocalBuffer arguments and return value."""
+    if check_for_deleted_args and any(arg.is_deleted() for arg in arguments):
+      raise ValueError('Executing with deleted local buffer argument')
+    raw_args = [arg.c_buffer for arg in arguments]
+    output_buffer = self._c_computation.Execute(raw_args)
+    return LocalBuffer(output_buffer, backend=self._backend, replica=0)
+
+  def ExecutePerReplica(self, arguments=None):
+    """Execute on many replicas with LocalBuffer arguments and return value.
+
+    Args:
+      arguments: A sequence of sequences of LocalBuffers. The i'th inner
+        sequence comprises the arguments for execution on the i'th replica.
+
+    Returns:
+      A list of the computation's outputs on each replica, as a LocalBuffer. If
+      a shallow sequence of arguments was passed in for `arguments`, then the
+      sole, zero'th replica's output is returned instead, as a LocalBuffer.
+    """
     if not self._is_compiled:
       raise ValueError('Cannot execute an uncompiled local XLA computation.')
-    arguments = tuple(arguments)
-    if any(arg.is_deleted() for arg in arguments):
-      raise ValueError('Executing with deleted local buffer argument')
-    return LocalBuffer(
-        self._c_computation.Execute([arg.c_buffer for arg in arguments]),
-        backend=self._backend)
+    if arguments is None:
+      arguments = ((),) * get_replica_count()
+    else:
+      arguments = [list(replica_args) for replica_args in arguments]
+
+    # Check arguments
+    for replica, replica_args in enumerate(arguments):
+      for arg in replica_args:
+        if arg.is_deleted():
+          raise ValueError('Executing with deleted local buffer argument')
+        if arg.replica() != replica:
+          raise ValueError(
+              'Executing on replica {} with argument from replica {}'.format(
+                  replica, arg.replica()))
+
+    # Pull out argument buffer handles
+    stripped_args = [
+        [arg.c_buffer for arg in replica_args] for replica_args in arguments
+    ]
+
+    # Execute
+    if self._backend.backend_type == BackendType.XRT:
+      if len(stripped_args) > 1:
+        raise NotImplementedError(
+            'Multi-replica execution is not yet supported via the XRT backend.')
+      output_buffers = [self._c_computation.Execute(stripped_args[0])]
+    else:
+      output_buffer_tup = self._c_computation.ExecutePerReplica(stripped_args)
+      size = output_buffer_tup.size()
+      output_buffers = [output_buffer_tup.Release(i) for i in xrange(size)]
+
+    # Wrap output handles in LocalBuffer instances
+    return tuple(
+        LocalBuffer(output_buffer, backend=self._backend, replica=replica)
+        for replica, output_buffer in enumerate(output_buffers))
 
   def ExecuteWithPythonValues(self, arguments=()):
-    """Execute with Python values as arguments and return value."""
-    arguments = tuple(
-        LocalBuffer.from_pyval(arg, backend=self._backend) for arg in arguments)
+    """Execute on one replica with Python values as arguments and output."""
+
+    def put(arg):
+      return LocalBuffer.from_pyval(arg, backend=self._backend)
+
+    arguments = [put(arg) for arg in arguments]
     return self.Execute(arguments).to_py()
 
+  def ExecuteWithPythonValuesPerReplica(self, arguments):
+    """Execute on many replicas with Python values as arguments and output."""
+
+    def put(arg, replica):
+      return LocalBuffer.from_pyval(arg, replica, backend=self._backend)
+
+    arguments = [[put(arg, replica)
+                  for arg in replica_args]
+                 for replica, replica_args in enumerate(arguments)]
+    return [out.to_py() for out in self.ExecutePerReplica(arguments)]
+
   def __del__(self):
     self._delete(self._c_computation)
 
@@ -754,6 +831,33 @@ class ComputationBuilder(object):
     return self.ParameterWithShape(
         Shape.from_pyval(value), name=name, parameter_num=parameter_num)
 
+  def Iota(self, dtype, size):
+    """Enqueues an iota constant onto the computation.
+
+    Args:
+      dtype: expected numpy dtype of the output.
+      size: integer, the number of elements in the array.
+
+    Returns:
+      A LocalOp representing the added iota constant.
+    """
+    element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(np.dtype(dtype))]
+    return self._client.Iota(element_type, size)
+
+  def BroadcastedIota(self, dtype, shape, dimension):
+    """Enqueues a broadcasted iota constant onto the computation.
+
+    Args:
+      dtype: expected numpy dtype of the output.
+      shape: tuple of integers, the expected output shape (dimensions).
+      dimension: positive integer, dimension along which to increment values.
+
+    Returns:
+      A LocalOp representing the added broadcasted iota constant.
+    """
+    xla_shape = Shape.array_shape(dtype, shape)
+    return self._client.BroadcastedIota(xla_shape, dimension)
+
   def Broadcast(self, operand, sizes):
     """Enqueues a broadcast operation onto the computation.
 
@@ -1025,6 +1129,31 @@ class ComputationBuilder(object):
     """
     return self._client.Call(computation_to_apply.computation, operands)
 
+  def CustomCall(self,
+                 call_target_name,
+                 operands,
+                 shape_with_layout,
+                 operand_shapes_with_layout,
+                 opaque=None):
+    """Enqueues a custom call operation onto the computation.
+
+    Args:
+      call_target_name: the name of the function to call.
+      operands: an iterable of LocalOp. The number and types of operands must
+        match the arity of `operand_shapes_with_layout`.
+      shape_with_layout: the shape of the operator's output, with layout.
+      operand_shapes_with_layout: the shapes of `operands`, including the
+        expected layouts.
+      opaque: an opaque string passed to the backend.
+
+    Returns:
+      A LocalOp representing the added custom call op.
+    """
+    opaque = opaque or ''
+    return self._client.CustomCall(call_target_name, operands,
+                                   shape_with_layout,
+                                   operand_shapes_with_layout, opaque)
+
   def Map(self, operands, computation_to_apply, dimensions):
     """Enqueues a map operation onto the computation.
 
@@ -1334,6 +1463,20 @@ class ComputationBuilder(object):
     """Enqueues a key-value sort operation onto the computation."""
     return self._client.SortKeyVal(keys, values, dimension)
 
+  def Cholesky(self, a):
+    """Enqueues a Cholesky decomposition onto the computation."""
+    return self._client.Cholesky(a)
+
+  def QR(self, a, full_matrices=True):
+    """Enqueues a QR decomposition onto the computation."""
+    return self._client.QR(a, full_matrices)
+
+  def TriangularSolve(self, a, b, left_side=False, lower=False,
+                      transpose_a=False, conjugate_a=False):
+    """Enqueues a triangular-solve operation onto the computation."""
+    return self._client.TriangularSolve(
+        a, b, left_side, lower, transpose_a, conjugate_a)
+
 
 def _forward_methods_to_local_builder():
   """Forward remaining ComputationBuilder methods to the C API.
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 21b5c93b615ec429a5da0b4ffe89e8f75f59ef1b..95c6dc8c4570564e361c27fd2bca5c90eebb4661 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import itertools
 import threading
 
@@ -51,9 +52,11 @@ class LocalComputationTest(unittest.TestCase):
   def _ExecuteAndCompareExact(self, c, arguments=(), expected=None):
     self._ExecuteAndAssertWith(np.testing.assert_equal, c, arguments, expected)
 
-  def _ExecuteAndCompareClose(self, c, arguments=(), expected=None):
-    self._ExecuteAndAssertWith(np.testing.assert_allclose, c, arguments,
-                               expected)
+  def _ExecuteAndCompareClose(self, c, arguments=(), expected=None, rtol=1e-7,
+                              atol=0):
+    self._ExecuteAndAssertWith(
+        functools.partial(np.testing.assert_allclose, rtol=rtol, atol=atol),
+        c, arguments, expected)
 
 
 def NumpyArrayF32(*args, **kwargs):
@@ -143,6 +146,17 @@ class ComputationsWithConstantsTest(LocalComputationTest):
     c.Pow(c.Constant(NumpyArrayF64([1.5, 2.5, 3.0])), c.ConstantF64Scalar(2.))
     self._ExecuteAndCompareClose(c, expected=[2.25, 6.25, 9.])
 
+  def testIota(self):
+    c = self._NewComputation()
+    c.Iota(np.float32, 10)
+    self._ExecuteAndCompareExact(c, expected=np.arange(10, dtype=np.float32))
+
+  def testBroadcastedIota(self):
+    c = self._NewComputation()
+    c.BroadcastedIota(np.int64, (2, 3), 1)
+    expected = np.array([[0, 1, 2], [0, 1, 2]], dtype=np.int64)
+    self._ExecuteAndCompareExact(c, expected=expected)
+
   def testBooleanAnd(self):
     c = self._NewComputation()
     c.And(
@@ -1057,6 +1071,38 @@ class SingleOpTest(LocalComputationTest):
     self.assertTrue(np.all(lo <= result))
     self.assertTrue(np.all(result < hi))
 
+  def testCholesky(self):
+    l = np.array([[4, 0, 0, 0], [6, 5, 0, 0], [2, 14, 16, 0], [3, 6, 1, 4]],
+                 dtype=np.float32)
+    c = self._NewComputation()
+    c.Cholesky(c.Constant(np.dot(l, l.T)))
+    self._ExecuteAndCompareClose(c, expected=l, rtol=1e-4)
+
+  def testQR(self):
+    a = np.array(
+        [[4, 6, 8, 10], [6, 45, 54, 63], [8, 54, 146, 166], [10, 63, 166, 310]],
+        dtype=np.float32)
+    c = self._NewComputation()
+    c.QR(c.Constant(a), full_matrices=True)
+    q, r = self._Execute(c, ())
+    np.testing.assert_allclose(np.dot(q, r), a, rtol=1e-4)
+
+  def testTriangularSolve(self):
+    a_vals = np.array(
+        [[2, 0, 0, 0], [3, 6, 0, 0], [4, 7, 9, 0], [5, 8, 10, 11]],
+        dtype=np.float32)
+    b_vals = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
+                      dtype=np.float32)
+
+    c = self._NewComputation()
+    c.TriangularSolve(c.Constant(a_vals), c.Constant(b_vals), left_side=False,
+                      lower=True, transpose_a=True)
+    self._ExecuteAndCompareClose(c, expected=np.array([
+        [0.5, 0.08333334, 0.04629629, 0.03367003],
+        [2.5, -0.25, -0.1388889, -0.1010101],
+        [4.5, -0.58333331, -0.32407406, -0.23569024],
+    ], dtype=np.float32), rtol=1e-4)
+
   def testIsConstant(self):
     c = self._NewComputation()
     a = c.ConstantS32Scalar(3)
diff --git a/tensorflow/compiler/xla/python_api/xla_shape.py b/tensorflow/compiler/xla/python_api/xla_shape.py
index 95b2bf300ec67e9f034f77450416544cb088ae55..bdcd4abd6cc708795416b15412f37dde10d7fe97 100644
--- a/tensorflow/compiler/xla/python_api/xla_shape.py
+++ b/tensorflow/compiler/xla/python_api/xla_shape.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import numpy as _np  # Avoids becoming a part of public Tensorflow API.
 
+from six.moves import xrange
+
 from tensorflow.compiler.xla import xla_data_pb2
 from tensorflow.compiler.xla.python_api import types
 
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index ceb5e74db7c3b9305e9d77068df9ae0a3690af8a..a27e2005dae3a44f4e49032e70f62d633f64779a 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -32,48 +31,19 @@ limitations under the License.
 
 namespace xla {
 
-namespace {
-
-template <typename T>
-std::unique_ptr<Array2D<T>> MatmulArray2DImpl(
-    const Array2D<T>& lhs, const Array2D<T>& rhs,
-    const std::function<void(
-        const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m, int64 n,
-        int64 k, int32 transpose_lhs, int32 transpose_rhs)>& impl_fn) {
-  CHECK_EQ(lhs.width(), rhs.height());
-  int m = lhs.height();
-  int n = rhs.width();
-  int k = lhs.width();
-  auto result = absl::make_unique<Array2D<T>>(m, n);
-  // Because Eigen is a header-oriented library, make sure that the Eigen code
-  // is the same as the code used by the CPU backend (otherwise the linker will
-  // randomly pick *some* definition).
-  impl_fn(
-      /*run_options_ptr=*/nullptr, result->data(), rhs.data(), lhs.data(), n, m,
-      k,
-      /*transpose_lhs=*/0,
-      /*transpose_rhs=*/0);
-  return result;
-}
-
-}  // namespace
-
 /* static */ std::unique_ptr<Array2D<Eigen::half>> ReferenceUtil::MatmulArray2D(
     const Array2D<Eigen::half>& lhs, const Array2D<Eigen::half>& rhs) {
-  return MatmulArray2DImpl<Eigen::half>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF16);
+  return HloEvaluator::MatmulArray2D(lhs, rhs);
 }
 
 /* static */ std::unique_ptr<Array2D<float>> ReferenceUtil::MatmulArray2D(
     const Array2D<float>& lhs, const Array2D<float>& rhs) {
-  return MatmulArray2DImpl<float>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF32);
+  return HloEvaluator::MatmulArray2D(lhs, rhs);
 }
 
 /* static */ std::unique_ptr<Array2D<double>> ReferenceUtil::MatmulArray2D(
     const Array2D<double>& lhs, const Array2D<double>& rhs) {
-  return MatmulArray2DImpl<double>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF64);
+  return HloEvaluator::MatmulArray2D(lhs, rhs);
 }
 
 /* static */ std::unique_ptr<Array2D<double>> ReferenceUtil::Array2DF32ToF64(
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 429b4e490cc2f1ab894924e95db3ad7e80342a72..55cadfdec64047a1d8cd4e2cd1d649d4c3f717e2 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -241,6 +241,7 @@ cc_library(
         ":hlo_casting_utils",
         ":hlo_query",
         ":shape_inference",
+        "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -249,6 +250,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
@@ -281,10 +283,12 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -1010,6 +1014,7 @@ cc_library(
     srcs = ["name_uniquer.cc"],
     hdrs = ["name_uniquer.h"],
     deps = [
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1410,6 +1415,7 @@ cc_library(
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -1574,6 +1580,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -1589,7 +1596,10 @@ tf_cc_test(
         ":hlo",
         ":hlo_casting_utils",
         ":hlo_matchers",
+        ":hlo_parser",
         ":hlo_pass",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -1777,6 +1787,7 @@ tf_cc_test(
         ":hlo_cse",
         ":hlo_dce",
         ":hlo_matchers",
+        ":hlo_parser",
         ":hlo_pass",
         ":hlo_pass_pipeline",
         ":tuple_simplifier",
@@ -1905,6 +1916,41 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "dynamic_dimension_inference",
+    srcs = ["dynamic_dimension_inference.cc"],
+    hdrs = ["dynamic_dimension_inference.h"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "dynamic_dimension_inference_test",
+    srcs = ["dynamic_dimension_inference_test.cc"],
+    deps = [
+        ":dynamic_dimension_inference",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_runner",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "reshape_mover_test",
     srcs = ["reshape_mover_test.cc"],
@@ -2062,7 +2108,8 @@ tf_cc_test(
     srcs = ["hlo_computation_test.cc"],
     deps = [
         ":hlo",
-        ":hlo_matchers",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -2656,7 +2703,6 @@ tf_cc_test(
         ":algebraic_simplifier",
         ":computation_layout",
         ":hlo",
-        ":hlo_matchers",
         ":layout_assignment",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
@@ -2670,6 +2716,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "@com_google_absl//absl/types:span",
@@ -3122,6 +3169,7 @@ cc_library(
     name = "hlo_graph_dumper",
     srcs = [
         "hlo_graph_dumper.cc",
+        "hlo_graph_html_renderer.cc",
     ],
     hdrs = ["hlo_graph_dumper.h"],
     deps = [
@@ -3129,6 +3177,7 @@ cc_library(
         ":hlo_casting_utils",
         ":hlo_execution_profile",
         ":hlo_tfgraph_builder",
+        ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
@@ -3582,7 +3631,6 @@ cc_library(
     srcs = ["hlo_lexer.cc"],
     hdrs = [
         "hlo_lexer.h",
-        "hlo_token.h",
     ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index a348bcf0a232994a046df51563a9167faac08190..1287dcf546d9fe575dd440d48323ed8efbf1de9d 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 
 #include <algorithm>
+#include <cmath>
+#include <functional>
 #include <iterator>
 #include <memory>
 #include <numeric>
@@ -24,6 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
@@ -68,6 +71,45 @@ bool IsAll(const HloInstruction* op, int8 value) {
   }
 }
 
+// Checks whether `op` is a floating-point constant or broadcast of a constant
+// of the form +/- 2^k for some integer k positive, negative, or zero.  Such
+// values are interesting because multiplying by a power of 2 just moves the
+// exponent.
+bool IsAllFpConstantPowerOf2(const HloInstruction* op) {
+  // Unwrap the broadcast if necessary.
+  const HloInstruction* c;
+  if (!Match(op, m::ConstantEffectiveScalar(&c)) &&
+      !Match(op, m::Broadcast(m::Constant(&c).WithShape(
+                     m::Shape().IsEffectiveScalar())))) {
+    return false;
+  }
+  auto val = [&]() -> absl::optional<double> {
+    switch (c->shape().element_type()) {
+      case BF16:
+        return static_cast<double>(c->literal().GetFirstElement<bfloat16>());
+      case F16:
+        return static_cast<double>(c->literal().GetFirstElement<Eigen::half>());
+      case F32:
+        return c->literal().GetFirstElement<float>();
+      case F64:
+        return c->literal().GetFirstElement<double>();
+      default:
+        // Cowardly refuse to consider complex types.
+        return absl::nullopt;
+    }
+  }();
+  if (!val) {
+    return false;
+  }
+
+  int exp;
+  double mantissa = std::frexp(*val, &exp);
+  // frexp returns a value in the range (-1; -0.5] U [0.5, 1).  A return value
+  // of +/-0.5 therefore indicates that the floating point value is a power of
+  // 2.
+  return mantissa == 0.5 || mantissa == -0.5;
+}
+
 // Returns whether the given transpose produces a result which is bit-wise
 // identical to its operand and thus may be replaced with a bitcast.
 bool TransposeIsBitcast(const HloInstruction* transpose) {
@@ -199,6 +241,13 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   // more fusion than leaving the nodes as Dot operations.
   StatusOr<bool> HandleDotStrengthReduction(HloInstruction* dot);
 
+  // Removes dimension dim from hlo.
+  HloInstruction* StripDim(HloInstruction* hlo, int64 dim) {
+    CHECK_EQ(hlo->shape().dimensions(dim), 1);
+    return computation_->AddInstruction(HloInstruction::CreateReshape(
+        ShapeUtil::DeleteDimension(dim, hlo->shape()), hlo));
+  }
+
   // Reshapes an instruction to rank 1 if it is not already rank 1.
   HloInstruction* Flatten(HloInstruction* hlo) {
     if (ShapeUtil::Rank(hlo->shape()) == 1) {
@@ -415,6 +464,40 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
                                           sum_of_constants));
   }
 
+  // A*C + B*C => (A+B)*C
+  //
+  //  - If A, B, and C are integers, do this unconditionally. Proof of
+  //    correctness: https://rise4fun.com/Alive/u9X.
+  //
+  //  - If A, B, and C are floating point, do this if C is a scalar constant or
+  //    broadcast of scalar constant and is equal to +/- 2^k for some (possibly
+  //    negative) integer k.
+  //
+  //    Multiplying by a power of 2 just moves the exponent, so our answer is
+  //    exact modulo rounding of intermediate results so long as
+  //
+  //     - none of the three products has an exponent which underflows (so the
+  //       result is 0 or denormal), and
+  //     - none of the three products overflows to inf.
+  //
+  //    Proof: See algebraic_simplifier_proof_distributive_property.py.
+  //
+  //    We deem these differences in rounding, underflow, and overflow
+  //    acceptable in the ML context.
+  HloInstruction *b, *c;
+  if (((Match(lhs, m::Multiply(m::Op(&a), m::Op(&c))) &&
+        Match(rhs, m::MultiplyAnyOrder(m::Op().Is(c), m::Op(&b)))) ||
+       (Match(lhs, m::Multiply(m::Op(&c), m::Op(&a))) &&
+        Match(rhs, m::MultiplyAnyOrder(m::Op().Is(c), m::Op(&b))))) &&
+      (ShapeUtil::ElementIsIntegral(add->shape()) ||
+       IsAllFpConstantPowerOf2(c))) {
+    return ReplaceWithNewInstruction(
+        add, HloInstruction::CreateBinary(
+                 add->shape(), HloOpcode::kMultiply,
+                 computation_->AddInstruction(HloInstruction::CreateBinary(
+                     add->shape(), HloOpcode::kAdd, a, b)),
+                 c));
+  }
   return Status::OK();
 }
 
@@ -834,21 +917,51 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
     HloInstruction* dot) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
-  int64 lhs_collapsing_dim =
-      dot->dot_dimension_numbers().lhs_contracting_dimensions(0);
+
+  const auto kept_dim = [](int64 rank, int64 contracting_dimension,
+                           absl::Span<const int64> batch_dimensions) -> int64 {
+    for (int64 i = 0; i < rank; ++i) {
+      if (i != contracting_dimension &&
+          !absl::c_linear_search(batch_dimensions, i)) {
+        return i;
+      }
+    }
+    return -1;
+  };
+
+  const int64 dot_rank = ShapeUtil::Rank(dot->shape());
+  const int64 rhs_rank = ShapeUtil::Rank(rhs->shape());
+  const int64 lhs_rank = ShapeUtil::Rank(lhs->shape());
+  const auto& dnums = dot->dot_dimension_numbers();
+  if (dnums.rhs_contracting_dimensions_size() > 1) {
+    return false;
+  }
+  if (dot_rank > 2 && (lhs_rank != rhs_rank || lhs_rank != dot_rank)) {
+    return false;
+  }
+  int64 lhs_collapsing_dim = dnums.lhs_contracting_dimensions(0);
+  int64 lhs_kept_dim = kept_dim(lhs_rank, lhs_collapsing_dim,
+                                AsInt64Slice(dnums.lhs_batch_dimensions()));
+  // If there is no non-contracting dimension in rank 2, do not strength reduce.
+  if (lhs_kept_dim == -1 && lhs_rank > 1) {
+    return false;
+  }
   if (lhs->IsRank2Transpose()) {
     lhs = lhs->mutable_operand(0);
-    lhs_collapsing_dim = 1 - lhs_collapsing_dim;
+    std::swap(lhs_collapsing_dim, lhs_kept_dim);
   }
-  const int64 lhs_kept_dim = 1 - lhs_collapsing_dim;
 
-  int64 rhs_collapsing_dim =
-      dot->dot_dimension_numbers().rhs_contracting_dimensions(0);
+  int64 rhs_collapsing_dim = dnums.rhs_contracting_dimensions(0);
+  int64 rhs_kept_dim = kept_dim(rhs_rank, rhs_collapsing_dim,
+                                AsInt64Slice(dnums.rhs_batch_dimensions()));
+  // If there is no non-contracting dimension in rank 2, do not strength reduce.
+  if (rhs_kept_dim == -1 && rhs_rank > 1) {
+    return false;
+  }
   if (rhs->IsRank2Transpose()) {
     rhs = rhs->mutable_operand(0);
-    rhs_collapsing_dim = 1 - rhs_collapsing_dim;
+    std::swap(rhs_collapsing_dim, rhs_kept_dim);
   }
-  const int64 rhs_kept_dim = 1 - rhs_collapsing_dim;
 
   auto as_type = [&](HloInstruction* hlo, const PrimitiveType element_type) {
     if (hlo->shape().element_type() == element_type) {
@@ -871,10 +984,15 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
     return AddReduce(as_type(hlo, F32), dim);
   };
 
+  auto broadcast = [&](HloInstruction* hlo, const Shape& shape,
+                       absl::Span<const int64> dims) {
+    return computation_->AddInstruction(
+        HloInstruction::CreateBroadcast(shape, hlo, dims));
+  };
+
   auto broadcast_to_dim = [&](HloInstruction* hlo, const Shape& shape,
                               int64 dim) {
-    return computation_->AddInstruction(
-        HloInstruction::CreateBroadcast(shape, hlo, {dim}));
+    return broadcast(hlo, shape, {dim});
   };
 
   auto multiply = [&](HloInstruction* local_lhs, HloInstruction* local_rhs) {
@@ -885,11 +1003,9 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
   // Strength reduce dot(a[K] , b[K]) =
   //  reshape(result.shape,
   //          reduce_sum(multiply(a, b), {0}))
-  if (ShapeUtil::Rank(rhs->shape()) == 1 &&
-      ShapeUtil::Rank(lhs->shape()) == 1) {
-    TF_RETURN_IF_ERROR(
-        ReplaceInstruction(dot, reshape_if_necessary(add_reduce_in_f32(
-                                    multiply(Flatten(lhs), Flatten(rhs)), 0))));
+  if (rhs_rank == 1 && lhs_rank == 1) {
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(add_reduce_in_f32(multiply(lhs, rhs), 0))));
     return true;
   }
 
@@ -903,8 +1019,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
   // Simplify outer product into multiply with implicit broadcasting.
   //
   // A dot(a[M, 1], b[1, N]) = multiply(a [M,1], b [1, N])
-  if (ShapeUtil::Rank(rhs->shape()) == 2 &&
-      rhs->shape().dimensions(rhs_collapsing_dim) == 1) {
+  if (rhs_rank == 2 && rhs->shape().dimensions(rhs_collapsing_dim) == 1) {
     TF_RETURN_IF_ERROR(ReplaceInstruction(
         dot, multiply(broadcast_to_dim(Flatten(lhs), dot->shape(), 0),
                       broadcast_to_dim(Flatten(rhs), dot->shape(), 1))));
@@ -918,9 +1033,8 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
   //        {0})
   //      )
   //    )
-  if (ShapeUtil::Rank(lhs->shape()) == 1 ||
-      (ShapeUtil::Rank(lhs->shape()) == 2 &&
-       lhs->shape().dimensions(lhs_kept_dim) == 1)) {
+  if (lhs_rank == 1 ||
+      (lhs_rank == 2 && lhs->shape().dimensions(lhs_kept_dim) == 1)) {
     if (ShapeUtil::Rank(rhs->shape()) == 1) {
       TF_RETURN_IF_ERROR(
           ReplaceInstruction(dot, reshape_if_necessary(add_reduce_in_f32(
@@ -940,9 +1054,8 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
   //  reshape(result.shape,
   //    reduce_sum(multiply(a, broadcast(reshape([K],b), {1})), {0})
   //  )
-  if (ShapeUtil::Rank(rhs->shape()) == 1 ||
-      (ShapeUtil::Rank(rhs->shape()) == 2 &&
-       rhs->shape().dimensions(rhs_kept_dim) == 1)) {
+  if (rhs_rank == 1 ||
+      (rhs_rank == 2 && rhs->shape().dimensions(rhs_kept_dim) == 1)) {
     TF_RETURN_IF_ERROR(ReplaceInstruction(
         dot, reshape_if_necessary(add_reduce_in_f32(
                  multiply(lhs, broadcast_to_dim(Flatten(rhs), lhs->shape(),
@@ -950,6 +1063,97 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
                  lhs_collapsing_dim))));
     return true;
   }
+
+  // Only consider kDot with batch dimension.
+  if (dot_rank <= 2) {
+    return false;
+  }
+
+  CHECK_EQ(rhs_rank, lhs_rank);
+  CHECK_EQ(dot_rank, lhs_rank);
+  // If there is more than one non-contracting dimension or the batch dimensions
+  // are not equal, bail out since transposes may be required to do a strength
+  // reduction.
+  if (dnums.rhs_batch_dimensions_size() + 2 != dot_rank ||
+      !absl::c_equal(dnums.lhs_batch_dimensions(),
+                     dnums.rhs_batch_dimensions())) {
+    return false;
+  }
+
+  auto broadcast_dims = [](int64 rank, int64 non_broadcast_dim) {
+    absl::InlinedVector<int64, 8> dims;
+    for (int64 i = 0; i < rank; ++i) {
+      if (i != non_broadcast_dim) {
+        dims.push_back(i);
+      }
+    }
+    return dims;
+  };
+
+  // If the contracting dimension is 1, remove the degnerate dimnesions from the
+  // lhs and rhs, broadcast each to the result shape and multiply.
+  if (lhs->shape().dimensions(lhs_collapsing_dim) == 1 &&
+      (rhs_kept_dim == rhs_rank - 1 ||
+       (rhs_collapsing_dim == rhs_rank - 1 && rhs_kept_dim == rhs_rank - 2))) {
+    CHECK_EQ(rhs->shape().dimensions(rhs_collapsing_dim), 1);
+    const int64 lhs_kept_dim_in_output =
+        lhs_kept_dim > lhs_collapsing_dim ? (lhs_kept_dim - 1) : lhs_kept_dim;
+    absl::InlinedVector<int64, 8> lhs_broadcast_dims;
+    for (const int64 dim : dnums.lhs_batch_dimensions()) {
+      lhs_broadcast_dims.push_back(dim > lhs_collapsing_dim ? (dim - 1) : dim);
+    }
+    absl::InlinedVector<int64, 8> rhs_broadcast_dims = lhs_broadcast_dims;
+    lhs_broadcast_dims.push_back(lhs_kept_dim_in_output);
+    absl::c_sort(lhs_broadcast_dims);
+    rhs_broadcast_dims.push_back(dot_rank - 1);
+    absl::c_sort(rhs_broadcast_dims);
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(
+                 multiply(broadcast(StripDim(lhs, lhs_collapsing_dim),
+                                    dot->shape(), lhs_broadcast_dims),
+                          broadcast(StripDim(rhs, rhs_collapsing_dim),
+                                    dot->shape(), rhs_broadcast_dims)))));
+    return true;
+  }
+
+  // If the lhs and rhs non-contracting dimensions are both one, strip each one,
+  // multiply and then reduce the collapsing dimension
+  if (lhs->shape().dimensions(lhs_kept_dim) == 1 &&
+      rhs->shape().dimensions(rhs_kept_dim) == 1 &&
+      lhs_kept_dim == rhs_kept_dim) {
+    auto new_lhs = StripDim(lhs, lhs_kept_dim);
+    auto new_rhs = StripDim(rhs, rhs_kept_dim);
+    const int64 reduce_dim = rhs_kept_dim < rhs_collapsing_dim
+                                 ? (rhs_collapsing_dim - 1)
+                                 : rhs_collapsing_dim;
+    TF_RETURN_IF_ERROR(
+        ReplaceInstruction(dot, reshape_if_necessary(add_reduce_in_f32(
+                                    multiply(new_lhs, new_rhs), reduce_dim))));
+    return true;
+  }
+
+  // If the lhs  non-contracting dimensions is one, strip the one, brodcast to
+  // the rhs shape, multiply and then reduce the collapsing dimension
+  if (lhs->shape().dimensions(lhs_kept_dim) == 1) {
+    auto new_lhs = broadcast(StripDim(lhs, lhs_kept_dim), rhs->shape(),
+                             broadcast_dims(rhs_rank, rhs_kept_dim));
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(add_reduce_in_f32(multiply(new_lhs, rhs),
+                                                    rhs_collapsing_dim))));
+    return true;
+  }
+
+  // If the rhs  non-contracting dimensions is one, strip the one, brodcast to
+  // the lhs shape, multiply and then reduce the collapsing dimension
+  if (rhs->shape().dimensions(rhs_kept_dim) == 1) {
+    auto new_rhs = broadcast(StripDim(rhs, rhs_kept_dim), lhs->shape(),
+                             broadcast_dims(lhs_rank, lhs_kept_dim));
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(add_reduce_in_f32(multiply(lhs, new_rhs),
+                                                    lhs_collapsing_dim))));
+    return true;
+  }
+
   return false;
 }
 
@@ -1228,25 +1432,31 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
 
-  // Only optimize F32 or BF16 dot operations where the dot, rhs and lhs are
-  // rank 2 or below.
-  if ((dot->shape().element_type() != F32 &&
-       dot->shape().element_type() != BF16) ||
-      ShapeUtil::Rank(lhs->shape()) > 2 || ShapeUtil::Rank(rhs->shape()) > 2 ||
-      ShapeUtil::Rank(dot->shape()) > 2) {
-    return Status::OK();
-  }
-
   // Replace a zero element dot with a broadcast of the constant 0.
   if (ShapeUtil::IsZeroElementArray(dot->shape()) ||
       ShapeUtil::IsZeroElementArray(lhs->shape()) ||
       ShapeUtil::IsZeroElementArray(rhs->shape())) {
-    auto zero = computation_->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f)));
+    auto zero = computation_->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(dot->shape().element_type())));
     return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateBroadcast(dot->shape(), zero, {}));
   }
 
+  // Only optimize F32 or BF16 dot operations where the dot, rhs and lhs are
+  // rank 2 or below.
+  if (dot->shape().element_type() != F32 &&
+      dot->shape().element_type() != BF16) {
+    return Status::OK();
+  }
+  if (ShapeUtil::Rank(lhs->shape()) > 2 || ShapeUtil::Rank(rhs->shape()) > 2 ||
+      ShapeUtil::Rank(dot->shape()) > 2) {
+    if (options_.enable_dot_strength_reduction() &&
+        !options_.is_layout_sensitive()) {
+      TF_RETURN_IF_ERROR(HandleDotStrengthReduction(dot).status());
+    }
+    return Status::OK();
+  }
+
   TF_ASSIGN_OR_RETURN(HloInstruction * dot_of_concat_optimized,
                       OptimizeDotOfConcat(dot));
   if (dot_of_concat_optimized) {
@@ -1952,6 +2162,7 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
         reshape, HloInstruction::CreateReshape(reshape->shape(),
                                                operand->mutable_operand(0)));
   }
+
   if (operand->opcode() == HloOpcode::kRng && operand->user_count() == 1) {
     *operand->mutable_shape() = reshape->shape();
     return ReplaceInstruction(reshape, operand);
@@ -2674,6 +2885,22 @@ Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
   return Status::OK();
 }
 
+namespace {
+bool OnlyPermutesMoreThanOneDegenerateDim(const Shape& shape,
+                                          absl::Span<const int64> perm) {
+  std::vector<int64> new_permutation;
+  int64 degenerate_count = 0;
+  for (int64 i = 0; i < perm.size(); ++i) {
+    if (shape.dimensions(i) != 1) {
+      new_permutation.push_back(perm[i]);
+    } else {
+      ++degenerate_count;
+    }
+  }
+  return degenerate_count > 1 && absl::c_is_sorted(new_permutation);
+}
+}  // namespace
+
 Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
   auto operand = transpose->mutable_operand(0);
   if (std::is_sorted(transpose->dimensions().begin(),
@@ -2690,6 +2917,15 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
                                            transpose->dimensions())));
   }
 
+  // Replace transpose with a reshape if more than one degenerate method is
+  // permuted.
+  if (OnlyPermutesMoreThanOneDegenerateDim(transpose->shape(),
+                                           transpose->dimensions())) {
+    return ReplaceWithNewInstruction(
+        transpose, HloInstruction::CreateReshape(
+                       transpose->shape(), transpose->mutable_operand(0)));
+  }
+
   if (operand->opcode() == HloOpcode::kRng && operand->user_count() == 1) {
     *operand->mutable_shape() = transpose->shape();
     return ReplaceInstruction(transpose, operand);
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py b/tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py
new file mode 100644
index 0000000000000000000000000000000000000000..5da13da041b4ded813876af7ca379025187545ab
--- /dev/null
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py
@@ -0,0 +1,82 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Proof that transforming (A*C)+(B*C) <=> (A+B)*C is "safe" if C=2^k.
+
+Specifically, for all floating-point values A, B, and C, if
+
+ - C is equal to +/- 2^k for some (possibly negative) integer k, and
+ - A, B, C, A*C, B*C, and A+B are not subnormal, zero, or inf,
+
+then there exists a rounding mode rm in [RTZ, RNE] such that
+
+ (A*C) + (B*C) == (A+B) * C  (computed with rounding mode rm).
+
+Informally, this means that the equivalence holds for powers of 2 C, modulo
+flushing to zero or inf, and modulo rounding of intermediate results.
+
+Requires z3 python bindings; try `pip install z3-solver`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import z3
+
+# We do float16 because it lets the solver run much faster.  These results
+# should generalize to fp32 and fp64, and you can verify this by changing the
+# value of FLOAT_TY (and then waiting a while).
+FLOAT_TY = z3.Float16
+
+a = z3.FP("a", FLOAT_TY())
+b = z3.FP("b", FLOAT_TY())
+c = z3.FP("c", FLOAT_TY())
+
+s = z3.Solver()
+
+# C must be a power of 2, i.e. significand bits must all be 0.
+s.add(z3.Extract(FLOAT_TY().sbits() - 1, 0, z3.fpToIEEEBV(c)) == 0)
+
+for rm in [z3.RTZ(), z3.RNE()]:
+  z3.set_default_rounding_mode(rm)
+  before = a * c + b * c
+  after = (a + b) * c
+
+  # Check that before == after, allowing that 0 == -0.
+  s.add(
+      z3.Not(
+          z3.Or(
+              before == after,  #
+              z3.And(z3.fpIsZero(before), z3.fpIsZero(after)))))
+
+  for x in [
+      (a * c),
+      (b * c),
+      (a + b),
+  ]:
+    s.add(z3.Not(z3.fpIsSubnormal(x)))
+    s.add(z3.Not(z3.fpIsZero(x)))
+    s.add(z3.Not(z3.fpIsInf(x)))
+
+if s.check() == z3.sat:
+  m = s.model()
+  print("Counterexample found!")
+  print(m)
+  print("a*c:       ", z3.simplify(m[a] * m[c]))
+  print("b*c:       ", z3.simplify(m[b] * m[c]))
+  print("a+b:       ", z3.simplify(m[a] + m[b]))
+  print("a*c + b*c: ", z3.simplify(m[a] * m[c] + m[b] * m[c]))
+  print("(a+b) * c: ", z3.simplify((m[a] + m[b]) * m[c]))
+else:
+  print("Proved!")
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 48f689c96a98065498818aa081d4a5a911aea5a6..cfb4c48277605a6f90ef51debac1c3bc26bed070 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -27,9 +27,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -42,8 +44,7 @@ namespace xla {
 namespace {
 
 using ::testing::ElementsAre;
-
-namespace op = xla::testing::opcode_matchers;
+namespace m = match;
 
 AlgebraicSimplifierOptions::ValidBitcastCallback bitcasting_callback() {
   return [](const Shape&, const Shape&) { return true; };
@@ -79,6 +80,128 @@ TEST_F(AlgebraicSimplifierTest, AddZero) {
   EXPECT_EQ(root, param0);
 }
 
+TEST_F(AlgebraicSimplifierTest, FactorIntegerAddition) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = s32[8] parameter(0)
+      p1 = s32[8] parameter(1)
+      p2 = s32[8] parameter(2)
+      x = s32[8] multiply(p0, p2)
+      y = s32[8] multiply(p1, p2)
+      ROOT sum = s32[8] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::MultiplyAnyOrder(
+          m::AddAnyOrder(m::Parameter(0), m::Parameter(1)), m::Parameter(2))));
+}
+
+// A*C + B*C => (A+B)*C if C is a floating-point power of 2.
+TEST_F(AlgebraicSimplifierTest, FactorFpAddition) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      c = f32[] constant(0.125)
+      x = f32[] multiply(p0, c)
+      y = f32[] multiply(p1, c)
+      ROOT sum = f32[] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::AddAnyOrder(m::Parameter(0), m::Parameter(1)),
+                  m::ConstantScalar(0.125))));
+}
+
+// A*C + B*C => (A+B)*C if C is a broadcast of a floating-point power of 2.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionWithBroadcast) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[4] parameter(0)
+      p1 = f32[4] parameter(1)
+      c = f32[] constant(0.125)
+      b = f32[4] broadcast(c), dimensions={}
+      x = f32[4] multiply(p0, b)
+      y = f32[4] multiply(p1, b)
+      ROOT sum = f32[4] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::AddAnyOrder(m::Parameter(0), m::Parameter(1)),
+                  m::Broadcast(m::ConstantScalar(0.125)))));
+}
+
+// A*C + B*C => (A+B)*C simplification should not happen if C is not a
+// floating-point power of 2.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionNotPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      c = f32[] constant(0.3)
+      x = f32[] multiply(p0, c)
+      y = f32[] multiply(p1, c)
+      ROOT sum = f32[] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  EXPECT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+}
+
+// A*C + B*C => (A+B)*C simplification should not happen if A, B, and C are
+// complex numbers.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionComplex) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = c64[8] parameter(0)
+      p1 = c64[8] parameter(1)
+      p2 = c64[8] parameter(2)
+      x = c64[8] multiply(p0, p2)
+      y = c64[8] multiply(p1, p2)
+      ROOT sum = c64[8] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  EXPECT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+}
+
+// A*C + B*C => (A+B)*C simplification is OK if A, B, and C are complex.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionBfloat16) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = bf16[4] parameter(0)
+      p1 = bf16[4] parameter(1)
+      c = bf16[] constant(0.125)
+      b = bf16[4] broadcast(c), dimensions={}
+      x = bf16[4] multiply(p0, b)
+      y = bf16[4] multiply(p1, b)
+      ROOT sum = bf16[4] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::AddAnyOrder(m::Parameter(0), m::Parameter(1)),
+                  m::Broadcast(m::ConstantScalar(0.125)))));
+}
+
 // Test that A * 0 is simplified to 0
 TEST_F(AlgebraicSimplifierTest, MulZero) {
   auto m = CreateNewVerifiedModule();
@@ -197,7 +320,7 @@ TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   HloInstruction* root = m->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Reduce(param, zero));
+  EXPECT_THAT(root, GmockMatch(m::Reduce(m::Parameter(0), m::Op().Is(zero))));
   EXPECT_EQ(root->dimensions(), std::vector<int64>({0, 2, 3}));
 }
 
@@ -219,7 +342,7 @@ TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) {
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0), m::Constant())));
 }
 
 // Test that [(A + C1) + C2] => [A + (C1 + C2)] for constants C1 and C2.
@@ -245,7 +368,9 @@ TEST_F(AlgebraicSimplifierTest, AddReassociateMergeConstants) {
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Add(constant1, constant2)));
+  EXPECT_THAT(root, GmockMatch(m::Add(
+                        m::Op().Is(param0),
+                        m::Add(m::Op().Is(constant1), m::Op().Is(constant2)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
@@ -303,7 +428,8 @@ TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) {
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Broadcast(zero)));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0),
+                                      m::Broadcast(m::Op().Is(zero)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
@@ -336,11 +462,11 @@ TEST_F(AlgebraicSimplifierTest, ConstantToBroadcast) {
 
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
   EXPECT_EQ(3.14f, root->operand(0)->literal().GetFirstElement<float>());
 }
 
@@ -352,11 +478,11 @@ TEST_F(AlgebraicSimplifierTest, ConstantNotToBroadcast) {
 
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_FALSE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaToBroadcast) {
@@ -367,11 +493,11 @@ TEST_F(AlgebraicSimplifierTest, IotaToBroadcast) {
 
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Iota());
+  EXPECT_THAT(root, GmockMatch(m::Iota()));
 }
 
 // Test that A - 0 is simplified to A
@@ -413,7 +539,8 @@ TEST_F(AlgebraicSimplifierTest, SubConstCanonicalization) {
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Negate(constant)));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0),
+                                      m::Negate(m::Op().Is(constant)))));
 }
 
 // Test that (A/B)/C is simplified to A/(B*C).
@@ -435,13 +562,16 @@ TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Divide(op::Divide(param0, param1), param2));
+              GmockMatch(m::Divide(m::Divide(m::Parameter(0), m::Parameter(1)),
+                                   m::Parameter(2))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Multiply(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Multiply(m::Parameter(1), m::Parameter(2)))));
 }
 
 // Test that A/(B/C) is simplified to (A*C)/B.
@@ -462,14 +592,18 @@ TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Divide(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Divide(m::Parameter(1), m::Parameter(2)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(op::Multiply(param0, param2), param1));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Multiply(m::Parameter(0), m::Parameter(2)),
+                           m::Parameter(1))));
 }
 
 // Test that (A/B)/(C/D) is simplified to (A*D)/(B*C).
@@ -496,14 +630,16 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
 
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Divide(op::Divide(param0, param1), op::Divide(param2, param3)));
+      GmockMatch(m::Divide(m::Divide(m::Parameter(0), m::Parameter(1)),
+                           m::Divide(m::Parameter(2), m::Parameter(3)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Divide(op::Multiply(param0, param3), op::Multiply(param1, param2)));
+      GmockMatch(m::Divide(m::Multiply(m::Parameter(0), m::Parameter(3)),
+                           m::Multiply(m::Parameter(1), m::Parameter(2)))));
 }
 
 // Test that A/exp(B) is simplified to A*exp(-B).
@@ -523,13 +659,14 @@ TEST_F(AlgebraicSimplifierTest, DivOfExp) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Exp(param1)));
+              GmockMatch(m::Divide(m::Parameter(0), m::Exp(m::Parameter(1)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Exp(op::Negate(param1))));
+              GmockMatch(m::Multiply(m::Parameter(0),
+                                     m::Exp(m::Negate(m::Parameter(1))))));
 }
 
 // Test that A/pow(B,C) is simplified to A*pow(B,-C).
@@ -550,14 +687,18 @@ TEST_F(AlgebraicSimplifierTest, DivOfPower) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Power(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Power(m::Parameter(1), m::Parameter(2)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Power(param1, op::Negate(param2))));
+              GmockMatch(m::Multiply(
+                  m::Parameter(0),
+                  m::Power(m::Parameter(1), m::Negate(m::Parameter(2))))));
 }
 
 // Test that broadcasting is done on the right step when simplifying A/pow(B,C)
@@ -579,14 +720,18 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Power(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Power(m::Parameter(1), m::Parameter(2)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   ASSERT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Power(param1, op::Negate(param2))));
+              GmockMatch(m::Multiply(
+                  m::Parameter(0),
+                  m::Power(m::Parameter(1), m::Negate(m::Parameter(2))))));
 }
 
 // A / Const => A * InvertedConst
@@ -608,7 +753,7 @@ TEST_F(AlgebraicSimplifierTest, DivideByConstant) {
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Constant()));
+              GmockMatch(m::Multiply(m::Parameter(0), m::Constant())));
 }
 
 // pow(pow(A, X), Y) => pow(A, X*Y)
@@ -630,8 +775,10 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
   auto computation = m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(),
-              op::Power(base, op::Multiply(exp1, exp2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Power(m::Op().Is(base),
+                          m::Multiply(m::Op().Is(exp1), m::Op().Is(exp2)))));
 }
 
 // Don't simplify pow(pow(A, X), Y) => pow(A, X*Y) if X and Y are complex
@@ -794,7 +941,7 @@ TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) {
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param1, param2));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(1), m::Parameter(2))));
 }
 
 // Test that exp(A)/exp(B) is simplified to exp(A-B)
@@ -815,14 +962,16 @@ TEST_F(AlgebraicSimplifierTest, ExpDiv) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(op::Exp(param0), op::Exp(param1)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Exp(m::Parameter(0)), m::Exp(m::Parameter(1)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Exp(op::Subtract(param0, param1)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Exp(m::Subtract(m::Parameter(0), m::Parameter(1)))));
 }
 
 // Test that exp(A)*exp(B) is simplified to exp(A+B)
@@ -844,13 +993,14 @@ TEST_F(AlgebraicSimplifierTest, ExpMul) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Exp(param0), op::Exp(param1)));
+              GmockMatch(m::Multiply(m::Exp(m::Parameter(0)),
+                                     m::Exp(m::Parameter(1)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Exp(op::Add(param0, param1)));
+              GmockMatch(m::Exp(m::Add(m::Parameter(0), m::Parameter(1)))));
 }
 
 // Test that pow(exp(A), B) is simplified to exp(A*B)
@@ -870,13 +1020,14 @@ TEST_F(AlgebraicSimplifierTest, PowExp) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Power(op::Exp(param0), param1));
+              GmockMatch(m::Power(m::Exp(m::Parameter(0)), m::Parameter(1))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Exp(op::Multiply(param0, param1)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Exp(m::Multiply(m::Parameter(0), m::Parameter(1)))));
 }
 
 // Test that ln(pow(A, B)) is simplified to ln(A)*B
@@ -896,13 +1047,14 @@ TEST_F(AlgebraicSimplifierTest, LnPow) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Log(op::Power(param0, param1)));
+              GmockMatch(m::Log(m::Power(m::Parameter(0), m::Parameter(1)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Log(param0), param1));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Multiply(m::Log(m::Parameter(0)), m::Parameter(1))));
 }
 
 // Test that ln(exp(A)) is simplified to A
@@ -919,7 +1071,8 @@ TEST_F(AlgebraicSimplifierTest, LnExp) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Log(op::Exp(param0)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Log(m::Exp(m::Parameter(0)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
@@ -948,12 +1101,14 @@ TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Log(op::Divide(op::Exp(param0), op::Exp(param1))));
+              GmockMatch(m::Log(m::Divide(m::Exp(m::Parameter(0)),
+                                          m::Exp(m::Parameter(1))))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Subtract(param0, param1));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Subtract(m::Parameter(0), m::Parameter(1))));
 }
 
 // Test that pow(A, 0) where A is a scalar is simplified to the scalar
@@ -971,13 +1126,14 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(zero))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
   EXPECT_EQ(root->literal().GetFirstElement<float>(), 1);
 }
 
@@ -995,13 +1151,14 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(zero))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast());
+  EXPECT_THAT(root, GmockMatch(m::Broadcast()));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), r1f32))
       << ShapeUtil::HumanString(root->shape());
   EXPECT_EQ(root->dimensions().size(), 0);
@@ -1023,7 +1180,8 @@ TEST_F(AlgebraicSimplifierTest, Pow1) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, one));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(one))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
@@ -1045,12 +1203,14 @@ TEST_F(AlgebraicSimplifierTest, Pow2) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, two));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(two))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Multiply(param0, param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(0))));
 }
 
 // Test that pow(A, -1) is simplified to 1/A.
@@ -1067,13 +1227,14 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, negative_one));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(negative_one))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Divide(op::Broadcast(), param0));
+  EXPECT_THAT(root, GmockMatch(m::Divide(m::Broadcast(), m::Parameter(0))));
   EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kBroadcast);
   EXPECT_EQ(root->operand(0)->operand(0)->literal().GetFirstElement<float>(),
             1);
@@ -1116,10 +1277,10 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedConvolution) {
   m->AddEntryComputation(builder.Build());
   HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Convolution(lhs, rhs));
+              GmockMatch(m::Convolution(m::Op().Is(lhs), m::Op().Is(rhs))));
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Broadcast(op::Constant()));
+              GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
@@ -1158,10 +1319,10 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
   m->AddEntryComputation(builder.Build());
   HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::ReduceWindow(param, op::Constant()));
+              GmockMatch(m::ReduceWindow(m::Parameter(0), m::Constant())));
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Broadcast(op::Constant()));
+              GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) {
@@ -1184,11 +1345,11 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) {
       padding));
   m->AddEntryComputation(builder.Build());
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Pad(param, op::Constant()));
+              GmockMatch(m::Pad(m::Parameter(0), m::Constant())));
   HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Broadcast(op::Constant()));
+              GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
@@ -1209,7 +1370,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
   m->AddEntryComputation(std::move(computation));
 
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Reshape(op::Broadcast(op::Reshape(op))));
+              GmockMatch(m::Reshape(m::Broadcast(m::Reshape(m::Op().Is(op))))));
 
   HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
@@ -1228,7 +1389,8 @@ TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Convert(m::Op().Is(input))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
@@ -1248,7 +1410,8 @@ TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
@@ -1269,21 +1432,24 @@ TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
   *copy->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({1, 2, 0, 3});
   auto computation = m->AddEntryComputation(builder.Build());
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
   AlgebraicSimplifierOptions options(non_bitcasting_callback());
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier1(options);
   ASSERT_FALSE(simplifier1.Run(m.get()).ValueOrDie());
   // Verify that the copy is not replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
   AlgebraicSimplifierOptions options2(bitcasting_callback());
   options2.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier2(options2);
   ASSERT_TRUE(simplifier2.Run(m.get()).ValueOrDie());
   // Verify that the copy is replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
 }
 
 // Test that unary concatenates are removed.
@@ -1298,7 +1464,8 @@ TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Concatenate(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Concatenate(m::Parameter(0))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
@@ -1327,15 +1494,17 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(
-      computation->root_instruction(),
-      op::Concatenate(empty_literal, param0, param0, empty_slice, param1));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Concatenate(
+                  m::Op().Is(empty_literal), m::Parameter(0), m::Parameter(0),
+                  m::Op().Is(empty_slice), m::Parameter(1))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Concatenate(param0, param0, param1));
+              GmockMatch(m::Concatenate(m::Parameter(0), m::Parameter(0),
+                                        m::Parameter(1))));
 }
 
 // Test that reduce of concat is simplified.
@@ -1383,8 +1552,9 @@ TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
 
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Map(op::Map(op::Reduce(param0, zero), op::Reduce(param1, zero)),
-              op::Reduce(param2, zero)));
+      GmockMatch(m::Map(m::Map(m::Reduce(m::Parameter(0), m::Op().Is(zero)),
+                               m::Reduce(m::Parameter(1), m::Op().Is(zero))),
+                        m::Reduce(m::Parameter(2), m::Op().Is(zero)))));
 }
 
 // Test a concatenate with only empty operands is removed.
@@ -1407,7 +1577,8 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Concatenate(empty_literal, empty_slice));
+              GmockMatch(m::Concatenate(m::Op().Is(empty_literal),
+                                        m::Op().Is(empty_slice))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
@@ -1434,7 +1605,8 @@ TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) {
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(), op::Pad(param0, param1));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Parameter(1))));
 }
 
 TEST_F(AlgebraicSimplifierTest, SimplifyConcatenateOfSlices) {
@@ -1495,10 +1667,10 @@ TEST_F(AlgebraicSimplifierTest, SimplifyConcatenateOfSlices) {
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  auto s = m::Slice(m::Parameter(0));
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Concatenate(op::Slice(param0), op::Slice(param0), op::Slice(param0),
-                      op::Slice(param0), op::Slice(param0), op::Slice(param1)));
+      GmockMatch(m::Concatenate(s, s, s, s, s, m::Slice(m::Parameter(1)))));
   // The operand 3 should be a merge of 'slice3', 'slice4' and 'slice5', so its
   // shape should have dimensions {50, 30}.
   EXPECT_TRUE(
@@ -1524,7 +1696,8 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
   *copy->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({1, 0});
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
   AlgebraicSimplifierOptions options(non_bitcasting_callback());
   options.set_is_layout_sensitive(true);
@@ -1532,7 +1705,8 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   // Copy has not been removed.
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 }
 
 // Test that a simplification which preserves layouts is performed if layout
@@ -1552,7 +1726,8 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
   *copy->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
   AlgebraicSimplifierOptions options(non_bitcasting_callback());
   options.set_is_layout_sensitive(true);
@@ -1581,7 +1756,8 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 
   AlgebraicSimplifierOptions options(non_bitcasting_callback());
   options.set_is_layout_sensitive(true);
@@ -1589,7 +1765,8 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   // Reshape is not replaced with a bitcast.
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 }
 
 // Test transforming reshapes and transposes of rng.
@@ -1617,9 +1794,9 @@ TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
       (AlgebraicSimplifierOptions(bitcasting_callback())));
   EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  // Verify that that reshape(transpose(rng)) is replace by a single rng of the
+  // Verify that reshape(transpose(rng)) is replace by a single rng of the
   // same shape as the reshape.
-  EXPECT_THAT(computation->root_instruction(), op::Rng());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Rng()));
   EXPECT_TRUE(ShapeUtil::Equal(computation->root_instruction()->shape(),
                                reshape_shape));
 }
@@ -1661,8 +1838,9 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Tuple(transformable_reshape, dimensions_wrong_reshape,
-                        layout_wrong_reshape));
+              GmockMatch(m::Tuple(m::Op().Is(transformable_reshape),
+                                  m::Op().Is(dimensions_wrong_reshape),
+                                  m::Op().Is(layout_wrong_reshape))));
 
   AlgebraicSimplifierOptions options(bitcasting_callback());
   options.set_is_layout_sensitive(true);
@@ -1672,7 +1850,8 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
   // Verify that only the first reshape is replaced.
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Tuple(op::Bitcast(), dimensions_wrong_reshape, layout_wrong_reshape));
+      GmockMatch(m::Tuple(m::Bitcast(), m::Op().Is(dimensions_wrong_reshape),
+                          m::Op().Is(layout_wrong_reshape))));
 }
 
 // Regression test for a bug where if we failed to sink a reshape, we'd set the
@@ -1741,7 +1920,8 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Parameter(0))));
 
   AlgebraicSimplifierOptions options(bitcasting_callback());
   options.set_is_layout_sensitive(true);
@@ -1749,7 +1929,8 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
@@ -1769,7 +1950,8 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Parameter(0))));
 
   AlgebraicSimplifierOptions options(bitcasting_callback());
   options.set_is_layout_sensitive(true);
@@ -1777,7 +1959,8 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
@@ -1797,12 +1980,13 @@ TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Reshape(param0)));
+              GmockMatch(m::Reshape(m::Reshape(m::Parameter(0)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
@@ -1823,14 +2007,16 @@ TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(op::Copy(param0)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Copy(m::Parameter(0)))));
 
   AlgebraicSimplifierOptions options(non_bitcasting_callback());
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
@@ -1849,16 +2035,39 @@ TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(transpose1));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Op().Is(transpose1))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Parameter(0))));
   EXPECT_EQ(std::vector<int64>({2, 1, 0}),
             computation->root_instruction()->dimensions());
 }
 
+TEST_F(AlgebraicSimplifierTest, TransposeIsReshape) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param = f32[10] parameter(0)
+      reshaped = f32[1,1,10] reshape(f32[10] param)
+      transposed = f32[10,1,1] transpose(f32[1,1,10] reshaped), dimensions={2,1,0}
+      ROOT reshaped_again = f32[10] reshape(f32[10,1,1] transposed)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Parameter()));
+}
+
 // Test merging reshape and broadcast.
 TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
   auto m = CreateNewVerifiedModule();
@@ -1873,12 +2082,13 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Broadcast(op::Reshape(param0)));
+              GmockMatch(m::Broadcast(m::Reshape(m::Parameter(0)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
 }
 
 // Test merging broadcast and reshape.
@@ -1895,12 +2105,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshapeMerged) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param0)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
@@ -1916,13 +2127,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
@@ -1938,12 +2149,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
   EXPECT_THAT(computation->root_instruction()->dimensions(),
               ::testing::ElementsAre(3));
 }
@@ -1961,12 +2173,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
   const std::vector<int64> broadcast_dims =
       computation->root_instruction()->dimensions();
   EXPECT_EQ(1, broadcast_dims.size());
@@ -1986,13 +2199,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshapeMerged) {
@@ -2005,12 +2218,13 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshapeMerged) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), result_shape));
 }
@@ -2024,13 +2238,13 @@ TEST_F(AlgebraicSimplifierTest, IotaEffectiveScalar) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   auto root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
   EXPECT_EQ(0.0f, root->operand(0)->literal().GetFirstElement<float>());
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), result_shape));
@@ -2046,12 +2260,14 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2_6) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
   AlgebraicSimplifier simplifier(default_options_);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4_6x1x1x4) {
@@ -2064,12 +2280,13 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4_6x1x1x4) {
 
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
   EXPECT_EQ(Cast<HloIotaInstruction>(computation->root_instruction())
                 ->iota_dimension(),
             3);
@@ -2085,12 +2302,13 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2x2_6x1x1x2) {
 
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
   const int64 iota_dim =
       Cast<HloIotaInstruction>(computation->root_instruction())
           ->iota_dimension();
@@ -2107,12 +2325,14 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4x2_6x8) {
 
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
   AlgebraicSimplifier simplifier(default_options_);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 }
 
 TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
@@ -2135,7 +2355,8 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
@@ -2179,12 +2400,14 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
     return false;
   };
 
-  EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
   EXPECT_TRUE(has_negative_padding(pad));
 
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Pad(param, zero)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Pad(m::Parameter(0), m::Op().Is(zero)))));
   EXPECT_FALSE(
       has_negative_padding(computation->root_instruction()->operand(0)));
 }
@@ -2213,12 +2436,14 @@ TEST_F(AlgebraicSimplifierTest, TrivialInteriorPadding) {
 
   AlgebraicSimplifier simplifier(default_options_);
 
-  ASSERT_THAT(computation->root_instruction(), op::Pad(param, zero));
+  ASSERT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
   ASSERT_TRUE(HasInteriorPadding(pad->padding_config()));
 
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
   EXPECT_FALSE(
       HasInteriorPadding(computation->root_instruction()->padding_config()));
 }
@@ -2234,7 +2459,8 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
@@ -2256,7 +2482,8 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Parameter(0))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
@@ -2284,12 +2511,14 @@ TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Slice(param)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Slice(m::Parameter(0)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Parameter(0))));
   EXPECT_EQ(computation->root_instruction()->slice_starts(0), 3);
   EXPECT_EQ(computation->root_instruction()->slice_starts(1), 5);
   EXPECT_EQ(computation->root_instruction()->slice_limits(0), dim0 - 2);
@@ -2315,12 +2544,14 @@ TEST_F(AlgebraicSimplifierTest, SliceOfReshapeToReshapeOfSlice) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Reshape(param)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Reshape(m::Parameter(0)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Slice(param)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Slice(m::Parameter(0)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfReshapeUnchanged) {
@@ -2339,7 +2570,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfReshapeUnchanged) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Reshape(param)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Reshape(m::Parameter(0)))));
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
@@ -2380,10 +2612,10 @@ TEST_F(AlgebraicSimplifierTest, ReplacePermutationSortWithScatter) {
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root,
-              op::Tuple(op::Iota(),
-                        op::Scatter(op::Iota(),
-                                    op::Concatenate(op::Iota(), op::Reshape()),
-                                    op::Reshape())));
+              GmockMatch(m::Tuple(
+                  m::Iota(),
+                  m::Scatter(m::Iota(), m::Concatenate(m::Iota(), m::Reshape()),
+                             m::Reshape()))));
 }
 
 TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortIfNonIntegral) {
@@ -2451,7 +2683,8 @@ TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
-              op::Tuple(keys, values0, values1));
+              GmockMatch(m::Tuple(m::Op().Is(keys), m::Op().Is(values0),
+                                  m::Op().Is(values1))));
 }
 
 // Test that A && True is simplified to A
@@ -2753,7 +2986,8 @@ TEST_P(ConvInputPaddingTest, DoTest) {
     ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
     auto* conv = module->entry_computation()->root_instruction();
     SCOPED_TRACE(module->ToString());
-    ASSERT_THAT(conv, op::Convolution(op::Parameter(), op::Parameter()));
+    ASSERT_THAT(conv,
+                GmockMatch(m::Convolution(m::Parameter(), m::Parameter())));
     EXPECT_EQ(window_util::ToString(conv->window()),
               absl::StrCat("size=3x3 ", testcase.expected_conv_window));
   }
@@ -2870,7 +3104,8 @@ TEST_P(ConvFilterPaddingTest, DoIt) {
     ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
     auto* conv = module->entry_computation()->root_instruction();
     SCOPED_TRACE(module->ToString());
-    ASSERT_THAT(conv, op::Convolution(op::Parameter(), op::Parameter()));
+    ASSERT_THAT(conv,
+                GmockMatch(m::Convolution(m::Parameter(), m::Parameter())));
     EXPECT_EQ(window_util::ToString(conv->window()),
               absl::StrFormat("size=%dx%d %s",
                               conv->operand(1)->shape().dimensions(2),
@@ -3142,10 +3377,9 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
 
   // Running simplification again should not result in any further changes.
   ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
-
-  root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(scalar_param));
-  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), slice_shape));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Op().Is(scalar_param))
+                             .WithShapeEqualTo(&slice_shape)));
 }
 
 // Test that reshape(transpose(broadcast(/*scalar value*/))) simplifies to a
@@ -3176,10 +3410,9 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-
-  root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(forty_two));
-  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reshape_shape));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Op().Is(forty_two))
+                             .WithShapeEqualTo(&reshape_shape)));
 }
 
 // Test that ReduceWindow(Pad(op, x), y) can simplify to ReduceWindow(op, x).
@@ -3248,7 +3481,8 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
 
   // Verify the result
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::ReduceWindow(operand, op::Constant()));
+  EXPECT_THAT(root,
+              GmockMatch(m::ReduceWindow(m::Op().Is(operand), m::Constant())));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reduce_window_shape))
       << ShapeUtil::HumanString(root->shape()) << " vs "
       << ShapeUtil::HumanString(reduce_window_shape);
@@ -3333,7 +3567,8 @@ TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
 
   // Verify the result
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::ReduceWindow(op::Convert(parameter), op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::ReduceWindow(m::Convert(m::Parameter(0)),
+                                               m::Constant())));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reduce_window_shape))
       << ShapeUtil::HumanString(root->shape()) << " vs "
       << ShapeUtil::HumanString(reduce_window_shape);
@@ -3414,7 +3649,7 @@ TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
-              op::Tuple(op::Constant(), op::Constant()));
+              GmockMatch(m::Tuple(m::Constant(), m::Constant())));
 }
 
 // A dynamic-slice is trivial if its start indices are all zeroes and the size
@@ -3436,7 +3671,7 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
   auto computation = m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(), op::Parameter());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Parameter()));
 }
 
 // A dynamic-update-slice is trivial if its start indices are all zeroes and the
@@ -3470,7 +3705,7 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
-              op::DynamicSlice(op::Parameter(), op::Parameter()));
+              GmockMatch(m::DynamicSlice(m::Parameter(), m::Parameter())));
 }
 
 // Test that two consecutive broadcasts can be merged to one.
@@ -3492,7 +3727,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) {
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
   EXPECT_THAT(root->dimensions(), ElementsAre(2));
 }
 
@@ -3518,7 +3753,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) {
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Parameter(0)));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Parameter(0))));
   EXPECT_THAT(root->dimensions(), ElementsAre(1, 3));
 }
 
@@ -3538,7 +3773,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota) {
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Iota());
+  EXPECT_THAT(root, GmockMatch(m::Iota()));
   EXPECT_EQ(Cast<HloIotaInstruction>(root)->iota_dimension(), 2);
 }
 
@@ -3559,7 +3794,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota2) {
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Iota());
+  EXPECT_THAT(root, GmockMatch(m::Iota()));
   EXPECT_EQ(Cast<HloIotaInstruction>(root)->iota_dimension(), 2);
 }
 
@@ -3581,7 +3816,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadLow) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Reshape(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) {
@@ -3602,7 +3837,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Reshape(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
@@ -3642,7 +3877,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Parameter());
+  EXPECT_THAT(root, GmockMatch(m::Parameter()));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {
@@ -3664,7 +3899,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Parameter(1));
+  EXPECT_THAT(root, GmockMatch(m::Parameter(1)));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfConcatNonScalarInput) {
@@ -3686,7 +3921,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfConcatNonScalarInput) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Slice(op::Parameter(2)));
+  EXPECT_THAT(root, GmockMatch(m::Slice(m::Parameter(2))));
   EXPECT_EQ(root->slice_starts(0), 1);
   EXPECT_EQ(root->slice_limits(0), 2);
 }
@@ -3708,7 +3943,7 @@ TEST_F(AlgebraicSimplifierTest, NegateNegate) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Parameter(0));
+  EXPECT_THAT(root, GmockMatch(m::Parameter(0)));
 }
 
 TEST_F(AlgebraicSimplifierTest, NotNot) {
@@ -3728,7 +3963,7 @@ TEST_F(AlgebraicSimplifierTest, NotNot) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Parameter(0));
+  EXPECT_THAT(root, GmockMatch(m::Parameter(0)));
 }
 
 struct PadReduceWindowEffectiveBroadcastCase {
@@ -3832,10 +4067,10 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
       ShapeUtil::Equal(computation->root_instruction()->shape(), output_shape));
 
   if (param.should_become_broadcast) {
-    EXPECT_THAT(computation->root_instruction(), op::Broadcast(::testing::_));
+    EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Broadcast()));
   } else {
     EXPECT_THAT(computation->root_instruction(),
-                op::ReduceWindow(::testing::_, zero));
+                GmockMatch(m::ReduceWindow(m::Op(), m::Op().Is(zero))));
   }
 }
 
@@ -3869,6 +4104,57 @@ INSTANTIATE_TEST_CASE_P(
     PadReduceWindowEffectiveBroadcastTest,
     ::testing::ValuesIn(PadReduceWindowEffectiveBroadcastCases()));
 
+class BatchDotStrengthReductionTest
+    : public AlgebraicSimplifierTest,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<int, int, int, PrimitiveType>> {};
+TEST_P(BatchDotStrengthReductionTest, BatchDotStrengthReduction) {
+  auto module = CreateNewVerifiedModule();
+  int m, k, n;
+  PrimitiveType element_type;
+  std::tie(m, k, n, element_type) = GetParam();
+
+  Shape dot_shape = ShapeUtil::MakeShape(element_type, {1, 3, 5, m, n});
+  Shape lhs_shape = ShapeUtil::MakeShape(element_type, {1, 3, 5, m, k});
+  Shape rhs_shape = ShapeUtil::MakeShape(element_type, {1, 3, 5, k, n});
+  HloComputation::Builder builder(TestName());
+
+  auto lhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, lhs_shape, "lhs"));
+  auto rhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, rhs_shape, "rhs"));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_batch_dimensions(0);
+  dot_dnums.add_lhs_batch_dimensions(1);
+  dot_dnums.add_lhs_batch_dimensions(2);
+  dot_dnums.add_rhs_batch_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(1);
+  dot_dnums.add_rhs_batch_dimensions(2);
+  dot_dnums.add_lhs_contracting_dimensions(4);
+  dot_dnums.add_rhs_contracting_dimensions(3);
+  builder.AddInstruction(HloInstruction::CreateDot(
+      dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
+  auto computation = module->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(module.get()));
+  const bool dot_should_be_transformed = m == 1 || k == 1 || n == 1;
+  const bool computation_should_be_modified = dot_should_be_transformed;
+  EXPECT_EQ(changed, computation_should_be_modified);
+  bool has_no_dot = true;
+  for (const auto& hlo : computation->instructions()) {
+    if (hlo->opcode() == HloOpcode::kDot) {
+      has_no_dot = false;
+      break;
+    }
+  }
+  EXPECT_EQ(has_no_dot, dot_should_be_transformed);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    BatchDotStrengthReductionTestInstantiation, BatchDotStrengthReductionTest,
+    ::testing::Combine(::testing::Values(1, 2), ::testing::Values(1, 2),
+                       ::testing::Values(1, 2), ::testing::Values(F32, BF16)));
+
 class DotStrengthReductionTest
     : public AlgebraicSimplifierTest,
       public ::testing::WithParamInterface<
@@ -3989,11 +4275,12 @@ TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
 
-  auto match_dot_0 = op::Dot(op::Slice(op::Constant()), op::Parameter(0));
-  auto match_dot_1 = op::Dot(op::Slice(op::Constant()), op::Parameter(1));
-  auto match_dot_2 = op::Dot(op::Slice(op::Constant()), op::Parameter(2));
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Add(match_dot_0, match_dot_1), match_dot_2));
+  auto match_dot_0 = m::Dot(m::Slice(m::Constant()), m::Parameter(0));
+  auto match_dot_1 = m::Dot(m::Slice(m::Constant()), m::Parameter(1));
+  auto match_dot_2 = m::Dot(m::Slice(m::Constant()), m::Parameter(2));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Add(m::Add(match_dot_0, match_dot_1), match_dot_2)));
 }
 
 // Test that we transform
@@ -4052,13 +4339,14 @@ TEST_P(DotOfConcatSimplificationTest, ConstantRHS) {
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
 
-  auto match_dot_0 = op::Dot(op::Parameter(0), op::Slice(op::Constant()));
-  auto match_dot_1 = op::Dot(op::Parameter(1), op::Slice(op::Constant()));
-  auto match_dot_2 = op::Dot(op::Parameter(2), op::Slice(op::Constant()));
-  auto match_dot_3 = op::Dot(op::Parameter(3), op::Slice(op::Constant()));
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Add(op::Add(match_dot_0, match_dot_1), match_dot_2),
-                      match_dot_3));
+  auto match_dot_0 = m::Dot(m::Parameter(0), m::Slice(m::Constant()));
+  auto match_dot_1 = m::Dot(m::Parameter(1), m::Slice(m::Constant()));
+  auto match_dot_2 = m::Dot(m::Parameter(2), m::Slice(m::Constant()));
+  auto match_dot_3 = m::Dot(m::Parameter(3), m::Slice(m::Constant()));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Add(m::Add(m::Add(match_dot_0, match_dot_1), match_dot_2),
+                        match_dot_3)));
 }
 
 DotOfConcatTestSpec kDotOfConcatTestSpecs[] = {
@@ -4175,8 +4463,8 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
               HloOpcode::kDynamicSlice);
   } else {
     EXPECT_THAT(computation->root_instruction(),
-                op::DynamicSlice(op::Dot(op::Constant(), op::Constant()),
-                                 op::Concatenate()));
+                GmockMatch(m::DynamicSlice(m::Dot(m::Constant(), m::Constant()),
+                                           m::Concatenate())));
   }
 }
 
@@ -4245,8 +4533,8 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
               HloOpcode::kDynamicSlice);
   } else {
     EXPECT_THAT(computation->root_instruction(),
-                op::DynamicSlice(op::Dot(op::Constant(), op::Constant()),
-                                 op::Concatenate()));
+                GmockMatch(m::DynamicSlice(m::Dot(m::Constant(), m::Constant()),
+                                           m::Concatenate())));
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
index c11452a6fbd49a1fc382d11d24a7d7b7eeab0bcc..47d2c7e35705698d49950c2fa042af1c6327d521 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -36,31 +36,47 @@ namespace {
 
 namespace m = match;
 
-// If the argument instruction is a CRS in the sequence
-// AR -> Convert -> Add -> CRS
-// then return the AR in the sequence.
-// TODO(b/117554291): Rewrite this to recognize more general patterns,
-// not just the specific one of AR -> Add -> Convert -> CRS.
-absl::optional<HloInstruction*> MatchesArCrsPattern(
-    HloInstruction* instruction) {
-  HloInstruction *ar, *convert, *add, *crs;
-  if (Match(instruction,
-            m::CrossReplicaSum(
-                &crs, m::Add(&add, m::Op(),
-                             m::Convert(&convert,
-                                        m::CrossReplicaSum(&ar, m::Op()))))) &&
-      ar->users().size() == 1 && ar->shape().element_type() == BF16 &&
-      convert->shape().element_type() == F32 && !crs->all_reduce_id()) {
-    return ar;
+// Returns true iff the argument instruction is an AllReduce, followed by a
+// certain sequence of instructions and then a CRS. It must be possible to move
+// the AR past each instruction in the sequence.
+bool MatchesArCrsPattern(HloInstruction* instruction) {
+  auto can_ar_move_past_instruction = [](HloInstruction* instruction) -> bool {
+    if (instruction->user_count() != 1) {
+      return false;
+    }
+    auto opcode = instruction->opcode();
+    return opcode == HloOpcode::kBitcast || opcode == HloOpcode::kTranspose ||
+           opcode == HloOpcode::kReshape || opcode == HloOpcode::kConvert ||
+           opcode == HloOpcode::kAdd || opcode == HloOpcode::kSubtract ||
+           opcode == HloOpcode::kMultiply;
+  };
+
+  auto computation_is_addition = [](HloComputation* c) {
+    return c->instruction_count() == 3 &&
+           Match(c->root_instruction(), m::Add(m::Parameter(), m::Parameter()));
+  };
+
+  if (!instruction->IsCrossModuleAllReduce() ||
+      !computation_is_addition(instruction->called_computations()[0]) ||
+      instruction->user_count() != 1) {
+    return false;
   }
-  return absl::optional<HloInstruction*>();
+  auto next = instruction->users()[0];
+  while (!next->IsCrossReplicaAllReduce()) {
+    if (can_ar_move_past_instruction(next)) {
+      next = next->users()[0];
+    } else {
+      return false;
+    }
+  }
+  return computation_is_addition(next->called_computations()[0]);
 }
 
 }  // namespace
 
 absl::optional<HloInstruction*> ArCrsCombiner::WhileFromBodyParameter(
     HloInstruction* instruction) {
-  CHECK(HloOpcode::kParameter == instruction->opcode());
+  CHECK_EQ(HloOpcode::kParameter, instruction->opcode());
   HloComputation* computation = instruction->parent();
   auto caller_instructions = call_graph_->GetComputationCallers(computation);
   if (caller_instructions.size() == 1) {
@@ -120,7 +136,7 @@ bool ArCrsCombiner::TupleElementsComputeSameValue(
     return false;
   }
   for (auto tuple : tuples) {
-    CHECK(tuple->opcode() == HloOpcode::kTuple);
+    CHECK_EQ(tuple->opcode(), HloOpcode::kTuple);
     if (!InstructionsComputeSameValue(tuple->mutable_operand(i1),
                                       tuple->mutable_operand(i2),
                                       visited_pairs)) {
@@ -160,13 +176,6 @@ bool ArCrsCombiner::InstructionsComputeSameValue(
   if (opcode1 != i2->opcode() || operands1.size() != i2->operands().size()) {
     return false;
   }
-  if (opcode1 == HloOpcode::kConstant || i1->IsCrossModuleAllReduce()) {
-    return i1->Identical(
-        *i2,
-        /*eq_operands=*/std::equal_to<const HloInstruction*>(),
-        /*eq_computations=*/std::equal_to<const HloComputation*>(),
-        /*layout_sensitive=*/false);
-  }
   visited_pairs->emplace(min_uid, max_uid);
   for (int i = 0; i < operands1.size(); ++i) {
     auto operand1 = operands1[i];
@@ -175,22 +184,35 @@ bool ArCrsCombiner::InstructionsComputeSameValue(
       return false;
     }
   }
+  if (opcode1 == HloOpcode::kParameter) {
+    // In the general case, we don't try to prove equality of parameters.
+    // We only try in the context of get-tuple-element
+    // (see TupleElementsComputeSameValue).
+    return false;
+  }
   if (opcode1 == HloOpcode::kGetTupleElement) {
-    if (i1->tuple_index() == i2->tuple_index()) {
-      return true;
-    }
-    return TupleElementsComputeSameValue(operands1[0], i1->tuple_index(),
+    return i1->tuple_index() == i2->tuple_index() ||
+           TupleElementsComputeSameValue(operands1[0], i1->tuple_index(),
                                          i2->tuple_index(), visited_pairs);
   }
-  return true;
+  // Don't check that the operands are identical, because Identical can
+  // return false for instructions that compute the same value but are not
+  // identical, which we don't want. We have checked the arguments with
+  // InstructionsComputeSameValue earlier.
+  auto eq_instructions = [](const HloInstruction* i1,
+                            const HloInstruction* i2) -> bool { return true; };
+  auto eq_computations = [](const HloComputation* a, const HloComputation* b) {
+    return *a == *b;
+  };
+  return i1->Identical(*i2, eq_instructions, eq_computations,
+                       /*layout_sensitive=*/false);
 }
 
 void ArCrsCombiner::GroupAllReducesById(HloModule* module) {
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
     for (HloInstruction* instruction : computation->instructions()) {
-      auto ar = MatchesArCrsPattern(instruction);
-      if (ar) {
-        all_reduce_map_[*((*ar)->all_reduce_id())].push_back(*ar);
+      if (MatchesArCrsPattern(instruction)) {
+        all_reduce_map_[*(instruction->all_reduce_id())].push_back(instruction);
       }
     }
   }
@@ -198,21 +220,23 @@ void ArCrsCombiner::GroupAllReducesById(HloModule* module) {
 
 void ArCrsCombiner::KeepProvablyEqualInstructionGroups() {
   for (auto it : all_reduce_map_) {
+    auto all_reduce_id = it.first;
     auto instruction_vec = it.second;
     CHECK_EQ(instruction_vec.size(), num_spatial_partitions_);
-
     auto instr_0 = instruction_vec[0];
-    auto add_0 = instr_0->users()[0]->users()[0];
-    CHECK(HloOpcode::kAdd == add_0->opcode());
-
     for (int i = 1; i < instruction_vec.size(); ++i) {
       auto instr_i = instruction_vec[i];
-      auto add_i = instr_i->users()[0]->users()[0];
-      CHECK(HloOpcode::kAdd == add_i->opcode());
+      auto next_0 = instr_0->users()[0];
+      auto next_i = instr_i->users()[0];
       absl::flat_hash_map<int64, int64> visited_pairs;
-      if (!InstructionsComputeSameValue(add_0, add_i, &visited_pairs)) {
-        all_reduce_map_.erase(it.first);
-      }
+      do {
+        if (!InstructionsComputeSameValue(next_0, next_i, &visited_pairs)) {
+          all_reduce_map_.erase(all_reduce_id);
+          break;
+        }
+        next_0 = next_0->users()[0];
+        next_i = next_i->users()[0];
+      } while (!next_0->IsCrossReplicaAllReduce());
     }
   }
 }
@@ -221,55 +245,51 @@ StatusOr<bool> ArCrsCombiner::RewriteGraph() {
   if (all_reduce_map_.empty()) {
     return false;
   }
-
-  auto computation_is_addition = [](HloComputation* c) {
-    return c->instruction_count() == 3 &&
-           Match(c->root_instruction(), m::Add(m::Parameter(), m::Parameter()));
-  };
-
   for (auto it : all_reduce_map_) {
     auto instruction_vec = it.second;
     for (auto all_reduce : instruction_vec) {
       auto parent_computation = all_reduce->parent();
-      auto convert = all_reduce->users()[0];
-      auto add = convert->users()[0];
-      auto crs = add->users()[0];
-
-      if (!computation_is_addition(all_reduce->called_computations()[0]) ||
-          !computation_is_addition(crs->called_computations()[0])) {
-        continue;
-      }
-      HloInstruction* other_summand = (add->operands()[0] == convert)
-                                          ? add->operands()[1]
-                                          : add->operands()[0];
-      // Remove the AllReduce and replace the CRS with:
-      // AllReduce - (other_summand * (num_spatial_partitions_ - 1))
-      TF_CHECK_OK(
-          all_reduce->ReplaceAllUsesWith(all_reduce->mutable_operand(0)));
-      crs->set_all_reduce_id(all_reduce->all_reduce_id());
-      auto new_shape = crs->shape();
-      HloInstruction* to_subtract;
-      if (num_spatial_partitions_ == 2) {
-        to_subtract = other_summand;
-      } else {
-        Literal partitions_minus_1_lit = Literal(new_shape);
-        partitions_minus_1_lit.PopulateWithValue<float>(
-            num_spatial_partitions_ - 1);
-        auto partitions_minus_1_const = parent_computation->AddInstruction(
-            HloInstruction::CreateConstant(partitions_minus_1_lit.Clone()));
-        to_subtract =
-            parent_computation->AddInstruction(HloInstruction::CreateBinary(
-                new_shape, HloOpcode::kMultiply, other_summand,
-                partitions_minus_1_const));
-      }
-      auto sub =
-          parent_computation->AddInstruction(HloInstruction::CreateBinary(
-              new_shape, HloOpcode::kSubtract, crs, to_subtract));
-      TF_CHECK_OK(crs->ReplaceAllUsesWith(sub));
+      auto all_reduce_id = all_reduce->all_reduce_id();
+      auto prev = all_reduce->mutable_operand(0);
+      auto next = all_reduce->users()[0];
+      TF_CHECK_OK(all_reduce->ReplaceUseWith(next, prev));
       TF_CHECK_OK(parent_computation->RemoveInstruction(all_reduce));
+      while (!next->IsCrossReplicaAllReduce()) {
+        switch (next->opcode()) {
+          case HloOpcode::kBitcast:
+          case HloOpcode::kTranspose:
+          case HloOpcode::kReshape:
+          case HloOpcode::kConvert:
+          case HloOpcode::kMultiply:
+            break;
+          case HloOpcode::kAdd:
+          case HloOpcode::kSubtract: {
+            auto other_operand = (next->operands()[0] == prev)
+                                     ? next->operands()[1]
+                                     : next->operands()[0];
+            // To move the AR past the addition/subtraction, we need to divide
+            // other_operand by the number of spatial partitions.
+            auto shape = other_operand->shape();
+            Literal lit(shape);
+            lit.PopulateWithValue<float>(num_spatial_partitions_);
+            auto divisor = parent_computation->AddInstruction(
+                HloInstruction::CreateConstant(lit.Clone()));
+            auto division =
+                parent_computation->AddInstruction(HloInstruction::CreateBinary(
+                    shape, HloOpcode::kDivide, other_operand, divisor));
+            TF_CHECK_OK(other_operand->ReplaceUseWith(next, division));
+            break;
+          }
+          default:
+            LOG(FATAL) << "Unexpected instruction: " << next->ToShortString();
+        }
+        prev = next;
+        next = next->users()[0];
+      }
+      // The AllReduce and the CRS are combined to an all-core AllReduce.
+      next->set_all_reduce_id(all_reduce_id);
     }
   }
-
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.h b/tensorflow/compiler/xla/service/ar_crs_combiner.h
index f6a7ef76ec3b76972d1b2c7fb548cecfb9423160..6be7e1002dc6822bf0b563721f00896da171c0a9 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.h
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.h
@@ -25,9 +25,12 @@ limitations under the License.
 
 namespace xla {
 
-// Combine an AllReduce and a CrossReplicaSum when they are close to each other
-// in the graph, to use an efficient CrossReplicaSum implementation that
-// fully utilizes the interconnect bandwidth.
+// When the HLO graph contains an AllReduce, followed by some simple linear
+// operations, followed by a CrossReplicaSum, we can combine the AR and the CRS,
+// to use an efficient CrossReplicaSum implementation that fully utilizes the
+// interconnect bandwidth.
+// Such sequences appear in spatially partitioned models.
+// This pass must run right after spatial partitioning.
 class ArCrsCombiner : public HloModulePass {
  public:
   ArCrsCombiner(int num_spatial_partitions)
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
index 9d5eaf63ccf32cd78b8c11f12f9bccdfd1fec3e0..8a4fd0ee1b25ec82f5dadfc8446af185914d4033 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -32,8 +32,8 @@ HloModule foobar
 
 ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
   %p = f32[2,2] parameter(0)
-  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
-  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.1 = f32[2,2] constant({{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant({{1, 2}, {3, 4}})
   ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
 }
 )";
@@ -48,13 +48,50 @@ ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
   EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
 }
 
+TEST_F(ArCrsCombinerTest, SameValueTestBasecase2) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (x: f32[]) -> (f32[], f32[]) {
+  %x = f32[] parameter(0)
+  ROOT %tuple = (f32[], f32[]) tuple(%x, %x)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestBasecase3) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (x: f32[], y: f32[]) -> (f32[], f32[]) {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %tuple = (f32[], f32[]) tuple(%x, %y)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
 TEST_F(ArCrsCombinerTest, SameValueTestNumOperands) {
   const char* module_str = R"(
 HloModule foobar
 
 ENTRY %entrycomp (p: f32[2,2]) -> ((f32[2,2]), (f32[2,2], f32[2,2])) {
   %p = f32[2,2] parameter(0)
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32 = f32[2,2] constant({{1, 2}, {3, 4}})
   %tuple1 = (f32[2,2]) tuple(%constant.f32)
   %tuple2 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
   ROOT %tuple = ((f32[2,2]), (f32[2,2], f32[2,2])) tuple(%tuple1, %tuple2)
@@ -69,13 +106,53 @@ ENTRY %entrycomp (p: f32[2,2]) -> ((f32[2,2]), (f32[2,2], f32[2,2])) {
   EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
 }
 
+TEST_F(ArCrsCombinerTest, SameValueTestSliceIndicesMatch) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2]) -> (f32[1], f32[1]) {
+  %p = f32[2] parameter(0)
+  %slice.1 = f32[1] slice(f32[2] %p), slice={[0:1]}
+  %slice.2 = f32[1] slice(f32[2] %p), slice={[0:1]}
+  ROOT %tuple = (f32[1], f32[1]) tuple(%slice.1, %slice.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestSliceIndicesDontMatch) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2]) -> (f32[1], f32[1]) {
+  %p = f32[2] parameter(0)
+  %slice.1 = f32[1] slice(f32[2] %p), slice={[0:1]}
+  %slice.2 = f32[1] slice(f32[2] %p), slice={[1:2]}
+  ROOT %tuple = (f32[1], f32[1]) tuple(%slice.1, %slice.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
 TEST_F(ArCrsCombinerTest, SameValueTestTupleElementSameIndex) {
   const char* module_str = R"(
 HloModule foobar
 
 ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
   %p = f32[2,2] parameter(0)
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32 = f32[2,2] constant({{1, 2}, {3, 4}})
   %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
   %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
   %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=0
@@ -97,7 +174,7 @@ HloModule foobar
 
 ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
   %p = f32[2,2] parameter(0)
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32 = f32[2,2] constant({{1, 2}, {3, 4}})
   %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
   %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
   %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=1
@@ -119,8 +196,8 @@ HloModule foobar
 
 ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
   %p = f32[2,2] parameter(0)
-  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
-  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{2, 3}, {4, 5}})
+  %constant.f32.1 = f32[2,2] constant({{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant({{2, 3}, {4, 5}})
   %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
   %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
   %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=1
@@ -149,7 +226,7 @@ HloModule foobar
 
 %body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
   %x = (f32[2,2], f32[2,2]) parameter(0)
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32 = f32[2,2] constant({{1, 2}, {3, 4}})
   %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
   %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
   %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32)
@@ -158,7 +235,7 @@ HloModule foobar
 }
 
 ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %constant.f32 = f32[2,2] constant({{3, 4}, {5, 6}})
   %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
   ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
 }
@@ -186,7 +263,7 @@ HloModule foobar
 
 %body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
   %x = (f32[2,2], f32[2,2]) parameter(0)
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32 = f32[2,2] constant({{1, 2}, {3, 4}})
   %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
   %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
   %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32)
@@ -195,8 +272,8 @@ HloModule foobar
 }
 
 ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
-  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
-  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {7, 8}})
+  %constant.f32.1 = f32[2,2] constant({{3, 4}, {5, 6}})
+  %constant.f32.2 = f32[2,2] constant({{3, 4}, {7, 8}})
   %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
   ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
 }
@@ -224,8 +301,8 @@ HloModule foobar
 
 %body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
   %x = (f32[2,2], f32[2,2]) parameter(0)
-  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
-  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {1, 2}})
+  %constant.f32.1 = f32[2,2] constant({{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant({{3, 4}, {1, 2}})
   %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
   %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
   %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32.1)
@@ -234,7 +311,7 @@ HloModule foobar
 }
 
 ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %constant.f32 = f32[2,2] constant({{3, 4}, {5, 6}})
   %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
   ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
 }
@@ -249,11 +326,27 @@ ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
   EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
 }
 
-TEST_F(ArCrsCombinerTest, RewritePatternArConvertAddCrs) {
+void CompareReplicaGroups(const std::vector<ReplicaGroup>& groups_before,
+                          const std::vector<ReplicaGroup>& groups_after) {
+  ASSERT_EQ(groups_before.size(), groups_after.size());
+  for (int i = 0; i < groups_before.size(); ++i) {
+    // Somewhat verbose way to compare the replica_ids, because EqualsProto
+    // is not available in the open-source build.
+    auto group_before = groups_before[i];
+    std::vector<int64> ids_before(group_before.replica_ids().begin(),
+                                  group_before.replica_ids().end());
+    auto group_after = groups_after[i];
+    std::vector<int64> ids_after(group_after.replica_ids().begin(),
+                                 group_after.replica_ids().end());
+    EXPECT_EQ(ids_before, ids_after);
+  }
+}
+
+TEST_F(ArCrsCombinerTest, RewriteArConvertCrs) {
   const char* module_str = R"(
 HloModule foobar
 
-%binary_add (a: bf16[], b: bf16[]) -> bf16[] {
+%sum.bf16 (a: bf16[], b: bf16[]) -> bf16[] {
   %a = bf16[] parameter(0)
   %b = bf16[] parameter(1)
   ROOT %add = bf16[] add(%a, %b)
@@ -265,48 +358,257 @@ HloModule foobar
   ROOT %add = f32[] add(%x, %y)
 }
 
-ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
-  %p = f32[2,2] parameter(0)
-  %constant.bf16 = bf16[2,2] constant(bf16[2,2] {{1, 2}, {3, 4}})
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
+  %p = bf16[] parameter(0)
+
+  %cross-replica-sum.ar.1 = bf16[]
+      cross-replica-sum(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.bf16,
+      sharding={maximal device=0}
+  %convert.1 = f32[]
+      convert(%cross-replica-sum.ar.1),
+      sharding={maximal device=0}
+  %cross-replica-sum.1 = f32[]
+      cross-replica-sum(%convert.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %cross-replica-sum.ar.2 = bf16[]
+      cross-replica-sum(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.bf16,
+      sharding={maximal device=1}
+  %convert.2 = f32[]
+      convert(%cross-replica-sum.ar.2),
+      sharding={maximal device=1}
+  %cross-replica-sum.2 = f32[]
+      cross-replica-sum(%convert.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
 
-  %cross-replica-sum.ar.1 = bf16[2,2]
+  ROOT %tuple = (f32[], f32[])
+      tuple(%cross-replica-sum.1, %cross-replica-sum.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::CrossReplicaSum(op::Convert(op::Parameter())),
+                        op::CrossReplicaSum(op::Convert(op::Parameter()))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteArBitcastCrs) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.1 (a: f32[2,1], b: f32[2,1]) -> f32[2,1] {
+  %a = f32[2,1] parameter(0)
+  %b = f32[2,1] parameter(1)
+  ROOT %add = f32[2,1] add(%a, %b)
+}
+
+%sum.2 (x: f32[2], y: f32[2]) -> f32[2] {
+  %x = f32[2] parameter(0)
+  %y = f32[2] parameter(1)
+  ROOT %add = f32[2] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[2,1]) -> (f32[2], f32[2]) {
+  %p = f32[2,1] parameter(0)
+
+  %cross-replica-sum.ar.1 = f32[2,1]
+      cross-replica-sum(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.1,
+      sharding={maximal device=0}
+  %bitcast.1 = f32[2]{0} bitcast(f32[2,1]{1,0} %cross-replica-sum.ar.1)
+  %cross-replica-sum.1 = f32[2]
+      cross-replica-sum(%bitcast.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.2,
+      sharding={maximal device=0}
+
+  %cross-replica-sum.ar.2 = f32[2,1]
+      cross-replica-sum(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.1,
+      sharding={maximal device=1}
+  %bitcast.2 = f32[2]{0} bitcast(f32[2,1]{1,0} %cross-replica-sum.ar.2)
+  %cross-replica-sum.2 = f32[2]
+      cross-replica-sum(%bitcast.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.2,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%cross-replica-sum.1, %cross-replica-sum.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::CrossReplicaSum(op::Bitcast(op::Parameter())),
+                        op::CrossReplicaSum(op::Bitcast(op::Parameter()))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteArMultiplyCrs) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %constant.f32 = f32[] constant(123)
+
+  %cross-replica-sum.ar.1 = f32[]
+      cross-replica-sum(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+  %multiply.1 = f32[]
+      multiply(%cross-replica-sum.ar.1, %constant.f32),
+      sharding={maximal device=0}
+  %cross-replica-sum.1 = f32[]
+      cross-replica-sum(%multiply.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %cross-replica-sum.ar.2 = f32[]
+      cross-replica-sum(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+  %multiply.2 = f32[]
+      multiply(%cross-replica-sum.ar.2, %constant.f32),
+      sharding={maximal device=1}
+  %cross-replica-sum.2 = f32[]
+      cross-replica-sum(%multiply.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%cross-replica-sum.1, %cross-replica-sum.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(
+          op::CrossReplicaSum(op::Multiply(op::Parameter(), op::Constant())),
+          op::CrossReplicaSum(op::Multiply(op::Parameter(), op::Constant()))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteArConvertAddCrs) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.bf16 (a: bf16[], b: bf16[]) -> bf16[] {
+  %a = bf16[] parameter(0)
+  %b = bf16[] parameter(1)
+  ROOT %add = bf16[] add(%a, %b)
+}
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %constant.bf16 = bf16[] constant(1)
+  %constant.f32 = f32[] constant(2)
+
+  %cross-replica-sum.ar.1 = bf16[]
       cross-replica-sum(%constant.bf16),
       replica_groups={{0},{1}},
       all_reduce_id=1,
-      to_apply=%binary_add,
+      to_apply=%sum.bf16,
       sharding={maximal device=0}
-  %convert.1 = f32[2,2]
+  %convert.1 = f32[]
       convert(%cross-replica-sum.ar.1),
       sharding={maximal device=0}
-  %add.1 = f32[2,2]
+  %add.1 = f32[]
       add(%constant.f32, %convert.1),
       sharding={maximal device=0}
-  %cross-replica-sum.1 = f32[2,2]
+  %cross-replica-sum.1 = f32[]
       cross-replica-sum(%add.1),
       replica_groups={{0,1}},
       to_apply=%sum.f32,
       sharding={maximal device=0}
 
-  %cross-replica-sum.ar.2 = bf16[2,2]
+  %cross-replica-sum.ar.2 = bf16[]
       cross-replica-sum(%constant.bf16),
       replica_groups={{0},{1}},
       all_reduce_id=1,
-      to_apply=%binary_add,
+      to_apply=%sum.bf16,
       sharding={maximal device=1}
-  %convert.2 = f32[2,2]
+  %convert.2 = f32[]
       convert(%cross-replica-sum.ar.2),
       sharding={maximal device=1}
-  %add.2 = f32[2,2]
+  %add.2 = f32[]
       add(%constant.f32, %convert.2),
       sharding={maximal device=1}
-  %cross-replica-sum.2 = f32[2,2]
+  %cross-replica-sum.2 = f32[]
       cross-replica-sum(%add.2),
       replica_groups={{0,1}},
       to_apply=%sum.f32,
       sharding={maximal device=1}
 
-  ROOT %tuple = (f32[2,2], f32[2,2])
+  ROOT %tuple = (f32[], f32[])
       tuple(%cross-replica-sum.1, %cross-replica-sum.2),
       sharding={{maximal device=0}, {maximal device=1}}
 }
@@ -320,31 +622,24 @@ ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
   ArCrsCombiner combiner(2);
   auto changed = combiner.Run(module.get()).ValueOrDie();
   EXPECT_TRUE(changed);
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Tuple(op::Subtract(op::CrossReplicaSum(), op::Constant()),
-                        op::Subtract(op::CrossReplicaSum(), op::Constant())));
-  auto sub = module->entry_computation()->root_instruction()->operands()[0];
-  auto crs_after = sub->operands()[0];
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(
+          op::CrossReplicaSum(op::Add(
+              op::Divide(op::Constant(), op::Constant()), op::Convert())),
+          op::CrossReplicaSum(op::Add(
+              op::Divide(op::Constant(), op::Constant()), op::Convert()))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_after = crs_after->replica_groups();
-  ASSERT_EQ(replica_groups_before.size(), replica_groups_after.size());
-  for (int i = 0; i < replica_groups_before.size(); ++i) {
-    // Somewhat verbose way to compare the replica_ids, because EqualsProto
-    // is not available in the open-source build.
-    auto group_before = replica_groups_before[i];
-    std::vector<int64> ids_before(group_before.replica_ids().begin(),
-                                  group_before.replica_ids().end());
-    auto group_after = replica_groups_after[i];
-    std::vector<int64> ids_after(group_after.replica_ids().begin(),
-                                 group_after.replica_ids().end());
-    EXPECT_EQ(ids_before, ids_after);
-  }
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
 }
 
 TEST_F(ArCrsCombinerTest, OtherSummandNotTheSameDontRewrite) {
   const char* module_str = R"(
 HloModule foobar
 
-%binary_add (a: bf16[], b: bf16[]) -> bf16[] {
+%sum.bf16 (a: bf16[], b: bf16[]) -> bf16[] {
   %a = bf16[] parameter(0)
   %b = bf16[] parameter(1)
   ROOT %add = bf16[] add(%a, %b)
@@ -356,49 +651,49 @@ HloModule foobar
   ROOT %add = f32[] add(%x, %y)
 }
 
-ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
-  %p = f32[2,2] parameter(0)
-  %constant.bf16 = bf16[2,2] constant(bf16[2,2] {{1, 2}, {3, 4}})
-  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
-  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %constant.bf16 = bf16[] constant(1)
+  %constant.f32.1 = f32[] constant(2)
+  %constant.f32.2 = f32[] constant(3)
 
-  %cross-replica-sum.ar.1 = bf16[2,2]
+  %cross-replica-sum.ar.1 = bf16[]
       cross-replica-sum(%constant.bf16),
       replica_groups={{0},{1}},
       all_reduce_id=1,
-      to_apply=%binary_add,
+      to_apply=%sum.bf16,
       sharding={maximal device=0}
-  %convert.1 = f32[2,2]
+  %convert.1 = f32[]
       convert(%cross-replica-sum.ar.1),
       sharding={maximal device=0}
-  %add.1 = f32[2,2]
+  %add.1 = f32[]
       add(%constant.f32.1, %convert.1),
       sharding={maximal device=0}
-  %cross-replica-sum.1 = f32[2,2]
+  %cross-replica-sum.1 = f32[]
       cross-replica-sum(%add.1),
       replica_groups={{0,1}},
       to_apply=%sum.f32,
       sharding={maximal device=0}
 
-  %cross-replica-sum.ar.2 = bf16[2,2]
+  %cross-replica-sum.ar.2 = bf16[]
       cross-replica-sum(%constant.bf16),
       replica_groups={{0},{1}},
       all_reduce_id=1,
-      to_apply=%binary_add,
+      to_apply=%sum.bf16,
       sharding={maximal device=1}
-  %convert.2 = f32[2,2]
+  %convert.2 = f32[]
       convert(%cross-replica-sum.ar.2),
       sharding={maximal device=1}
-  %add.2 = f32[2,2]
+  %add.2 = f32[]
       add(%constant.f32.2, %convert.2),
       sharding={maximal device=1}
-  %cross-replica-sum.2 = f32[2,2]
+  %cross-replica-sum.2 = f32[]
       cross-replica-sum(%add.2),
       replica_groups={{0,1}},
       to_apply=%sum.f32,
       sharding={maximal device=1}
 
-  ROOT %tuple = (f32[2,2], f32[2,2])
+  ROOT %tuple = (f32[], f32[])
       tuple(%cross-replica-sum.1, %cross-replica-sum.2),
       sharding={{maximal device=0}, {maximal device=1}}
 }
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index f70f6ddfec69c0113a1afe2073a2392098f49456..0e6ca1871b379a2f55b92207133822fc6258b007 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -107,19 +107,37 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
   }
 
   std::unique_ptr<HloInstruction> Mean(
-      int64 element_count, HloInstruction* operand,
+      HloInstruction* element_count, HloInstruction* operand,
       const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
           add_instruction) {
-    HloInstruction* elem_count_recip =
-        add_instruction(HloInstruction::CreateBroadcast(
-            operand->shape(),
-            add_instruction(HloInstruction::CreateConvert(
-                ShapeUtil::MakeShape(operand->shape().element_type(), {}),
-                add_instruction(HloInstruction::CreateConstant(
-                    LiteralUtil::CreateR0<float>(1.0 / element_count))))),
-            {}));
-    return HloInstruction::CreateBinary(operand->shape(), HloOpcode::kMultiply,
-                                        operand, elem_count_recip);
+    auto broadcast = add_instruction(
+        HloInstruction::CreateBroadcast(operand->shape(), element_count, {}));
+    return HloInstruction::CreateBinary(operand->shape(), HloOpcode::kDivide,
+                                        operand, broadcast);
+  }
+
+  std::unique_ptr<HloInstruction> DynamicElementCountPerFeature(
+      HloInstruction* operand, int64 feature_index,
+      const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
+          add_instruction) {
+    auto elements_per_feature_u32 = add_instruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(1)));
+
+    for (int64 i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
+      if (i == feature_index) {
+        continue;
+      }
+      auto dynamic_dimension_size =
+          add_instruction(HloInstruction::CreateGetDimensionSize(
+              ShapeUtil::MakeShape(U32, {}), operand, i));
+      elements_per_feature_u32 = add_instruction(HloInstruction::CreateBinary(
+          ShapeUtil::MakeShape(U32, {}), HloOpcode::kMultiply,
+          dynamic_dimension_size, elements_per_feature_u32));
+    }
+
+    return HloInstruction::CreateConvert(
+        ShapeUtil::MakeShape(operand->shape().element_type(), {}),
+        elements_per_feature_u32);
   }
 
   // Replaces the existing HLO instruction old_instruction, with
@@ -195,9 +213,6 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
   const Shape operand_shape = operand->shape();
   PrimitiveType ptype = operand_shape.element_type();
   int64 feature_index = batch_norm->feature_index();
-  const int64 feature_count = operand_shape.dimensions(feature_index);
-  const int64 size_in_elements = ShapeUtil::ElementsIn(operand_shape);
-  int64 elements_per_feature_int64 = size_in_elements / feature_count;
 
   HloInstruction* scale = batch_norm->mutable_operand(1);
   HloInstruction* offset = batch_norm->mutable_operand(2);
@@ -220,6 +235,9 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
     }
   }
 
+  auto elements_per_feature =
+      add(DynamicElementCountPerFeature(operand, feature_index, add));
+
   auto scale_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, scale, {feature_index}));
 
@@ -243,13 +261,13 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
       add_reduce_computation));
 
   // E[X].
-  auto mean = add(Mean(elements_per_feature_int64, sum, add));
+  auto mean = add(Mean(elements_per_feature, sum, add));
 
   auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index}));
 
   // E[X^2].
-  auto square_mean = add(Mean(elements_per_feature_int64, squared_sum, add));
+  auto square_mean = add(Mean(elements_per_feature, squared_sum, add));
 
   // E^2[X].
   auto mean_square =
@@ -458,9 +476,8 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
 
   int64 feature_index = batch_norm->feature_index();
 
-  const int64 size_in_elements = ShapeUtil::ElementsIn(activation_shape);
-  const int64 feature_count = activation_shape.dimensions(feature_index);
-  const int64 elements_per_feature_int64 = size_in_elements / feature_count;
+  auto elements_per_feature =
+      add(DynamicElementCountPerFeature(activation, feature_index, add));
 
   auto zero_literal = LiteralUtil::CreateR0(0.0f);
   TF_ASSIGN_OR_RETURN(zero_literal, zero_literal.Convert(ptype));
@@ -553,15 +570,9 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
       add_binary(activation_shape, HloOpcode::kMultiply, scale_broadcasted,
                  rsqrt_var_add_epsilon_broadcasted);
 
-  scale_times_rsqrt_var_add_epsilon = add(
-      Mean(elements_per_feature_int64, scale_times_rsqrt_var_add_epsilon, add));
+  scale_times_rsqrt_var_add_epsilon =
+      add(Mean(elements_per_feature, scale_times_rsqrt_var_add_epsilon, add));
 
-  auto elements_per_feature_literal =
-      LiteralUtil::CreateR0<float>(elements_per_feature_int64);
-  TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
-                      elements_per_feature_literal.Convert(ptype));
-  auto elements_per_feature = add(
-      HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
   auto i1 = add_binary(activation_shape, HloOpcode::kMultiply, grad_output,
                        add(HloInstruction::CreateBroadcast(
                            activation_shape, elements_per_feature, {})));
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
index 08cf8026177d77ff98cca5e5d168ac3194936b35..8e8fbbd935b154e5a77d68e60d861601d740bf03 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
@@ -36,7 +36,21 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using BatchNormExpanderTest = HloTestBase;
+class BatchNormExpanderTest : public HloTestBase {
+ protected:
+  // BatchNorm should have a dynamic sized dividor for mean operations.
+  int64 CountGetDimensionSize(const HloModule& module) {
+    int64 count = 0;
+    for (HloComputation* comp : module.computations()) {
+      for (HloInstruction* inst : comp->instructions()) {
+        if (inst->opcode() == HloOpcode::kGetDimensionSize) {
+          count++;
+        }
+      }
+    }
+    return count;
+  }
+};
 
 // Test that we expand BatchNormTraining.
 TEST_F(BatchNormExpanderTest, BatchNormTraining) {
@@ -68,6 +82,7 @@ TEST_F(BatchNormExpanderTest, BatchNormTraining) {
                              /*rewrite_grad_op=*/true);
   ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
+  EXPECT_EQ(CountGetDimensionSize(*module), 3);
   // Make sure this operation is expanded.
   EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
 }
@@ -110,6 +125,7 @@ TEST_F(BatchNormExpanderTest, BatchNormGrad) {
                              /*rewrite_grad_op=*/true);
   ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
+  EXPECT_EQ(CountGetDimensionSize(*module), 3);
   // Make sure this operation is expanded.
   EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
 }
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
index 09c3f32860b3176ee5afbb147872ddafc51af256..95c7724c3c93507ae61a984301ecfc0111bef192 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
@@ -205,38 +205,6 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
     // If the code generator handles depthwise separable convolutions
     // inherently, then no filter expansion is needed.
     if (!filter_expansion_ && depthwise_separable) {
-      const int64 old_kernel_input_feature_dimension =
-          dim_numbers.kernel_input_feature_dimension();
-      const int64 old_kernel_output_feature_dimension =
-          dim_numbers.kernel_output_feature_dimension();
-
-      // For depthwise convolutions, we want the kernel input feature dimension
-      // to be smaller than the output feature dimension. If that's not the
-      // case, we swap the dimensions.
-      if (old_kernel_input_feature_dimension >
-          old_kernel_output_feature_dimension) {
-        Shape reshaped_filter_shape = filter->shape();
-        auto& dimensions = *reshaped_filter_shape.mutable_dimensions();
-        std::swap(dimensions[old_kernel_input_feature_dimension],
-                  dimensions[old_kernel_output_feature_dimension]);
-
-        auto reshaped_filter =
-            add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
-
-        dim_numbers.set_kernel_input_feature_dimension(
-            old_kernel_output_feature_dimension);
-
-        dim_numbers.set_kernel_output_feature_dimension(
-            old_kernel_input_feature_dimension);
-
-        auto new_convolution = HloInstruction::CreateConvolve(
-            convolution->shape(), convolution->mutable_operand(0),
-            reshaped_filter, group_count, convolution->window(), dim_numbers,
-            convolution->precision_config());
-
-        TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
-            convolution, std::move(new_convolution)));
-      }
       return Status::OK();
     }
     // We want to repeat 'filter' in the 'input_feature_dim' dimension
@@ -271,130 +239,72 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
         convolution, std::move(new_convolution)));
   } else {
     int64 activation_input_feature_dim = dim_numbers.input_feature_dimension();
-    auto activation = convolution->mutable_operand(0);
 
     int64 output_feature =
         filter->shape().dimensions(kernel_output_feature_dim);
 
-    int64 input_feature =
-        activation->shape().dimensions(activation_input_feature_dim);
-
     // If group_count == output_feature, then we map those grouped convolutions
-    // onto depthwise convolution + reduce. E.g., we would turn
+    // onto depthwise convolution. This is done by adding an additional spatial
+    // dimension to the activations, kernel, and the output.
+    // E.g., we would turn
     // [2, 12]{B, IF} conv [3, 4]{IF, OF} into
-    // [2, 12]{B, IF} depth conv [1, 12]{IF, OF}, and then use a reduce window
-    // of {1, 3} on the generated [2, 12] output to produce the final result of
-    // [2, 4].
+    // [3, 2, 4]{S, B, IF} depth conv [3, 1, 4]{S, IF, OF}, where S is the
+    // additional spatial dimension. The generated convolution output will be
+    // [1, 2, 4]{S, B, OF} and then reshape the output back to [2, 4] {B, OF}.
+
     if (group_count == output_feature && !filter_expansion_) {
-      Shape reshaped_filter_shape = filter->shape();
+      auto filter = convolution->mutable_operand(1);
+      auto activation = convolution->mutable_operand(0);
 
-      if (kernel_input_feature_dim < kernel_output_feature_dim) {
-        // Transpose IF and OF on the kernel.
-        std::vector<int64> filter_dims;
-        for (int64 i = 0; i < dim_numbers.kernel_spatial_dimensions().size();
-             ++i) {
-          filter_dims.push_back(dim_numbers.kernel_spatial_dimensions(i));
-        }
-        filter_dims.push_back(kernel_output_feature_dim);
-        filter_dims.push_back(kernel_input_feature_dim);
-
-        Shape transposed_filter = filter->shape();
-        auto& dimensions = *transposed_filter.mutable_dimensions();
-        std::swap(dimensions[kernel_input_feature_dim],
-                  dimensions[kernel_output_feature_dim]);
-
-        filter = add(HloInstruction::CreateTranspose(transposed_filter, filter,
-                                                     filter_dims));
-      } else {
-        // For depthwise convolutions, we want the kernel input feature
-        // dimension to be smaller than the output feature dimension. If that's
-        // not the case, we swap the dimensions.
-
-        auto& dimensions = *reshaped_filter_shape.mutable_dimensions();
-        std::swap(dimensions[kernel_input_feature_dim],
-                  dimensions[kernel_output_feature_dim]);
-
-        dim_numbers.set_kernel_input_feature_dimension(
-            kernel_output_feature_dim);
-
-        dim_numbers.set_kernel_output_feature_dimension(
-            kernel_input_feature_dim);
-        std::swap(kernel_output_feature_dim, kernel_input_feature_dim);
-      }
+      // Add spatial dimension to the activation, and reshape.
+      Shape reshaped_activation_shape = activation->shape();
+      ShapeUtil::AppendMajorDimension(group_size, &reshaped_activation_shape);
 
-      reshaped_filter_shape.set_dimensions(kernel_input_feature_dim, 1);
-      reshaped_filter_shape.set_dimensions(kernel_output_feature_dim,
-                                           group_count * group_size);
-      auto reshaped_filter =
-          add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
+      int64 new_spatial_dim = reshaped_activation_shape.dimensions().size() - 1;
 
-      Shape reshaped_convolution_shape = convolution->shape();
-      reshaped_convolution_shape.set_dimensions(
-          dim_numbers.output_feature_dimension(), group_count * group_size);
-      auto new_convolution = add(HloInstruction::CreateConvolve(
-          reshaped_convolution_shape, convolution->mutable_operand(0),
-          reshaped_filter, /*feature_group_count=*/input_feature,
-          convolution->window(), dim_numbers, convolution->precision_config()));
-
-      // Create the reduce window.
-      Window window;
-      for (int64 i = 0; i < new_convolution->shape().dimensions_size(); ++i) {
-        auto* dim = window.add_dimensions();
-        dim->set_padding_low(0);
-        dim->set_padding_high(0);
-        dim->set_window_dilation(1);
-        dim->set_base_dilation(1);
-        if (i == dim_numbers.output_feature_dimension()) {
-          dim->set_stride(group_size);
-          dim->set_size(group_size);
-        } else {
-          dim->set_stride(1);
-          dim->set_size(1);
-        }
-      }
+      reshaped_activation_shape.set_dimensions(activation_input_feature_dim,
+                                               group_count);
+      activation = add(
+          HloInstruction::CreateReshape(reshaped_activation_shape, activation));
 
-      auto reduce_window_shape = new_convolution->shape();
-      reduce_window_shape.set_dimensions(dim_numbers.output_feature_dimension(),
-                                         group_count);
-
-      auto zero_literal = LiteralUtil::CreateR0(0.0f);
-      TF_ASSIGN_OR_RETURN(zero_literal, zero_literal.Convert(F32));
-      auto zero = add(HloInstruction::CreateConstant(std::move(zero_literal)));
-
-      auto reduce_function = [&]() -> HloComputation* {
-        HloComputation::Builder b("add_computation");
-        Shape shape = ShapeUtil::MakeShape(F32, {});
-        auto lhs =
-            b.AddInstruction(HloInstruction::CreateParameter(0, shape, "lhs"));
-        auto rhs =
-            b.AddInstruction(HloInstruction::CreateParameter(1, shape, "rhs"));
-        auto scalar_op = b.AddInstruction(
-            HloInstruction::CreateBinary(shape, HloOpcode::kAdd, lhs, rhs));
-        return computation_->parent()->AddEmbeddedComputation(
-            b.Build(scalar_op));
-      };
-
-      // Ensure that data input to reduce window is of type F32.
-      if (primitive_util::BitWidth(new_convolution->shape().element_type()) <
-          primitive_util::BitWidth(F32)) {
-        Shape convert_shape = new_convolution->shape();
-        convert_shape.set_element_type(F32);
-        new_convolution = add(HloInstruction::CreateBitcastConvert(
-            convert_shape, new_convolution));
-      }
+      // Add spatial dimension to the filter, and reshape.
+      Shape reshaped_filter_shape = filter->shape();
+      ShapeUtil::AppendMajorDimension(1, &reshaped_filter_shape);
 
-      auto reduce_window = add(HloInstruction::CreateReduceWindow(
-          reduce_window_shape, new_convolution, zero, window,
-          reduce_function()));
+      filter =
+          add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
+
+      Shape new_output_shape = convolution->shape();
+      ShapeUtil::AppendMajorDimension(1, &new_output_shape);
+
+      // Edit convolution dimension numbers. Note that kernel_input_feature_dim
+      // now becomes a spatial dimension, and the newly added dimension of size
+      // 1 is the new kernel_input_feature_dim.
+      dim_numbers.add_input_spatial_dimensions(new_spatial_dim);
+      dim_numbers.add_kernel_spatial_dimensions(kernel_input_feature_dim);
+      dim_numbers.set_kernel_input_feature_dimension(new_spatial_dim);
+      dim_numbers.add_output_spatial_dimensions(new_spatial_dim);
+
+      // Add window for the new spatial dimension.
+      Window new_window = convolution->window();
+      auto* dim = new_window.add_dimensions();
+      dim->set_window_dilation(1);
+      dim->set_base_dilation(1);
+      dim->set_stride(1);
+      dim->set_size(group_size);
+
+      auto new_convolution = add(HloInstruction::CreateConvolve(
+          new_output_shape, activation, filter, group_count, new_window,
+          dim_numbers, convolution->precision_config()));
 
-      Shape convert_back_shape = reduce_window->shape();
-      convert_back_shape.set_element_type(activation->shape().element_type());
+      // Delete the extra spatial dimension, and reshape.
+      Shape reshaped_convolution_shape =
+          ShapeUtil::DeleteDimension(new_spatial_dim, new_convolution->shape());
+      auto reshaped_convolution = HloInstruction::CreateReshape(
+          reshaped_convolution_shape, new_convolution);
 
-      // Convert reduced data back to the original data type.
-      auto reduce_window_converted = HloInstruction::CreateBitcastConvert(
-          convert_back_shape, reduce_window);
       TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
-          convolution, std::move(reduce_window_converted)));
+          convolution, std::move(reshaped_convolution)));
 
     } else {
       // The filter expansion mechanism adds zeroes in the kernel.
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index ce4c2a9cc69240b9565b35a3f2504d7fc9373917..4173af5179ba096523db973ca7e0466faefda38a 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -572,6 +572,7 @@ cc_library(
         ":runtime_matvec",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/core:framework_lite",
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
         "//third_party/eigen3",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 2bf24c15c1f050b200b1d9af2d95286f9a9dbe4c..f3dfa4d64264808e0d5c9f86693bb844b2011964 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -250,7 +250,6 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
       &pipeline, module->config().debug_options(),
       ReducePrecisionInsertion::PassTiming::BEFORE_OPTIMIZATION);
 
-  pipeline.AddPass<HloGetDimensionSizeRewriter>();
   pipeline.AddPass<MapInliner>();
 
   // TODO(b/65775800): Fix wrong output bug in Call and remove the CallInliner
@@ -270,6 +269,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
         /*rewrite_training_op=*/true,
         /*rewrite_inference_op=*/true,
         /*rewrite_grad_op=*/true);
+    pipeline.AddPass<HloGetDimensionSizeRewriter>();
     AlgebraicSimplifierOptions options(
         [](const Shape&, const Shape&) { return false; });
     options.set_enable_dot_strength_reduction(false);
@@ -635,18 +635,17 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
             .EmitComputation(
                 embedded_computation, embedded_computation->name(),
                 /*is_top_level_computation=*/false,
-                &schedule.sequence(embedded_computation).instructions())
+                schedule.sequence(embedded_computation).instructions())
             .status());
   }
   string function_name_prefix = entry_computation->name().empty()
                                     ? "__compute"
                                     : entry_computation->name();
-  TF_ASSIGN_OR_RETURN(
-      llvm::Function * entry_function,
-      ir_emitter.EmitComputation(
-          entry_computation, function_name_prefix,
-          /*is_top_level_computation=*/true,
-          &schedule.sequence(entry_computation).instructions()));
+  TF_ASSIGN_OR_RETURN(llvm::Function * entry_function,
+                      ir_emitter.EmitComputation(
+                          entry_computation, function_name_prefix,
+                          /*is_top_level_computation=*/true,
+                          schedule.sequence(entry_computation).instructions()));
 
   string function_name = [&]() {
     llvm::SmallVector<char, 40> function_name_vector;
@@ -835,7 +834,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
               .EmitComputation(
                   embedded_computation, embedded_computation->name(),
                   /*is_top_level_computation=*/false,
-                  &schedule.sequence(embedded_computation).instructions())
+                  schedule.sequence(embedded_computation).instructions())
               .status());
     }
     const string& entry_point_name = options.entry_point_name();
@@ -843,7 +842,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                         ir_emitter.EmitComputation(
                             computation, entry_point_name,
                             /*is_top_level_computation=*/true,
-                            &schedule.sequence(computation).instructions()));
+                            schedule.sequence(computation).instructions()));
 
     CHECK(entry_function->getName() == llvm_ir::AsStringRef(entry_point_name));
 
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 97f9b85a606e140fd7f3b1e3ecfb0dd5ba289f03..a33035ad1081d7d73ceed6ce3a208af5910d2d2c 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -323,11 +323,11 @@ void ColumnMajorMatrixVectorProductEmitter::Emit() {
   int64 column_remainder = k() % tile_cols();
   int64 column_limit = k() - column_remainder;
 
-  ksl_.ForReturnVoid("dot.outer.tiled",
-                     /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols(),
-                     [&](llvm::Value* column, bool is_first_column) {
-                       EmitOuterLoopBody(column, tile_cols(), is_first_column);
-                     });
+  ksl_.For("dot.outer.tiled",
+           /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols(),
+           [&](llvm::Value* column, bool is_first_column) {
+             EmitOuterLoopBody(column, tile_cols(), is_first_column);
+           });
 
   if (column_remainder != 0) {
     EmitOuterLoopBody(b_->getInt64(column_limit), column_remainder,
@@ -340,7 +340,7 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
     int64 columns, bool is_first_column) {
   int64 row_limit = m() - (m() % tile_rows());
 
-  ksl_.ForReturnVoid(
+  ksl_.For(
       "dot.inner.tiled", /*start=*/0, /*end=*/row_limit,
       /*step=*/tile_rows(), [&](llvm::Value* row) {
         std::vector<llvm::Value*> lhs_tile =
@@ -372,7 +372,7 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
   //     // initialized.
   //   }
 
-  ksl_.ForReturnVoid(
+  ksl_.For(
       "dot.inner.epilg.outer", /*start=*/current_tile_col,
       /*end=*/b_->CreateAdd(columns_llvm, current_tile_col),
       /*step=*/1, /*peel_first_iteration=*/false,
@@ -381,14 +381,14 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
         llvm::Value* total_offset = b_->CreateMul(col, b_->getInt64(m()));
         llvm::Value* lhs_base_pointer =
             vsl_.ComputeOffsetPointer(lhs_, total_offset);
-        ksl_.ForReturnVoid(
+        ksl_.For(
             "dot.inner.epilg.inner", /*start=*/row_start, /*end=*/m(),
             /*step=*/1, [&](llvm::Value* scalar_row) {
               llvm::Value* product = vsl_.Mul(
                   vsl_.LoadScalar(lhs_base_pointer, scalar_row), rhs_element);
               llvm::Value* setting_result_first_time = b_->CreateAnd(
                   is_first_scalar_col, b_->getInt1(is_first_tiled_column));
-              ksl_.IfReturnVoid(
+              ksl_.If(
                   setting_result_first_time,
                   /*true_block_generator=*/
                   [&]() {
@@ -568,10 +568,9 @@ void RowMajorMatrixVectorProductEmitter::Emit() {
   int64 row_remainder = m() % tile_rows();
   int64 row_limit = m() - row_remainder;
 
-  ksl_.ForReturnVoid(
-      "dot.outer.tiled",
-      /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows(),
-      [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows()); });
+  ksl_.For("dot.outer.tiled",
+           /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows(),
+           [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows()); });
 
   if (row_remainder != 0) {
     EmitOuterLoopBody(b_->getInt64(row_limit), row_remainder);
@@ -583,17 +582,17 @@ void RowMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
     std::vector<VectorVariable>* vector_accumulators) {
   int64 column_limit = k() - (k() % tile_cols());
 
-  ksl_.ForReturnVoid("dot.inner.tiled", /*start=*/0, /*end=*/column_limit,
-                     /*step=*/tile_cols(), [&](llvm::Value* col) {
-                       std::vector<llvm::Value*> lhs_tile =
-                           lhs_memory_tile->LoadTile(/*minor_dim_offset=*/col);
-                       llvm::Value* rhs_value = vsl_.LoadVector(rhs_, col);
-                       for (int i = 0; i < rows; i++) {
-                         llvm::Value* old_sum = (*vector_accumulators)[i].Get();
-                         (*vector_accumulators)[i].Set(vsl_.Add(
-                             old_sum, vsl_.Mul(rhs_value, lhs_tile[i])));
-                       }
-                     });
+  ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/column_limit,
+           /*step=*/tile_cols(), [&](llvm::Value* col) {
+             std::vector<llvm::Value*> lhs_tile =
+                 lhs_memory_tile->LoadTile(/*minor_dim_offset=*/col);
+             llvm::Value* rhs_value = vsl_.LoadVector(rhs_, col);
+             for (int i = 0; i < rows; i++) {
+               llvm::Value* old_sum = (*vector_accumulators)[i].Get();
+               (*vector_accumulators)[i].Set(
+                   vsl_.Add(old_sum, vsl_.Mul(rhs_value, lhs_tile[i])));
+             }
+           });
 }
 
 void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
@@ -609,7 +608,7 @@ void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
         b_->CreateAdd(b_->getInt64(r), current_tile_row), b_->getInt64(k()));
     llvm::Value* lhs_base_pointer =
         vsl_.ComputeOffsetPointer(lhs_, total_offset);
-    ksl_.ForReturnVoid(
+    ksl_.For(
         "dot.inner.epilg.inner", /*start=*/column_start, /*end=*/k(),
         /*step=*/1, [&](llvm::Value* scalar_col) {
           llvm::Value* product =
@@ -813,7 +812,7 @@ void TiledSmallGemmEmitter::HandleResiduesOnN() {
 
   if (n_start != dims().n()) {
     VectorSupportLibrary vsl(scalar_type(), 1, b_, "gemm");
-    ksl_.ForReturnVoid("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) {
+    ksl_.For("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) {
       llvm::Value* n_i_next = b_->CreateAdd(n_i, b_->getInt64(1));
       HandleResiduesOnK(&vsl, n_i, n_i_next);
     });
@@ -924,7 +923,7 @@ void TiledSmallGemmEmitter::EmitTiledGemm(
     VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
     llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end,
     int64 tile_size_m, llvm::Value* m_start, llvm::Value* m_end) {
-  ksl_.ForReturnVoid(
+  ksl_.For(
       "dot.m", m_start, m_end, tile_size_m, [&](llvm::Value* m_i) {
         MemoryTile result_memory_tile(
             vsl, b_, /*matrix=*/result_,
@@ -935,11 +934,11 @@ void TiledSmallGemmEmitter::EmitTiledGemm(
                                    /*matrix_size_along_minor_dim=*/dims().k(),
                                    /*major_dim_offset=*/m_i,
                                    /*tile_size_along_major_dim=*/tile_size_m);
-        ksl_.ForReturnVoid(
+        ksl_.For(
             "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) {
               TileVariable result_tile_var(vsl,
                                            result_memory_tile.LoadTile(n_i));
-              ksl_.ForReturnVoid(
+              ksl_.For(
                   "dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) {
                     MemoryTile rhs_memory_tile(vsl, b_, rhs_, dims().n(), k_i,
                                                tile_size_k);
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 4032c2da2f33ee61da8771ae6225a14172cbe6e8..62a4e8d3507a4e678e80c1abea680c030d048de5 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -111,10 +111,9 @@ IrEmitter::IrEmitter(
 StatusOr<llvm::Function*> IrEmitter::EmitComputation(
     HloComputation* computation, const string& function_name_prefix,
     bool is_top_level_computation,
-    const std::vector<HloInstruction*>* instruction_order) {
+    absl::Span<HloInstruction* const> instruction_order) {
   string function_name = name_uniquer_.GetUniqueName(function_name_prefix);
-  VLOG(2) << "Emitting IR for CPU function [" << function_name_prefix
-          << "]; ordered? " << (instruction_order != nullptr);
+  VLOG(2) << "Emitting IR for CPU function [" << function_name_prefix << "]";
   is_top_level_computation_ = is_top_level_computation;
   num_dynamic_loop_bounds_ = 0;
   if (!computation->root_instruction()->outer_dimension_partitions().empty()) {
@@ -141,11 +140,7 @@ StatusOr<llvm::Function*> IrEmitter::EmitComputation(
   bool use_rdtscp = arch_type_ == llvm::Triple::ArchType::x86 ||
                     arch_type_ == llvm::Triple::ArchType::x86_64;
   profiling_state_ = ProfilingState(use_rdtscp);
-  if (instruction_order == nullptr) {
-    TF_RETURN_IF_ERROR(computation->Accept(this));
-  } else {
-    TF_RETURN_IF_ERROR(computation->AcceptOrdered(this, *instruction_order));
-  }
+  TF_RETURN_IF_ERROR(computation->AcceptOrdered(this, instruction_order));
   llvm::Function* ir_function = compute_function_->function();
   InsertOrDie(&emitted_functions_, computation, ir_function);
   // Delete 'compute_function', finalizing 'ir_function' and restoring caller
@@ -2271,6 +2266,22 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
               /*isVarArg=*/false)));
 
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call));
+  // Write the tuple table if the output is a tuple.
+  if (ShapeUtil::IsTuple(custom_call->shape())) {
+    std::vector<llvm::Value*> base_ptrs;
+    for (int i = 0; i < ShapeUtil::TupleElementCount(custom_call->shape());
+         ++i) {
+      const Shape& elem_shape =
+          ShapeUtil::GetTupleElementShape(custom_call->shape(), i);
+      TF_RET_CHECK(!ShapeUtil::IsTuple(elem_shape))
+          << "Nested tuples not implemented";
+      TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
+                          assignment_.GetUniqueSlice(custom_call, {i}));
+      llvm::Value* addr = EmitBufferPointer(slice, elem_shape);
+      base_ptrs.push_back(addr);
+    }
+    llvm_ir::EmitTuple(GetIrArrayFor(custom_call), base_ptrs, &b_, module_);
+  }
   auto* output_address_arg =
       PointerCast(GetEmittedValueFor(custom_call), i8_ptr_type);
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 559a8162a2d53f28ea6817653503c216af90a610..1db75cc8becea80f121289a843d4eb16ee9a8c8a 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -101,7 +101,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   StatusOr<llvm::Function*> EmitComputation(
       HloComputation* computation, const string& function_name_prefix,
       bool is_top_level_computation,
-      const std::vector<HloInstruction*>* instruction_order);
+      absl::Span<HloInstruction* const> instruction_order);
 
   llvm::IRBuilder<>* b() { return &b_; }
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
index f0b65046c14ccec5336abf7c4d05d1d755f783bd..35ae62b42dfa768c6abd0508097d6b235b2ebf54 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -112,10 +112,10 @@ TEST_F(ParallelTaskAssignmentTest, InfeedOutfeedOperationNotParallelized) {
   const string hlo_string = R"(
     HloModule TestTaskParallel_infeed_outfeed
     ENTRY InfeedOutfeed {
-      token = token[] after-all()
-      infeed0 = (u32[12345678,2]{1,0}, token[]) infeed(token)
+      token0 = token[] after-all()
+      infeed0 = (u32[12345678,2]{1,0}, token[]) infeed(token0)
       infeed0.data = u32[12345678,2]{1,0} get-tuple-element((u32[12345678,2]{1,0}, token[]) infeed0), index=0
-      ROOT outfeed0 = token[] outfeed(infeed0.data, token)
+      ROOT outfeed0 = token[] outfeed(infeed0.data, token0)
     }
   )";
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
index a71a85913cfef271bc2a226cb0cf2dd4204499a4..56f018abdd496e804dc4dea5420d400175491db3 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
@@ -23,6 +23,10 @@ limitations under the License.
 #include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 using tensorflow::int32;
 using tensorflow::int64;
 
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index efccadedf27181a4cddf4f1dc3610f7c6db1d821..296f39a4853f2d3f7030209a921001e92c39d609 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -139,7 +139,7 @@ llvm::JITSymbol SimpleOrcJIT::ResolveRuntimeSymbol(const std::string& name) {
   }
 
   if (func_addr == nullptr) {
-    VLOG(2) << "Unable to resolve runtime symbol: " << name;
+    LOG(ERROR) << "Unable to resolve runtime symbol: " << name;
     return nullptr;
   }
   llvm::JITEvaluatedSymbol symbol_info(reinterpret_cast<uint64_t>(func_addr),
@@ -296,6 +296,9 @@ bool RegisterKnownJITSymbols() {
   REGISTER_LIBM_SYMBOL(sin, double (*)(double));
 #ifdef __APPLE__
   REGISTER_LIBM_SYMBOL(__sincos, void (*)(double, double*, double*));
+  registry->Register("__sincosf_stret",
+                     reinterpret_cast<void*>(__sincosf_stret));
+  registry->Register("__sincos_stret", reinterpret_cast<void*>(__sincos_stret));
 #else
   REGISTER_LIBM_SYMBOL(sincos, void (*)(double, double*, double*));
 #endif
@@ -311,6 +314,13 @@ bool RegisterKnownJITSymbols() {
   registry->Register("memcpy", reinterpret_cast<void*>(memcpy));
   registry->Register("memmove", reinterpret_cast<void*>(memmove));
   registry->Register("memset", reinterpret_cast<void*>(memset));
+
+#ifdef __APPLE__
+  registry->Register("__bzero", reinterpret_cast<void*>(bzero));
+  registry->Register("memset_pattern16",
+                     reinterpret_cast<void*>(memset_pattern16));
+#endif
+
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
index fa0e09ff6b5694c0e97963b83c6e541b858a1376..0584c0484f810a03ccccd522163f54535440ef8b 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
@@ -31,29 +31,27 @@ HloModule RepeatedConstants
 while_body {
   arg_body = f32[2,3,2] parameter(0)
   ROOT const = f32[2,3,2] constant(
-  f32[2,3,2]
     {{{1, 2}, {1001, 1002}, {2001, 2002}},
      {{2, 1}, {2001, 3002}, {2001, 2002}}})
 }
 
 while_cond {
   arg_cond = f32[2,3,2] parameter(0)
-  token = token[] after-all()
-  infeed = (pred[], token[]) infeed(token)
+  token0 = token[] after-all()
+  infeed = (pred[], token[]) infeed(token0)
   ROOT unknown = pred[] get-tuple-element((pred[], token[]) infeed), index=0
 }
 
 ENTRY main {
   param = f32[2,3,2] parameter(0)
   const_a = f32[2,3,2] constant(
-  f32[2,3,2]
     {{{1, 2}, {1001, 1002}, {2001, 2002}},
      {{2, 1}, {2001, 3002}, {2001, 2002}}})
   const_b = f32[2,3,2] while(f32[2,3,2] const_a), condition=while_cond, body=while_body
 
-  token = token[] after-all()
-  out0 = token[] outfeed(f32[2,3,2] const_a, token[] token)
-  ROOT out1 = token[] outfeed(f32[2,3,2] const_b, token[] token)
+  token0 = token[] after-all()
+  out0 = token[] outfeed(f32[2,3,2] const_a, token[] token0)
+  ROOT out1 = token[] outfeed(f32[2,3,2] const_b, token[] token0)
 }
 )";
 
@@ -82,24 +80,24 @@ HloModule RepeatedConstants
 
 while_body {
   arg_body = (f32[2,1]{1,0}, f32[1]{0}) parameter(0)
-  ROOT const = (f32[2,1]{1,0}, f32[1]{0}) constant((f32[2,1], f32[1]) ( f32[2,1] { { 1 }, { 2 } }, {2} ))
+  ROOT const = (f32[2,1]{1,0}, f32[1]{0}) constant(({ { 1 }, { 2 } }, {2} ))
 }
 
 while_cond {
   arg_cond = (f32[2,1]{1,0}, f32[1]{0}) parameter(0)
-  token = token[] after-all()
-  infeed = (pred[], token[]) infeed(token)
+  token0 = token[] after-all()
+  infeed = (pred[], token[]) infeed(token0)
   ROOT unknown = pred[] get-tuple-element((pred[], token[]) infeed), index=0
 }
 
 ENTRY main {
   param = f32[2,3,2] parameter(0)
-  const_a = (f32[2,1]{1,0}, f32[1]{0}) constant((f32[2,1], f32[1]) ( f32[2,1] { { 1 }, { 2 } }, {2} ))
+  const_a = (f32[2,1]{1,0}, f32[1]{0}) constant(( { { 1 }, { 2 } }, {2} ))
   const_b = (f32[2,1]{1,0}, f32[1]{0}) while((f32[2,1]{1,0}, f32[1]{0}) const_a), condition=while_cond, body=while_body
 
-  token = token[] after-all()
-  out0 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_a, token[] token)
-  ROOT out1 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_b, token[] token)
+  token0 = token[] after-all()
+  out0 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_a, token[] token0)
+  ROOT out1 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_b, token[] token0)
 }
 )";
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
index e2c7af541eede5265f274c72f55305549f059839..aab7f0b393881642437f1891256bd138823a3b87 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
@@ -28,12 +28,11 @@ HloModule Outfeed
 
 ENTRY main {
   const_a = f32[2,3,2] constant(
-  f32[2,3,2]
     {{{1, 2}, {1001, 1002}, {2001, 2002}},
      {{2, 1}, {2001, 3002}, {2001, 2002}}})
 
-  token = token[] after-all()
-  outfeed = token[] outfeed(f32[2,3,2] const_a, token)
+  token0 = token[] after-all()
+  outfeed = token[] outfeed(f32[2,3,2] const_a, token0)
   ROOT root = () tuple()
 }
 )";
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d0472689bf48092ceef2e9792c1358687d707ec
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -0,0 +1,459 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+namespace xla {
+
+namespace {
+bool IsTrivialWindowDimension(const WindowDimension& window_dimension) {
+  return window_dimension.size() == 1 && window_dimension.stride() == 1 &&
+         window_dimension.padding_low() == 0 &&
+         window_dimension.padding_high() == 0 &&
+         window_dimension.window_dilation() == 1 &&
+         window_dimension.base_dilation() == 1;
+}
+}  // namespace
+
+class DynamicDimensionInferenceVisitor : public DfsHloVisitorWithDefault {
+ public:
+  explicit DynamicDimensionInferenceVisitor(
+      const DynamicParameterBinding& param_bindings,
+      DynamicDimensionInference* parent)
+      : param_bindings_(param_bindings), parent_(parent) {}
+
+  Status DefaultAction(HloInstruction* hlo) override;
+
+  static Status Run(HloComputation* computation,
+                    const DynamicParameterBinding& param_bindings,
+                    DynamicDimensionInference* parent) {
+    DynamicDimensionInferenceVisitor visitor(param_bindings, parent);
+    return computation->Accept(&visitor);
+  }
+
+  Status HandleParameter(HloInstruction* hlo) override;
+
+  Status HandleReduce(HloInstruction* hlo) override;
+
+  Status HandleDot(HloInstruction* hlo) override;
+
+  Status HandleTranspose(HloInstruction* hlo) override;
+
+  Status HandleReshape(HloInstruction* hlo) override;
+
+  Status HandlePad(HloInstruction* hlo) override;
+
+  Status HandleBroadcast(HloInstruction* hlo) override;
+
+  Status HandleGetDimensionSize(HloInstruction* hlo) override;
+
+  Status HandleSelect(HloInstruction* hlo) override;
+
+  Status HandleConvolution(HloInstruction* hlo) override;
+
+  Status HandleReduceWindow(HloInstruction* hlo) override;
+
+  Status HandleSelectAndScatter(HloInstruction* hlo) override;
+
+  Status HandleGetTupleElement(HloInstruction* hlo) override;
+
+  Status HandleElementwiseUnary(HloInstruction* hlo) override;
+
+  Status HandleElementwiseBinary(HloInstruction* hlo) override;
+
+ private:
+  using OperandDynamicDimensionFn = std::function<Status(
+      HloInstruction* operand, ShapeIndex index, int64 dimension,
+      int64 operand_index, HloInstruction* dynamic_size)>;
+
+  Status ForEachOperandDynamicDimension(HloInstruction* inst,
+                                        const OperandDynamicDimensionFn&);
+
+  // Pass through a dynamic dimension from the input to the output with the same
+  // value and index in the shape. This is a helper function to handle trivial
+  // instructions like elementwise operations.
+  Status PassThroughDynamicDimension(HloInstruction*);
+
+  // The dynamic parameter bindings of this computation.
+  const DynamicParameterBinding& param_bindings_;
+
+  // A pointer to DynamicDimensionInference, used to update the dynamic mapping.
+  DynamicDimensionInference* parent_;
+};
+
+Status DynamicDimensionInferenceVisitor::DefaultAction(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        return UnimplementedStrCat(
+            "Asked to propagate a dynamic dimension from hlo ",
+            operand->ToString(), "@", index.ToString(), "@", dimension,
+            " to hlo ", hlo->ToString(), ", which is not implemented.");
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleGetTupleElement(
+    HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        if (hlo->tuple_index() == index[0]) {
+          ShapeIndex new_index =
+              ShapeIndexView(index).ConsumeFront().ToShapeIndex();
+          parent_->SetDynamicSize(hlo, new_index, dimension, dynamic_size);
+        }
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleBroadcast(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        int64 broadcast_dim = hlo->dimensions(dimension);
+        parent_->SetDynamicSize(hlo, index, broadcast_dim, dynamic_size);
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandlePad(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        if (operand_index != 0) {
+          return Unimplemented(
+              "Dynamic dimension on padding value is not supported");
+        }
+        const PaddingConfig_PaddingConfigDimension& padding_config =
+            hlo->padding_config().dimensions(dimension);
+        if (padding_config.interior_padding() == 0 &&
+            padding_config.edge_padding_low() == 0 &&
+            padding_config.edge_padding_high() == 0) {
+          parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size);
+          return Status::OK();
+        } else {
+          return Unimplemented(
+              "Dynamic dimension propagation on padding dimension is not "
+              "supported.");
+        }
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleReduce(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* reduce = hlo;
+        int64 operand_count = reduce->operand_count();
+        CHECK_EQ(operand_count % 2, 0);
+        if (operand_index >= operand_count / 2) {
+          // Init values doesn't have dynamic size.
+          return Status::OK();
+        }
+        if ((absl::c_count(reduce->dimensions(), dimension) != 0)) {
+          // Dimension is to be reduce, stop tracing.
+          return Status::OK();
+        }
+
+        // Find out the new dynamic dimension after reduce.
+        int64 dimensions_not_reduced_count = 0;
+        for (int i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
+          if (dimension == i) {
+            parent_->SetDynamicSize(reduce, {}, dimensions_not_reduced_count,
+                                    dynamic_size);
+
+            return Status::OK();
+          }
+          if (absl::c_count(reduce->dimensions(), i) == 0) {
+            dimensions_not_reduced_count++;
+          }
+        }
+
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* dot = hlo;
+        const DotDimensionNumbers& dimension_numbers =
+            dot->dot_dimension_numbers();
+        // A map from the operand dimensions to result dimension.
+        absl::flat_hash_map<int64, int64> result_dim_mapping;
+        int64 current_result_dims = 0;
+        std::unordered_set<int64> batch_dims(
+            dimension_numbers.rhs_batch_dimensions().begin(),
+            dimension_numbers.rhs_batch_dimensions().end());
+
+        for (int64 i : dimension_numbers.rhs_batch_dimensions()) {
+          result_dim_mapping[i] = current_result_dims++;
+        }
+
+        for (int64 i = 0; i < ShapeUtil::Rank(dot->operand(0)->shape()); i++) {
+          if (!absl::c_linear_search(
+                  dimension_numbers.lhs_contracting_dimensions(), i)) {
+            if (operand_index == 0) {
+              result_dim_mapping[i] = current_result_dims;
+            }
+            current_result_dims++;
+          }
+        }
+
+        for (int64 i = 0; i < ShapeUtil::Rank(dot->operand(1)->shape()); i++) {
+          if (!absl::c_linear_search(
+                  dimension_numbers.rhs_contracting_dimensions(), i) &&
+              !absl::c_linear_search(dimension_numbers.rhs_batch_dimensions(),
+                                     i)) {
+            if (operand_index == 1) {
+              result_dim_mapping[i] = current_result_dims;
+            }
+            current_result_dims++;
+          }
+        }
+
+        // Check if the operand dim is in the result shape. If so, add another
+        // work item to trace that dimension.
+        auto iter = result_dim_mapping.find(dimension);
+        if (iter != result_dim_mapping.end()) {
+          parent_->SetDynamicSize(dot, {}, iter->second, dynamic_size);
+        }
+
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleTranspose(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        parent_->SetDynamicSize(hlo, {}, hlo->dimensions()[dimension],
+                                dynamic_size);
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleConvolution(
+    HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* conv = hlo;
+        const ConvolutionDimensionNumbers& dimension_numbers =
+            conv->convolution_dimension_numbers();
+
+        if (operand_index == 0) {
+          if (dimension == dimension_numbers.input_batch_dimension()) {
+            parent_->SetDynamicSize(conv, {},
+                                    dimension_numbers.output_batch_dimension(),
+                                    dynamic_size);
+            return Status::OK();
+          }
+
+          if (dimension == dimension_numbers.input_feature_dimension()) {
+            return Status::OK();
+          }
+        } else {
+          if (dimension == dimension_numbers.kernel_input_feature_dimension()) {
+            return Status::OK();
+          }
+        }
+
+        return Unimplemented("Dynamic Spatial Convolution is not supported: %s",
+                             conv->ToString());
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleGetDimensionSize(
+    HloInstruction*) {
+  // Dynamic dimension doesn't propagate through GetDimensionSize:
+  //
+  //   Input: F32[x, y, z]
+  //     |
+  //   GetDimensionSize(1): U32[]
+  //
+  // The returned value is a scalar, which doesn't have any dynamic dimension in
+  // the shape (although the value contains the real size of the dynamic
+  // dimension of the input).
+  return Status::OK();
+}
+
+Status DynamicDimensionInferenceVisitor::PassThroughDynamicDimension(
+    HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        parent_->SetDynamicSize(hlo, index, dimension, dynamic_size);
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleElementwiseUnary(
+    HloInstruction* hlo) {
+  return PassThroughDynamicDimension(hlo);
+}
+
+Status DynamicDimensionInferenceVisitor::HandleSelect(HloInstruction* hlo) {
+  return PassThroughDynamicDimension(hlo);
+}
+
+Status DynamicDimensionInferenceVisitor::HandleElementwiseBinary(
+    HloInstruction* hlo) {
+  return PassThroughDynamicDimension(hlo);
+}
+
+Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* reshape = hlo;
+        std::vector<std::pair<int64, int64>> unmodified_dims =
+            ShapeUtil::DimensionsUnmodifiedByReshape(operand->shape(),
+                                                     reshape->shape());
+        for (auto& unmodified : unmodified_dims) {
+          if (unmodified.first == dimension) {
+            parent_->SetDynamicSize(reshape, {}, unmodified.second,
+                                    dynamic_size);
+            return Status::OK();
+          }
+        }
+        return Unimplemented(
+            "Dynamic Reshape on modified dimensions is yet not supported: %s",
+            reshape->ToString());
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleReduceWindow(
+    HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* reduce_window = hlo;
+        const WindowDimension& window_dimension =
+            reduce_window->window().dimensions(dimension);
+
+        if (!IsTrivialWindowDimension(window_dimension)) {
+          return Unimplemented(
+              "Dynamic Spatial reduce window is not supported: %s",
+              reduce_window->ToString());
+        }
+
+        parent_->SetDynamicSize(reduce_window, {}, dimension, dynamic_size);
+
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleSelectAndScatter(
+    HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* select_and_scatter = hlo;
+        const WindowDimension& window_dimension =
+            select_and_scatter->window().dimensions(dimension);
+
+        if (!IsTrivialWindowDimension(window_dimension)) {
+          return Unimplemented(
+              "Dynamic Spatial select and scatter is not supported: %s",
+              select_and_scatter->ToString());
+        }
+
+        parent_->SetDynamicSize(select_and_scatter, {}, dimension,
+                                dynamic_size);
+
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleParameter(HloInstruction* hlo) {
+  return param_bindings_.ForEachBinding(
+      [&](const DynamicParameterBinding::DynamicParameter& dynamic_parameter,
+          const DynamicParameterBinding::DynamicDimension& dynamic_dimension) {
+        if (dynamic_dimension.parameter_num != hlo->parameter_number()) {
+          return Status::OK();
+        }
+        HloComputation* computation = hlo->parent();
+        HloInstruction* target_parameter =
+            computation->parameter_instruction(dynamic_dimension.parameter_num);
+
+        HloInstruction* dynamic_size =
+            computation->parameter_instruction(dynamic_parameter.parameter_num);
+        for (int64 i : dynamic_parameter.parameter_index) {
+          dynamic_size =
+              computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+                  ShapeUtil::GetSubshape(dynamic_size->shape(), {i}),
+                  dynamic_size, i));
+        }
+
+        parent_->SetDynamicSize(target_parameter,
+                                dynamic_dimension.parameter_index,
+                                dynamic_dimension.dimension, dynamic_size);
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::ForEachOperandDynamicDimension(
+    HloInstruction* inst, const OperandDynamicDimensionFn& fn) {
+  for (int64 operand_index = 0; operand_index < inst->operand_count();
+       ++operand_index) {
+    auto iter =
+        parent_->per_hlo_dynamic_dimensions_.find(inst->operand(operand_index));
+    if (iter != parent_->per_hlo_dynamic_dimensions_.end()) {
+      for (auto& dynamic_dimension : iter->second) {
+        HloInstruction* dynamic_size = parent_->GetDynamicSize(
+            dynamic_dimension.inst, dynamic_dimension.index,
+            dynamic_dimension.dim);
+        TF_RETURN_IF_ERROR(fn(dynamic_dimension.inst, dynamic_dimension.index,
+                              dynamic_dimension.dim, operand_index,
+                              dynamic_size));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+/* static */
+StatusOr<DynamicDimensionInference> DynamicDimensionInference::Run(
+    HloModule* module) {
+  VLOG(0) << "Param Config " << module->dynamic_parameter_binding().ToString();
+  DynamicDimensionInference inference(module);
+  TF_RETURN_IF_ERROR(inference.AnalyzeDynamicDimensions());
+  return inference;
+}
+
+DynamicDimensionInference::DynamicDimensionInference(HloModule* module)
+    : module_(module) {}
+
+Status DynamicDimensionInference::AnalyzeDynamicDimensions() {
+  return DynamicDimensionInferenceVisitor::Run(
+      module_->entry_computation(), module_->dynamic_parameter_binding(), this);
+}
+
+HloInstruction* DynamicDimensionInference::GetDynamicSize(
+    HloInstruction* inst, const ShapeIndex& index, int64 dim) const {
+  auto iter = dynamic_mapping_.find(DynamicDimension{inst, index, dim});
+  if (iter != dynamic_mapping_.end()) {
+    return iter->second;
+  }
+  return nullptr;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..164d15bf111a92e3da957f609b54ee0662ef18b1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_DIMENSION_INFERENCE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_DIMENSION_INFERENCE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+
+// DynamicDimensionInference analyzes each HLO instruction in a graph and
+// inferences which dimensions are dynamic and which scalar instructions
+// represent the runtime real size of those dynamic dimensions.
+class DynamicDimensionInference {
+ public:
+  static StatusOr<DynamicDimensionInference> Run(HloModule* module);
+
+  string ToString() const;
+
+  // If the dimension `dim` of instruction `inst` at `index` has a dynamic size,
+  // returns a scalar HloInstruction that represents the runtime size of that
+  // dimension. Otherwise returns nullptr.
+  HloInstruction* GetDynamicSize(HloInstruction* inst, const ShapeIndex& index,
+                                 int64 dim) const;
+
+  friend class DynamicDimensionInferenceVisitor;
+
+ private:
+  explicit DynamicDimensionInference(HloModule* module);
+
+  // DynamicDimension is used as a key in the dynamic key-value mapping. It
+  // unambiguously represents a dynamic dimension of a instruction at a given
+  // index.
+  struct DynamicDimension {
+    // HloInstruction that holds the dimension.
+    HloInstruction* inst;
+    // Subshape of the instruction that holds the dimension.
+    ShapeIndex index;
+    // The dimension number of the dynamic dimension at given index of a given
+    // instruction.
+    int64 dim;
+
+    // Artifacts needed to make this struct able to be used as a `key` in absl
+    // maps. "friend" keywords are added so these functions can be found through
+    // ADL.
+    template <typename H>
+    friend H AbslHashValue(H h, const DynamicDimension& m) {
+      return H::combine(std::move(h), m.inst, m.index, m.dim);
+    }
+
+    friend bool operator==(const DynamicDimension& lhs,
+                           const DynamicDimension& rhs) {
+      return lhs.inst == rhs.inst && lhs.index == rhs.index &&
+             lhs.dim == rhs.dim;
+    }
+  };
+
+  // Update the dynamic mapping so that we know dimension `dim` of instruction
+  // `inst` at `index` has a dynamic size, and its runtime size is represented
+  // by a scalar instruction `size`.
+  void SetDynamicSize(HloInstruction* inst, const ShapeIndex& index, int64 dim,
+                      HloInstruction* size) {
+    dynamic_mapping_.try_emplace(DynamicDimension{inst, index, dim}, size);
+    auto iter = per_hlo_dynamic_dimensions_.try_emplace(inst);
+    iter.first->second.emplace(DynamicDimension{inst, index, dim});
+  }
+
+  // AnalyzeDynamicDimensions starts the analysis of the dynamic dimensions in
+  // module_.
+  Status AnalyzeDynamicDimensions();
+
+  // HloModule being analyzed.
+  HloModule* module_;
+
+  // dynamic_mapping_ holds the result of the analysis. It maps a dynamic
+  // dimension to a scalar HloInstruction that represents the real dynamic size
+  // of the dynamic dimension.
+  using DynamicMapping = absl::flat_hash_map<DynamicDimension, HloInstruction*>;
+  DynamicMapping dynamic_mapping_;
+
+  using PerHloDynamicDimensions =
+      absl::flat_hash_map<HloInstruction*,
+                          absl::flat_hash_set<DynamicDimension>>;
+  PerHloDynamicDimensions per_hlo_dynamic_dimensions_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_DIMENSION_INFERENCE_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea9ebed45d99797ce4f80376ec3d0b758da3ca17
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -0,0 +1,535 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+class DynamicDimensionInferenceTest : public HloTestBase {
+ protected:
+  DynamicDimensionInferenceTest() : HloTestBase() {
+    module_ = CreateNewVerifiedModule();
+  }
+
+  Status RunInference() {
+    hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before alias analysis");
+    TF_ASSIGN_OR_RETURN(DynamicDimensionInference inference,
+                        DynamicDimensionInference::Run(module_.get()));
+
+    inference_ = absl::make_unique<DynamicDimensionInference>(inference);
+    return Status::OK();
+  }
+
+  HloComputation* GetAdd() {
+    auto embedded_builder = HloComputation::Builder("add");
+    auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    embedded_builder.AddInstruction(
+        HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+    return module_->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
+  std::unique_ptr<HloModule> module_;
+  std::unique_ptr<DynamicDimensionInference> inference_;
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(S32, {});
+};
+
+TEST_F(DynamicDimensionInferenceTest, ParamTest) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "param"));
+  auto param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "param"));
+
+  module_->AddEntryComputation(builder.Build());
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(param, {}, 1), param2);
+  EXPECT_EQ(inference_->GetDynamicSize(param, {}, 0), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(param2, {}, 0), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ParamTestTuple) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+
+  auto param = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({input_shape, scalar_shape_}), "param"));
+
+  module_->AddEntryComputation(builder.Build());
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{0, {1}},
+      DynamicParameterBinding::DynamicDimension{0, {0}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_THAT(inference_->GetDynamicSize(param, {0}, 1),
+              op::GetTupleElement(param, 1));
+
+  EXPECT_EQ(inference_->GetDynamicSize(param, {0}, 0), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, GetTupleElement) {
+  // When data flows through GTE, the dynamic dimension size keeps the
+  // same, and the index has its front popped.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+
+  auto param = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({input_shape, scalar_shape_}), "param"));
+
+  auto gte = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(input_shape, param, 0));
+
+  module_->AddEntryComputation(builder.Build());
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{0, {1}},
+      DynamicParameterBinding::DynamicDimension{0, {0}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_THAT(inference_->GetDynamicSize(param, {0}, 1),
+              op::GetTupleElement(param, 1));
+
+  EXPECT_THAT(inference_->GetDynamicSize(gte, {}, 1),
+              op::GetTupleElement(param, 1));
+
+  EXPECT_EQ(inference_->GetDynamicSize(param, {0}, 0), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ElementwiseTest) {
+  // When data flows through elementwise, the dynamic dimension size keeps the
+  // same.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "data_param"));
+  auto size_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+
+  auto* negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(input_shape, HloOpcode::kNegate, data_param));
+
+  module_->AddEntryComputation(builder.Build());
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(negate, {}, 1), size_param);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ReduceTestI) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+  auto reduce_shape = ShapeUtil::MakeShape(F32, {2});
+
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "data_param"));
+  auto size_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(input_shape, HloOpcode::kNegate, data_param));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+      reduce_shape, negate, init, {0, 2}, GetAdd()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 0), size_param);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ReduceTestII) {
+  // Same as ReduceTestI, but only reduce one dimension.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+  auto reduce_shape = ShapeUtil::MakeShape(F32, {1, 2});
+
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "data_param"));
+  auto size_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(input_shape, HloOpcode::kNegate, data_param));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto reduce = builder.AddInstruction(
+      HloInstruction::CreateReduce(reduce_shape, negate, init, {1}, GetAdd()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 2}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 1), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 0), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, DotTest) {
+  auto builder = HloComputation::Builder(TestName());
+  constexpr int xdim = 3;
+  constexpr int ydim = 2;
+  constexpr int zdim = 1;
+  auto xy_shape = ShapeUtil::MakeShape(F32, {xdim, ydim});
+  auto yz_shape = ShapeUtil::MakeShape(F32, {ydim, zdim});
+  auto xz_shape = ShapeUtil::MakeShape(F32, {xdim, zdim});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, xy_shape, "A"));
+  auto* b_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, yz_shape, "B"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, scalar_shape_, "size_param"));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateDot(xz_shape, a_param, b_param, dot_dnums,
+                                HloTestBase::DefaultPrecisionConfig(2)));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding for non-contracting dimension.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  // Set up binding for contracting dimensions.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{1, {}, 0}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(dot, {}, 0), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(dot, {}, 1), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ConvolutionTest) {
+  auto builder = HloComputation::Builder(TestName());
+  constexpr int xdim = 3;
+  constexpr int ydim = 2;
+  constexpr int zdim = 1;
+  auto xy_shape = ShapeUtil::MakeShape(F32, {xdim, ydim});
+  auto yz_shape = ShapeUtil::MakeShape(F32, {ydim, zdim});
+  auto zx_shape = ShapeUtil::MakeShape(F32, {zdim, xdim});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, xy_shape, "A"));
+  auto* b_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, yz_shape, "B"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, scalar_shape_, "size_param"));
+
+  auto dnums = XlaBuilder::CreateDefaultConvDimensionNumbers(0);
+
+  dnums.set_kernel_input_feature_dimension(0);
+  dnums.set_kernel_output_feature_dimension(1);
+  dnums.set_input_batch_dimension(0);
+  dnums.set_output_batch_dimension(1);
+  dnums.set_output_feature_dimension(0);
+
+  Window window;
+
+  auto* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      zx_shape, a_param, b_param, /*feature_group_count=*/1, window, dnums,
+      HloTestBase::DefaultPrecisionConfig(2)));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding for non-contracting dimension.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  // Set up binding for contracting dimensions.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(conv, {}, 1), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(conv, {}, 0), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, TransposeTest) {
+  // Test the ability to trace unmodified dimensions
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 3});
+  auto output_shape = ShapeUtil::MakeShape(F32, {3, 2, 1});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+  auto* size_param_1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+  auto* size_param_2 = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, scalar_shape_, "size_param"));
+  auto* size_param_3 = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/3, scalar_shape_, "size_param"));
+
+  auto* transpose = builder.AddInstruction(
+      HloInstruction::CreateTranspose(output_shape, a_param, {2, 1, 0}));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{3, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 2}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(transpose, {}, 0), size_param_3);
+  EXPECT_EQ(inference_->GetDynamicSize(transpose, {}, 1), size_param_2);
+  EXPECT_EQ(inference_->GetDynamicSize(transpose, {}, 2), size_param_1);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ReshapeTest) {
+  // Test the ability to trace unmodified reshape dimensions.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2, 3, 4, 5, 6});
+  auto output_shape = ShapeUtil::MakeShape(F32, {6, 4, 1, 5, 2, 3});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  auto* reshape = builder.AddInstruction(
+      HloInstruction::CreateReshape(output_shape, a_param));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 2}));
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 3}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 0), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 1), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 2), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 3), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 4), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 5), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ReshapeTestUnimplemented) {
+  // Test the ability to trace unmodified reshape dimensions.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2, 3, 4, 5, 6});
+  auto output_shape = ShapeUtil::MakeShape(F32, {6, 4, 1, 5, 2, 3});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  builder.AddInstruction(HloInstruction::CreateReshape(output_shape, a_param));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  Status status = RunInference();
+  EXPECT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED);
+}
+
+TEST_F(DynamicDimensionInferenceTest, BroadcastTest) {
+  // Test the ability to trace broadcast dimension.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2});
+  auto output_shape = ShapeUtil::MakeShape(F32, {3, 2, 4});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  auto* broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(output_shape, a_param, {1}));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(broadcast, {}, 0), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(broadcast, {}, 1), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(broadcast, {}, 2), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ReduceWindowBatchTest) {
+  // Test the ability to trace reduce window batch dimensions.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2, 4, 4});
+  auto output_shape = ShapeUtil::MakeShape(F32, {2, 2, 2});
+
+  Window window;
+  // First dimension is unchanged.
+  WindowDimension* batch_dim = window.add_dimensions();
+  batch_dim->set_size(1);
+  batch_dim->set_stride(1);
+  batch_dim->set_padding_low(0);
+  batch_dim->set_padding_high(0);
+  batch_dim->set_window_dilation(1);
+  batch_dim->set_base_dilation(1);
+
+  // Second and third dimension are reduced.
+  for (int64 i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(2);
+    dim->set_stride(2);
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_window_dilation(1);
+    dim->set_base_dilation(1);
+  }
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto* reduce_window =
+      builder.AddInstruction(HloInstruction::CreateReduceWindow(
+          output_shape, a_param, init, window, GetAdd()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reduce_window, {}, 0), size_param);
+}
+
+TEST_F(DynamicDimensionInferenceTest, SelectAndScatterTest) {
+  // Test the ability to trace select and scatter batch dimensions.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2, 4, 4});
+  auto output_shape = ShapeUtil::MakeShape(F32, {2, 2, 2});
+
+  Window window;
+  // First dimension is unchanged.
+  WindowDimension* batch_dim = window.add_dimensions();
+  batch_dim->set_size(1);
+  batch_dim->set_stride(1);
+  batch_dim->set_padding_low(0);
+  batch_dim->set_padding_high(0);
+  batch_dim->set_window_dilation(1);
+  batch_dim->set_base_dilation(1);
+
+  // Second and third dimension are reduced.
+  for (int64 i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(2);
+    dim->set_stride(2);
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_window_dilation(1);
+    dim->set_base_dilation(1);
+  }
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto* reduce_window =
+      builder.AddInstruction(HloInstruction::CreateReduceWindow(
+          output_shape, a_param, init, window, GetAdd()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reduce_window, {}, 0), size_param);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc
index d7829045cc127deaa4c2c9b705dca5285d704af2..3a09d4d4716950a09d65dd093272482d55ac5c27 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc
@@ -43,13 +43,14 @@ bool IsForwardConvolutionCanonical(const HloInstruction& conv) {
 // dilation), returns kPad and/or kSlice instructions that explicitly apply the
 // padding; otherwise returns the original input operand. When there is both
 // positive padding (including dilation) and negative padding, we insert both
-// kPad and kSlice.
+// kPad and kSlice. Modifies 'conv_window' accordingly if any padding was moved
+// into a kPad or kSlice op.
 HloInstruction* MaybePaddedAndSlicedInput(
-    const Window& conv_window, const ConvolutionDimensionNumbers& conv_dnums,
+    Window* conv_window, const ConvolutionDimensionNumbers& conv_dnums,
     HloInstruction* input) {
   HloComputation* computation = input->parent();
-  if (!window_util::HasSymmetricPadding(conv_window) ||
-      window_util::HasBaseDilation(conv_window)) {
+  if (!window_util::HasSymmetricPadding(*conv_window) ||
+      window_util::HasBaseDilation(*conv_window)) {
     // If padding is uneven or has dilation, we insert a kPad instruction that
     // applies positive padding and dilation.
     //
@@ -62,12 +63,21 @@ HloInstruction* MaybePaddedAndSlicedInput(
         MakeNoPaddingConfig(input->shape().dimensions_size());
     for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) {
       int64 dim = conv_dnums.input_spatial_dimensions(i);
-      padding_config.mutable_dimensions(dim)->set_edge_padding_low(
-          std::max<int64>(0LL, conv_window.dimensions(i).padding_low()));
-      padding_config.mutable_dimensions(dim)->set_edge_padding_high(
-          std::max<int64>(0LL, conv_window.dimensions(i).padding_high()));
-      padding_config.mutable_dimensions(dim)->set_interior_padding(
-          conv_window.dimensions(i).base_dilation() - 1);
+      if (conv_window->dimensions(i).padding_low() > 0) {
+        padding_config.mutable_dimensions(dim)->set_edge_padding_low(
+            conv_window->dimensions(i).padding_low());
+        conv_window->mutable_dimensions(i)->set_padding_low(0);
+      }
+      if (conv_window->dimensions(i).padding_high() > 0) {
+        padding_config.mutable_dimensions(dim)->set_edge_padding_high(
+            conv_window->dimensions(i).padding_high());
+        conv_window->mutable_dimensions(i)->set_padding_high(0);
+      }
+      if (conv_window->dimensions(i).base_dilation() != 1) {
+        padding_config.mutable_dimensions(dim)->set_interior_padding(
+            conv_window->dimensions(i).base_dilation() - 1);
+        conv_window->mutable_dimensions(i)->set_base_dilation(1);
+      }
     }
     PrimitiveType element_type = input->shape().element_type();
     HloInstruction* padding = computation->AddInstruction(
@@ -75,7 +85,7 @@ HloInstruction* MaybePaddedAndSlicedInput(
     input = MakePadHlo(input, padding, padding_config).ValueOrDie();
   }
 
-  if (window_util::HasNegativePadding(conv_window)) {
+  if (window_util::HasNegativePadding(*conv_window)) {
     // If the window has negative padding, insert a kSlice that explicitly
     // applies negative padding.
     //
@@ -89,10 +99,14 @@ HloInstruction* MaybePaddedAndSlicedInput(
       int64 dim = conv_dnums.input_spatial_dimensions(i);
       // If dimension "dim" has negative padding, increase the start index or
       // decrement the limit index by the amount of negative padding.
-      start_indices[dim] +=
-          std::max<int64>(0LL, -conv_window.dimensions(i).padding_low());
-      limit_indices[dim] -=
-          std::max<int64>(0LL, -conv_window.dimensions(i).padding_high());
+      if (conv_window->dimensions(i).padding_low() < 0) {
+        start_indices[dim] += -conv_window->dimensions(i).padding_low();
+        conv_window->mutable_dimensions(i)->set_padding_low(0);
+      }
+      if (conv_window->dimensions(i).padding_high() < 0) {
+        limit_indices[dim] -= -conv_window->dimensions(i).padding_high();
+        conv_window->mutable_dimensions(i)->set_padding_high(0);
+      }
     }
 
     input =
@@ -140,25 +154,22 @@ bool CudnnConvPaddingLegalization::CanonicalizeForwardConvolution(
 
   // Insert slices and/or pads between the convolution and its input and/or
   // kernel operand.
+  Window new_conv_window = conv->window();
   HloInstruction* new_input = MaybePaddedAndSlicedInput(
-      conv->window(), conv->convolution_dimension_numbers(),
+      &new_conv_window, conv->convolution_dimension_numbers(),
       conv->mutable_operand(0));
   HloInstruction* new_kernel =
-      MaybePaddedKernel(conv->window(), conv->convolution_dimension_numbers(),
+      MaybePaddedKernel(new_conv_window, conv->convolution_dimension_numbers(),
                         conv->mutable_operand(1));
 
-  // Remove the padding from convolution's window field. These paddings are
-  // made explicit with the inserted pads.
-  Window new_conv_window = conv->window();
+  // Remove the window dilation from convolution's window field. These paddings
+  // are made explicit with the pads inserted by MaybePaddedKernel().
   for (size_t i = 0; i < new_conv_window.dimensions_size(); ++i) {
     WindowDimension* dim = new_conv_window.mutable_dimensions(i);
 
     // The size of the kernel may have changed so update the Window to match.
     dim->set_size(new_kernel->shape().dimensions(
         conv->convolution_dimension_numbers().kernel_spatial_dimensions(i)));
-    dim->set_padding_low(0);
-    dim->set_padding_high(0);
-    dim->set_base_dilation(1);
     dim->set_window_dilation(1);
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
index 443883a89f66a747def1049bc5afb53fec3c2409..73af18f87aeeedaefac4fc37fb7b6f78f506bb4f 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
@@ -599,7 +599,7 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveConstantFilter) {
   Array4D<float> constant_arr(4, 4, 2, 2);
   constant_arr.FillIota(0);
   string constant_str =
-      LiteralUtil::CreateR4FromArray4D(constant_arr).ToString();
+      LiteralUtil::CreateR4FromArray4D(constant_arr).ToStringWithoutShape();
 
   const string module_str = absl::StrFormat(R"(
     HloModule test
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 6dcdaf1cfe06e446deed847aaf29088a7ed10e13..2ab754a471070d5f90a3eaebd0600ff180d2fe5d 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -161,6 +161,16 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatBinaryOp(
   PrimitiveType lhs_input_type = op->operand(0)->shape().element_type();
   PrimitiveType rhs_input_type = op->operand(1)->shape().element_type();
   PrimitiveType output_type = op->shape().element_type();
+  HloOpcode opcode = op->opcode();
+
+  if (hlo_module_config_.debug_options().xla_gpu_enable_fast_min_max() &&
+      (opcode == HloOpcode::kMaximum || opcode == HloOpcode::kMinimum)) {
+    return llvm_ir::EmitCallToIntrinsic(
+        opcode == HloOpcode::kMaximum ? llvm::Intrinsic::maxnum
+                                      : llvm::Intrinsic::minnum,
+        {lhs_value, rhs_value}, {lhs_value->getType()}, b_);
+  }
+
   switch (op->opcode()) {
     case HloOpcode::kRemainder: {
       return EmitLibdeviceMathCall("__nv_fmod", {lhs_value, rhs_value},
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index 2ffc8bfb49b205dced0d540ba72426e72d95e596..29756d27260b0f41b2dd4b649ea9b1610ff90268 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -369,7 +369,7 @@ TEST_F(LayoutAssignmentTest, SortLayout) {
   const char* hlo_text = R"(
   HloModule SortLayout
   ENTRY sort {
-    keys = f32[3,2]{0,1} constant(f32[3,2]{0,1}{{0,1},{0,1},{0,1}})
+    keys = f32[3,2]{0,1} constant({{0,1},{0,1},{0,1}})
     values = f32[2,3]{1,0} parameter(0)
     transpose = f32[3,2]{1,0} transpose(values), dimensions={1,0}
     ROOT sort = (f32[3,2]{1,0}, f32[3,2]{1,0}) sort(keys, transpose),
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 42fb38dffae31b0f4322216545027e067cab250d..33e41a2782b5932430eea621d3cea2c6634f292f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -268,5 +268,17 @@ string CudnnConvKindToString(CudnnConvKind kind) {
   }
 }
 
+llvm::Value* IsBlock0Thread0(llvm::IRBuilder<>* b) {
+  return b->CreateAnd(
+      b->CreateICmpEQ(
+          b->getInt32(0),
+          llvm_ir::EmitCallToIntrinsic(
+              llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b)),
+      b->CreateICmpEQ(
+          b->getInt32(0),
+          llvm_ir::EmitCallToIntrinsic(
+              llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b)));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index f373d4a8393a047aba599b0fae954e98a740161e..ebf4d926b7a280e10b09a2532caba7ad6ab3ceb2 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -155,6 +155,10 @@ llvm::Value* EmitPrintf(absl::string_view fmt,
 llvm::Value* EmitFullWarpShuffleDown(llvm::Value* value, llvm::Value* offset,
                                      llvm::IRBuilder<>* builder);
 
+// Emits code that determines whether the current thread is thread 0 within
+// block 0 of the kernel.
+llvm::Value* IsBlock0Thread0(llvm::IRBuilder<>* b);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 31591914cc553f0f5ecd81cb514faa1dc56ea041..6693f66d62d8b04d1b78e001fdb515b34539c67f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -63,9 +63,6 @@ IrEmitter::IrEmitter(const HloModuleConfig& hlo_module_config,
                 &ir_emitter_context->buffer_assignment(), &b_, module_,
                 is_nested),
       hlo_module_config_(hlo_module_config) {
-  b_.setFastMathFlags(llvm_ir::GetFastMathFlags(
-      /*fast_math_enabled=*/hlo_module_config.debug_options()
-          .xla_gpu_enable_fast_math()));
 }
 
 Status IrEmitter::DefaultAction(HloInstruction* hlo) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index bbe1583c01167b3fbb50e066ad59a48e45f5e683..87d16c0afcc3c115f652558b5d8c24606ff56733 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2059,8 +2059,16 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
             GetIndexTypeForKernel(&hlo, launch_dimensions.launch_bound(), &b_));
   }
 
-  // For multioutput fusion, we need to emit each operand and the root.
+  // Emit the tuple pointers in one thread.  We could do this at any point in
+  // the kernel, but we do it at the beginning in the hopes of reducing register
+  // pressure, since we touch threadIdx.x and blockIdx.x at the beginning of the
+  // kernel *anyway*.
   std::vector<IrArray> output_arrays = ConstructIrArrayForOutputs(hlo);
+  KernelSupportLibrary{&b_}.If("emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
+    llvm_ir::EmitTuple(GetIrArray(hlo, hlo), output_arrays, &b_, module_);
+  });
+
+  // For multioutput fusion, we need to emit each operand and the root.
   TF_RETURN_IF_ERROR(
       ParallelLoopEmitter(element_generator, output_arrays, launch_dimensions,
                           &b_, unroll_factor)
@@ -2069,8 +2077,6 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
                         &hlo, launch_dimensions.launch_bound(), &b_)));
 
   b_.SetInsertPoint(b_.GetInsertBlock()->getTerminator());
-  llvm_ir::EmitTuple(GetIrArray(hlo, hlo), output_arrays, &b_, module_);
-
   return Status::OK();
 }
 
@@ -2130,65 +2136,36 @@ int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
 
 namespace {
 
-// Reads thread_idx.x and converts it to a (y,x) coordinate, assuming that the
-// thread lives within a square tile of size tile_size (so thread blocks are of
-// size tile_size * tile_size).
-std::tuple<llvm::Value*, llvm::Value*> CalculateYXCoordinateWithinTile(
-    llvm::IRBuilder<>* builder, llvm::Value* tile_size,
-    int64 threads_per_tile) {
-  // Calculate the starting element coordinate within a tile for the current
-  // thread, (y, x) from thread_id.
-  llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic(
-      llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, builder);
-  llvm_ir::AddRangeMetadata(0, threads_per_tile,
-                            llvm::cast<llvm::Instruction>(thread_id));
-  thread_id = builder->CreateIntCast(thread_id, tile_size->getType(),
-                                     /*isSigned=*/true, "thread.id.x");
-  auto x = builder->CreateURem(thread_id, tile_size);
-  auto y = builder->CreateUDiv(thread_id, tile_size);
-  return std::make_tuple(y, x);
-}
-
-// Reads block_idx.x, casts it to type index_ty, and adds the assumption that
-// it's in the range [0, num_blocks].
-llvm::Value* GetBlockIdx(llvm::IRBuilder<>* builder, llvm::Type* index_ty,
-                         int64 num_blocks) {
-  llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic(
-      llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, builder);
-  llvm_ir::AddRangeMetadata(0, num_blocks,
-                            llvm::cast<llvm::Instruction>(block_id));
-  return builder->CreateIntCast(block_id, index_ty, /*isSigned=*/true,
-                                "block.id.x");
-}
-
-void EmitFullTile(const KernelMappingScheme* mapping_scheme,
-                  const IrArray::Index& tile_origin_index,
-                  llvm::IRBuilder<>* builder, llvm::Value* y, llvm::Value* x,
-                  llvm::Type* index_ty,
-                  const std::function<void(const IrArray::Index&, llvm::Value*,
-                                           llvm::Value*)>& emit_elem_function) {
+void EmitFullElementalTile(
+    const KernelMappingScheme* mapping_scheme,
+    const IrArray::Index& tile_origin_index, const string& loop_name,
+    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
+    llvm::Value* x, llvm::Type* index_ty,
+    const std::function<void(const IrArray::Index&, llvm::Value*,
+                             llvm::Value*)>& emit_elem_function) {
   int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
   int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
   int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
   int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
-  for (int64 i = 0; i < tile_size_y; i += num_threads_y) {
-    IrArray::Index source_idx_y =
-        tile_origin_index.AddOffsetToDim(llvm::ConstantInt::get(index_ty, i),
-                                         KernelMappingScheme::DimY, builder);
-    llvm::Value* y_loc =
-        builder->CreateAdd(llvm::ConstantInt::get(index_ty, i), y);
-    for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
-      IrArray::Index source_idx =
-          source_idx_y.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j),
-                                      KernelMappingScheme::DimX, builder);
-      llvm::Value* x_loc =
-          builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
-      emit_elem_function(source_idx, y_loc, x_loc);
-    }
-  }
-}
-
-void EmitPartialTile(
+  ksl->For(loop_name + "_y", /*start=*/llvm::ConstantInt::get(index_ty, 0),
+           /*end=*/llvm::ConstantInt::get(index_ty, tile_size_y),
+           /*step=*/llvm::ConstantInt::get(index_ty, num_threads_y),
+           [&](llvm::Value* y_indvar) {
+             IrArray::Index source_idx_y = tile_origin_index.AddOffsetToDim(
+                 y_indvar, KernelMappingScheme::DimY, builder);
+             llvm::Value* y_loc = builder->CreateAdd(y_indvar, y);
+             for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
+               IrArray::Index source_idx = source_idx_y.AddOffsetToDim(
+                   llvm::ConstantInt::get(index_ty, j),
+                   KernelMappingScheme::DimX, builder);
+               llvm::Value* x_loc =
+                   builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
+               emit_elem_function(source_idx, y_loc, x_loc);
+             }
+           });
+}
+
+void EmitPartialElementalTile(
     const KernelMappingScheme* mapping_scheme,
     const IrArray::Index& tile_origin_index, const string& loop_name,
     KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
@@ -2207,7 +2184,7 @@ void EmitPartialTile(
     llvm::Value* x_loc =
         builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
 
-    ksl->IfReturnVoid(
+    ksl->If(
         loop_name + "_x_in_tile", builder->CreateICmpULT(x_loc, tile_width),
         [&] {
           // tile_height_bound =
@@ -2219,13 +2196,13 @@ void EmitPartialTile(
           llvm::Value* tile_height_bound = builder->CreateMul(
               ceiling_of_ratio,
               llvm::ConstantInt::get(index_ty, num_threads_y));
-          ksl->ForReturnVoid(
+          ksl->For(
               loop_name, /*start=*/llvm::ConstantInt::get(index_ty, 0),
               /*end=*/tile_height_bound,
               /*step=*/llvm::ConstantInt::get(index_ty, num_threads_y),
               [&](llvm::Value* y_indvar) {
                 llvm::Value* y_loc = builder->CreateAdd(y_indvar, y);
-                ksl->IfReturnVoid(
+                ksl->If(
                     loop_name + "_y_in_tile",
                     builder->CreateICmpULT(y_loc, tile_height), [&] {
                       emit_elem_function(
@@ -2257,7 +2234,7 @@ void EmitTiledElementalCodeWithBoundsCheck(
   int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
   llvm::Type* index_ty = tile_width->getType();
 
-  ksl->IfReturnVoid(
+  ksl->If(
       loop_name + "_full_tile",
       builder->CreateAnd(
           builder->CreateICmpEQ(llvm::ConstantInt::get(index_ty, tile_size_x),
@@ -2265,13 +2242,13 @@ void EmitTiledElementalCodeWithBoundsCheck(
           builder->CreateICmpEQ(llvm::ConstantInt::get(index_ty, tile_size_y),
                                 tile_height)),
       [&] {
-        EmitFullTile(mapping_scheme, tile_origin_index, builder, y, x, index_ty,
-                     emit_elem_function);
+        EmitFullElementalTile(mapping_scheme, tile_origin_index, loop_name, ksl,
+                              builder, y, x, index_ty, emit_elem_function);
       },
       [&] {
-        EmitPartialTile(mapping_scheme, tile_origin_index, loop_name, ksl,
-                        builder, y, x, tile_height, tile_width, index_ty,
-                        emit_elem_function);
+        EmitPartialElementalTile(mapping_scheme, tile_origin_index, loop_name,
+                                 ksl, builder, y, x, tile_height, tile_width,
+                                 index_ty, emit_elem_function);
       });
 }
 }  // namespace
@@ -2381,14 +2358,14 @@ class ReductionCodegenInfo : public IrEmitterUnnested::KernelCodegenInfo {
   AddressVector* GetMutablePartialResultAddresses() {
     return &partial_result_addresses_;
   }
-  const AddressVector& GetPartialResultAddresses() const {
+  absl::Span<llvm::AllocaInst* const> GetPartialResultAddresses() const {
     return partial_result_addresses_;
   }
 
   AddressVector* GetMutableReductionInputAddresses() {
     return &reduction_input_addresses_;
   }
-  const AddressVector& GetReductionInputAddresses() const {
+  absl::Span<llvm::AllocaInst* const> GetReductionInputAddresses() const {
     return reduction_input_addresses_;
   }
 
@@ -2401,7 +2378,7 @@ class ReductionCodegenInfo : public IrEmitterUnnested::KernelCodegenInfo {
   InlinedVector<ShapeIndex, 1>* GetMutableReductionOutputShapeIndices() {
     return &reduction_output_shape_indices_;
   }
-  const InlinedVector<ShapeIndex, 1>& GetReductionOutputShapeIndices() const {
+  absl::Span<const ShapeIndex> GetReductionOutputShapeIndices() const {
     return reduction_output_shape_indices_;
   }
 
@@ -2556,8 +2533,8 @@ void IrEmitterUnnested::EmitPrologueForReduction(
 }
 
 void IrEmitterUnnested::EmitFullWarpShuffleDownLoopForAllReduces(
-    const InlinedVector<HloComputation*, 1>& reducers,
-    const AddressVector& partial_result_addresses) {
+    absl::Span<HloComputation* const> reducers,
+    absl::Span<llvm::AllocaInst* const> partial_result_addresses) {
   for (int distance = 16; distance >= 1; distance /= 2) {
     for (int i = 0; i != reducers.size(); ++i) {
       llvm::Type* element_type =
@@ -2589,11 +2566,11 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
   ReductionCodegenInfo* reduction_info =
       static_cast<ReductionCodegenInfo*>(kernel_info);
   int num_reduces = reduction_info->GetNumberOfReduces();
-  const AddressVector& partial_result_addresses =
+  absl::Span<llvm::AllocaInst* const> partial_result_addresses =
       reduction_info->GetPartialResultAddresses();
   const InlinedVector<HloComputation*, 1>& reducers =
       reduction_info->GetReducers();
-  const InlinedVector<ShapeIndex, 1>& reduction_output_shape_indices =
+  absl::Span<const ShapeIndex> reduction_output_shape_indices =
       reduction_info->GetReductionOutputShapeIndices();
 
   if (reduction_info->IsRowReduction()) {
@@ -2713,9 +2690,9 @@ void IrEmitterUnnested::EmitTileElementForReduction(
       reduction_info->GetKernelMappingScheme()->GetUnnormalizedIndex(
           index,
           GetFirstReduceInstruction(output_instructions)->operand(0)->shape());
-  const AddressVector& partial_reduction_result_addresses =
+  absl::Span<llvm::AllocaInst* const> partial_reduction_result_addresses =
       reduction_info->GetPartialResultAddresses();
-  const AddressVector& reduction_input_addresses =
+  absl::Span<llvm::AllocaInst* const> reduction_input_addresses =
       reduction_info->GetReductionInputAddresses();
   const InlinedVector<HloComputation*, 1>& reducers =
       reduction_info->GetReducers();
@@ -2774,15 +2751,14 @@ void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
               Select(ICmpEQ(last_block_for_dim, block_id_for_dim),
                      last_block_size_for_dim, block_size_for_dim);
 
-          ksl.ForReturnVoid(
-              loop_name,
-              /*start=*/index_typed_constant(0),
-              /*end=*/num_tiles_in_block,
-              /*step=*/1, [&](llvm::Value* block_dim_induction_var) {
-                IrArray::Index tile_index = starting_tile.AddOffsetToDim(
-                    block_dim_induction_var, dim_id, &b_);
-                emit_next_block_dim(tile_index);
-              });
+          ksl.For(loop_name,
+                  /*start=*/index_typed_constant(0),
+                  /*end=*/num_tiles_in_block,
+                  /*step=*/1, [&](llvm::Value* block_dim_induction_var) {
+                    IrArray::Index tile_index = starting_tile.AddOffsetToDim(
+                        block_dim_induction_var, dim_id, &b_);
+                    emit_next_block_dim(tile_index);
+                  });
         }
       };
 
@@ -2864,14 +2840,40 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
             << llvm_ir::DumpToString(*param_shmem_buffers[id]);
   }
 
-  LaunchDimensions launch_dimensions = LaunchDimensions(
-      mapping_scheme->GetNumberOfBlocks(), mapping_scheme->GetThreadsPerTile());
-  llvm::Type* index_ty = GetIndexTypeForKernel(
-      unnested_hlo, launch_dimensions.launch_bound(), &b_);
+  const ReductionCodegenInfo* reduction_info =
+      dynamic_cast<const ReductionCodegenInfo*>(kernel_info);
+  bool is_column_reduction =
+      (reduction_info && !reduction_info->IsRowReduction());
+
+  LaunchDimensions launch_dimensions =
+      LaunchDimensions(mapping_scheme->GetNumberOfBlocks(),
+                       mapping_scheme->GetThreadsPerBlock());
+
+  // TODO(b/110211620): Enable int32 index type for column reduction.
+  llvm::Type* index_ty =
+      is_column_reduction
+          ? b_.getInt64Ty()
+          : GetIndexTypeForKernel(unnested_hlo,
+                                  launch_dimensions.launch_bound(), &b_);
+
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
   };
 
+  // For multioutput fusion, one thread needs to output a tuple with pointers to
+  // all the individual outputs.  We could do this at any point in the kernel,
+  // but we do it at the beginning in the hopes of reducing register pressure,
+  // since we touch threadIdx.x and blockIdx.x at the beginning of the kernel
+  // *anyway*.
+  if (!reduction_info && unnested_hlo->IsMultiOutputFusion()) {
+    KernelSupportLibrary{&b_}.If(
+        "emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
+          llvm_ir::EmitTuple(GetIrArray(*unnested_hlo, *unnested_hlo),
+                             ConstructIrArrayForOutputs(*unnested_hlo), &b_,
+                             module_);
+        });
+  }
+
   // For each tiled parameter, cast its input IrArray to the corresponding
   // reduced shape and keep the reduced shape live during IR emission.
   std::vector<IrArray> param_in_reduced_shape_arrays;
@@ -2985,15 +2987,6 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
     block_epilogue_generator(unnested_hlo, kernel_info);
   }
 
-  // For multioutput fusion, emit a tuple with pointers to all the individual
-  // outputs.
-  if (unnested_hlo->IsMultiOutputFusion()) {
-    std::vector<IrArray> output_arrays =
-        ConstructIrArrayForOutputs(*unnested_hlo);
-    llvm_ir::EmitTuple(GetIrArray(*unnested_hlo, *unnested_hlo), output_arrays,
-                       &b_, module_);
-  }
-
   return launch_dimensions;
 }
 
@@ -3260,15 +3253,17 @@ std::tuple<int64, int64, int64> GetReductionToVectorDimensions(
   return std::make_tuple(num_reduced_major, num_kept, num_reduced_minor);
 }
 
-std::tuple<KernelMappingScheme, bool> ComputeMappingSchemeAndReductionKind(
-    const HloInstruction* first_reduce, llvm::IRBuilder<>* b) {
+}  // namespace
+
+std::tuple<KernelMappingScheme, bool>
+IrEmitterUnnested::ComputeMappingSchemeAndReductionKind(
+    const HloInstruction* first_reduce) {
   int64 depth = 1;
   int64 height = 1;
   int64 width = 1;
   bool is_row_reduction = true;
   int64 tile_size_x = 1;
   int64 tile_size_y = 1;
-  int64 block_size_y = 1;
   int64 block_size_z = 1;
   int64 num_threads_x = 1;
   int64 num_threads_y = 1;
@@ -3291,14 +3286,17 @@ std::tuple<KernelMappingScheme, bool> ComputeMappingSchemeAndReductionKind(
     height = num_reduced_major;
     width = num_kept;
     is_row_reduction = false;
-    tile_size_x = std::min(kWarpSize, num_kept);
-    // The old Column reduction algorithm uses kTileHeight = 128. We choose
-    // tile_size_y * block_size_y = 128 to match the value of kTileHeight. Using
-    // a non-trivial block_size_y here is a way to avoid unrolling all the 128
-    // iterations.
-    tile_size_y = 32;
-    block_size_y = 4;
+    // Column reduction without transpose doesn't require communication among
+    // threads processing elements in the same tile. The current implementation
+    // only support the use of on hardware thread block to process one block of
+    // tiles in the KernelMappingScheme. We try to maximize the values of
+    // num_threads_x and tile_size_x to allow a bigger hardware thread block.
+    int64 hw_threads_per_block_limit =
+        ThreadsPerBlockLimit(ir_emitter_context_->device_description());
+    tile_size_x = std::min(hw_threads_per_block_limit, num_kept);
     num_threads_x = tile_size_x;
+    int64 kNumElementsPerPartialSum = 128;
+    tile_size_y = kNumElementsPerPartialSum;
   } else {
     // Row reduction reduces inputs with dimension [depth, height, width],
     // where width is the most minor dimension, to dimension [height] .
@@ -3321,15 +3319,13 @@ std::tuple<KernelMappingScheme, bool> ComputeMappingSchemeAndReductionKind(
            << " " << width;
 
   DimensionVector dims_in_elem{depth, height, width};
-  DimensionVector req_block_sizes{block_size_z, block_size_y, 1};
-  llvm_ir::KernelMappingScheme mapping_scheme(dims_in_elem, tile_size_y,
-                                              tile_size_x, req_block_sizes,
-                                              num_threads_y, num_threads_x, b);
+  DimensionVector req_block_sizes{block_size_z, 1, 1};
+  llvm_ir::KernelMappingScheme mapping_scheme(
+      dims_in_elem, tile_size_y, tile_size_x, req_block_sizes, num_threads_y,
+      num_threads_x, &b_);
   return std::make_tuple(mapping_scheme, is_row_reduction);
 }
 
-}  // namespace
-
 Status IrEmitterUnnested::EmitReductionToVector(HloInstruction* unnested_hlo) {
   VLOG(10) << "Emitting reduction to vector " << unnested_hlo->ToString();
 
@@ -3375,7 +3371,7 @@ Status IrEmitterUnnested::EmitReductionToVector(HloInstruction* unnested_hlo) {
   bool is_row_reduction;
   llvm_ir::KernelMappingScheme mapping_scheme;
   std::tie(mapping_scheme, is_row_reduction) =
-      ComputeMappingSchemeAndReductionKind(first_reduce, &b_);
+      ComputeMappingSchemeAndReductionKind(first_reduce);
   ReductionCodegenInfo reduction_info(&mapping_scheme, is_row_reduction);
   KernelCodeGenerator kernel_generator(
       /*tile_element_generator=*/
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 85a0e5328c4e436d4522593b38421efc87c42d32..1ebea7ab48664e693937b45561d096f7ec15132f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -215,6 +215,11 @@ class IrEmitterUnnested : public IrEmitter {
   // Prerequisite: `IsReductionToVector(*unnested_hlo)`
   Status EmitReductionToVector(HloInstruction* unnested_hlo);
 
+  // Computes the KernelMappingScheme for the reduce HLO and indicates whether
+  // the reduction is a row reduction.
+  std::tuple<llvm_ir::KernelMappingScheme, bool>
+  ComputeMappingSchemeAndReductionKind(const HloInstruction* first_reduce);
+
   // Emits code for an in-place scatter, modifying `thunk`s launch dimensions in
   // the process. `scatter` may be fused, scatter indices are taken from
   // `scatter_indices_gen`, updates from`updates_gen`. The output buffer is
@@ -272,9 +277,8 @@ class IrEmitterUnnested : public IrEmitter {
   // For each reducer, emits the shuffle-down loop to accumulate the partial
   // result to the global result.
   void EmitFullWarpShuffleDownLoopForAllReduces(
-      const absl::InlinedVector<HloComputation*, 1>& reducers,
-      const absl::InlinedVector<llvm::AllocaInst*, 1>&
-          partial_result_addresses);
+      absl::Span<HloComputation* const> reducers,
+      absl::Span<llvm::AllocaInst* const> partial_result_addresses);
 
   // Generates the IrArray for each input of an hlo and returns a vector that
   // constains such IrArrays.
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
index 364f69a69d47644b383af9cf6865c93360b82bab..bd53b90b42d8e657a3ee58e7ca03fb60522aae28 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
@@ -177,13 +177,6 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
   }
 
   TargetOptions target_options = InitTargetOptionsFromCodeGenFlags();
-  llvm_ir::SetTargetOptions(
-      /*fast_math_enabled=*/hlo_module_config.debug_options()
-          .xla_gpu_enable_fast_math(),
-      &target_options);
-
-  // Enable FMA synthesis.
-  target_options.AllowFPOpFusion = FPOpFusion::Fast;
 
   // Set the verbose assembly options.
   target_options.MCOptions.AsmVerbose = false;
@@ -206,8 +199,7 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
   }
   return absl::WrapUnique(target->createTargetMachine(
       triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx60", target_options,
-      Optional<Reloc::Model>(RelocModel), Optional<CodeModel::Model>(CMModel),
-      codegen_opt_level));
+      getRelocModel(), getCodeModel(), codegen_opt_level));
 }
 
 // Adds the standard LLVM optimization passes, based on the speed optimization
@@ -401,8 +393,16 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
   int32 opt_level =
       hlo_module_config.debug_options().xla_backend_optimization_level();
 
-  CHECK_GE(opt_level, 2)
-      << "The XLA GPU backend doesn't support unoptimized code generation";
+  if (opt_level < 2) {
+    LOG(ERROR) << std::string(80, '*');
+    LOG(ERROR) << "The XLA GPU backend doesn't support unoptimized code "
+                  "generation but ";
+    LOG(ERROR) << "--xla_backend_optimization_level is set to " << opt_level
+               << "!";
+    LOG(ERROR) << "(Supported configuration is "
+                  "--xla_backend_optimization_level >= 2.)";
+    LOG(ERROR) << std::string(80, '*');
+  }
 
   AddOptimizationPasses(opt_level,
                         /*size_level=*/0, target_machine.get(), &module_passes,
@@ -465,6 +465,9 @@ void GPUBackendInit(const HloModuleConfig& hlo_module_config) {
   // between those loads.
   FeedLLVMWithFlags({"-memdep-block-scan-limit=500"});
 
+  // Use div.approx -- it matters for some float-division heavy benchmarks.
+  FeedLLVMWithFlags({"-nvptx-prec-divf32=0"});
+
   llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config);
 
   // Initialize the NVPTX target; it's the only target we link with, so call its
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 637b861f70235f17e8e739907a3f262b7004ee7c..60f2116e6088fd2c5d3400b4463cb7fa8bbadfdc 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -108,27 +108,33 @@ namespace {
 
 namespace tracing = tensorflow::tracing;
 
-// Returns the directory containing nvvm libdevice files.  config_cuda_data_dir
-// should be equal to config().debug_options().xla_gpu_cuda_data_dir() of the
-// HloModule being compiled.
-string GetLibdeviceDir(const string& config_cuda_data_dir) {
-  std::vector<string> potential_libdevice_dirs;
-  if (!config_cuda_data_dir.empty()) {
-    potential_libdevice_dirs.push_back(config_cuda_data_dir);
-  }
-  potential_libdevice_dirs.push_back(tensorflow::LibdeviceRoot());
-
-  // Tries all potential libdevice directories in the order they are inserted.
-  // Returns the first directory that exists in the file system.
-  for (const string& potential_libdevice_dir : potential_libdevice_dirs) {
-    if (tensorflow::Env::Default()->IsDirectory(potential_libdevice_dir).ok()) {
-      VLOG(2) << "Found libdevice dir " << potential_libdevice_dir;
-      return potential_libdevice_dir;
+// Returns a vector of potential locations of the CUDA root directory.
+std::vector<string> GetCudaRootCandidates(
+    const HloModuleConfig& hlo_module_config) {
+  std::vector<string> potential_cuda_roots = tensorflow::CandidateCudaRoots();
+
+  // CUDA location explicitly specified by user via --xla_gpu_cuda_data_dir has
+  // highest priority.
+  string xla_gpu_cuda_data_dir =
+      hlo_module_config.debug_options().xla_gpu_cuda_data_dir();
+  if (!xla_gpu_cuda_data_dir.empty()) {
+    potential_cuda_roots.insert(potential_cuda_roots.begin(),
+                                xla_gpu_cuda_data_dir);
+  }
+  return potential_cuda_roots;
+}
+
+// Returns the directory containing nvvm libdevice files.
+string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
+  for (const string& cuda_root : GetCudaRootCandidates(hlo_module_config)) {
+    string libdevice_dir =
+        tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
+    VLOG(2) << "Looking for libdevice at " << libdevice_dir;
+    if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
+      VLOG(2) << "Found libdevice dir " << libdevice_dir;
+      return libdevice_dir;
     }
-    VLOG(2) << "Unable to find potential libdevice dir "
-            << potential_libdevice_dir;
   }
-
   LOG(WARNING) << "Unable to find libdevice dir. Using '.'";
   // Last resort: maybe in the current folder.
   return ".";
@@ -143,7 +149,6 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
                          Compiler* compiler) {
   {
     HloPassPipeline pipeline("optimization");
-    pipeline.AddPass<HloGetDimensionSizeRewriter>();
     pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                               /*allow_mixed_precision=*/false);
     pipeline.AddPass<GpuHloSupportChecker>();
@@ -175,6 +180,8 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
           /*rewrite_inference_op=*/true,
           /*rewrite_grad_op=*/true);
 
+      pipeline.AddPass<HloGetDimensionSizeRewriter>();
+
       // BatchNormExpander can create zero-sized ops, so zero-sized HLO
       // elimination has to come after that pass.
       pipeline.AddPass<ZeroSizedHloElimination>();
@@ -477,13 +484,19 @@ void WarnIfBadDriverJITVersion() {
 
 // Compiles the given PTX string using ptxas and returns the resulting machine
 // code (i.e. a cubin) as a byte array.
-StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
-                                        int cc_minor) {
+StatusOr<std::vector<uint8>> CompilePtx(
+    const string& ptx, int cc_major, int cc_minor,
+    const HloModuleConfig& hlo_module_config) {
   tracing::ScopedActivity activity("Compile PTX", /*is_expensive=*/true);
-  const string ptxas_path =
-      tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin", "ptxas");
-  VLOG(2) << "Checking ptxas at " << ptxas_path;
   auto env = tensorflow::Env::Default();
+  string ptxas_path;
+  for (const string& cuda_root : GetCudaRootCandidates(hlo_module_config)) {
+    ptxas_path = tensorflow::io::JoinPath(cuda_root, "bin", "ptxas");
+    VLOG(2) << "Looking for ptxas at " << ptxas_path;
+    if (env->FileExists(ptxas_path).ok()) {
+      break;
+    }
+  }
   TF_RETURN_IF_ERROR(env->FileExists(ptxas_path));
   VLOG(2) << "Using ptxas at " << ptxas_path;
 
@@ -518,6 +531,9 @@ StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
   if (VLOG_IS_ON(2)) {
     ptxas_args.push_back("-v");
   }
+  if (hlo_module_config.debug_options().xla_gpu_disable_ptxas_optimizations()) {
+    ptxas_args.push_back("-O0");
+  }
   ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
   ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
                                      tensorflow::ACTION_PIPE);
@@ -680,12 +696,8 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
     // Find the directory containing libdevice.  To avoid searching for it every
     // time, we have a one-element cache, keyed on the module's config's
     // cuda_data_dir.
-    const auto& config_cuda_data_dir =
-        module->config().debug_options().xla_gpu_cuda_data_dir();
-    if (cached_libdevice_dir_.empty() ||
-        cached_cuda_data_dir_ != config_cuda_data_dir) {
-      cached_cuda_data_dir_ = config_cuda_data_dir;
-      cached_libdevice_dir_ = GetLibdeviceDir(config_cuda_data_dir);
+    if (cached_libdevice_dir_.empty()) {
+      cached_libdevice_dir_ = GetLibdeviceDir(module->config());
     }
     libdevice_dir = cached_libdevice_dir_;
   }
@@ -739,7 +751,7 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   }
 
   const std::vector<uint8> cubin =
-      CompilePtxOrGetCachedResult(ptx, cc_major, cc_minor);
+      CompilePtxOrGetCachedResult(ptx, cc_major, cc_minor, module->config());
 
   auto thunk_schedule = absl::make_unique<ThunkSchedule>(
       ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
@@ -771,9 +783,9 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   return std::unique_ptr<Executable>(gpu_executable);
 }
 
-std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(const string& ptx,
-                                                              int cc_major,
-                                                              int cc_minor) {
+std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
+    const string& ptx, int cc_major, int cc_minor,
+    const HloModuleConfig& hlo_module_config) {
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::CompilePtxOrGetCachedResult");
   tracing::ScopedActivity activity("PTX->CUBIN", /*is_expensive=*/true);
   bool inserted;
@@ -802,7 +814,7 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(const string& ptx,
       CHECK(!cache_value->compilation_done);
       if (!ptx.empty()) {
         StatusOr<std::vector<uint8>> maybe_cubin =
-            CompilePtx(*cache_ptx, cc_major, cc_minor);
+            CompilePtx(*cache_ptx, cc_major, cc_minor, hlo_module_config);
         if (maybe_cubin.ok()) {
           cache_value->cubin_data = std::move(maybe_cubin).ValueOrDie();
           VLOG(2) << "Compiled PTX size:" << ptx.size()
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index f79ae2990ae7d6e6985b15727a72358289121aa9..b2077f42fd097330703fde063d80a20704fa48e2 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -97,8 +97,9 @@ class NVPTXCompiler : public LLVMCompiler {
 
   // Tries to compile the given ptx string to cubin.  Returns a vector with the
   // compiled cubin.  If compilation was unsuccessful, returns an empty vector.
-  std::vector<uint8> CompilePtxOrGetCachedResult(const string& ptx,
-                                                 int cc_major, int cc_minor);
+  std::vector<uint8> CompilePtxOrGetCachedResult(
+      const string& ptx, int cc_major, int cc_minor,
+      const HloModuleConfig& hlo_module_config);
 
   // The compilation_cache_ map is a cache from {ptx string, cc_major, cc_minor}
   // -> cubin so we don't recompile the same ptx twice.  This is important for
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
index 375f68a15957936151aee068582a714b62694af2..bfed4f5230dfe37bca48560ce83a2dd82c8950a4 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@@ -39,6 +39,25 @@ std::ostream& operator<<(std::ostream& out,
   return out;
 }
 
+int64 ThreadsPerBlockLimit(const se::DeviceDescription& device_desc) {
+  int64 threads_per_block = device_desc.threads_per_block_limit();
+  if (threads_per_block == 0) {
+    static std::atomic<int64> log_count{0};
+    if (log_count.fetch_add(1) < 8) {
+      LOG(WARNING) << "Attempting to calculate launch dimensions for GPU "
+                      "without full information about its capabilities.  "
+                      "StreamExecutor's PopulateDeviceDescription should be "
+                      "updated for this device.";
+    }
+    threads_per_block = device_desc.threads_per_warp();
+    if (threads_per_block == 0) {
+      // Fall back to *something* if we can't even get num threads per warp.
+      threads_per_block = 32;
+    }
+  }
+  return threads_per_block;
+}
+
 // Calculates the launch dimensions used to invoke `hlo`.
 LaunchDimensions CalculateLaunchDimensions(
     const Shape& shape, const se::DeviceDescription& device_desc,
@@ -62,21 +81,7 @@ LaunchDimensions CalculateLaunchDimensions(
   //
   //   <num threads per block> * <max blocks per core> = <max threads per core>
 
-  int64 threads_per_block = device_desc.threads_per_block_limit();
-  if (threads_per_block == 0) {
-    static std::atomic<int64> log_count{0};
-    if (log_count.fetch_add(1) < 8) {
-      LOG(WARNING) << "Attempting to calculate launch dimensions for GPU "
-                      "without full information about its capabilities.  "
-                      "StreamExecutor's PopulateDeviceDescription should be "
-                      "updated for this device.";
-    }
-    threads_per_block = device_desc.threads_per_warp();
-    if (threads_per_block == 0) {
-      // Fall back to *something* if we can't even get num threads per warp.
-      threads_per_block = 32;
-    }
-  }
+  int64 threads_per_block = ThreadsPerBlockLimit(device_desc);
 
   if (num_elements < threads_per_block) {
     threads_per_block = num_elements;
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
index 02471129e004b4876ce20a62cade34060c65b478..eb41dcccb938ccc088c2371def96ca73276771ab 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
@@ -57,6 +57,9 @@ class LaunchDimensions {
 std::ostream& operator<<(std::ostream& out,
                          const LaunchDimensions& launch_dims);
 
+// Returns the maximum number of threads per block allowed by the device.
+int64 ThreadsPerBlockLimit(const se::DeviceDescription& device_desc);
+
 LaunchDimensions CalculateLaunchDimensions(
     const Shape& shape, const se::DeviceDescription& device_desc,
     int unroll_factor = 1);
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
index 1fc46bafa10e7ba6c896f081d5c836bd400886c9..92e4d6dbbc1bd564657f8a5de09d23d5ae81a93e 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
 
+#include "tensorflow/compiler/xla/layout.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
index d0ccd8619bde9ddd560989380b403efed5c5f42c..5e524faab18947f5793dc2ae34e9329a446d4235 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@@ -75,16 +75,16 @@ class GpuFtzDisabledTest : public GpuFtzTest {
 // Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise.
 TEST_F(GpuFtzEnabledTest, MultiplyFtz) {
   CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
-    CHECK-NOT: mul.f32
-    CHECK: mul.ftz.f32
-    CHECK-NOT: mul.f32
+    CHECK-NOT: mul.rn.f32
+    CHECK: mul.rn.ftz.f32
+    CHECK-NOT: mul.rn.f32
   )");
 }
 TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
   CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
-    CHECK-NOT: mul.ftz.f32
-    CHECK: mul.f32
-    CHECK-NOT: mul.ftz.f32
+    CHECK-NOT: mul.rn.ftz.f32
+    CHECK: mul.rn.f32
+    CHECK-NOT: mul.rn.ftz.f32
   )");
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index ff122b529bdcdcc69d2245136e19101902dbf957..ca663b8b4a970900a4a899a7ad9d33dc45af9d99 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -711,8 +711,6 @@ bool HloComputation::operator==(const HloComputation& other) const {
   return eq(root_instruction(), other.root_instruction());
 }
 
-uint64 HloComputation::Hash() const { return root_instruction()->Hash(); }
-
 Status HloComputation::ReplaceWithNewInstruction(
     HloInstruction* old_instruction,
     std::unique_ptr<HloInstruction> new_instruction) {
@@ -797,7 +795,7 @@ Status HloComputation::AcceptWithOperandOrder(
 template <typename HloInstructionPtr>
 Status HloComputation::AcceptOrdered(
     DfsHloVisitorBase<HloInstructionPtr>* visitor,
-    const std::vector<HloInstruction*>& order) const {
+    absl::Span<HloInstruction* const> order) const {
   VLOG(3) << "Accepting visitor with order.";
   for (HloInstruction* root : CollectUnreachableRoots()) {
     TF_RET_CHECK(std::find(order.begin(), order.end(), root) != order.end())
@@ -827,9 +825,9 @@ Status HloComputation::AcceptOrdered(
 
 // Explicit instantiations.
 template Status HloComputation::AcceptOrdered(
-    DfsHloVisitor*, const std::vector<HloInstruction*>&) const;
+    DfsHloVisitor*, absl::Span<HloInstruction* const>) const;
 template Status HloComputation::AcceptOrdered(
-    ConstDfsHloVisitor*, const std::vector<HloInstruction*>&) const;
+    ConstDfsHloVisitor*, absl::Span<HloInstruction* const>) const;
 
 Status HloComputation::Accept(
     const std::function<Status(HloInstruction*)>& visitor_func) {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index c584e4c7ca5770533f28352b0df9dadd9dbe1860..5467d0a68b18170891dcd9f67e44d3bb269bf920 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -264,12 +264,6 @@ class HloComputation {
   // Return whether `*this` and `other` are functionally equivalent.
   bool operator==(const HloComputation& other) const;
 
-  // Generates a hash value of an HLO computation. Hash considers
-  // information on opcode, shape, operands, and typically a root instruction.
-  // This function returns the same hash value for equivalent HLO computations,
-  // with respect to HloInstruction::Identical() method.
-  uint64 Hash() const;
-
   // Replaces old instruction with newly created instruction. Removes old
   // instruction from computation. Updates uses and root instruction.
   Status ReplaceWithNewInstruction(
@@ -307,7 +301,7 @@ class HloComputation {
   // be a topological sort of all instructions in the computation.
   template <typename HloInstructionPtr>
   Status AcceptOrdered(DfsHloVisitorBase<HloInstructionPtr>* visitor,
-                       const std::vector<HloInstruction*>& order) const;
+                       absl::Span<HloInstruction* const> order) const;
 
   // Same as Accept() above, but the visitor is given as a function.
   Status Accept(const std::function<Status(HloInstruction*)>& visitor_func);
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 8b50cfa9aed90091cfbedc1df902440ec9bf2a80..0361c87428f6e4c031d95492a5bc782ad388e5b5 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -20,19 +20,19 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
-namespace op = xla::testing::opcode_matchers;
-
 namespace xla {
 
 namespace {
 
+namespace m = match;
 using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
 
@@ -261,7 +261,7 @@ TEST_F(HloComputationTest, DeepCopyArray) {
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(constant).ValueOrDie();
 
-  EXPECT_THAT(copy, op::Copy(constant));
+  EXPECT_THAT(copy, GmockMatch(m::Copy(m::Op().Is(constant))));
 }
 
 TEST_F(HloComputationTest, DeepCopyTuple) {
@@ -278,8 +278,9 @@ TEST_F(HloComputationTest, DeepCopyTuple) {
   auto computation = module->AddEntryComputation(builder.Build());
   auto tuple_copy = computation->DeepCopyInstruction(tuple).ValueOrDie();
 
-  EXPECT_THAT(tuple_copy, op::Tuple(op::Copy(op::GetTupleElement(tuple)),
-                                    op::Copy(op::GetTupleElement(tuple))));
+  EXPECT_THAT(tuple_copy, GmockMatch(m::Tuple(
+                              m::Copy(m::GetTupleElement(m::Op().Is(tuple))),
+                              m::Copy(m::GetTupleElement(m::Op().Is(tuple))))));
   EXPECT_EQ(0, tuple_copy->operand(0)->operand(0)->tuple_index());
   EXPECT_EQ(1, tuple_copy->operand(1)->operand(0)->tuple_index());
 }
@@ -297,7 +298,7 @@ TEST_F(HloComputationTest, DeepCopyArrayAtIndices) {
     ShapeTree<bool> indices_to_copy(constant->shape(), /*init_value=*/true);
     EXPECT_THAT(computation->DeepCopyInstruction(constant, &indices_to_copy)
                     .ValueOrDie(),
-                op::Copy(constant));
+                GmockMatch(m::Copy(m::Op().Is(constant))));
   }
 
   {
@@ -330,10 +331,11 @@ TEST_F(HloComputationTest, DeepCopyTupleAtIndices) {
         computation->DeepCopyInstruction(tuple, &indices_to_copy, &copies_added)
             .ValueOrDie();
 
-    EXPECT_THAT(deep_copy, op::Tuple(op::Copy(op::GetTupleElement(tuple)),
-                                     op::Copy(op::GetTupleElement(tuple))));
-    EXPECT_THAT(deep_copy, op::Tuple(copies_added.element({0}),
-                                     copies_added.element({1})));
+    EXPECT_THAT(deep_copy, GmockMatch(m::Tuple(
+                               m::Copy(m::GetTupleElement(m::Op().Is(tuple)))
+                                   .Is(copies_added.element({0})),
+                               m::Copy(m::GetTupleElement(m::Op().Is(tuple)))
+                                   .Is(copies_added.element({1})))));
   }
 
   {
@@ -346,8 +348,9 @@ TEST_F(HloComputationTest, DeepCopyTupleAtIndices) {
         computation->DeepCopyInstruction(tuple, &indices_to_copy, &copies_added)
             .ValueOrDie();
 
-    EXPECT_THAT(deep_copy, op::Tuple(op::GetTupleElement(tuple),
-                                     op::GetTupleElement(tuple)));
+    EXPECT_THAT(deep_copy,
+                GmockMatch(m::Tuple(m::GetTupleElement(m::Op().Is(tuple)),
+                                    m::GetTupleElement(m::Op().Is(tuple)))));
     EXPECT_TRUE(copies_added.element({}) == nullptr);
     EXPECT_TRUE(copies_added.element({0}) == nullptr);
     EXPECT_TRUE(copies_added.element({1}) == nullptr);
@@ -363,8 +366,9 @@ TEST_F(HloComputationTest, DeepCopyTupleAtIndices) {
         computation->DeepCopyInstruction(tuple, &indices_to_copy, &copies_added)
             .ValueOrDie();
 
-    EXPECT_THAT(deep_copy, op::Tuple(op::Copy(op::GetTupleElement(tuple)),
-                                     op::GetTupleElement(tuple)));
+    EXPECT_THAT(deep_copy, GmockMatch(m::Tuple(
+                               m::Copy(m::GetTupleElement(m::Op().Is(tuple))),
+                               m::GetTupleElement(m::Op().Is(tuple)))));
     EXPECT_TRUE(copies_added.element({}) == nullptr);
     EXPECT_TRUE(copies_added.element({0}) != nullptr);
     EXPECT_TRUE(copies_added.element({1}) == nullptr);
@@ -381,7 +385,7 @@ TEST_F(HloComputationTest, DeepCopyToken) {
   auto copy = computation->DeepCopyInstruction(token).ValueOrDie();
 
   // No copy should be added.
-  EXPECT_THAT(copy, op::AfterAll());
+  EXPECT_THAT(copy, GmockMatch(m::AfterAll()));
 }
 
 TEST_F(HloComputationTest, DeepCopyTokenTuple) {
@@ -399,8 +403,9 @@ TEST_F(HloComputationTest, DeepCopyTokenTuple) {
 
   // Only the array (second tuple element) should be copied. The token is passed
   // through transparently.
-  EXPECT_THAT(copy, op::Tuple(op::GetTupleElement(tuple),
-                              op::Copy(op::GetTupleElement(tuple))));
+  EXPECT_THAT(copy, GmockMatch(m::Tuple(
+                        m::GetTupleElement(m::Op().Is(tuple)),
+                        m::Copy(m::GetTupleElement(m::Op().Is(tuple))))));
 }
 
 TEST_F(HloComputationTest, CycleDetection) {
@@ -443,13 +448,15 @@ TEST_F(HloComputationTest, RemoveInstructionWithDuplicateOperand) {
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(4, computation->instruction_count());
-  EXPECT_THAT(computation->root_instruction(), op::Negate(constant));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Negate(m::Op().Is(constant))));
   EXPECT_EQ(negate, computation->root_instruction());
 
   ASSERT_IS_OK(computation->RemoveInstructionAndUnusedOperands(dead_add));
 
   EXPECT_EQ(2, computation->instruction_count());
-  EXPECT_THAT(computation->root_instruction(), op::Negate(constant));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Negate(m::Op().Is(constant))));
   EXPECT_EQ(negate, computation->root_instruction());
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index 4f81dc94e577a63c09ae4019e5e8158252c712ce..92b748d813c3efef83ef0155f1d5d3c637ce2c57 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -252,7 +252,7 @@ const char* const kConstantFoldLargePad = R"(
   HloModule ConstantFoldLargePad
 
   ENTRY r {
-    a = f32[1,1,1] constant(f32[1,1,1]{{{7}}})
+    a = f32[1,1,1] constant({{{7}}})
     b = f32[] constant(42)
     ROOT pad = f32[2048,2048,128] pad(a, b), padding=1024_1023x1024_1023x64_63
   })";
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index f7a1f19a6f52befd58a405d0e406d7d0d37a8e57..94de7c55dd2402e55ec344b79c24af2d8283fe73 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -1882,8 +1882,8 @@ TEST_P(HloDataflowAnalysisTest, AddDependency) {
 HloModule AddDependency
 ENTRY %AddDependency (p: f32[3]) -> f32[3] {
   %p = f32[3] parameter(0)
-  %token = token[] after-all()
-  ROOT %add_dep = f32[3] add-dependency(f32[3] %p, token[] %token)
+  %token0 = token[] after-all()
+  ROOT %add_dep = f32[3] add-dependency(f32[3] %p, token[] %token0)
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(
diff --git a/tensorflow/compiler/xla/service/hlo_domain_test.cc b/tensorflow/compiler/xla/service/hlo_domain_test.cc
index acdb42128e3d9a1fb912a466c9c2c3cbbe3d3f83..fd4fb0246d8d42ab7329c05dc23e386303cdce3c 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_test.cc
@@ -195,10 +195,10 @@ HloModule Module
 ENTRY entry {
   p0 = (f32[4]) parameter(0)
   a = f32[4] get-tuple-element(p0), index=0
-  token = token[] after-all()
-  b = (f32[4], u32[], token[]) send(a, token), channel_id=1, sharding={maximal device=0}
+  token0 = token[] after-all()
+  b = (f32[4], u32[], token[]) send(a, token0), channel_id=1, sharding={maximal device=0}
   c = token[] send-done(b), channel_id=1, sharding={maximal device=0}
-  d = (f32[4], u32[], token[]) recv(token), channel_id=2, sharding={maximal device=0}
+  d = (f32[4], u32[], token[]) recv(token0), channel_id=2, sharding={maximal device=0}
   e = (f32[4], token[]) recv-done(d), channel_id=2, sharding={maximal device=0}
   e_element = f32[4] get-tuple-element(e), index=0, sharding={maximal device=0}
   f = f32[4] add(a, e_element)
@@ -235,12 +235,12 @@ TEST_F(HloDomainTest, CheckNoDomainAddedOnPureIOComputation) {
 HloModule Module
 
 ENTRY entry {
-  token = token[] after-all(), sharding={maximal device=-1}
-  a = (f32[4], u32[], token[]) recv(token), channel_id=1, sharding={maximal device=-1}
+  token0 = token[] after-all(), sharding={maximal device=-1}
+  a = (f32[4], u32[], token[]) recv(token0), channel_id=1, sharding={maximal device=-1}
   b = (f32[4], token[]) recv-done(a), channel_id=1, sharding={maximal device=-1}
   b_element = f32[4] get-tuple-element(b), index=0, sharding={maximal device=-1}
   c = f32[4] add(b_element, b_element), sharding={maximal device=-1}
-  d = (f32[4], u32[], token[]) send(c, token), channel_id=2, sharding={maximal device=-1}
+  d = (f32[4], u32[], token[]) send(c, token0), channel_id=2, sharding={maximal device=-1}
   ROOT e = token[] send-done(d), channel_id=2, sharding={maximal device=-1}
 }
 )";
@@ -259,12 +259,12 @@ TEST_F(HloDomainTest, CheckNormalizationOnPureIOComputation) {
 HloModule Module
 
 ENTRY entry {
-  token = token[] after-all(), sharding={maximal device=0}
-  a = (f32[4], u32[], token[]) recv(token), channel_id=1, sharding={maximal device=0}
+  token0 = token[] after-all(), sharding={maximal device=0}
+  a = (f32[4], u32[], token[]) recv(token0), channel_id=1, sharding={maximal device=0}
   b = (f32[4], token[]) recv-done(a), channel_id=1, sharding={maximal device=0}
   b_element = f32[4] get-tuple-element(b), index=0, sharding={maximal device=0}
   c = f32[4] add(b_element, b_element)
-  d = (f32[4], u32[], token[]) send(c, token), channel_id=2, sharding={maximal device=0}
+  d = (f32[4], u32[], token[]) send(c, token0), channel_id=2, sharding={maximal device=0}
   ROOT e = token[] send-done(d), channel_id=2, sharding={maximal device=0}
 }
 )";
@@ -344,8 +344,8 @@ TEST_F(HloDomainTest, CheckNormalizationOnInfeedTuple) {
 HloModule Module
 
 ENTRY entry {
-  token = token[] after-all()
-  infeed = ((f32[4], f32[4]), token[]) infeed(token),
+  token0 = token[] after-all()
+  infeed = ((f32[4], f32[4]), token[]) infeed(token0),
     sharding={{maximal device=1}, {maximal device=0}, {maximal device=0}}
   infeed.data = (f32[4], f32[4]) get-tuple-element(infeed), index=0,
     sharding={{maximal device=1}, {maximal device=0}}
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
index c170e36c73ad2bef830e528de3ec72d38683d888..a3b56a44a0b02923585c1dcb69571479236188a3 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
@@ -57,10 +57,10 @@ TEST_F(HloElementTypeConverterTest, InfeedsOutfeedsNotConverted) {
   const string& hlo_string = R"(
     HloModule InfeedOutfeed
     ENTRY RoundTrip16MiBR1.v2 {
-      token = token[] after-all()
-      infeed = (bf16[4]{0}, token[]) infeed(token)
+      token0 = token[] after-all()
+      infeed = (bf16[4]{0}, token[]) infeed(token0)
       ROOT infeed.data = bf16[4]{0} get-tuple-element(infeed), index=0
-      outfeed = token[] outfeed(infeed.data, token)
+      outfeed = token[] outfeed(infeed.data, token0)
     }
   )";
   auto module = CreateModuleFromHloString(hlo_string);
@@ -96,13 +96,13 @@ TEST_F(HloElementTypeConverterTest, BatchNormGradBF16Converted) {
   const string& hlo_string = R"(
     HloModule BatchNormGrad
     ENTRY BatchNormGrad.v6 {
-      constant.4 = bf16[2,2,2,1]{3,2,1,0} constant(bf16[2,2,2,1] { { /*i0=0*/ 
+      constant.4 = bf16[2,2,2,1]{3,2,1,0} constant({ { /*i0=0*/
       { /*i1=0*/ {0}, {0} }, { /*i1=1*/ {0}, {0} } }, { /*i0=1*/ { /*i1=0*/ {0},
       {0} }, { /*i1=1*/ {0}, {0} } } })
       constant.5 = bf16[2]{0} constant({1, 1})
       constant.6 = bf16[2]{0} constant({0, 0})
       constant.7 = bf16[2]{0} constant({1, 1})
-      constant.8 = bf16[2,2,2,1]{3,2,1,0} constant(bf16[2,2,2,1] { { /*i0=0*/
+      constant.8 = bf16[2,2,2,1]{3,2,1,0} constant({ { /*i0=0*/
       { /*i1=0*/ {1}, {2} }, { /*i1=1*/ {3}, {4} } }, { /*i0=1*/ { /*i1=0*/
       {5}, {6} }, { /*i1=1*/ {7}, {8} } } })
       ROOT batch-norm-grad = (bf16[2,2,2,1]{3,2,1,0}, bf16[2]{0}, bf16[2]{0})
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 51a3fba1768aaf219b78ddc09a1c526448389d9e..934c082bb9f003b1d2d80835f09a8f4109c7e7fd 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -33,12 +33,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
@@ -396,6 +398,16 @@ StatusOr<Literal> HloEvaluator::EvaluateDotOp(
   return Evaluate(cloned_instruction.get());
 }
 
+Status HloEvaluator::HandleBitcast(HloInstruction* bitcast) {
+  const Literal& operand_literal = GetEvaluatedLiteralFor(bitcast->operand(0));
+  Literal result(bitcast->shape());
+  TF_RET_CHECK(operand_literal.size_bytes() == result.size_bytes());
+  memcpy(result.untyped_data(), operand_literal.untyped_data(),
+         operand_literal.size_bytes());
+  evaluated_[bitcast] = std::move(result);
+  return Status::OK();
+}
+
 Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
   CHECK_LT(parameter->parameter_number(), arg_literals_.size());
   const Literal* input_literal = arg_literals_[parameter->parameter_number()];
@@ -618,8 +630,11 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) {
           evaluated_[compare],
           Compare<int64>(compare->shape(), opcode, lhs_literal, rhs_literal));
     } break;
-    case F16:
-      return Unimplemented("unhandled primitive type: F16.");
+    case F16: {
+      TF_ASSIGN_OR_RETURN(
+          evaluated_[compare],
+          Compare<half>(compare->shape(), opcode, lhs_literal, rhs_literal));
+    } break;
     case BF16: {
       TF_ASSIGN_OR_RETURN(evaluated_[compare],
                           Compare<bfloat16>(compare->shape(), opcode,
@@ -1438,4 +1453,46 @@ template StatusOr<Literal> HloEvaluator::Evaluate<const Literal*>(
 template StatusOr<Literal> HloEvaluator::Evaluate<const Literal*>(
     HloInstruction* instruction, absl::Span<const Literal* const> arg_literals);
 
+namespace {
+template <typename T>
+std::unique_ptr<Array2D<T>> MatmulArray2DImpl(
+    const Array2D<T>& lhs, const Array2D<T>& rhs,
+    const std::function<void(
+        const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m, int64 n,
+        int64 k, int32 transpose_lhs, int32 transpose_rhs)>& impl_fn) {
+  CHECK_EQ(lhs.width(), rhs.height());
+  int m = lhs.height();
+  int n = rhs.width();
+  int k = lhs.width();
+  auto result = absl::make_unique<Array2D<T>>(m, n);
+  // Because Eigen is a header-oriented library, make sure that the Eigen code
+  // is the same as the code used by the CPU backend (otherwise the linker will
+  // randomly pick *some* definition).
+  impl_fn(
+      /*run_options_ptr=*/nullptr, result->data(), rhs.data(), lhs.data(), n, m,
+      k,
+      /*transpose_lhs=*/0,
+      /*transpose_rhs=*/0);
+  return result;
+}
+}  // namespace
+
+std::unique_ptr<Array2D<Eigen::half>> HloEvaluator::MatmulArray2D(
+    const Array2D<Eigen::half>& lhs, const Array2D<Eigen::half>& rhs) {
+  return MatmulArray2DImpl<Eigen::half>(
+      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF16);
+}
+
+std::unique_ptr<Array2D<float>> HloEvaluator::MatmulArray2D(
+    const Array2D<float>& lhs, const Array2D<float>& rhs) {
+  return MatmulArray2DImpl<float>(
+      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF32);
+}
+
+std::unique_ptr<Array2D<double>> HloEvaluator::MatmulArray2D(
+    const Array2D<double>& lhs, const Array2D<double>& rhs) {
+  return MatmulArray2DImpl<double>(
+      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF64);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index d847900010c697d7d280ed8e4a9502f1c465ee07..d363a51c63de6fd4246c4970f580b68f4a627df8 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/container/node_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -119,6 +120,17 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
                                   const PrecisionConfig& precision_config,
                                   const Literal& lhs, const Literal& rhs);
 
+  // Enable the fast path for certain operations like dot or convolution.
+  void set_use_fast_path(bool value) { use_fast_path_ = value; }
+
+  // Returns the result of a matrix multiply `lhs x rhs`.
+  static std::unique_ptr<Array2D<Eigen::half>> MatmulArray2D(
+      const Array2D<Eigen::half>& lhs, const Array2D<Eigen::half>& rhs);
+  static std::unique_ptr<Array2D<float>> MatmulArray2D(
+      const Array2D<float>& lhs, const Array2D<float>& rhs);
+  static std::unique_ptr<Array2D<double>> MatmulArray2D(
+      const Array2D<double>& lhs, const Array2D<double>& rhs);
+
  protected:
   // Make HloEvaluatorTypedVisitor a friend because it is logically part of this
   // class.
@@ -144,6 +156,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // Operations that are type-agnostic or always return a specific type, such as
   // HandleIsFinite where boolean is always returned.
   //
+  Status HandleBitcast(HloInstruction* bitcast) override;
+
   Status HandleParameter(HloInstruction* parameter) override;
 
   Status HandleConstant(HloInstruction* constant) override;
@@ -215,6 +229,9 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // we cannot use flat_hash_map any more.
   absl::node_hash_map<const HloInstruction*, Literal> evaluated_;
 
+  // Use fast path that uses eigen in the evaluator.
+  bool use_fast_path_ = false;
+
  private:
   template <typename ReturnT, typename NativeT>
   static StatusOr<Literal> ElementWiseUnaryOpImpl(
@@ -248,6 +265,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   TF_DISALLOW_COPY_AND_ASSIGN(HloEvaluator);
 };
 
+std::unique_ptr<Array2D<float>> MatmulArray2D(const Array2D<float>& lhs,
+                                              const Array2D<float>& rhs);
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index d95b6ad04f2c446b423a3aaef4de333ed2968883..4eaaab20ea0add17d9b49b1b2b97991af0438dcc 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -35,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -2765,6 +2767,33 @@ ENTRY main {
   EXPECT_TRUE(LiteralTestUtil::Equal(arg, actual));
 }
 
+TEST_P(HloEvaluatorTest, Bitcast) {
+  // Regression test for b/114735354.
+  constexpr absl::string_view hlo_text_base = R"(
+HloModule Bitcast
+
+ENTRY main {
+  param = %s[32,121]{1,0} parameter(0)
+  ROOT bitcast = %s[121,32,1]{0,1,2} bitcast(%s[32,121]{1,0} param)
+}
+)";
+  string hlo_text;
+  if (use_bfloat16_) {
+    hlo_text = absl::StrFormat(hlo_text_base, "bf16", "bf16", "bf16");
+  } else {
+    hlo_text = absl::StrFormat(hlo_text_base, "f32", "f32", "f32");
+  }
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  Literal actual = Evaluate({&args[0]});
+  if (use_bfloat16_) {
+    EXPECT_TRUE(
+        absl::c_equal(args[0].data<bfloat16>(), actual.data<bfloat16>()));
+  } else {
+    EXPECT_TRUE(absl::c_equal(args[0].data<float>(), actual.data<float>()));
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(HloEvaluatorTest_Instantiation, HloEvaluatorTest,
                         ::testing::ValuesIn(use_bf16_params));
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index b87fc3e34012e75ee07bff6c1e113dce404f83cb..03d42990ce9dcd3f689831078354f878bcb0800f 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
@@ -105,6 +106,12 @@ bool SafeLess(const NativeT& a, const NativeT& b) {
 template <typename ReturnT, typename ElementwiseT = ReturnT>
 class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
  private:
+  Status UnsupportedTypeError(HloInstruction* instruction) {
+    return InvalidArgument(
+        "Unsupported type for %s: %s", HloOpcodeString(instruction->opcode()),
+        PrimitiveType_Name(instruction->shape().element_type()));
+  }
+
   // Get the value in the given literal static_cast as a double.
   template <
       typename NativeT,
@@ -224,7 +231,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleRound(HloInstruction* round) {
-    return InvalidArgument("Unsupported type for Round");
+    return UnsupportedTypeError(round);
   }
 
   Status HandleRound(HloInstruction* round) override {
@@ -246,7 +253,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleCeil(HloInstruction* ceil) {
-    return InvalidArgument("Unsupported type for Ceil");
+    return UnsupportedTypeError(ceil);
   }
 
   Status HandleCeil(HloInstruction* ceil) override {
@@ -297,8 +304,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleExpm1(HloInstruction* floor) {
-    return InvalidArgument("Unsupported type for Expm1");
+  Status HandleExpm1(HloInstruction* expm1) {
+    return UnsupportedTypeError(expm1);
   }
 
   Status HandleExpm1(HloInstruction* floor) override {
@@ -321,7 +328,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleFloor(HloInstruction* floor) {
-    return InvalidArgument("Unsupported type for Floor");
+    return UnsupportedTypeError(floor);
   }
 
   Status HandleFloor(HloInstruction* floor) override {
@@ -351,12 +358,12 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleLog1p(HloInstruction* floor) {
-    return InvalidArgument("Unsupported type for Log1p");
+  Status HandleLog1p(HloInstruction* log1p) {
+    return UnsupportedTypeError(log1p);
   }
 
-  Status HandleLog1p(HloInstruction* floor) override {
-    return HandleLog1p<ReturnT>(floor);
+  Status HandleLog1p(HloInstruction* log1p) override {
+    return HandleLog1p<ReturnT>(log1p);
   }
 
   template <typename NativeT,
@@ -396,7 +403,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleNot(HloInstruction* not_) {
-    return InvalidArgument("Unsupported type for Not");
+    return UnsupportedTypeError(not_);
   }
 
   Status HandleNot(HloInstruction* not_) override {
@@ -476,7 +483,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <typename NativeT, typename std::enable_if<!std::is_floating_point<
                                   NativeT>::value>::type* = nullptr>
   Status HandleAtan2(HloInstruction* atan2) {
-    return InvalidArgument("Unsupported type for Atan2");
+    return UnsupportedTypeError(atan2);
   }
 
   Status HandleAtan2(HloInstruction* atan2) override {
@@ -624,7 +631,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleMaximum(HloInstruction* maximum) {
-    return InvalidArgument("Unsupported type for Maximum");
+    return UnsupportedTypeError(maximum);
   }
 
   Status HandleMaximum(HloInstruction* maximum) override {
@@ -659,7 +666,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleMinimum(HloInstruction* minimum) {
-    return InvalidArgument("Unsupported type for Minimum");
+    return UnsupportedTypeError(minimum);
   }
 
   Status HandleMinimum(HloInstruction* minimum) override {
@@ -724,7 +731,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleRemainder(HloInstruction* remainder) {
-    return InvalidArgument("Unsupported type for Remainder");
+    return UnsupportedTypeError(remainder);
   }
 
   Status HandleRemainder(HloInstruction* remainder) override {
@@ -746,14 +753,14 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <typename NativeT, typename std::enable_if<std::is_floating_point<
                                   NativeT>::value>::type* = nullptr>
   Status HandleAnd(HloInstruction* and_) {
-    return InvalidArgument("Unsupported type for And");
+    return UnsupportedTypeError(and_);
   }
 
   template <
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleAnd(HloInstruction* and_) {
-    return InvalidArgument("Unsupported type for And");
+    return UnsupportedTypeError(and_);
   }
 
   Status HandleAnd(HloInstruction* and_) override {
@@ -775,7 +782,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <typename NativeT, typename std::enable_if<std::is_floating_point<
                                   NativeT>::value>::type* = nullptr>
   Status HandleOr(HloInstruction* or_) {
-    return InvalidArgument("Unsupported type for Or");
+    return UnsupportedTypeError(or_);
   }
 
   template <
@@ -804,14 +811,14 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <typename NativeT, typename std::enable_if<std::is_floating_point<
                                   NativeT>::value>::type* = nullptr>
   Status HandleXor(HloInstruction* xor_) {
-    return InvalidArgument("Unsupported type for Xor");
+    return UnsupportedTypeError(xor_);
   }
 
   template <
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleXor(HloInstruction* xor_) {
-    return InvalidArgument("Unsupported type for Xor");
+    return UnsupportedTypeError(xor_);
   }
 
   Status HandleXor(HloInstruction* xor_) override {
@@ -836,8 +843,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             typename std::enable_if<!std::is_integral<NativeT>::value ||
                                     std::is_same<NativeT, bool>::value>::type* =
                 nullptr>
-  Status HandleShiftLeft(HloInstruction*) {
-    return InvalidArgument("Unsupported type for ShiftLeft");
+  Status HandleShiftLeft(HloInstruction* shift) {
+    return UnsupportedTypeError(shift);
   }
 
   Status HandleShiftLeft(HloInstruction* shl) override {
@@ -866,8 +873,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             typename std::enable_if<!std::is_integral<NativeT>::value ||
                                     std::is_same<NativeT, bool>::value>::type* =
                 nullptr>
-  Status HandleShiftRightArithmetic(HloInstruction*) {
-    return InvalidArgument("Unsupported type for ShiftRightArithmetic");
+  Status HandleShiftRightArithmetic(HloInstruction* shift) {
+    return UnsupportedTypeError(shift);
   }
 
   Status HandleShiftRightArithmetic(HloInstruction* shra) override {
@@ -897,8 +904,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             typename std::enable_if<!std::is_integral<NativeT>::value ||
                                     std::is_same<NativeT, bool>::value>::type* =
                 nullptr>
-  Status HandleShiftRightLogical(HloInstruction*) {
-    return InvalidArgument("Unsupported type for ShiftRightLogical");
+  Status HandleShiftRightLogical(HloInstruction* shift) {
+    return UnsupportedTypeError(shift);
   }
 
   Status HandleShiftRightLogical(HloInstruction* shrl) override {
@@ -923,8 +930,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleClamp(HloInstruction*) {
-    return InvalidArgument("Unsupported type for Clamp");
+  Status HandleClamp(HloInstruction* clamp) {
+    return UnsupportedTypeError(clamp);
   }
 
   Status HandleClamp(HloInstruction* clamp) override {
@@ -1148,6 +1155,78 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleDot(HloInstruction* dot) override {
+    if (parent_->use_fast_path_) {
+      return HandleDot<ReturnT>(dot);
+    }
+    return HandleDotSlowPath(dot);
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_same<
+                                  NativeT, float>::value>::type* = nullptr>
+  Status HandleDot(HloInstruction* dot) {
+    const HloInstruction* lhs = dot->operand(0);
+    const HloInstruction* rhs = dot->operand(1);
+    CHECK(ShapeUtil::IsArray(dot->shape()));
+    CHECK(ShapeUtil::IsArray(lhs->shape()));
+    CHECK(ShapeUtil::IsArray(rhs->shape()));
+
+    const auto& dnums = dot->dot_dimension_numbers();
+
+    const int64 lhs_rank = ShapeUtil::Rank(lhs->shape());
+    const int64 rhs_rank = ShapeUtil::Rank(rhs->shape());
+
+    CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
+    CHECK(ShapeUtil::SameElementType(lhs->shape(), dot->shape()));
+
+    // There must be 1 and only 1 Contracting dimension for lhs and rhs.
+    CHECK_EQ(dnums.lhs_contracting_dimensions_size(), 1);
+    CHECK_EQ(dnums.rhs_contracting_dimensions_size(), 1);
+    const int64 lhs_contracting_dimension = dnums.lhs_contracting_dimensions(0);
+    const int64 rhs_contracting_dimension = dnums.rhs_contracting_dimensions(0);
+    // Contracted dimension sizes must be the same.
+    CHECK_EQ(lhs->shape().dimensions(lhs_contracting_dimension),
+             rhs->shape().dimensions(rhs_contracting_dimension))
+        << "lhs contracted dimension: "
+        << lhs->shape().dimensions(lhs_contracting_dimension)
+        << " rhs contracted dimension: "
+        << rhs->shape().dimensions(rhs_contracting_dimension);
+
+    // The fast path is for a simple rank 2 dot with default layout operands.
+    if (lhs_rank == 2 && rhs_rank == 2 && lhs_contracting_dimension == 1 &&
+        rhs_contracting_dimension == 0 &&
+        LayoutUtil::Equal(lhs->shape().layout(),
+                          LayoutUtil::GetDefaultLayoutForR2()) &&
+        LayoutUtil::Equal(rhs->shape().layout(),
+                          LayoutUtil::GetDefaultLayoutForR2()) &&
+        LayoutUtil::Equal(dot->shape().layout(),
+                          LayoutUtil::GetDefaultLayoutForR2())) {
+      const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+      const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+      const int64 contracted_dimension_size =
+          lhs->shape().dimensions(lhs_contracting_dimension);
+      Array2D<NativeT> lhs_array(lhs->shape().dimensions(0),
+                                 contracted_dimension_size);
+      lhs_array.SetValues(lhs_literal.data<NativeT>());
+      Array2D<NativeT> rhs_array(contracted_dimension_size,
+                                 rhs->shape().dimensions(1));
+      rhs_array.SetValues(rhs_literal.data<NativeT>());
+      std::unique_ptr<Array2D<NativeT>> result_array =
+          HloEvaluator::MatmulArray2D(lhs_array, rhs_array);
+      Literal result(dot->shape());
+      result.PopulateR2FromArray2D(*result_array);
+      parent_->evaluated_[dot] = std::move(result);
+      return Status::OK();
+    }
+    return HandleDotSlowPath(dot);
+  }
+
+  template <typename NativeT, typename std::enable_if<!std::is_same<
+                                  NativeT, float>::value>::type* = nullptr>
+  Status HandleDot(HloInstruction* dot) {
+    return HandleDotSlowPath(dot);
+  }
+
+  Status HandleDotSlowPath(HloInstruction* dot) {
     auto lhs = dot->operand(0);
     auto rhs = dot->operand(1);
     CHECK(ShapeUtil::IsArray(dot->shape()));
@@ -1578,7 +1657,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                                     std::is_same<NativeT, bool>::value>::type* =
                 nullptr>
   Status HandleSort(HloInstruction* sort) {
-    return InvalidArgument("Unsupported type for Sort");
+    return UnsupportedTypeError(sort);
   }
 
   Status HandleSort(HloInstruction* sort) override {
@@ -2357,7 +2436,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             std::is_same<NativeT, int64>::value ||
             std::is_same<NativeT, uint64>::value)>::type* = nullptr>
   Status HandleClz(HloInstruction* clz) {
-    return InvalidArgument("Unsupported type for Clz");
+    return UnsupportedTypeError(clz);
   }
 
   template <typename NativeT,
@@ -2403,7 +2482,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename std::enable_if<std::is_integral<NativeT>::value ||
                               is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleSin(HloInstruction* sin) {
-    return InvalidArgument("Unsupported type for Sin");
+    return UnsupportedTypeError(sin);
   }
 
   Status HandleSin(HloInstruction* sin) override {
@@ -2425,7 +2504,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename std::enable_if<std::is_integral<NativeT>::value ||
                               is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleCos(HloInstruction* cos) {
-    return InvalidArgument("Unsupported type for Cos");
+    return UnsupportedTypeError(cos);
   }
 
   Status HandleCos(HloInstruction* cos) override {
@@ -2534,7 +2613,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename std::enable_if<std::is_integral<NativeT>::value ||
                               is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleReducePrecision(HloInstruction* reduce_precision) {
-    return InvalidArgument("Unsupported type for reduce precision");
+    return UnsupportedTypeError(reduce_precision);
   }
 
   Status HandleReducePrecision(HloInstruction* reduce_precision) override {
@@ -2543,15 +2622,27 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
   template <typename NativeT,
             typename std::enable_if<
+                std::is_same<NativeT, bfloat16>::value ||
+                std::is_same<NativeT, Eigen::half>::value ||
                 std::is_integral<NativeT>::value ||
                 std::is_floating_point<NativeT>::value>::type* = nullptr>
   Status HandleIota(HloInstruction* instruction) {
     auto* iota = Cast<HloIotaInstruction>(instruction);
+    const int64 iota_size = iota->shape().dimensions(iota->iota_dimension());
     // Avoid using std::vector since std::vector<bool> does not convert to
     // absl::Span<bool>.
-    absl::InlinedVector<NativeT, 1> data(
-        iota->shape().dimensions(iota->iota_dimension()));
-    std::iota(data.begin(), data.end(), 0);
+    absl::InlinedVector<NativeT, 1> data(iota_size);
+    // We don't use std::iota for two reasons:
+    //
+    // (1) std:iota does not support bfloat16 and float16.
+    //
+    // (2) std::iota saturates for floating point types when the value is not
+    //     representable, but the definition of HLO iota is the value as a
+    //     64-bit integer cast to the native type.
+    for (int64 i = 0; i < iota_size; ++i) {
+      // static_cast is required for Eigen::half (F16).
+      data[i] = static_cast<NativeT>(i);
+    }
     auto result = LiteralUtil::CreateR1<NativeT>(data);
 
     if (ShapeUtil::Rank(iota->shape()) > 1) {
@@ -2567,10 +2658,12 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
   template <typename NativeT,
             typename std::enable_if<
-                !(std::is_integral<NativeT>::value ||
+                !(std::is_same<NativeT, bfloat16>::value ||
+                  std::is_same<NativeT, Eigen::half>::value ||
+                  std::is_integral<NativeT>::value ||
                   std::is_floating_point<NativeT>::value)>::type* = nullptr>
   Status HandleIota(HloInstruction* iota) {
-    return InvalidArgument("Unsupported type for iota");
+    return UnsupportedTypeError(iota);
   }
   Status HandleIota(HloInstruction* iota) override {
     return HandleIota<ReturnT>(iota);
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
index 631b3ad735f369922d10b37d11e2a1b1ba117e6b..c919dbd82d3668c477bf37074f1d56f8cb7d9506 100644
--- a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
@@ -39,7 +39,7 @@ StatusOr<bool> ReplaceGetSize(HloInstruction* instr) {
   uint32 size = instr->operand(0)->shape().dimensions(instr->dimension());
   HloInstruction* new_instr = computation->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(size)));
-  TF_RETURN_IF_ERROR(computation->ReplaceInstruction(instr, new_instr));
+  TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(new_instr));
   return true;
 }
 
@@ -50,12 +50,7 @@ StatusOr<bool> HloGetDimensionSizeRewriter::Run(HloModule* module) {
   HloProto proto;
   *proto.mutable_hlo_module() = module->ToProto();
   for (auto* computation : module->computations()) {
-    // Replacing instructions will change the instruction list in the
-    // computation. So instead of iterating computation->instructions()
-    // directly, we make a copy of the list to avoid use-after-free.
-    std::vector<HloInstruction*> instrs(computation->instruction_count());
-    absl::c_copy(computation->instructions(), instrs.begin());
-    for (auto instruction : instrs) {
+    for (auto instruction : computation->instructions()) {
       TF_ASSIGN_OR_RETURN(bool replaced, ReplaceGetSize(instruction));
       changed = changed || replaced;
     }
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 804feff290a1c0800a8e6bf209b042241b6cb759..5db21e47ca94af3b017e0401237692913365a48c 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <deque>
 #include <map>
 #include <memory>
+#include <queue>
 #include <string>
 #include <tuple>
 #include <unordered_map>
@@ -38,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/window_util.h"
@@ -111,11 +113,6 @@ class NodeFilter {
            result == kSomeUsersOmitted;
   }
 
-  bool ShowFusionSubcomputation(const HloInstruction* instr) const {
-    CHECK_EQ(instr->opcode(), HloOpcode::kFusion);
-    return Show(instr) && !SomeOrAllOperandsOmitted(instr);
-  }
-
  private:
   std::function<NodeFilterResult(const HloInstruction* instr)> filter_;
 };
@@ -240,34 +237,28 @@ string HtmlLikeStringSanitize(absl::string_view s) {
 // it to a short string lets us tell the user what the subcomputation is without
 // drawing it as a graph.
 optional<string> MatchTrivialComputation(const HloComputation* computation) {
+  namespace m = match;
+
   if (computation->instruction_count() != 3) {
     return nullopt;
   }
-
   HloInstruction* root = computation->root_instruction();
-  if (root->operand_count() != 2) {
-    return nullopt;
-  }
-
-  // Check that both of the operands to the root are parameters.
-  const HloInstruction* operand0 = root->operand(0);
-  const HloInstruction* operand1 = root->operand(1);
-  if (operand0->opcode() != HloOpcode::kParameter ||
-      operand1->opcode() != HloOpcode::kParameter) {
+  const HloInstruction *param0, *param1;
+  if (!Match(root, m::Op()
+                       .WithNumOperands(2)
+                       .WithShape(m::Shape().IsEffectiveScalar())
+                       .WithBinaryOperandsAnyOrder(
+                           m::Parameter(&param0, 0)
+                               .WithShape(m::Shape().IsEffectiveScalar()),
+                           m::Parameter(&param1, 1)
+                               .WithShape(m::Shape().IsEffectiveScalar())))) {
     return nullopt;
   }
 
-  // Check that the two operands of root are param0 and param1.  All of the
-  // opcodes we recognize are commutative, so we're OK with either order.
-  auto n0 = operand0->parameter_number();
-  auto n1 = operand1->parameter_number();
-  if (!(n0 == 0 && n1 == 1) && !(n1 == 0 && n0 == 1)) {
-    return nullopt;
-  }
-
-  // If the params are reversed, check that the operation being performed is
-  // commutative.
-  if (n0 == 1) {
+  // If the params are reversed (i.e. operand0 is param1 and operand1 is
+  // param0), check that the operation being performed is commutative.
+  if (root->operand(0) == param1) {
+    CHECK_EQ(root->operand(1), param0);
     switch (root->opcode()) {
       case HloOpcode::kLe:
       case HloOpcode::kGe:
@@ -279,13 +270,6 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
     }
   }
 
-  // Check that the root and params are all effective scalars.
-  if (!ShapeUtil::IsEffectiveScalar(root->shape()) ||
-      !ShapeUtil::IsEffectiveScalar(operand0->shape()) ||
-      !ShapeUtil::IsEffectiveScalar(operand1->shape())) {
-    return nullopt;
-  }
-
   // If we recognize the root's opcode, we've successfully pattern-matched!
   switch (root->opcode()) {
     case HloOpcode::kAdd:
@@ -578,7 +562,7 @@ bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
 
   // Show the subcomputation if we're showing any of its members.
   return std::any_of(
-      computation_->instructions().begin(), computation_->instructions().end(),
+      subcomp->instructions().begin(), subcomp->instructions().end(),
       [&](const HloInstruction* instr) { return filter_.Show(instr); });
 }
 
@@ -1298,7 +1282,8 @@ namespace {
 
 // Gets a NodeFilter that includes roughly all instructions whose distance from
 // root is <= radius.
-NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
+NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
+                                      int64 radius) {
   // First, find the neighborhood of nodes with distance from root <= radius.
   // These nodes are our initial set of "normal" nodes.
   std::unordered_map<const HloInstruction*, NodeFilterResult> nodes;
@@ -1405,6 +1390,56 @@ NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
   });
 }
 
+// Gets a node filter that includes nodes on all paths from `from` to `to`.  If
+// the all-paths set contains more than max_nodes elements, includes the nodes
+// on the shortest paths and sets hit_limit to true.
+NodeFilter MakeNodeFromToFilter(const HloInstruction* from,
+                                const HloInstruction* to, int64 max_nodes,
+                                bool* hit_limit) {
+  *hit_limit = false;
+
+  // Elements in the queue are paths through the graph.
+  std::deque<std::vector<const HloInstruction*>> queue;
+  queue.push_front({from});
+
+  // Compute the set of nodes we want to show using a slightly-modified
+  // Djikstra's algorithm.  The only real difference is, rather than stopping
+  // when we find a (shortest) path, we continue until we've found max_nodes
+  // nodes on some path.
+  std::unordered_set<const HloInstruction*> visited;
+  std::unordered_set<const HloInstruction*> to_display = {from, to};
+  while (!queue.empty() && to_display.size() < max_nodes) {
+    std::vector<const HloInstruction*> path = std::move(queue.front());
+    queue.pop_front();
+    if (!visited.insert(path.back()).second) {
+      continue;
+    }
+
+    for (const auto* user : path.back()->users()) {
+      if (user == to) {
+        auto it = path.begin();
+        for (; it != path.end() && to_display.size() < max_nodes; ++it) {
+          to_display.insert(*it);
+        }
+        if (it != path.end()) {
+          *hit_limit = true;
+        }
+      } else if (!visited.count(user)) {
+        auto new_path = path;
+        new_path.push_back(user);
+        queue.push_back(std::move(new_path));
+      }
+    }
+  }
+
+  return NodeFilter([=](const HloInstruction* instr) {
+    if (instr == from || instr == to) {
+      return kHighlightNode;
+    }
+    return to_display.count(instr) ? kNormalNode : kHideNode;
+  });
+}
+
 string SaveGraph(const string& graph,
                  GraphRendererInterface::GraphKind graph_kind,
                  const string& dest_path) {
@@ -1439,14 +1474,15 @@ string ExportGraph(const string& graph,
                    GraphRendererInterface::GraphKind graph_kind,
                    const DebugOptions& debug_options) {
   string path = debug_options.xla_hlo_graph_path();
-  if (!path.empty()) {
+  if (!path.empty() && !debug_options.xla_hlo_dump_as_html()) {
     return SaveGraph(graph, graph_kind, path);
   } else {
     auto graph_renderer =
         GraphRendererRegistry::Default()->GetDefaultRenderer();
     CHECK(graph_renderer != nullptr)
         << "No registered renderer for the HLO graph. "
-           "Use --xla_hlo_graph_path=PATH to export to local file system";
+           "Use --xla_hlo_graph_path=PATH --xla_hlo_dump_as_html=false to "
+           "export to local file system";
     return graph_renderer->RenderGraph(graph, graph_kind, debug_options);
   }
 }
@@ -1484,7 +1520,7 @@ string DumpNeighborhoodAround(const HloInstruction& node, int radius,
   auto debug_options = node.GetModule()->config().debug_options();
   string label =
       StrCat("Neighborhood of ", radius, " nodes around ", node.name());
-  NodeFilter filter = MakeNodeFilter(&node, radius);
+  NodeFilter filter = MakeNodeRadiusAroundFilter(&node, radius);
   string graph =
       HloDotDumper(node.parent(), label, debug_options, show_backend_config,
                    /*profile=*/nullptr, filter)
@@ -1492,6 +1528,29 @@ string DumpNeighborhoodAround(const HloInstruction& node, int radius,
   return ExportGraph(graph, GraphRendererInterface::DOT_GRAPH, debug_options);
 }
 
+string DumpAllPathsFromTo(const HloInstruction& from, const HloInstruction& to,
+                          int64 max_nodes, bool show_backend_config) {
+  CHECK_EQ(from.parent(), to.parent()) << "Nodes must be in same computation!";
+  auto debug_options = from.GetModule()->config().debug_options();
+
+  bool hit_limit = false;
+  NodeFilter filter = MakeNodeFromToFilter(&from, &to, max_nodes, &hit_limit);
+  string label;
+  if (!hit_limit) {
+    label = StrCat("All paths from ", from.name(), " to ", to.name());
+  } else {
+    label = StrCat(max_nodes, " nodes on the shortest paths from ", from.name(),
+                   " to ", to.name(),
+                   "<br/><br/>***SHOWING ONLY A SUBSET OF ALL PATHS BETWEEN "
+                   "NODES***<br/><br/>");
+  }
+  string graph =
+      HloDotDumper(from.parent(), label, debug_options, show_backend_config,
+                   /*profile=*/nullptr, filter)
+          .Dump();
+  return ExportGraph(graph, GraphRendererInterface::DOT_GRAPH, debug_options);
+}
+
 void DumpText(const HloModule& module, const string& label,
               const string& directory_path, bool do_prefix) {
   Env* env = Env::Default();
@@ -1531,5 +1590,143 @@ string MaybeDumpHloModule(const HloModule& module, const string& label,
   return graph_url;
 }
 
+string WrapDotInHTML(const string& dot) {
+  static const char html_prefix[] = R"html(
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset="utf-8">
+  <style type="text/css">
+    html, body { height: 100%; }
+    body { margin: 0; }
+  </style>
+</head>
+<body>
+  <!-- Integrity hash is generated by https://www.srihash.org/ -->
+  <script src="https://cdn.jsdelivr.net/npm/viz.js@2.1.1/viz.js"
+     integrity="sha384-aD1MJYb0WKIUT+CtwJp5LTuV3U4pLAS6B/nUxL7ECimC2pN9N8vjlMr/yQCAkzxE"
+     crossorigin="anonymous"></script>
+  <script src="https://cdn.jsdelivr.net/npm/viz.js@2.1.1/full.render.js"
+     integrity="sha384-bAixY275aIpCj6Te19y0MILZ4V+VEC8CVFujFEH+Lf7W+4XYYeYLwW5IBI6yQmMT"
+     crossorigin="anonymous"></script>
+  <script src="https://cdn.jsdelivr.net/npm/svg-pan-zoom@3.6.0/dist/svg-pan-zoom.min.js"
+     integrity="sha384-3008WpYB2pOBvE7lwkrKf+qTmbTPGGPYxA9C1YVhvbPukns4ZFj7E98QPLkNW9dS"
+     crossorigin="anonymous"></script>
+  <div id="container" style="height:95%; border:1px solid black; "></div>
+  <script>
+    var data = `
+)html";
+
+  static const char html_suffix[] = R"html(
+`;
+    var cssregex = new RegExp('stylesheet=<([^]*)\n>\n', 'gm');
+    var results = cssregex.exec(data)
+    // graphviz has problem dealing with large stylesheets.
+    // https://github.com/tensorflow/tensorflow/issues/17220#issuecomment-369228492
+    // In order to avoid the problem, remove the stylesheet from the dot and
+    // insert it directly info the rendered SVG.
+    var dot_data = data;
+    var css_data = ''
+    if (results !== null) {
+        css_data = results[1].replace(/\s*data:.*\s*,/,''); // Strip content-type field.
+        dot_data = data.replace(cssregex, ''); // Remove the stylesheet
+    }
+
+    var render_start = performance.now()
+    function add_controls(svg) {
+        var htmlblob = new Blob([document.documentElement.innerHTML],
+                                {type: 'text/html'});
+        var savehtml = document.createElement('a');
+        savehtml.setAttribute('href', URL.createObjectURL(htmlblob));
+        savehtml.setAttribute('download', 'graph.html');
+        savehtml.innerHTML = " [Save HTML+SVG] ";
+        document.body.append(savehtml);
+        var svgblob = new Blob([svg.outerHTML], {type: 'image/svg'});
+        var savesvg = document.createElement('a');
+        savesvg.setAttribute('href', URL.createObjectURL(svgblob));
+        savesvg.setAttribute('download', 'graph.svg');
+        savesvg.innerHTML = " [Save SVG] ";
+        document.body.append(savesvg);
+        var dotblob =  new Blob([data], {type: 'text/dot'});
+        var savedot = document.createElement('a');
+        savedot.setAttribute('href', URL.createObjectURL(dotblob));
+        savedot.setAttribute('download', 'graph.dot');
+        savedot.innerHTML = " [Save DOT] ";
+        document.body.append(savedot);
+        // Will get called after embed element was loaded
+        var panzoom = svgPanZoom(svg, {
+            zoomEnabled: true,
+            controlIconsEnabled: true,
+        });
+        document.getElementsByTagName("BODY")[0].onresize = function() {
+            panzoom.resize();
+            panzoom.fit();
+            panzoom.center();
+        };
+        var render_end = performance.now();
+        var render_note = document.createElement('div')
+        render_note.innerHTML = 'Rendering took '
+                                + (render_end - render_start).toFixed(2) + "ms."
+        document.body.append(render_note);
+    }
+    var svg = document.getElementById('graph')
+    if (svg == null) {
+        // Need to render SVG first.
+        var viz = new Viz();
+        viz.renderSVGElement(dot_data)
+            .then(function(svg){
+                var container = document.getElementById('container')
+                var style = document.createElementNS('http://www.w3.org/2000/svg', 'style');
+                var node = document.createTextNode(css_data);
+                style.appendChild(node);
+                svg.setAttribute('width', '100%');
+                svg.setAttribute('height', 'auto');
+                svg.setAttribute('id', 'graph');
+                svg.appendChild(style);
+                container.appendChild(svg);
+                add_controls(svg);
+            })
+    } else {
+        // HTML already has rendered SVG embedded, so we just need to add
+        // controls.
+        add_controls(svg);
+    }
+  </script>
+</body>
+</html>
+)html";
+
+  return html_prefix + dot + html_suffix;
+}
+
+string RenderDotAsHTMLFile(const string& dot,
+                           const DebugOptions& debug_options) {
+  string html = WrapDotInHTML(dot);
+
+  auto env = tensorflow::Env::Default();
+  std::vector<string> dirs;
+  string output_dir = debug_options.xla_hlo_graph_path();
+  if (output_dir.empty()) {
+    env->GetLocalTempDirectories(&dirs);
+  } else {
+    dirs.push_back(output_dir);
+  }
+  // Try each directory, as they might be full, have inappropriate
+  // permissions or have different problems at times.
+  string output;
+  for (const string& dir : dirs) {
+    string filename = tensorflow::io::JoinPath(dir, "graph-");
+    if (env->CreateUniqueFileName(&filename, ".html")) {
+      output = filename;
+      break;
+    }
+  }
+  if (output.empty()) {
+    LOG(FATAL) << "Failed to create unique output file name.";
+  }
+  TF_CHECK_OK(tensorflow::WriteStringToFile(env, output, html));
+  return "file://" + output;
+}
+
 }  // namespace hlo_graph_dumper
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 8d5945aba8cb0a7426597f07173e83c4574f3365..8e51454ef1cf992386cc7325e32705c08bf7712f 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -66,6 +66,12 @@ string DumpGraph(const HloComputation& computation, const string& label,
 string DumpNeighborhoodAround(const HloInstruction& node, int radius,
                               bool show_backend_config = false);
 
+// Dumps nodes on any of the paths from `from` to `to`.  If there are more than
+// max_nodes on all paths, restricts to the max_nodes nodes on the shortest
+// paths.
+string DumpAllPathsFromTo(const HloInstruction& from, const HloInstruction& to,
+                          int64 max_nodes, bool show_backend_config = false);
+
 // Dumps the HloModule::ToString() as a file into the provided directory path
 // suffixed with the provided label.
 //
@@ -75,6 +81,12 @@ string DumpNeighborhoodAround(const HloInstruction& node, int radius,
 void DumpText(const HloModule& module, const string& label,
               const string& directory_path, bool do_prefix = true);
 
+// Renders DOT graph as inline SVG and saves it in an HTML file in a temprary
+// directory or directory specified via --xla_hlo_graph_path. Returns the file
+// URI pointing to the file.
+string RenderDotAsHTMLFile(const string& dot,
+                           const DebugOptions& debug_options);
+
 // Graph renderers may be added using a registration mechanism, e.g.:
 // XLA_REGISTER_GRAPH_RENDERER(AGraphRendererClass, 100)
 // The renderer with the highest numeric priority value is used.
diff --git a/tensorflow/compiler/xla/service/hlo_graph_html_renderer.cc b/tensorflow/compiler/xla/service/hlo_graph_html_renderer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..84c4cf18df69816c611f4eb159ba247320ebc20e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_graph_html_renderer.cc
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implementation of an DOT graph renderer that uses Javascript to render DOT to
+// SVG in a browser.
+
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+namespace xla {
+namespace hlo_graph_dumper {
+namespace {
+
+class GraphHtmlRenderer : public GraphRendererInterface {
+ public:
+  string RenderGraph(const string& graph, GraphKind graph_kind,
+                     const DebugOptions& debug_options) override {
+    switch (graph_kind) {
+      case DOT_GRAPH:
+        return RenderDotAsHTMLFile(graph, debug_options);
+      default:
+        LOG(FATAL) << "Only DOT graphs can be rendered";
+    }
+  }
+};
+
+XLA_REGISTER_GRAPH_RENDERER(GraphHtmlRenderer);
+
+}  // namespace
+}  // namespace hlo_graph_dumper
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 21b1dbc1676cccd2fe5b331a1f9d6ff5e3a73fcd..8b2ace1e82eff250f4d9f0d5630e9e6d646cfe6d 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -569,6 +569,11 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   instruction->SetAndSanitizeName(proto.name());
   instruction->metadata_ = proto.metadata();
   instruction->backend_config_ = proto.backend_config();
+
+  TF_RET_CHECK(proto.id() >= 0)
+      << "Instruction with negative id: " << proto.id();
+  TF_RET_CHECK(proto.id() <= INT_MAX)
+      << "Instruction with id > INT_MAX: " << proto.id();
   instruction->unique_id_ = proto.id();
 
   if (proto.has_sharding()) {
@@ -914,12 +919,8 @@ HloInstruction::CreateDynamicUpdateSlice(const Shape& shape,
                                          HloInstruction* operand,
                                          HloInstruction* update,
                                          HloInstruction* start_indices) {
-  auto instruction = absl::WrapUnique(
-      new HloInstruction(HloOpcode::kDynamicUpdateSlice, shape));
-  instruction->AppendOperand(operand);
-  instruction->AppendOperand(update);
-  instruction->AppendOperand(start_indices);
-  return instruction;
+  return absl::make_unique<HloDynamicUpdateSliceInstruction>(
+      shape, operand, update, start_indices);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConcatenate(
@@ -1760,7 +1761,12 @@ bool HloInstruction::IdenticalSlowPath(
   return false;
 }
 
-uint64 HloInstruction::Hash() const {
+static uint64 HashOperand(const HloInstruction* hlo) {
+  return ShapeUtil::Hash(hlo->shape());
+}
+
+uint64 HloInstruction::Hash(
+    const std::function<uint64(const HloInstruction*)>& hash_operand) const {
   using tensorflow::Hash64Combine;
 
   uint64 hash_value = Hash64Combine(0, static_cast<uint64>(opcode()));
@@ -1769,7 +1775,7 @@ uint64 HloInstruction::Hash() const {
   if (!IsCrossModuleAllReduce()) {
     if (!operands().empty()) {
       for (size_t i = 0; i < operands().size(); ++i) {
-        hash_value = Hash64Combine(hash_value, operand(i)->Hash());
+        hash_value = Hash64Combine(hash_value, hash_operand(operand(i)));
       }
     }
   }
@@ -1778,6 +1784,11 @@ uint64 HloInstruction::Hash() const {
   return hash_value;
 }
 
+uint64 HloInstruction::Hash() const {
+  // Use HashOperand as an argument to prevent non-termination.
+  return Hash(HashOperand);
+}
+
 uint64 HloInstruction::InnerHash() const { return 13; }
 
 void HloInstruction::RemoveUser(HloInstruction* user) {
@@ -2059,6 +2070,10 @@ bool HloInstruction::IsCrossModuleAllReduce() const {
   return opcode() == HloOpcode::kCrossReplicaSum && all_reduce_id();
 }
 
+bool HloInstruction::IsCrossReplicaAllReduce() const {
+  return opcode() == HloOpcode::kCrossReplicaSum && !all_reduce_id();
+}
+
 string HloInstruction::ToStringWithCanonicalNameMap(
     const HloPrintOptions& options,
     CanonicalNameMap* canonical_name_map) const {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index a54716217d6bbc5c0601f5d9ff7bf4072a6b30f5..dd77f101a049d7247dcf571d2d19cb4f74e2f8ea 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -909,6 +909,14 @@ class HloInstruction {
   // information on opcode, shape, operands, and typically a root instruction.
   // This function returns the same hash value for equivalent HLO instructions,
   // with respect to HloInstruction::Identical() method.
+  //
+  // Uses hash_operand function to compute hash values of its operands.
+  // At the very top level, hash_operand should be non-recursive to prevent
+  // non-termination.
+  uint64 Hash(
+      const std::function<uint64(const HloInstruction*)>& hash_operand) const;
+
+  // Calls the above method with non-recursive hash_operand function.
   uint64 Hash() const;
 
   // Returns whether the instruction has a constant operand.
@@ -1174,9 +1182,12 @@ class HloInstruction {
   // Returns true if this instruction is elementwise on all its operands.
   bool IsElementwise() const;
 
-  // Returns true if this is an cross module all-reduce instrucion.
+  // Returns true if this is a cross module all-reduce instruction.
   bool IsCrossModuleAllReduce() const;
 
+  // Returns true if this is a cross-replica all-reduce instruction.
+  bool IsCrossReplicaAllReduce() const;
+
   // Returns true if this elementwise instruction implicitly broadcasts operand
   // `operand_idx`.
   //
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 1ea02cf9c03866a598bec0e5356f0eb31ad27755..5521e5bd9acefcd1cb7721ed55fe987189623404 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -905,7 +905,7 @@ string HloConstantInstruction::OperandsToStringWithCanonicalNameMap(
        options.print_large_constants())) {
     // Literal::ToString emits multidimensional arrays over multiple
     // lines. Compact this into one line by stripping out white space.
-    string tmp = literal().ToString();
+    string tmp = literal().ToStringWithoutShape();
     std::replace(tmp.begin(), tmp.end(), '\n', ' ');
     std::vector<string> v = absl::StrSplit(tmp, ' ');
     bool first = true;
@@ -1372,8 +1372,14 @@ bool HloFusionInstruction::IdenticalSlowPath(
                          other.fused_instructions_computation());
 }
 
+static uint64 HashOperandRecursive(const HloInstruction* hlo) {
+  return hlo->Hash(HashOperandRecursive);
+}
+
 uint64 HloFusionInstruction::InnerHash() const {
-  return fused_instructions_computation()->Hash();
+  // Use HashOperandRecursive to recursively compute hash on inner operands.
+  return fused_instructions_computation()->root_instruction()->Hash(
+      HashOperandRecursive);
 }
 
 std::unique_ptr<HloInstruction> HloFusionInstruction::CloneWithNewOperandsImpl(
@@ -1994,12 +2000,21 @@ std::unique_ptr<HloInstruction> HloPadInstruction::CloneWithNewOperandsImpl(
 HloDynamicSliceInstruction::HloDynamicSliceInstruction(
     const Shape& shape, HloInstruction* operand, HloInstruction* start_indices,
     absl::Span<const int64> slice_sizes)
-    : HloInstruction(HloOpcode::kDynamicSlice, shape),
+    : HloDynamicIndexInstruction(HloOpcode::kDynamicSlice, shape),
       dynamic_slice_sizes_(slice_sizes.begin(), slice_sizes.end()) {
   AppendOperand(operand);
   AppendOperand(start_indices);
 }
 
+HloDynamicUpdateSliceInstruction::HloDynamicUpdateSliceInstruction(
+    const Shape& shape, HloInstruction* operand, HloInstruction* update,
+    HloInstruction* start_indices)
+    : HloDynamicIndexInstruction(HloOpcode::kDynamicUpdateSlice, shape) {
+  AppendOperand(operand);
+  AppendOperand(update);
+  AppendOperand(start_indices);
+}
+
 HloInstructionProto HloDynamicSliceInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   for (int64 slice_size : dynamic_slice_sizes_) {
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index b5c28137a145667a977d39c9d3c40c6d36a8436e..5420d4ce11f4bdd068e82f208a98e9943ad4479e 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -1171,7 +1171,14 @@ class HloPadInstruction : public HloInstruction {
   PaddingConfig padding_config_;
 };
 
-class HloDynamicSliceInstruction : public HloInstruction {
+class HloDynamicIndexInstruction : public HloInstruction {
+ public:
+  explicit HloDynamicIndexInstruction(HloOpcode opcode, const Shape& shape)
+      : HloInstruction(opcode, shape) {}
+  virtual int64 index_operand_number() const = 0;
+};
+
+class HloDynamicSliceInstruction : public HloDynamicIndexInstruction {
  public:
   explicit HloDynamicSliceInstruction(const Shape& shape,
                                       HloInstruction* operand,
@@ -1189,6 +1196,8 @@ class HloDynamicSliceInstruction : public HloInstruction {
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
+  int64 index_operand_number() const override { return 1; }
+
  private:
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
@@ -1206,6 +1215,16 @@ class HloDynamicSliceInstruction : public HloInstruction {
   std::vector<int64> dynamic_slice_sizes_;
 };
 
+class HloDynamicUpdateSliceInstruction : public HloDynamicIndexInstruction {
+ public:
+  explicit HloDynamicUpdateSliceInstruction(const Shape& shape,
+                                            HloInstruction* operand,
+                                            HloInstruction* update,
+                                            HloInstruction* start_indices);
+
+  int64 index_operand_number() const override { return 2; }
+};
+
 class HloGatherInstruction : public HloInstruction {
  public:
   explicit HloGatherInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
index 1390537101e95a08e4ba4eef7ae8d6059a40e916..dc712e5e42c449737bf4415f3a5e3eb9d81d9be4 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/strings/escaping.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -82,9 +83,23 @@ tensorflow::RegexpStringPiece HloLexer::RegexpStringPieceFromPointers(
   return tensorflow::RegexpStringPiece(begin, end - begin);
 }
 
+TokKind HloLexer::LookAhead() {
+  if (GetKind() == TokKind::kEof || GetKind() == TokKind::kError) {
+    return GetKind();
+  }
+
+  const char* old_current_ptr = current_ptr_;
+  TokenState old_token_state = token_state_;
+  Lex();
+  TokKind kind = GetKind();
+  token_state_ = old_token_state;
+  current_ptr_ = old_current_ptr;
+  return kind;
+}
+
 TokKind HloLexer::LexToken() {
   while (true) {
-    token_start_ = current_ptr_;
+    token_state_.token_start = current_ptr_;
 
     int current_char = GetNextChar();
     switch (current_char) {
@@ -206,43 +221,37 @@ TokKind HloLexer::LexToken() {
 // dim_labels_pattern ::= [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}
 // identifiers ::= other cases that match [a-zA-Z_][a-zA-Z0-9_.-]*
 TokKind HloLexer::LexIdentifier() {
-  {
-    auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
-    // 'consumable' will be advanced iff its prefix matches the pattern.
-    static LazyRE2 shape_pattern = {
-        R"(^(\w*\d*)\[([\d,\s]*)\](?:(dense|sparse)?{([\d,\s]+)})?)"};
-    if (RE2::Consume(&consumable, *shape_pattern)) {
-      auto status_or_shape = ShapeUtil::ParseShapeString(
-          StringPieceFromPointers(token_start_, consumable.begin()));
-      if (status_or_shape.ok()) {
-        // This is a shape string.
-        shape_val_ = status_or_shape.ValueOrDie();
-        current_ptr_ = consumable.begin();
-        return TokKind::kShape;
-      }
-    }
-  }
-
   while (IsIdentifierChar(PeekCurrentChar())) {
     current_ptr_++;
   }
 
   // If followed by ':', it's a name.
   if (PeekCurrentChar() == ':') {
-    str_val_.assign(token_start_, current_ptr_);
+    token_state_.str_val.assign(token_state_.token_start, current_ptr_);
     current_ptr_++;  // skip ':'
     return TokKind::kName;
   }
 
   // If followed by '=', it's a attribute name.
   if (PeekCurrentChar() == '=') {
-    str_val_.assign(token_start_, current_ptr_);
+    token_state_.str_val.assign(token_state_.token_start, current_ptr_);
     current_ptr_++;  // skip '='
     return TokKind::kAttributeName;
   }
 
   absl::string_view identifier =
-      StringPieceFromPointers(token_start_, current_ptr_);
+      StringPieceFromPointers(token_state_.token_start, current_ptr_);
+
+  // Primitive type strings are reserved words. The exception is 'tuple' whose
+  // type is represented using nested parentheses without the string 'tuple'.
+  if (primitive_util::IsPrimitiveTypeName(identifier)) {
+    PrimitiveType primitive_type =
+        primitive_util::StringToPrimitiveType(identifier).ValueOrDie();
+    if (primitive_type != TUPLE) {
+      token_state_.primitive_type_val = primitive_type;
+      return TokKind::kPrimitiveType;
+    }
+  }
 
   // See if this is a keyword.
 #define KEYWORD(STR)            \
@@ -261,21 +270,23 @@ TokKind HloLexer::LexIdentifier() {
   KEYWORD(ROOT);
   KEYWORD(maximal);
   KEYWORD(replicated);
+  KEYWORD(sparse);
 
 #undef KEYWORD
 
   {
-    auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
+    auto consumable =
+        RegexpStringPieceFromPointers(token_state_.token_start, buf_.end());
     static LazyRE2 dim_labels_pattern = {
         R"([0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,})"};
     if (RE2::Consume(&consumable, *dim_labels_pattern)) {
       current_ptr_ = consumable.begin();
-      str_val_.assign(token_start_, current_ptr_);
+      token_state_.str_val.assign(token_state_.token_start, current_ptr_);
       return TokKind::kDimLabels;
     }
   }
 
-  str_val_ = string(identifier);
+  token_state_.str_val = string(identifier);
   return TokKind::kIdent;
 }
 
@@ -289,7 +300,7 @@ TokKind HloLexer::LexPercent() {
     while (IsIdentifierChar(PeekCurrentChar())) {
       current_ptr_++;
     }
-    str_val_.assign(name_start, current_ptr_);
+    token_state_.str_val.assign(name_start, current_ptr_);
     return TokKind::kName;
   }
   return TokKind::kError;
@@ -307,12 +318,14 @@ TokKind HloLexer::LexPercent() {
 // int ::=  [-]?[0-9]+
 // negative inf ::= '-inf'
 TokKind HloLexer::LexNumberOrPattern() {
-  auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
+  auto consumable =
+      RegexpStringPieceFromPointers(token_state_.token_start, buf_.end());
   static LazyRE2 float_pattern = {
       R"([-]?((\d+|\d+[.]\d*|\d*[.]\d+)([eE][+-]?\d+))|[-]?(\d+[.]\d*|\d*[.]\d+))"};
   if (RE2::Consume(&consumable, *float_pattern)) {
     current_ptr_ = consumable.begin();
-    CHECK(absl::SimpleAtod(string(token_start_, current_ptr_), &decimal_val_));
+    CHECK(absl::SimpleAtod(string(token_state_.token_start, current_ptr_),
+                           &token_state_.decimal_val));
     return TokKind::kDecimal;
   }
 
@@ -324,27 +337,28 @@ TokKind HloLexer::LexNumberOrPattern() {
 
   if (RE2::Consume(&consumable, *dim_labels_pattern)) {
     current_ptr_ = consumable.begin();
-    str_val_.assign(token_start_, current_ptr_);
+    token_state_.str_val.assign(token_state_.token_start, current_ptr_);
     return TokKind::kDimLabels;
   }
 
   if (RE2::Consume(&consumable, *dxd_pattern)) {
     current_ptr_ = consumable.begin();
-    str_val_.assign(token_start_, current_ptr_);
+    token_state_.str_val.assign(token_state_.token_start, current_ptr_);
     return TokKind::kDxD;
   }
 
   if (RE2::Consume(&consumable, *pad_pattern)) {
     current_ptr_ = consumable.begin();
-    str_val_.assign(token_start_, current_ptr_);
+    token_state_.str_val.assign(token_state_.token_start, current_ptr_);
     return TokKind::kPad;
   }
 
   static LazyRE2 int_pattern = {R"([-]?\d+)"};
   if (RE2::Consume(&consumable, *int_pattern)) {
     current_ptr_ = consumable.begin();
-    auto slice = StringPieceFromPointers(token_start_, current_ptr_);
-    if (absl::SimpleAtoi(slice, &int64_val_)) {
+    auto slice =
+        StringPieceFromPointers(token_state_.token_start, current_ptr_);
+    if (absl::SimpleAtoi(slice, &token_state_.int64_val)) {
       return TokKind::kInt;
     }
     LOG(ERROR) << "Failed to parse int literal: " << slice;
@@ -403,16 +417,17 @@ absl::string_view HloLexer::GetLine(LocTy loc) const {
 }
 
 // Lexes quoted string with escaping characters. If matched, the quoted string
-// will be unescaped and stored to str_val_.
+// will be unescaped and stored to token_state_.str_val.
 TokKind HloLexer::LexString() {
-  auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
+  auto consumable =
+      RegexpStringPieceFromPointers(token_state_.token_start, buf_.end());
   static LazyRE2 escaping_pattern = {R"("([^"\\]|\\.)*")"};
   if (RE2::Consume(&consumable, *escaping_pattern)) {
     current_ptr_ = consumable.begin();
     absl::string_view raw =
-        StringPieceFromPointers(token_start_ + 1, current_ptr_ - 1);
+        StringPieceFromPointers(token_state_.token_start + 1, current_ptr_ - 1);
     string error;
-    if (!absl::CUnescape(raw, &str_val_, &error)) {
+    if (!absl::CUnescape(raw, &token_state_.str_val, &error)) {
       LOG(ERROR) << "Failed unescaping string: " << raw << ". error: " << error;
       return TokKind::kError;
     }
@@ -467,6 +482,10 @@ string TokKindToString(TokKind kind) {
       return "kw_inf";
     case TokKind::kNegInf:
       return "kNegInf";
+    case TokKind::kw_sparse:
+      return "kw_sparse";
+    case TokKind::kPrimitiveType:
+      return "kPrimitiveType";
     case TokKind::kName:
       return "kName";
     case TokKind::kAttributeName:
@@ -481,8 +500,6 @@ string TokKindToString(TokKind kind) {
       return "kIdent";
     case TokKind::kString:
       return "kString";
-    case TokKind::kShape:
-      return "kShape";
     case TokKind::kInt:
       return "kInt";
     case TokKind::kDecimal:
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
index d6a2b292a3916b2ff85f278cf5cb9f1567df88fa..41f5043904a2622814154693679a0e27cb92f642 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_token.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -29,6 +28,57 @@ limitations under the License.
 
 namespace xla {
 
+// Defines different kinds of tokens used by the HLO lexer.
+//
+// You shouldn't need to use this directly unless you're using HloLexer
+// directly, and you probably don't need to do that.  Use hlo_parser instead.
+enum class TokKind {
+  // Markers
+  kEof,
+  kError,
+
+  // Tokens with no info.
+  kEqual,  // =
+  kComma,  // ,
+  kColon,  // :
+  kLsquare,
+  kRsquare,  // [  ]
+  kLbrace,
+  kRbrace,  // {  }
+  kLparen,
+  kRparen,  // (  )
+
+  kArrow,  // ->
+
+  // Keywords
+  kw_HloModule,
+  kw_ENTRY,
+  kw_ROOT,
+  kw_true,
+  kw_false,
+  kw_maximal,
+  kw_replicated,
+  kw_nan,
+  kw_inf,
+  kw_sparse,
+
+  kNegInf,  // -inf
+
+  // Typed tokens.
+  kPrimitiveType,  // F32, PRED, etc.
+  kName,           // %foo
+  kAttributeName,  // dimensions=
+  kDimLabels,      // [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}
+  kDxD,            // [0-9]+(x[0-9]+)+
+  kPad,            // [0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*
+  kIdent,          // other identifiers
+  kString,         // "abcd\"\n"
+  kInt,            // 42
+  kDecimal,        // 4.2
+};
+
+string TokKindToString(TokKind kind);
+
 // Lexer for the HloModule::ToString() format text.
 //
 // This class is meant to be used by hlo_parser.cc.  You shouldn't need to use
@@ -39,9 +89,9 @@ class HloLexer {
     current_ptr_ = buf_.begin();
   }
 
-  TokKind Lex() { return current_kind_ = LexToken(); }
+  TokKind Lex() { return token_state_.current_kind = LexToken(); }
 
-  TokKind GetKind() const { return current_kind_; }
+  TokKind GetKind() const { return token_state_.current_kind; }
   string GetStrVal() const {
     switch (GetKind()) {
       case TokKind::kName:
@@ -51,28 +101,28 @@ class HloLexer {
       case TokKind::kPad:
       case TokKind::kString:
       case TokKind::kIdent:
-        return str_val_;
+        return token_state_.str_val;
       default:
         LOG(FATAL) << "This token does not have string value";
     }
   }
-  Shape GetShapeVal() const {
-    CHECK(GetKind() == TokKind::kShape);
-    return shape_val_;
-  }
   tensorflow::int64 GetInt64Val() const {
     CHECK(GetKind() == TokKind::kInt);
-    return int64_val_;
+    return token_state_.int64_val;
   }
   double GetDecimalVal() const {
     CHECK(GetKind() == TokKind::kDecimal);
-    return decimal_val_;
+    return token_state_.decimal_val;
+  }
+  PrimitiveType GetPrimitiveTypeVal() const {
+    CHECK(GetKind() == TokKind::kPrimitiveType);
+    return token_state_.primitive_type_val;
   }
 
   typedef const char* LocTy;
 
   // Returns the location of the current token.
-  LocTy GetLoc() const { return token_start_; }
+  LocTy GetLoc() const { return token_state_.token_start; }
 
   // Returns the line and column of a location in the buffer.
   std::pair<unsigned, unsigned> GetLineAndColumn(LocTy location) const;
@@ -80,6 +130,9 @@ class HloLexer {
   // Returns the whole line given the location.
   absl::string_view GetLine(LocTy loc) const;
 
+  // Looks ahead one token and returns it. Lexer state is unchanged.
+  TokKind LookAhead();
+
  private:
   // Returns the current character. If it's neither the end of input buffer nor
   // an invalid character, moves the pointer forward.
@@ -112,12 +165,15 @@ class HloLexer {
   const char* current_ptr_;
 
   // Information about the current token.
-  const char* token_start_ = nullptr;
-  TokKind current_kind_;
-  string str_val_;
-  Shape shape_val_;
-  tensorflow::int64 int64_val_;
-  double decimal_val_;
+  struct TokenState {
+    const char* token_start = nullptr;
+    TokKind current_kind;
+    string str_val;
+    tensorflow::int64 int64_val;
+    double decimal_val;
+    PrimitiveType primitive_type_val;
+  };
+  TokenState token_state_;
 
   struct LineNoCacheTy {
     const char* last_query;
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
index e0ae1173c6114f0bc6ef18b2cfff9d54ccfe2faf..436cccb1fb9ecf6f4efad772c700c611b28ce628 100644
--- a/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
@@ -403,9 +403,9 @@ TEST_F(HloLivenessAnalysisTest, WhileWithOutfeed) {
   HloModule OutfeedLoop
   WhileBody {
     body_param = (s32[]) parameter(0)
-    token = token[] after-all()
+    token0 = token[] after-all()
     constant.2 = s32[] constant(2)
-    outfeed_tuple = (s32[]) outfeed(constant.2, token)
+    outfeed_tuple = (s32[]) outfeed(constant.2, token0)
     get-tuple-element.1 = s32[] get-tuple-element(body_param), index=0
     constant.1 = s32[] constant(1)
     add = s32[] add(get-tuple-element.1, constant.1)
@@ -436,9 +436,9 @@ TEST_F(HloLivenessAnalysisTest, NestedWhileWithOutfeed) {
   HloModule OutfeedLoop
   InnerWhileBody {
     body_param = (s32[]) parameter(0)
-    token = token[] after-all()
+    token0 = token[] after-all()
     constant.2 = s32[] constant(2)
-    outfeed_tuple = (s32[]) outfeed(constant.2, token)
+    outfeed_tuple = (s32[]) outfeed(constant.2, token0)
     get-tuple-element.1 = s32[] get-tuple-element(body_param), index=0
     constant.1 = s32[] constant(1)
     add = s32[] add(get-tuple-element.1, constant.1)
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 235efb19ce4ed28a5cd9fe5ca52ae5d8e9e5ba3d..1fbcbdf98d68204b1c6269d51d9b19363761ee04 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -312,8 +312,8 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> Shape(
 }
 inline ::testing::Matcher<const ::xla::HloInstruction*> Shape(
     absl::string_view shape) {
-  return ::testing::MakeMatcher(new ::xla::testing::HloShapeMatcher(
-      ShapeUtil::ParseShapeString(shape).ValueOrDie()));
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloShapeMatcher(ParseShape(shape).ValueOrDie()));
 }
 inline ::testing::Matcher<const ::xla::HloInstruction*> ShapeWithLayout(
     const class Shape& shape) {
@@ -323,7 +323,7 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> ShapeWithLayout(
 inline ::testing::Matcher<const ::xla::HloInstruction*> ShapeWithLayout(
     absl::string_view shape) {
   return ::testing::MakeMatcher(new ::xla::testing::HloShapeAndLayoutMatcher(
-      ShapeUtil::ParseShapeString(shape).ValueOrDie()));
+      ParseShape(shape).ValueOrDie()));
 }
 
 // Verifies the value of the HloSharing against the provided sharding object.
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 7b9cbf9a53a2201b1312405bbd7ed2b88f65c9be..f1310e4b270898a21dbb4f86123edde4ba8993d0 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -136,7 +136,9 @@ class HloModule {
   // information on opcode, shape, operands, and typically a root instruction.
   // This function returns the same hash value for equivalent HLO modules,
   // with respect to HloInstruction::Identical() method.
-  uint64 Hash() const { return entry_computation()->Hash(); }
+  uint64 Hash() const {
+    return entry_computation()->root_instruction()->Hash();
+  }
 
   // Gets the computations in this module.
   //
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
index bf66cc6bc37a5e11c9ecfc07a62ba0ea5ca11a03..e535b7d74943943069b4d795cf999a3b1e963360 100644
--- a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
@@ -373,9 +373,9 @@ TEST_F(HloModuleDceTest, WhileWithOutfeed) {
   HloModule OutfeedLoop
   WhileBody {
     body_param = (s32[]) parameter(0)
-    token = token[] after-all()
+    token0 = token[] after-all()
     constant.2 = s32[] constant(2)
-    outfeed_tuple = (s32[]) outfeed(constant.2, token)
+    outfeed_tuple = (s32[]) outfeed(constant.2, token0)
     get-tuple-element.1 = s32[] get-tuple-element(body_param), index=0
     constant.1 = s32[] constant(1)
     add = s32[] add(get-tuple-element.1, constant.1)
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 9b5bb5d0bd6af104ef62eaa5d3e53cedbe0213d3..29bb088f6de9a5113d253b7e5559a8e66e7e408b 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -74,6 +75,7 @@ class HloParser {
   string GetError() const { return StrJoin(error_, "\n"); }
 
   // Stand alone parsing utils for various aggregate data types.
+  StatusOr<Shape> ParseShapeOnly();
   StatusOr<HloSharding> ParseShardingOnly();
   StatusOr<Window> ParseWindowOnly();
   StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbersOnly();
@@ -255,7 +257,9 @@ class HloParser {
   bool ParseName(string* result);
   bool ParseAttributeName(string* result);
   bool ParseString(string* result);
+  bool ParseDimensionSizes(std::vector<int64>* dimension_sizes);
   bool ParseShape(Shape* result);
+  bool ParseLayout(Layout* layout);
   bool ParseOpcode(HloOpcode* result);
   bool ParseFftType(FftType* result);
   bool ParseFusionKind(HloInstruction::FusionKind* result);
@@ -279,9 +283,6 @@ class HloParser {
   // If the current token is 'kind', eats it (i.e. lexes the next token) and
   // returns true.
   bool EatIfPresent(TokKind kind);
-  // Parses a shape, and returns true if the result is compatible with the given
-  // shape.
-  bool EatShapeAndCheckCompatible(const Shape& shape);
 
   // Adds the instruction to the pool. Returns false and emits an error if the
   // instruction already exists.
@@ -1697,11 +1698,6 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
         }
         break;
       }
-      case TokKind::kShape:
-        // TODO(b/112302613): Left here for backward compatibility to ignore the
-        // removed tile shape data.
-        lexer_.Lex();
-        break;
       case TokKind::kRbrace:
         break;
       default:
@@ -1925,19 +1921,6 @@ bool HloParser::SetValueInLiteralHelper(ParsedElemT value,
   return true;
 }
 
-bool HloParser::EatShapeAndCheckCompatible(const Shape& shape) {
-  Shape new_shape;
-  if (!ParseShape(&new_shape)) {
-    return TokenError(StrCat("expects shape ", ShapeUtil::HumanString(shape)));
-  }
-  if (!ShapeUtil::Compatible(shape, new_shape)) {
-    return TokenError(StrCat(
-        "expects shape ", ShapeUtil::HumanString(shape),
-        ", but sees a different shape: ", ShapeUtil::HumanString(new_shape)));
-  }
-  return true;
-}
-
 // literal
 //  ::= tuple
 //  ::= non_tuple
@@ -1952,10 +1935,6 @@ bool HloParser::ParseLiteral(Literal* literal, const Shape& shape) {
 //  ::= /*empty*/
 //  ::= literal (',' literal)*
 bool HloParser::ParseTupleLiteral(Literal* literal, const Shape& shape) {
-  if (!EatShapeAndCheckCompatible(shape)) {
-    return TokenError(StrCat("expects tuple constant in shape ",
-                             ShapeUtil::HumanString(shape)));
-  }
   if (!ParseToken(TokKind::kLparen, "expects '(' in front of tuple elements")) {
     return false;
   }
@@ -1990,16 +1969,12 @@ bool HloParser::ParseNonTupleLiteral(Literal* literal, const Shape& shape) {
     return ParseSparseLiteral(literal, shape);
   }
 
-  CHECK(LayoutUtil::IsDenseArray(shape));
+  CHECK(LayoutUtil::IsDenseArray(shape)) << shape.ToString(true);
   return ParseDenseLiteral(literal, shape);
 }
 
 bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
   const tensorflow::int64 rank = ShapeUtil::Rank(shape);
-  if (rank > 1 && !EatShapeAndCheckCompatible(shape)) {
-    return false;
-  }
-
   // Create a literal with the given shape in default layout.
   *literal = LiteralUtil::CreateFromDimensions(
       shape.element_type(), AsInt64Slice(shape.dimensions()));
@@ -2126,10 +2101,6 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
 }
 
 bool HloParser::ParseSparseLiteral(Literal* literal, const Shape& shape) {
-  if (!EatShapeAndCheckCompatible(shape)) {
-    return false;
-  }
-
   switch (shape.element_type()) {
     case PRED:
       return ParseSparseLiteralHelper<tensorflow::uint8>(literal, shape);
@@ -2994,6 +2965,39 @@ bool HloParser::ParseParamList() {
   return ParseToken(TokKind::kRparen, "expects ')' at the end of param list");
 }
 
+// dimension_sizes ::= '[' int64_list ']'
+bool HloParser::ParseDimensionSizes(std::vector<int64>* dimension_sizes) {
+  auto parse_and_add_item = [&]() {
+    tensorflow::int64 i;
+    if (!ParseInt64(&i)) {
+      return false;
+    }
+    dimension_sizes->push_back(i);
+    return true;
+  };
+  return ParseList(TokKind::kLsquare, TokKind::kRsquare, TokKind::kComma,
+                   parse_and_add_item);
+}
+
+// layout ::= '{' int64_list '}'
+bool HloParser::ParseLayout(Layout* layout) {
+  std::vector<int64> minor_to_major;
+  auto parse_and_add_item = [&]() {
+    tensorflow::int64 i;
+    if (!ParseInt64(&i)) {
+      return false;
+    }
+    minor_to_major.push_back(i);
+    return true;
+  };
+  if (!ParseList(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma,
+                 parse_and_add_item)) {
+    return false;
+  }
+  *layout = LayoutUtil::MakeLayout(minor_to_major);
+  return true;
+}
+
 // shape ::= shape_val_
 // shape ::= '(' tuple_elements ')'
 // tuple_elements
@@ -3017,19 +3021,61 @@ bool HloParser::ParseShape(Shape* result) {
     return ParseToken(TokKind::kRparen, "expects ')' at the end of tuple.");
   }
 
-  if (lexer_.GetKind() != TokKind::kShape) {
-    return TokenError(absl::StrCat("expected shape, saw ",
+  if (lexer_.GetKind() != TokKind::kPrimitiveType) {
+    return TokenError(absl::StrCat("expected primitive type, saw ",
                                    TokKindToString(lexer_.GetKind())));
   }
-  *result = lexer_.GetShapeVal();
+  PrimitiveType primitive_type = lexer_.GetPrimitiveTypeVal();
   lexer_.Lex();
+
+  std::vector<int64> dimension_sizes;
+  if (!ParseDimensionSizes(&dimension_sizes)) {
+    return false;
+  }
+  result->set_element_type(primitive_type);
+  *result->mutable_dimensions() = dimension_sizes;
+  LayoutUtil::SetToDefaultLayout(result);
+
+  if (lexer_.GetKind() == TokKind::kw_sparse) {
+    lexer_.Lex();
+    const string message =
+        "expects a brace-bracketed integer for sparse layout";
+    tensorflow::int64 max_sparse_elements;
+    if (!ParseToken(TokKind::kLbrace, message) ||
+        !ParseInt64(&max_sparse_elements) ||
+        !ParseToken(TokKind::kRbrace, message)) {
+      return false;
+    }
+    *result->mutable_layout() =
+        LayoutUtil::MakeSparseLayout(max_sparse_elements);
+    return true;
+  }
+
+  // We need to lookahead to see if a following open brace is the start of a
+  // layout. The specific problematic case is:
+  //
+  // ENTRY %foo (x: f32[42]) -> f32[123] {
+  //  ...
+  // }
+  //
+  // The open brace could either be the start of a computation or the start of a
+  // layout for the f32[123] shape. We consider it the start of a layout if the
+  // next token after the open brace is a integer
+  if (lexer_.GetKind() == TokKind::kLbrace &&
+      lexer_.LookAhead() == TokKind::kInt) {
+    Layout layout;
+    if (!ParseLayout(&layout)) {
+      return false;
+    }
+    *result->mutable_layout() = layout;
+  }
   return true;
 }
 
 bool HloParser::CanBeShape() {
-  // A non-tuple shape starts with a kShape token; a tuple shape starts with
-  // '('.
-  return lexer_.GetKind() == TokKind::kShape ||
+  // A non-tuple shape starts with a kPrimitiveType token; a tuple shape starts
+  // with '('.
+  return lexer_.GetKind() == TokKind::kPrimitiveType ||
          lexer_.GetKind() == TokKind::kLparen;
 }
 
@@ -3332,6 +3378,18 @@ bool HloParser::AddComputation(const string& name, HloComputation* computation,
   return true;
 }
 
+StatusOr<Shape> HloParser::ParseShapeOnly() {
+  lexer_.Lex();
+  Shape shape;
+  if (!ParseShape(&shape)) {
+    return InvalidArgument("Syntax error:\n%s", GetError());
+  }
+  if (lexer_.GetKind() != TokKind::kEof) {
+    return InvalidArgument("Syntax error:\nExtra content after shape");
+  }
+  return shape;
+}
+
 StatusOr<HloSharding> HloParser::ParseShardingOnly() {
   lexer_.Lex();
   OpSharding op_sharding;
@@ -3475,4 +3533,9 @@ StatusOr<PaddingConfig> ParsePaddingConfig(absl::string_view str) {
   return parser.ParsePaddingConfigOnly();
 }
 
+StatusOr<Shape> ParseShape(absl::string_view str) {
+  HloParser parser(str);
+  return parser.ParseShapeOnly();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_parser.h b/tensorflow/compiler/xla/service/hlo_parser.h
index d830fa61438239005875f785f85cf2486123ebc9..450a54c54c156c2ae27475d145a8e83dc841b431 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.h
+++ b/tensorflow/compiler/xla/service/hlo_parser.h
@@ -60,6 +60,9 @@ StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
 // Parses the result of PaddingConfigToString(), e.g. "0_0x1_1".
 StatusOr<PaddingConfig> ParsePaddingConfig(absl::string_view str);
 
+// Parses and returns a Shape::ToString-format string.
+StatusOr<Shape> ParseShape(absl::string_view str);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PARSER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index ab71f011ac9d77d00ddfb41aca7a224d26d416b7..80882d490d6b477403f87a4eb266d3ba2fdb3378 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -82,7 +82,7 @@ ENTRY %constant_pred () -> pred[] {
 R"(HloModule module
 
 ENTRY %constant_pred_array () -> pred[2,3] {
-  ROOT %constant = pred[2,3]{1,0} constant(pred[2,3] { { 0, 1, 0 }, { 1, 0, 1 } })
+  ROOT %constant = pred[2,3]{1,0} constant({ { 0, 1, 0 }, { 1, 0, 1 } })
 }
 
 )"
@@ -128,7 +128,7 @@ ENTRY %ConstantF32Empty.v4 () -> f32[0] {
 R"(HloModule ConstantF32R4Empty_module
 
 ENTRY %ConstantF32R4Empty.v4 () -> f32[2,0,4,3] {
-  ROOT %constant = f32[2,0,4,3]{3,2,1,0} constant(f32[2,0,4,3] { { /*i0=0*/ }, { /*i0=1*/ } })
+  ROOT %constant = f32[2,0,4,3]{3,2,1,0} constant({ { /*i0=0*/ }, { /*i0=1*/ } })
 }
 
 )"
@@ -139,7 +139,7 @@ ENTRY %ConstantF32R4Empty.v4 () -> f32[2,0,4,3] {
 R"(HloModule Small_3x2x1x1_module
 
 ENTRY %Small_3x2x1x1.v1 () -> f32[3,2,1,1] {
-  ROOT %constant = f32[3,2,1,1]{3,2,1,0} constant(f32[3,2,1,1] { { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } })
+  ROOT %constant = f32[3,2,1,1]{3,2,1,0} constant({ { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } })
 }
 
 )"
@@ -196,7 +196,7 @@ ENTRY %add_constants () -> f32[] {
 R"(HloModule TupleConstant_module
 
 ENTRY %TupleConstant.v1 () -> (f32[2,1], f32[2]) {
-  ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { {1}, {2} }, {2, 42} ))
+  ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant(( { {1}, {2} }, {2, 42} ))
 }
 
 )"
@@ -295,11 +295,11 @@ ENTRY %WhileWithScalarS32Result.v2 () -> s32[] {
 R"(HloModule TwoSendRecvBothWayRecvFist_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> (f32[], token[]) {
-  %token = token[] after-all()
-  %recv = (f32[], u32[], token[]) recv(token[] %token), channel_id=15, sharding={maximal device=1}
+  %token0 = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(token[] %token0), channel_id=15, sharding={maximal device=1}
   ROOT %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15, sharding={maximal device=1}
   %constant = f32[] constant(2.1), sharding={maximal device=0}
-  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token), channel_id=16, sharding={maximal device=0}, control-predecessors={%recv}
+  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token0), channel_id=16, sharding={maximal device=0}, control-predecessors={%recv}
   %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16, sharding={maximal device=0}
 }
 
@@ -310,11 +310,11 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> (f32[], token[]) {
 R"(HloModule HostTransferSendRecv_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> (f32[], token[]) {
-  %token = token[] after-all()
-  %recv = (f32[], u32[], token[]) recv(token[] %token), channel_id=15, is_host_transfer=true
+  %token0 = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(token[] %token0), channel_id=15, is_host_transfer=true
   ROOT %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15, is_host_transfer=true
   %constant = f32[] constant(2.1), sharding={maximal device=0}
-  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token), channel_id=16, is_host_transfer=true
+  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token0), channel_id=16, is_host_transfer=true
   %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16, is_host_transfer=true
 }
 
@@ -327,7 +327,7 @@ R"(HloModule GetTupleElement_module
 
 ENTRY %GetTupleElement.v4 () -> s32[2,3] {
   %constant = f32[3]{0} constant({1, 2, 3})
-  %constant.1 = s32[2,3]{1,0} constant(s32[2,3] { { 1, 2, 3 }, { 4, 5, 6 } })
+  %constant.1 = s32[2,3]{1,0} constant({ { 1, 2, 3 }, { 4, 5, 6 } })
   %tuple = (f32[3]{0}, s32[2,3]{1,0}) tuple(f32[3]{0} %constant, s32[2,3]{1,0} %constant.1)
   ROOT %get-tuple-element = s32[2,3]{1,0} get-tuple-element((f32[3]{0}, s32[2,3]{1,0}) %tuple), index=1, sharding={maximal device=0}
 }
@@ -434,7 +434,7 @@ ENTRY %ConvolveBackward (input: f32[128,7,7,512], filter: f32[3,3,512,512]) -> f
 R"(HloModule Reverse4DFloatArrayOnDim01_module
 
 ENTRY %Reverse4DFloatArrayOnDim01.v2 () -> f32[4,3,2,1] {
-  %constant = f32[4,3,2,1]{0,1,2,3} constant(f32[4,3,2,1] { { /*i0=0*/ { /*i1=0*/ {1}, {2} }, { /*i1=1*/ {3}, {4} }, { /*i1=2*/ {5}, {6} } }, { /*i0=1*/ { /*i1=0*/ {7}, {8} }, { /*i1=1*/ {9}, {10} }, { /*i1=2*/ {11}, {12} } }, { /*i0=2*/ { /*i1=0*/ {13}, {14} }, { /*i1=1*/ {15}, {16} }, { /*i1=2*/ {17}, {18} } }, { /*i0=3*/ { /*i1=0*/ {19}, {20} }, { /*i1=1*/ {21}, {22} }, { /*i1=2*/ {23}, {24} } } })
+  %constant = f32[4,3,2,1]{0,1,2,3} constant({ { /*i0=0*/ { /*i1=0*/ {1}, {2} }, { /*i1=1*/ {3}, {4} }, { /*i1=2*/ {5}, {6} } }, { /*i0=1*/ { /*i1=0*/ {7}, {8} }, { /*i1=1*/ {9}, {10} }, { /*i1=2*/ {11}, {12} } }, { /*i0=2*/ { /*i1=0*/ {13}, {14} }, { /*i1=1*/ {15}, {16} }, { /*i1=2*/ {17}, {18} } }, { /*i0=3*/ { /*i1=0*/ {19}, {20} }, { /*i1=1*/ {21}, {22} }, { /*i1=2*/ {23}, {24} } } })
   ROOT %reverse = f32[4,3,2,1]{0,1,2,3} reverse(f32[4,3,2,1]{0,1,2,3} %constant), dimensions={0,1}
 }
 
@@ -446,8 +446,8 @@ ENTRY %Reverse4DFloatArrayOnDim01.v2 () -> f32[4,3,2,1] {
 R"(HloModule Concat2x3With2x5_module
 
 ENTRY %Concat2x3With2x5.v3 () -> f32[2,8] {
-  %constant = f32[2,3]{1,0} constant(f32[2,3] { { 0, 1, 2 }, { 1000, 1001, 1002 } })
-  %constant.1 = f32[2,5]{1,0} constant(f32[2,5] { { 64, 65, 66, 67, 68 }, { 1064, 1065, 1066, 1067, 1068 } })
+  %constant = f32[2,3]{1,0} constant({ { 0, 1, 2 }, { 1000, 1001, 1002 } })
+  %constant.1 = f32[2,5]{1,0} constant({ { 64, 65, 66, 67, 68 }, { 1064, 1065, 1066, 1067, 1068 } })
   ROOT %concatenate = f32[2,8]{1,0} concatenate(f32[2,3]{1,0} %constant, f32[2,5]{1,0} %constant.1), dimensions={1}
 }
 
@@ -471,8 +471,8 @@ R"(HloModule R4F32OverlapSmall_module
 }
 
 ENTRY %R4F32OverlapSmall.v4 () -> f32[4,5,1,1] {
-  %constant = f32[4,5,1,1]{3,2,1,0} constant(f32[4,5,1,1] { { /*i0=0*/ { /*i1=0*/ {7} }, { /*i1=1*/ {2} }, { /*i1=2*/ {5} }, { /*i1=3*/ {3} }, { /*i1=4*/ {8} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {8} }, { /*i1=2*/ {9} }, { /*i1=3*/ {3} }, { /*i1=4*/ {4} } }, { /*i0=2*/ { /*i1=0*/ {1} }, { /*i1=1*/ {5} }, { /*i1=2*/ {7} }, { /*i1=3*/ {5} }, { /*i1=4*/ {6} } }, { /*i0=3*/ { /*i1=0*/ {0} }, { /*i1=1*/ {6} }, { /*i1=2*/ {2} }, { /*i1=3*/ {10} }, { /*i1=4*/ {2} } } })
-  %constant.1 = f32[2,2,1,1]{3,2,1,0} constant(f32[2,2,1,1] { { /*i0=0*/ { /*i1=0*/ {2} }, { /*i1=1*/ {6} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {1} } } })
+  %constant = f32[4,5,1,1]{3,2,1,0} constant({ { /*i0=0*/ { /*i1=0*/ {7} }, { /*i1=1*/ {2} }, { /*i1=2*/ {5} }, { /*i1=3*/ {3} }, { /*i1=4*/ {8} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {8} }, { /*i1=2*/ {9} }, { /*i1=3*/ {3} }, { /*i1=4*/ {4} } }, { /*i0=2*/ { /*i1=0*/ {1} }, { /*i1=1*/ {5} }, { /*i1=2*/ {7} }, { /*i1=3*/ {5} }, { /*i1=4*/ {6} } }, { /*i0=3*/ { /*i1=0*/ {0} }, { /*i1=1*/ {6} }, { /*i1=2*/ {2} }, { /*i1=3*/ {10} }, { /*i1=4*/ {2} } } })
+  %constant.1 = f32[2,2,1,1]{3,2,1,0} constant({ { /*i0=0*/ { /*i1=0*/ {2} }, { /*i1=1*/ {6} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {1} } } })
   %constant.2 = f32[] constant(0)
   ROOT %select-and-scatter = f32[4,5,1,1]{3,2,1,0} select-and-scatter(f32[4,5,1,1]{3,2,1,0} %constant, f32[2,2,1,1]{3,2,1,0} %constant.1, f32[] %constant.2), window={size=2x3x1x1 stride=2x2x1x1}, select=%ge_F32.v3, scatter=%add_F32.v3
 }
@@ -523,7 +523,7 @@ ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
 R"(HloModule Slice3x3x3_To_1x3x3_F32_module
 
 ENTRY %Slice3x3x3_To_1x3x3_F32.v2 () -> f32[1,3,3] {
-  %constant = f32[3,3,3]{2,1,0} constant(f32[3,3,3] { { { 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 } }, { { 9, 10, 11 }, { 12, 13, 14 }, { 15, 16, 17 } }, { { 18, 19, 20 }, { 21, 22, 23 }, { 24, 25, 26 } } })
+  %constant = f32[3,3,3]{2,1,0} constant({ { { 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 } }, { { 9, 10, 11 }, { 12, 13, 14 }, { 15, 16, 17 } }, { { 18, 19, 20 }, { 21, 22, 23 }, { 24, 25, 26 } } })
   ROOT %slice = f32[1,3,3]{2,1,0} slice(f32[3,3,3]{2,1,0} %constant), slice={[0:1], [0:3], [0:3]}
 }
 
@@ -547,7 +547,7 @@ ENTRY %SliceR0.v2 () -> s32[] {
 R"(HloModule Transpose_module
 
 ENTRY %Transpose.v2 () -> s32[1,2,3] {
-  %constant = s32[1,2,3]{2,1,0} constant(s32[1,2,3] { { { 1, 2, 3 }, { 4, 5, 6 } } })
+  %constant = s32[1,2,3]{2,1,0} constant({ { { 1, 2, 3 }, { 4, 5, 6 } } })
   ROOT %transpose = s32[1,2,3]{2,1,0} transpose(s32[1,2,3]{2,1,0} %constant), dimensions={0,1,2}
 }
 
@@ -588,7 +588,7 @@ ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_
 R"(HloModule BasicTraining_module
 
 ENTRY %BasicTraining.v4 () -> (f32[2,2,1,2], f32[2], f32[2]) {
-  %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ { 1, 2 } }, { /*i1=1*/ { 3, 4 } } }, { /*i0=1*/ { /*i1=0*/ { 5, 6 } }, { /*i1=1*/ { 7, 8 } } } })
+  %constant = f32[2,2,1,2]{3,2,1,0} constant({ { /*i0=0*/ { /*i1=0*/ { 1, 2 } }, { /*i1=1*/ { 3, 4 } } }, { /*i0=1*/ { /*i1=0*/ { 5, 6 } }, { /*i1=1*/ { 7, 8 } } } })
   %constant.1 = f32[2]{0} constant({2, 3})
   %constant.2 = f32[2]{0} constant({1, 2})
   ROOT %batch-norm-training = (f32[2,2,1,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}) batch-norm-training(f32[2,2,1,2]{3,2,1,0} %constant, f32[2]{0} %constant.1, f32[2]{0} %constant.2), epsilon=0.001, feature_index=3
@@ -728,7 +728,7 @@ R"(HloModule fusion_module
 }
 
 ENTRY %fusion.v3 () -> f32[3,2,1,1] {
-  %constant = f32[3,2,1,1]{3,2,1,0} constant(f32[3,2,1,1] { { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } })
+  %constant = f32[3,2,1,1]{3,2,1,0} constant({ { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } })
   %constant.1 = f32[2]{0} constant({3.14, 4.25})
   ROOT %fusion = f32[3,2,1,1]{3,2,1,0} fusion(f32[3,2,1,1]{3,2,1,0} %constant, f32[2]{0} %constant.1), kind=kLoop, calls=%fused_computation
 }
@@ -740,7 +740,7 @@ ENTRY %fusion.v3 () -> f32[3,2,1,1] {
 R"(HloModule sparse_f32
 
 ENTRY %sparse () -> f32[2,3,4] {
-  ROOT %foo = f32[2,3,4]sparse{10} constant(f32[2,3,4]{[0, 1, 2]: 1, [1, 2, 3]: 2, [2, 3, 4]: 3})
+  ROOT %foo = f32[2,3,4]sparse{10} constant({[0, 1, 2]: 1, [1, 2, 3]: 2, [2, 3, 4]: 3})
 }
 
 )"
@@ -750,7 +750,7 @@ ENTRY %sparse () -> f32[2,3,4] {
 R"(HloModule sparse_f32_empty
 
 ENTRY %sparse_f32_empty () -> f32[2,3,4] {
-  ROOT %foo = f32[2,3,4]sparse{10} constant(f32[2,3,4]{})
+  ROOT %foo = f32[2,3,4]sparse{10} constant({})
 }
 
 )"
@@ -760,7 +760,7 @@ ENTRY %sparse_f32_empty () -> f32[2,3,4] {
 R"(HloModule sparse_f32_r1
 
 ENTRY %sparse_f32_r1 () -> f32[9] {
-  ROOT %foo = f32[9]sparse{10} constant(f32[9]{1: 2, 3: 4, 5: 6})
+  ROOT %foo = f32[9]sparse{10} constant({1: 2, 3: 4, 5: 6})
 }
 
 )"
@@ -931,11 +931,11 @@ ENTRY reduce_entry {
 R"(HloModule outfeed_module
 
 ENTRY InfeedToOutfeed {
-  token = token[] after-all()
-  infeed = ((u32[3]{0}, pred[]), token[]) infeed(token)
+  token0 = token[] after-all()
+  infeed = ((u32[3]{0}, pred[]), token[]) infeed(token0)
   infeed.data = (u32[3]{0}, pred[]) get-tuple-element(infeed), index=0
-  outfeed = token[] outfeed(infeed.data, token)
-  ROOT infeed.1 = ((u32[3]{0}, pred[]), token[]) infeed(token)
+  outfeed = token[] outfeed(infeed.data, token0)
+  ROOT infeed.1 = ((u32[3]{0}, pred[]), token[]) infeed(token0)
   infeed.1.data = (u32[3]{0}, pred[]) get-tuple-element(infeed.1), index=0
   infeed.1.token = token[] get-tuple-element(infeed.1), index=1
   outfeed.1 = token[] outfeed(infeed.1.data, infeed.1.token)
@@ -1266,8 +1266,8 @@ R"(HloModule AddDependency
 ENTRY AddDependency {
   p = f32[] parameter(0)
   neg = f32[] negate(p)
-  token = token[] after-all(neg)
-  p_after_token = f32[] add-dependency(p, token)
+  token0 = token[] after-all(neg)
+  p_after_token = f32[] add-dependency(p, token0)
   exp = f32[] exponential(p_after_token)
   ROOT sum = f32[] add(neg, exp)
 }
@@ -1419,7 +1419,7 @@ TEST_F(HloParserTest, MoreConstants) {
 
 ENTRY %SelectScalarS32True.v4 () -> s32[] {
   %constant.2 = pred[] constant(true)
-  %constant.1 = s32[] constant(-42), sharding={s32[5,6] devices=[2,2]1,2,3,4}
+  %constant.1 = s32[] constant(-42), sharding={devices=[2,2]1,2,3,4}
   %constant = s32[] constant(42)
   %select = s32[] select(pred[] %constant.2, s32[] %constant.1, s32[] %constant)
 }
@@ -1462,7 +1462,7 @@ TEST_F(HloParserTest, LiteralDimensionsMismatch_2) {
   const string original = R"(HloModule some_2x3_module
 
 ENTRY %some_2x3 () -> f32[2,3] {
-  ROOT %constant = f32[2,3]{1,0} constant(f32[2,3] {1, 2, 3, 4, 5, 6})
+  ROOT %constant = f32[2,3]{1,0} constant({1, 2, 3, 4, 5, 6})
 }
 
 )";
@@ -1476,7 +1476,7 @@ TEST_F(HloParserTest, LiteralDimensionsMismatch_3) {
   const string original = R"(HloModule some_2x3x2_module
 
 ENTRY %some_2x3x2 () -> f32[2,3,2] {
-  ROOT %constant = f32[2,3,2]{2,1,0} constant(f32[2,3,2] {{{1, 2}, {3, 4}, {5, 6}, {7, 8}, {9, 10}, {11, 12}}})
+  ROOT %constant = f32[2,3,2]{2,1,0} constant({{{1, 2}, {3, 4}, {5, 6}, {7, 8}, {9, 10}, {11, 12}}})
 }
 
 )";
@@ -1594,11 +1594,11 @@ TEST_F(HloParserTest, UnexpectedAttribute) {
   const string original = R"(HloModule unexpected_attr_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
-  %token = token[] after-all()
-  %recv = (f32[], u32[], token[]) recv(token[] %token), channel_id=15
+  %token0 = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(token[] %token0), channel_id=15
   %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15
   ROOT %constant = f32[] constant(2.1)
-  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token), channel_id=16, calls=%recv
+  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token0), channel_id=16, calls=%recv
   %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16
 }
 
@@ -1611,11 +1611,11 @@ TEST_F(HloParserTest, MissingAttribute) {
   const string original = R"(HloModule missing_attr_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
-  %token = token[] after-all()
-  %recv = (f32[], u32[], token[]) recv(token[] %token), channel_id=15
+  %token0 = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(token[] %token0), channel_id=15
   %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15
   ROOT %constant = f32[] constant(-2.1)
-  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token)
+  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token0)
   %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16
 }
 
@@ -1628,11 +1628,11 @@ TEST_F(HloParserTest, PredecessorUndefined) {
   const string original = R"(HloModule pre_not_found_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
-  %token = token[] after-all()
-  %recv = (f32[], u32[], token[]) recv(token[] %token), channel_id=15
+  %token0 = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(token[] %token0), channel_id=15
   %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15
   ROOT %constant = f32[] constant(2.1)
-  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token), channel_id=16, control-predecessors={%done}
+  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token0), channel_id=16, control-predecessors={%done}
   %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16
 }
 
@@ -1940,8 +1940,8 @@ TEST_F(HloParserTest, ParsePaddingConfigInteriorPaddingImplicitZeroDim) {
 TEST_F(HloParserTest, NontupleInfeed) {
   const string original = R"(HloModule nontuple_infeed:
 ENTRY nontuple_infeed {
-  token = token[] after-all()
-  ROOT infeed = pred[] infeed(token)
+  token0 = token[] after-all()
+  ROOT infeed = pred[] infeed(token0)
 })";
   ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   "infeed must have a non-empty tuple shape");
@@ -2239,7 +2239,7 @@ HloModule foobar
 
 ENTRY %entrycomp (p: f32[2,2]) -> f32[2,2] {
   %p = f32[2,2] parameter(0)
-  %constant.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.1 = f32[2,2] constant({{1, 2}, {3, 4}})
   ROOT %add.1 = f32[2,2] add(f32[2,2] %p, f32[2,5] %constant.1)
 }
 )";
@@ -2249,7 +2249,85 @@ ENTRY %entrycomp (p: f32[2,2]) -> f32[2,2] {
                   " with the shape of the operand instruction f32[2,2]{1,0}.");
 }
 
-// custom call incompatible shape.
+TEST_F(HloParserTest, ParseShapeStringR2F32) {
+  string shape_string = "f32[123,456]";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeShape(F32, {123, 456});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseShapeStringTupleOfArrays) {
+  string shape_string = "(f32[1572864],s8[5120,1024])";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected =
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {1572864}),
+                                 ShapeUtil::MakeShape(S8, {5120, 1024})});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseShapeStringNestedTuple) {
+  string shape_string = "(f32[1],(f32[2], token[]), opaque[], f32[3])";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeTupleShape({
+      ShapeUtil::MakeShape(F32, {1}),
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(F32, {2}), ShapeUtil::MakeTokenShape()}),
+      ShapeUtil::MakeOpaqueShape(),
+      ShapeUtil::MakeShape(F32, {3}),
+  });
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseShapeStringWithLayout) {
+  string shape_string = "f32[123,456]{0,1}";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeShapeWithLayout(F32, {123, 456}, {0, 1});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseShapeStringWithSparseLayout) {
+  string shape_string = "f32[123,456]sparse{10}";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeShapeWithSparseLayout(F32, {123, 456}, 10);
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual: " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseOpaqueType) {
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape("opaque[]"));
+  Shape expected = ShapeUtil::MakeOpaqueShape();
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseTokenType) {
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape("token[]"));
+  Shape expected = ShapeUtil::MakeTokenShape();
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseInvalidShapeString) {
+  string shape_strings[] = {
+      "f32[123,456]foobar{0,1}", "f32[123,456]sparse{0,1}", "f32[123,456]{foo}",
+      "f32[123,456]dense{foo}",  "f32[123,456]sparse{foo}",
+  };
+  for (const string& shape_string : shape_strings) {
+    StatusOr<Shape> result = ParseShape(shape_string);
+    ASSERT_FALSE(result.ok()) << "shape: " << shape_string;
+  }
+}
 
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 312b5d020c398feb7738d14a9cfa0928d5178948..33ce7e23a82d840676bba5f1ca9c0ffc4433465d 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -77,6 +77,11 @@ std::vector<HloPassInterface*> HloPassPipeline::GetEnabledPasses(
   auto repeated_field = debug_options.xla_disable_hlo_passes();
   absl::flat_hash_set<string> disabled_pass_names(repeated_field.begin(),
                                                   repeated_field.end());
+  if (debug_options.xla_disable_all_hlo_passes()) {
+    VLOG(1) << "*All* passes disabled by --xla_disable_all_hlo_passes.";
+    return {};
+  }
+
   if (!disabled_pass_names.empty()) {
     VLOG(1) << "Passes disabled by --xla_disable_hlo_passes: "
             << absl::StrJoin(disabled_pass_names, ", ");
@@ -113,7 +118,7 @@ void HloPassPipeline::MaybeDumpHlo(const HloModule& module,
   }
 
   const string message =
-      StrCat("after ", after_pass_name, ", before ", before_pass_name);
+      absl::StrCat("after ", after_pass_name, ", before ", before_pass_name);
   hlo_graph_dumper::MaybeDumpHloModule(module, message);
   VLOG(3) << "HLO " << message << ":";
   VLOG(3) << module.entry_computation_layout().ToString();
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util.cc b/tensorflow/compiler/xla/service/hlo_proto_util.cc
index 981d06ce101644ecce587c4bd2f7a12c8edf6548..3a9ee57e5551ae5b608f02d9f8bd0428ff16db13 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_proto_util.cc
@@ -39,6 +39,7 @@ HloProto MakeHloProto(const HloModule& module) {
 
 StatusOr<std::unique_ptr<HloModule>> CreateModuleFromProto(
     const HloModuleProto& proto, const HloModuleConfig& module_config) {
+  VLOG(4) << proto.ShortDebugString();
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       HloModule::CreateFromProto(proto, module_config));
   TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/xla/service/hlo_token.h b/tensorflow/compiler/xla/service/hlo_token.h
deleted file mode 100644
index 4458c251dee4af365e39027dd4289925c8890efd..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/hlo_token.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TOKEN_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TOKEN_H_
-
-#include <string>
-
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-
-// Defines different kinds of tokens in a hlo module string.
-//
-// You shouldn't need to use this directly unless you're using HloLexer
-// directly, and you probably don't need to do that.  Use hlo_parser instead.
-enum class TokKind {
-  // Markers
-  kEof,
-  kError,
-
-  // Tokens with no info.
-  kEqual,  // =
-  kComma,  // ,
-  kColon,  // :
-  kLsquare,
-  kRsquare,  // [  ]
-  kLbrace,
-  kRbrace,  // {  }
-  kLparen,
-  kRparen,  // (  )
-
-  kArrow,    // ->
-
-  // Keywords
-  kw_HloModule,
-  kw_ENTRY,
-  kw_ROOT,
-  kw_true,
-  kw_false,
-  kw_maximal,
-  kw_replicated,
-  kw_nan,
-  kw_inf,
-
-  kNegInf,  // -inf
-
-  // Typed tokens.
-  kName,           // %foo
-  kAttributeName,  // dimensions=
-  kDimLabels,      // [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}
-  kDxD,            // [0-9]+(x[0-9]+)+
-  kPad,            // [0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*
-  kIdent,          // other identifiers
-  kString,         // "abcd\"\n"
-  kShape,          // f32[2,3]{1,0}
-  kInt,            // 42
-  kDecimal,        // 4.2
-};
-
-string TokKindToString(TokKind kind);
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TOKEN_H_
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 77db7b098a38ff4efdcc7447935fae61561c9ff4..ace854ed6a243c3788a46333f41cb85d90c8e174 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -481,7 +481,9 @@ Status ShapeVerifier::HandleCustomCall(HloInstruction* instruction) {
       const Shape& operand_shape_with_layout =
           custom_call->operand_shapes_with_layout()[i];
       TF_RET_CHECK(ShapeUtil::Compatible(custom_call->operand(i)->shape(),
-                                         operand_shape_with_layout));
+                                         operand_shape_with_layout))
+          << custom_call->operand(i)->shape().ToString() << " operand "
+          << operand_shape_with_layout.ToString();
       TF_RET_CHECK(LayoutUtil::HasLayout(operand_shape_with_layout));
     }
   }
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
index 98246d5403e4aebc2f4d81e52145706355ddd9a9..295465c8481bcb7d1385192febe0d09614e393b3 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
@@ -99,7 +99,7 @@ TEST_F(IndexedArrayAnalysisTest, SimpleOneToOneConstantGather) {
 HloModule SimpleGather
 
 ENTRY main {
-  operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}})
+  operand = s32[3,3] constant({{1,2,3},{1,2,3},{1,2,3}})
   indices = s32[5] parameter(0)
   ROOT gather = s32[5,3] gather(operand, indices),
       offset_dims={1},
@@ -119,7 +119,7 @@ TEST_F(IndexedArrayAnalysisTest, GatherIsNotScalarIndexed0) {
 HloModule SimpleGather
 
 ENTRY main {
-  operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}})
+  operand = s32[3,3] constant({{1,2,3},{1,2,3},{1,2,3}})
   indices = s32[5,2] parameter(0)
   ROOT gather = s32[5] gather(operand, indices),
       offset_dims={},
@@ -195,7 +195,7 @@ TEST_F(IndexedArrayAnalysisTest, GatherOfGather_OneToOne) {
 HloModule SimpleGather
 
 ENTRY main {
-  operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}})
+  operand = s32[3,3] constant({{1,2,3},{1,2,3},{1,2,3}})
   indices_a = s32[5] parameter(0)
   indices_b = s32[2] parameter(1)
   gather_a = s32[5,3] gather(operand, indices_a),
@@ -309,7 +309,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather0) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}})
+  operand = s32[3,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4}})
   indices = s32[5] parameter(0)
   gather = s32[5,4] gather(operand, indices),
       offset_dims={1},
@@ -330,7 +330,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather1) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}})
+  operand = s32[3,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4}})
   indices = s32[5,7] parameter(0)
   gather = s32[5,4,7] gather(operand, indices),
       offset_dims={1},
@@ -352,7 +352,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather2) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[3,2,6] constant(s32[3,2,6]{
+  operand = s32[3,2,6] constant({
       {{1,2,3,4,5,6},{1,2,3,4,5,6}},
       {{1,2,3,4,5,6},{1,2,3,4,5,6}},
       {{1,2,3,4,5,6},{1,2,3,4,5,6}}})
@@ -377,7 +377,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather3) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[2,6] constant(s32[2,6]{
+  operand = s32[2,6] constant({
       {1,2,3,4,5,6},{1,2,3,4,5,6}})
   indices = s32[1] parameter(0)
   gather = s32[1,6] gather(operand, indices),
@@ -405,7 +405,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather4) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[2,3]{1,0} constant(s32[2,3] { { 1, 2, 3 }, { 1, 2, 3 } })
+  operand = s32[2,3]{1,0} constant({ { 1, 2, 3 }, { 1, 2, 3 } })
 
   i.0 = s64[1,3]{1,0} parameter(0)
   g.0 = s32[1,3,3]{2,1,0} gather(operand, i.0), offset_dims={2},
@@ -438,7 +438,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather5) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[1,6] constant(s32[1,6]{{1,2,3,4,5,6}})
+  operand = s32[1,6] constant({{1,2,3,4,5,6}})
   indices = s32[1] parameter(0)
   gather = s32[1,6] gather(operand, indices),
       offset_dims={1},
@@ -465,7 +465,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather6) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[1,2,6] constant(s32[1,2,6]{{
+  operand = s32[1,2,6] constant({{
       {1,2,3,4,5,6},{1,2,3,4,5,6}}})
   indices = s32[1] parameter(0)
   gather = s32[1,1,6] gather(operand, indices),
@@ -496,7 +496,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather7) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[2,6] constant(s32[2,6]{
+  operand = s32[2,6] constant({
       {1,2,3,4,5,6},{1,2,3,4,5,6}})
   indices = s32[1,5] parameter(0)
   gather = s32[1,5,6] gather(operand, indices),
@@ -527,7 +527,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNoFold0) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}})
+  operand = s32[3,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4}})
   indices = s32[5,6] parameter(0)
   gather = s32[5,4,6] gather(operand, indices),
       offset_dims={1},
@@ -556,7 +556,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNoFold1) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[3,5,2] constant(s32[3,5,2]{
+  operand = s32[3,5,2] constant({
       {{1,2},{3,4},{5,6},{7,8},{9,10}},
       {{1,2},{3,4},{5,6},{7,8},{9,10}},
       {{1,2},{3,4},{5,6},{7,8},{9,10}}})
@@ -588,7 +588,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNoFold2) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[3,4,1] constant(s32[3,4,1]{
+  operand = s32[3,4,1] constant({
     {{1},{2},{3},{4}},
     {{1},{2},{3},{4}},
     {{1},{2},{3},{4}}})
@@ -620,7 +620,7 @@ TEST_F(IndexedArrayAnalysisTest, UnaryOpOfGather) {
 HloModule UnaryOpOfGather
 
 ENTRY main {
-  operand = f32[3,4] constant(f32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  operand = f32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
   indices = s32[5] parameter(0)
   gather = f32[5,4] gather(operand, indices),
       offset_dims={1},
@@ -645,7 +645,7 @@ TEST_F(IndexedArrayAnalysisTest, AddBroadcastedScalarWithGather) {
 HloModule AddBroadcastedScalarWithGather
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
   constant = s32[] constant(5)
   constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
   indices = s32[5] parameter(0)
@@ -673,7 +673,7 @@ TEST_F(IndexedArrayAnalysisTest,
 HloModule SubtractBroadcastedScalarWithGather
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
   constant = s32[] constant(5)
   constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
   indices = s32[5] parameter(0)
@@ -701,7 +701,7 @@ TEST_F(IndexedArrayAnalysisTest,
 HloModule SubtractBroadcastedScalarWithGather
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
   constant = s32[] constant(5)
   constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
   indices = s32[5] parameter(0)
@@ -728,7 +728,7 @@ TEST_F(IndexedArrayAnalysisTest, AddBroadcastedVectorWithGather) {
 HloModule AddBroadcastedVectorWithGather
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
   constant_vect = s32[4] constant({10,11,12,13})
   constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={1}
   indices = s32[5] parameter(0)
@@ -755,7 +755,7 @@ TEST_F(IndexedArrayAnalysisTest, AddBroadcastedVectorWithGather_Negative) {
 HloModule AddBroadcastedVectorWithGather
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
   constant_vect = s32[5] constant({10,11,12,13,14})
   constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={0}
   indices = s32[5] parameter(0)
@@ -804,8 +804,8 @@ TEST_F(IndexedArrayAnalysisTest, DotOpBasic_0) {
 HloModule DotOp
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_rhs_constant = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
+  dot_rhs_constant = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
   indices = s32[5] parameter(0)
   dot_lhs = s32[5,4] gather(gather_operand, indices),
       offset_dims={1},
@@ -831,8 +831,8 @@ TEST_F(IndexedArrayAnalysisTest, DotOpBasic_1) {
 HloModule DotOp
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_rhs_constant = s32[3,3] constant(s32[3,3]{{1,2,3},{4,5,6},{7,8,9}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
+  dot_rhs_constant = s32[3,3] constant({{1,2,3},{4,5,6},{7,8,9}})
   indices = s32[5] parameter(0)
   dot_lhs = s32[3,5] gather(gather_operand, indices),
       offset_dims={0},
@@ -859,8 +859,8 @@ TEST_F(IndexedArrayAnalysisTest, DotOpBasic_2) {
 HloModule DotOp
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_lhs_constant = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
+  dot_lhs_constant = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
   indices = s32[5] parameter(0)
   dot_rhs = s32[3,5] gather(gather_operand, indices),
       offset_dims={0},
@@ -888,8 +888,8 @@ TEST_F(IndexedArrayAnalysisTest, DotOpBasic_3) {
 HloModule DotOp
 
 ENTRY main {
-  gather_operand = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
-  dot_lhs_constant = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
+  gather_operand = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
+  dot_lhs_constant = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
   indices = s32[5] parameter(0)
   dot_rhs = s32[5,3] gather(gather_operand, indices),
       offset_dims={1},
@@ -917,8 +917,8 @@ TEST_F(IndexedArrayAnalysisTest, DotOpWithBatch) {
 HloModule DotOp
 
 ENTRY main {
-  gather_operand = s32[2,3,2] constant(s32[2,3,2]{{{1,2},{3,4},{5,6}},{{7,8},{9,10},{11,12}}})
-  dot_lhs_constant = s32[2,2,3] constant(s32[2,2,3]{{{1,2,3},{4,5,6}},{{7,8,9},{10,11,12}}})
+  gather_operand = s32[2,3,2] constant({{{1,2},{3,4},{5,6}},{{7,8},{9,10},{11,12}}})
+  dot_lhs_constant = s32[2,2,3] constant({{{1,2,3},{4,5,6}},{{7,8,9},{10,11,12}}})
   indices = s32[4] parameter(0)
   dot_rhs = s32[2,3,4] gather(gather_operand, indices),
       offset_dims={0,1},
@@ -948,8 +948,8 @@ TEST_F(IndexedArrayAnalysisTest, DotOpNegative) {
 HloModule DotOp
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_rhs_constant = s32[2,3] constant(s32[2,3]{{1,2,3},{4,5,6}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
+  dot_rhs_constant = s32[2,3] constant({{1,2,3},{4,5,6}})
   indices = s32[2] parameter(0)
   dot_lhs = s32[3,2] gather(gather_operand, indices),
       offset_dims={0},
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 2297edcbe1d167f0752423f76b795b3592e85c47..3ea0b81d0d0c1e3edaf8fc2221e0c55a8086e110 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/fusion_queue.h"
@@ -457,8 +458,13 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
     computation_ = computation;
     reachability_ = HloReachabilityMap::Build(computation_);
 
-    HloInstructionSet do_not_duplicate =
-        ComputeGloballyUnfusible(computation_->MakeInstructionPostOrder());
+    HloInstructionSet do_not_duplicate;
+    // If we allow duplications, we need to compute which instructions we do not
+    // want to duplicate based on a global analysis of the graph.
+    if (may_duplicate_) {
+      do_not_duplicate =
+          ComputeGloballyUnfusible(computation_->MakeInstructionPostOrder());
+    }
     auto fusion_queue = GetFusionQueue(computation_);
 
     // Instruction fusion effectively fuses edges in the computation graph
@@ -565,19 +571,42 @@ HloInstruction* InstructionFusion::FuseIntoMultiOutput(
 
 bool InstructionFusion::MultiOutputFusionCreatesCycle(
     HloInstruction* producer, HloInstruction* consumer) {
-  auto is_reachable = [&](const HloInstruction* a, const HloInstruction* b) {
-    // A consumer operand may have been multii-output fused into a parallel
-    // consumer and thus be missing  from the oridinal reachability map.
-    if (!reachability_->IsPresent(a) || !reachability_->IsPresent(b)) {
-      reachability_ = HloReachabilityMap::Build(consumer->parent());
+  absl::flat_hash_set<int> operands;
+  for (const HloInstruction* operand : consumer->operands()) {
+    if (operand == producer) {
+      continue;
+    }
+
+    // If the reachability map already contains the producer and the operand of
+    // the consumer, and the producer can reach the operand, then we know for
+    // sure MultiOutputFusion would create a cycle. If not, we need to do a DFS
+    // traversal of the computation to verify that this multioutput fusion would
+    // not create a cycle.
+    if (reachability_->IsPresent(producer) &&
+        reachability_->IsPresent(operand) &&
+        reachability_->IsReachable(producer, operand)) {
+      return true;
+    }
+    operands.insert(operand->unique_id());
+  }
+
+  // Do a DFS on the producer to see if any of the other consumer operands are
+  // reachable in the current state of the graph.
+  std::vector<HloInstruction*> worklist = producer->users();
+  absl::flat_hash_set<int> visits;
+  while (!worklist.empty()) {
+    const HloInstruction* user = worklist.back();
+    worklist.pop_back();
+    if (operands.count(user->unique_id()) != 0) {
+      return true;
     }
-    return reachability_->IsReachable(a, b);
-  };
-  return absl::c_any_of(consumer->operands(),
-                        [&](const HloInstruction* consumer_operand) {
-                          return consumer_operand != producer &&
-                                 is_reachable(producer, consumer_operand);
-                        });
+    if (visits.count(user->unique_id()) == 0) {
+      visits.insert(user->unique_id());
+      worklist.insert(worklist.end(), user->users().begin(),
+                      user->users().end());
+    }
+  }
+  return false;
 }
 
 bool InstructionFusion::ShouldFuse(HloInstruction* consumer,
diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
index 6b483126499fe1e635a7d13cf597ec5d089c5b24..611cfd404d7622f561f0acc86fc9b05e16eea22e 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
@@ -259,8 +259,8 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusibleRecursively) {
     add = f32[4,3]{1,0} add(p0, p0)
     abs1 = f32[4,3]{1,0} abs(add)
     log = f32[4,3]{1,0} log(abs1)
-    token = token[] after-all()
-    send = f32[4,3]{1,0} send(log, token), channel_id=0
+    token0 = token[] after-all()
+    send = f32[4,3]{1,0} send(log, token0), channel_id=0
     abs2 = f32[4,3]{1,0} abs(log)
     ROOT root = f32[4,3]{1,0} subtract(abs2, add)
   })")
@@ -290,8 +290,8 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusibleRecursively) {
     p0 = f32[4,3]{1,0} parameter(0)
     add1 = f32[4,3]{1,0} add(p0, p0)
     log = f32[4,3]{1,0} log(p0)
-    token = token[] after-all()
-    send = f32[4,3]{1,0} send(log, token), channel_id=0
+    token0 = token[] after-all()
+    send = f32[4,3]{1,0} send(log, token0), channel_id=0
     add2 = f32[4,3]{1,0} add(log, add1)
     ROOT root = f32[4,3]{1,0} subtract(add1, add2)
   })")
@@ -324,8 +324,8 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusibleRecursively) {
     add1 = f32[4,3]{1,0} add(p0, p0)
     add2 = f32[4,3]{1,0} add(add1, add1)
     log = f32[4,3]{1,0} log(add2)
-    token = token[] after-all()
-    send = f32[4,3]{1,0} send(log, token), channel_id=0
+    token0 = token[] after-all()
+    send = f32[4,3]{1,0} send(log, token0), channel_id=0
     sub1 = f32[4,3]{1,0} subtract(log, add2)
     sub2 = f32[4,3]{1,0} subtract(add2, add1)
     ROOT root = (f32[4,3]{1,0}, f32[4,3]{1,0}) tuple(sub1, sub2)
@@ -394,6 +394,56 @@ TEST_F(InstructionFusionTest, AllowEffectiveUnaryDuplication) {
           .ValueOrDie());
 }
 
+TEST_F(InstructionFusionTest, FuseDiamondGraphsNoDuplication) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY Test {
+    p0 = f32[100] parameter(0)
+    p1 = f32[100] parameter(1)
+    add = f32[100] add(p0, p1)
+    slice1 = f32[99] slice(add), slice={[0:99:1]}
+    slice2 = f32[99] slice(add), slice={[1:100:1]}
+    ROOT add2 = f32[99] add(slice1, slice2)
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/false)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  // 'add' would originally need to be duplicated if fused. However after its
+  // two users 'slice1' and 'slice2' are fused into 'add2', 'add' has only one
+  // user and can now be also fused.
+  EXPECT_THAT(root, op::Fusion(op::Parameter(), op::Parameter()));
+}
+
+TEST_F(InstructionFusionTest, FuseDiamondGraphsAllowDuplication) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY Test {
+    p0 = f32[100] parameter(0)
+    p1 = f32[100] parameter(1)
+    add = f32[100] add(p0, p1)
+    slice1 = f32[99] slice(add), slice={[0:99:1]}
+    slice2 = f32[99] slice(add), slice={[1:100:1]}
+    ROOT add2 = f32[99] add(slice1, slice2)
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  // 'add' would originally need to be duplicated if fused. However after its
+  // two users 'slice1' and 'slice2' are fused into 'add2', 'add' has only one
+  // user and can now be also fused.
+  EXPECT_THAT(root, op::Fusion(op::Parameter(), op::Parameter()));
+}
+
 TEST_F(InstructionFusionTest,
        WideningConvertsAreAlwaysDuplicableIntoConsumers) {
   auto module = ParseHloString(R"(
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 3a5177c418e3af8253df228a51f2fc0901d10041..d37ae94bf6c4c697bbf30390c02a5099271e00a4 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -76,9 +76,12 @@ StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
   // need to compile anything
 
   // Create executable from only the Hlo module.
+  auto evaluator = absl::make_unique<HloEvaluator>();
+  evaluator->set_use_fast_path(
+      hlo_module->config().debug_options().xla_hlo_evaluator_use_fast_path());
   std::unique_ptr<Executable> executable =
-      absl::make_unique<InterpreterExecutable>(
-          std::move(hlo_module), absl::make_unique<HloEvaluator>());
+      absl::make_unique<InterpreterExecutable>(std::move(hlo_module),
+                                               std::move(evaluator));
 
   return std::move(executable);
 }
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 7635fbfed6f6a51fc9d203251d9bebf43cc63fd9..de9204011ce5ba8a9fc2871c6bd7120b6ed371b5 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -85,6 +85,7 @@ StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteOnStream(
   Literal result_literal;
   {
     tensorflow::mutex_lock lock(evaluator_lock_);
+    evaluator_->ResetVisitStates();
     TF_ASSIGN_OR_RETURN(result_literal, evaluator_->Evaluate<Literal>(
                                             *computation, arg_literals));
   }
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 311bd7890545b5b2cbec920d2d12ddd482d0d53c..9fe8c3accbf283f3b3eebbefbac8739c37df16bc 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
@@ -848,12 +847,12 @@ TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
     ENTRY entry_computation {
       param = (f32[2,2]) parameter(0)
       gte = f32[2,2] get-tuple-element(param), index=0
-      token = token[] after-all()
-      recv = (f32[2,2], u32[], token[]) recv(token), channel_id=1, sharding={maximal device=1}
+      token0 = token[] after-all()
+      recv = (f32[2,2], u32[], token[]) recv(token0), channel_id=1, sharding={maximal device=1}
       recv-done = (f32[2,2], token[]) recv-done(recv), channel_id=1,
         sharding={maximal device=1}
       ROOT root = f32[2,2] get-tuple-element(recv-done), index=0
-      send = (f32[2,2], u32[], token[]) send(gte, token), channel_id=1,
+      send = (f32[2,2], u32[], token[]) send(gte, token0), channel_id=1,
         sharding={maximal device=0}
       send-done = token[] send-done(send), channel_id=1, sharding={maximal device=0}
     }
@@ -898,7 +897,7 @@ TEST_F(LayoutAssignmentTest, AllReduceLayoutMissmatch) {
       ar.0 = f32[2,2] cross-replica-sum(gte),
         all_reduce_id=1, replica_groups={{0}}, to_apply=add,
         sharding={maximal device=0}
-      const = f32[2,2] constant(f32[2,2]{{0,1},{2,3}})
+      const = f32[2,2] constant({{0,1},{2,3}})
       ROOT ar.1 = f32[2,2] cross-replica-sum(const),
         all_reduce_id=1, replica_groups={{0}}, to_apply=add,
         sharding={maximal device=1}
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
index bd0139f85b6a5c5dc23dad962263038451921e65..5eeb29c478a371dae83251771f2dc4844672d3e9 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
@@ -18,28 +18,29 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 
 namespace xla {
-Status KernelSupportLibrary::For(
+Status KernelSupportLibrary::ForWithStatus(
     absl::string_view name, llvm::Value* start, llvm::Value* end,
     llvm::Value* step,
     const std::function<Status(llvm::Value*, bool)>& for_body_generator) {
-  return If(b_->CreateICmpSLT(start, end), [&]() -> Status {
+  return IfWithStatus(b_->CreateICmpSLT(start, end), [&]() -> Status {
     TF_RETURN_IF_ERROR(for_body_generator(start, /*is_first_iteration=*/true));
-    return For(name, b_->CreateAdd(start, step), end, step,
-               [&](llvm::Value* iv) { return for_body_generator(iv, false); });
+    return ForWithStatus(
+        name, b_->CreateAdd(start, step), end, step,
+        [&](llvm::Value* iv) { return for_body_generator(iv, false); });
   });
 }
 
-Status KernelSupportLibrary::For(
+Status KernelSupportLibrary::ForWithStatus(
     absl::string_view name, llvm::Value* start, llvm::Value* end,
     llvm::Value* step, bool peel_first_iteration,
     const std::function<Status(llvm::Value*, llvm::Value*)>&
         for_body_generator) {
   if (peel_first_iteration) {
-    return For(name, start, end, step, true,
-               [&](llvm::Value* indvar, bool is_first_iteration) -> Status {
-                 return for_body_generator(indvar,
-                                           b_->getInt1(is_first_iteration));
-               });
+    return ForWithStatus(
+        name, start, end, step, true,
+        [&](llvm::Value* indvar, bool is_first_iteration) -> Status {
+          return for_body_generator(indvar, b_->getInt1(is_first_iteration));
+        });
   } else {
     std::unique_ptr<llvm_ir::ForLoop> loop = llvm_ir::ForLoop::EmitForLoop(
         name, start, end, step, b_,
@@ -55,7 +56,7 @@ Status KernelSupportLibrary::For(
   }
 }
 
-Status KernelSupportLibrary::If(
+Status KernelSupportLibrary::IfWithStatus(
     absl::string_view name, llvm::Value* condition,
     const std::function<Status()>& true_block_generator,
     const std::function<Status()>& false_block_generator) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
index 43fec311f150d6054f6ad24f99db332f90ff94a3..612b839cfa15711061e1ae53358a72d5220e1801 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
@@ -48,41 +48,42 @@ class KernelSupportLibrary {
   //     for (i64 i = `start` + `step`; i s< `end`; i += `step`)
   //       `for_body_generator(/*ind_var=*/,i, /*is_first_iteration=*/false)`;
   //   }
-  Status For(
+  Status ForWithStatus(
       absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<Status(llvm::Value* ind_var,
                                  bool is_first_iteration)>& for_body_generator);
 
-  void ForReturnVoid(
+  void For(
       absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
           for_body_generator) {
     CHECK_EQ(Status::OK(),
-             For(name, start, end, step,
+             ForWithStatus(
+                 name, start, end, step,
                  [&](llvm::Value* ind_var, bool is_first_iteration) -> Status {
                    for_body_generator(ind_var, is_first_iteration);
                    return Status::OK();
                  }));
   }
 
-  Status For(absl::string_view name, int64 start, int64 end, int64 step,
-             const std::function<Status(llvm::Value* ind_var,
-                                        bool is_first_iteration)>&
-                 for_body_generator) {
-    return For(name, /*start=*/b_->getInt64(start),
-               /*end=*/b_->getInt64(end),
-               /*step=*/b_->getInt64(step), for_body_generator);
+  Status ForWithStatus(
+      absl::string_view name, int64 start, int64 end, int64 step,
+      const std::function<Status(
+          llvm::Value* ind_var, bool is_first_iteration)>& for_body_generator) {
+    return ForWithStatus(name, /*start=*/b_->getInt64(start),
+                         /*end=*/b_->getInt64(end),
+                         /*step=*/b_->getInt64(step), for_body_generator);
   }
 
-  void ForReturnVoid(
+  void For(
       absl::string_view name, int64 start, int64 end, int64 step,
       const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
           for_body_generator) {
-    ForReturnVoid(name, /*start=*/b_->getInt64(start),
-                  /*end=*/b_->getInt64(end),
-                  /*step=*/b_->getInt64(step), for_body_generator);
+    For(name, /*start=*/b_->getInt64(start),
+        /*end=*/b_->getInt64(end),
+        /*step=*/b_->getInt64(step), for_body_generator);
   }
 
   // Generates the following control flow structure if `peel_first_iteration` is
@@ -99,19 +100,19 @@ class KernelSupportLibrary {
   //   for (i64 i = `start`; i s< `end`; i += `step`)
   //     `for_body_generator(/*ind_var=*/,i,
   //                         /*is_first_iteration=*/,(i != `start`))`;
-  Status For(absl::string_view name, llvm::Value* start, llvm::Value* end,
-             llvm::Value* step, bool peel_first_iteration,
-             const std::function<Status(llvm::Value* ind_var,
-                                        llvm::Value* is_first_iteration)>&
-                 for_body_generator);
+  Status ForWithStatus(
+      absl::string_view name, llvm::Value* start, llvm::Value* end,
+      llvm::Value* step, bool peel_first_iteration,
+      const std::function<Status(llvm::Value* ind_var,
+                                 llvm::Value* is_first_iteration)>&
+          for_body_generator);
 
-  void ForReturnVoid(absl::string_view name, llvm::Value* start,
-                     llvm::Value* end, llvm::Value* step,
-                     bool peel_first_iteration,
-                     const std::function<void(llvm::Value* ind_var,
-                                              llvm::Value* is_first_iteration)>&
-                         for_body_generator) {
-    TF_CHECK_OK(For(
+  void For(absl::string_view name, llvm::Value* start, llvm::Value* end,
+           llvm::Value* step, bool peel_first_iteration,
+           const std::function<void(llvm::Value* ind_var,
+                                    llvm::Value* is_first_iteration)>&
+               for_body_generator) {
+    TF_CHECK_OK(ForWithStatus(
         name, start, end, step, peel_first_iteration,
         [&](llvm::Value* ind_var, llvm::Value* is_first_iteration) -> Status {
           for_body_generator(ind_var, is_first_iteration);
@@ -119,80 +120,81 @@ class KernelSupportLibrary {
         }));
   }
 
-  Status For(absl::string_view name, llvm::Value* start, llvm::Value* end,
-             int64 step, bool peel_first_iteration,
-             const std::function<Status(llvm::Value* ind_var,
-                                        llvm::Value* is_first_iteration)>&
-                 for_body_generator) {
-    return For(name, /*start=*/start, /*end=*/end,
-               /*step=*/llvm::ConstantInt::get(start->getType(), step),
-               peel_first_iteration, for_body_generator);
+  Status ForWithStatus(
+      absl::string_view name, llvm::Value* start, llvm::Value* end, int64 step,
+      bool peel_first_iteration,
+      const std::function<Status(llvm::Value* ind_var,
+                                 llvm::Value* is_first_iteration)>&
+          for_body_generator) {
+    return ForWithStatus(
+        name, /*start=*/start, /*end=*/end,
+        /*step=*/llvm::ConstantInt::get(start->getType(), step),
+        peel_first_iteration, for_body_generator);
   }
 
-  void ForReturnVoid(absl::string_view name, llvm::Value* start,
-                     llvm::Value* end, int64 step, bool peel_first_iteration,
-                     const std::function<void(llvm::Value* ind_var,
-                                              llvm::Value* is_first_iteration)>&
-                         for_body_generator) {
-    ForReturnVoid(name, /*start=*/start, /*end=*/end,
-                  /*step=*/llvm::ConstantInt::get(start->getType(), step),
-                  peel_first_iteration, for_body_generator);
+  void For(absl::string_view name, llvm::Value* start, llvm::Value* end,
+           int64 step, bool peel_first_iteration,
+           const std::function<void(llvm::Value* ind_var,
+                                    llvm::Value* is_first_iteration)>&
+               for_body_generator) {
+    For(name, /*start=*/start, /*end=*/end,
+        /*step=*/llvm::ConstantInt::get(start->getType(), step),
+        peel_first_iteration, for_body_generator);
   }
 
-  Status For(
+  Status ForWithStatus(
       absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
-    return For(name, start, end, step,
-               /*peel_first_iteration=*/false,
-               [&](llvm::Value* indvar, llvm::Value*) -> Status {
-                 return for_body_generator(indvar);
-               });
+    return ForWithStatus(name, start, end, step,
+                         /*peel_first_iteration=*/false,
+                         [&](llvm::Value* indvar, llvm::Value*) -> Status {
+                           return for_body_generator(indvar);
+                         });
   }
 
-  void ForReturnVoid(
+  void For(
       absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
-    ForReturnVoid(name, start, end, step,
-                  /*peel_first_iteration=*/false,
-                  [&](llvm::Value* indvar, llvm::Value*) {
-                    return for_body_generator(indvar);
-                  });
+    For(name, start, end, step,
+        /*peel_first_iteration=*/false, [&](llvm::Value* indvar, llvm::Value*) {
+          return for_body_generator(indvar);
+        });
   }
 
-  Status For(
+  Status ForWithStatus(
       absl::string_view name, llvm::Value* start, llvm::Value* end, int64 step,
       const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
-    return For(name, start, end, llvm::ConstantInt::get(start->getType(), step),
-               /*peel_first_iteration=*/false,
-               [&](llvm::Value* indvar, llvm::Value*) -> Status {
-                 return for_body_generator(indvar);
-               });
+    return ForWithStatus(name, start, end,
+                         llvm::ConstantInt::get(start->getType(), step),
+                         /*peel_first_iteration=*/false,
+                         [&](llvm::Value* indvar, llvm::Value*) -> Status {
+                           return for_body_generator(indvar);
+                         });
   }
 
-  void ForReturnVoid(
+  void For(
       absl::string_view name, llvm::Value* start, llvm::Value* end, int64 step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
-    ForReturnVoid(name, start, end,
-                  llvm::ConstantInt::get(start->getType(), step),
-                  for_body_generator);
+    For(name, start, end, llvm::ConstantInt::get(start->getType(), step),
+        for_body_generator);
   }
 
-  Status For(
+  Status ForWithStatus(
       absl::string_view name, int64 start, int64 end, int64 step,
       const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
-    return For(name, /*start=*/b_->getInt64(start),
-               /*end=*/b_->getInt64(end),
-               /*step=*/b_->getInt64(step), for_body_generator);
+    return ForWithStatus(name, /*start=*/b_->getInt64(start),
+                         /*end=*/b_->getInt64(end),
+                         /*step=*/b_->getInt64(step), for_body_generator);
   }
 
-  void ForReturnVoid(
+  void For(
       absl::string_view name, int64 start, int64 end, int64 step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
-    ForReturnVoid(name, /*start=*/b_->getInt64(start),
-                  /*end=*/b_->getInt64(end),
-                  /*step=*/b_->getInt64(step), for_body_generator);
+    For(name, /*start=*/b_->getInt64(start),
+        /*end=*/b_->getInt64(end),
+        /*step=*/b_->getInt64(step), for_body_generator);
   }
 
   // Generates the following control flow structure:
@@ -201,38 +203,43 @@ class KernelSupportLibrary {
   //     `true_block_generator()`;
   //   else
   //      `false_block_generator()`;
-  Status If(absl::string_view name, llvm::Value* condition,
-            const std::function<Status()>& true_block_generator,
-            const std::function<Status()>& false_block_generator =
-                []() -> Status { return Status::OK(); });
+  Status IfWithStatus(
+      absl::string_view name, llvm::Value* condition,
+      const std::function<Status()>& true_block_generator,
+      const std::function<Status()>& false_block_generator = []() -> Status {
+        return Status::OK();
+      });
 
-  Status If(llvm::Value* condition,
-            const std::function<Status()>& true_block_generator,
-            const std::function<Status()>& false_block_generator =
-                []() -> Status { return Status::OK(); }) {
-    return If("", condition, true_block_generator, false_block_generator);
+  Status IfWithStatus(
+      llvm::Value* condition,
+      const std::function<Status()>& true_block_generator,
+      const std::function<Status()>& false_block_generator = []() -> Status {
+        return Status::OK();
+      }) {
+    return IfWithStatus("", condition, true_block_generator,
+                        false_block_generator);
   }
 
-  void IfReturnVoid(llvm::Value* condition,
-                    const std::function<void()>& true_block_generator,
-                    const std::function<void()>& false_block_generator = []() {
-                    }) {
-    IfReturnVoid("", condition, true_block_generator, false_block_generator);
+  void If(
+      llvm::Value* condition, const std::function<void()>& true_block_generator,
+      const std::function<void()>& false_block_generator = []() {}) {
+    If("", condition, true_block_generator, false_block_generator);
   }
 
-  void IfReturnVoid(absl::string_view name, llvm::Value* condition,
-                    const std::function<void()>& true_block_generator,
-                    const std::function<void()>& false_block_generator = []() {
-                    }) {
-    TF_CHECK_OK(If(name, condition,
-                   [&]() {
-                     true_block_generator();
-                     return Status::OK();
-                   },
-                   [&]() {
-                     false_block_generator();
-                     return Status::OK();
-                   }));
+  void If(
+      absl::string_view name, llvm::Value* condition,
+      const std::function<void()>& true_block_generator,
+      const std::function<void()>& false_block_generator = []() {}) {
+    TF_CHECK_OK(IfWithStatus(
+        name, condition,
+        [&]() {
+          true_block_generator();
+          return Status::OK();
+        },
+        [&]() {
+          false_block_generator();
+          return Status::OK();
+        }));
   }
 
   using ArgumentVector = absl::Span<llvm::Value* const>;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
index 1aa85eb8d2d206bf0537deb659e779b24fffbb0a..cebbc4290163d4e98003cd7b5df6ec906509a446 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
@@ -170,14 +170,16 @@ IrArray::Index KernelMappingScheme::EmitBlockIndex(llvm::Type* index_ty) {
 
 IrArray::Index KernelMappingScheme::GetTileIndexForBlockOrigin(
     const IrArray::Index& block_index) {
-  IrArray::Index tile_index = block_index;
+  DCHECK_EQ(block_index.size(), block_sizes_.size());
+  std::vector<llvm::Value*> multidim;
+  multidim.reserve(block_sizes_.size());
   for (int i = 0; i < block_sizes_.size(); ++i) {
-    tile_index[i] = b_->CreateMul(
+    multidim.push_back(b_->CreateMul(
         block_index[i],
         llvm::ConstantInt::get(block_index[i]->getType(), block_sizes_[i]),
-        "block_origin." + std::to_string(i));
+        "block_origin." + std::to_string(i)));
   }
-  return tile_index;
+  return IrArray::Index(multidim, block_index[0]->getType());
 }
 
 IrArray::Index KernelMappingScheme::GetElementIndexForTileOrigin(
@@ -217,14 +219,14 @@ KernelMappingScheme::EmitThreadYXCoordinate(llvm::Type* index_ty) {
   // defined by (num_thread_y, num_thread_x) from thread_id.
   llvm::CallInst* thread_id_raw = llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b_);
-  llvm_ir::AddRangeMetadata(0, GetThreadsPerTile(), thread_id_raw);
+  llvm_ir::AddRangeMetadata(0, GetThreadsPerBlock(), thread_id_raw);
   llvm::Value* thread_id_int =
       b_->CreateIntCast(thread_id_raw, index_ty,
                         /*isSigned=*/true, "thread.id.x");
   llvm::Value* num_thread_x =
       llvm::ConstantInt::get(index_ty, GetNumberOfThreadsForDimensionX());
-  llvm::Value* x = b_->CreateURem(thread_id_int, num_thread_x);
-  llvm::Value* y = b_->CreateUDiv(thread_id_int, num_thread_x);
+  llvm::Value* x = b_->CreateURem(thread_id_int, num_thread_x, "thread.x");
+  llvm::Value* y = b_->CreateUDiv(thread_id_int, num_thread_x, "thread.y");
   return std::make_tuple(y, x);
 }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
index 7277aeac8ad2086a2f6419b1fdb60c4872841adc..fb633b12e60d1a9f3103fb2919ad2c3f3f14de20 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
@@ -142,7 +142,7 @@ class KernelMappingScheme {
   int64 GetNumberOfThreadsForDimensionX() const { return num_threads_x_; }
   int64 GetNumberOfThreadsForDimensionY() const { return num_threads_y_; }
 
-  int64 GetThreadsPerTile() const {
+  int64 GetThreadsPerBlock() const {
     return GetNumberOfThreadsForDimensionX() *
            GetNumberOfThreadsForDimensionY();
   }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
index e22c2173c271fc9571be1ddb0759d2b31562dc98..6a9406bfebafcc02dc2e144b62284a9e83c3edeb 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -108,7 +108,7 @@ void EmitCompareLoopBody(
 
   // if (is_smaller_index && index_is_inbounds)
   KernelSupportLibrary ksl(b);
-  ksl.IfReturnVoid("smaller_comparison_index", do_comparison, [&]() {
+  ksl.If("smaller_comparison_index", do_comparison, [&]() {
     auto key1 = read_element(0, current_keys_index);
     auto key2 = read_element(0, compare_keys_index);
     auto compare_key1 = key1;
@@ -155,7 +155,7 @@ void EmitCompareLoopBody(
       is_smaller_than = b->CreateOr(
           is_smaller_than, b->CreateAnd(keys_equal, index_is_smaller_than));
     }
-    ksl.IfReturnVoid("is_smaller_than", is_smaller_than, [&]() {
+    ksl.If("is_smaller_than", is_smaller_than, [&]() {
       // Swap key1 with key2.
       write_element(0, current_keys_index, key2);
       write_element(0, compare_keys_index, key1);
@@ -192,7 +192,7 @@ void EmitTiledCompareLoop(
             b->CreateShl(tiled_keys_index[dimension_to_sort], value_one);
         // We want to copy two adjacent elements. We first check whether the
         // first index position is within bounds.
-        ksl.IfReturnVoid(
+        ksl.If(
             "smaller_keys_index",
             b->CreateICmpSLT(current_keys_index,
                              tiled_keys_index.GetConstantWithIndexType(
@@ -203,15 +203,14 @@ void EmitTiledCompareLoop(
               // Increment to go the next index position.
               current_keys_index = b->CreateAdd(current_keys_index, value_one);
               // Here we check whether the next index position is within bounds.
-              ksl.IfReturnVoid(
-                  "inner_smaller_keys_index",
-                  b->CreateICmpSLT(current_keys_index,
-                                   tiled_keys_index.GetConstantWithIndexType(
-                                       dimension_to_sort_bound)),
-                  [&]() {
-                    cache_index = b->CreateAdd(cache_index, value_one);
-                    read_or_write(cache_index, current_keys_index);
-                  });
+              ksl.If("inner_smaller_keys_index",
+                     b->CreateICmpSLT(current_keys_index,
+                                      tiled_keys_index.GetConstantWithIndexType(
+                                          dimension_to_sort_bound)),
+                     [&]() {
+                       cache_index = b->CreateAdd(cache_index, value_one);
+                       read_or_write(cache_index, current_keys_index);
+                     });
             });
       };
 
@@ -253,7 +252,7 @@ void EmitTiledCompareLoop(
     if (dimension_to_sort_bound % tile_size) {
       // Otherwise we need a bounds check for the last tile. The last tile has
       // size 'dimension_to_sort_bound' % 'tile_size'.
-      ksl.IfReturnVoid(
+      ksl.If(
           "is_last_tile",
           b->CreateICmpUGE(
               b->CreateMul(tiled_keys_index[dimension_to_sort],
diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index ac2f79674feceff436c0e9c65338967f498e4473..daa718879ddd45afb02725b557380b2f49fe833e 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -42,6 +43,7 @@ NameUniquer::NameUniquer(const string& separator) {
   if (name.empty()) {
     return "";
   }
+
   string result = name;
   char c = static_cast<unsigned char>(result[0]);
   if (!isalpha(c) && c != '_') {
@@ -52,6 +54,13 @@ NameUniquer::NameUniquer(const string& separator) {
       result[i] = '_';
     }
   }
+
+  // HLO primitive type names (with the exception of 'tuple') are keywords in
+  // the HLO text representation and cannot be names, so append an underscore if
+  // the name is a primitive type.
+  if (primitive_util::IsPrimitiveTypeName(result) && result != "tuple") {
+    result += "_";
+  }
   return result;
 }
 
diff --git a/tensorflow/compiler/xla/service/name_uniquer_test.cc b/tensorflow/compiler/xla/service/name_uniquer_test.cc
index 3e2592c6ac626143f1421e545a31d9be91e376bc..d0d04147e0c29c66cba447550c0a9c703f35573a 100644
--- a/tensorflow/compiler/xla/service/name_uniquer_test.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer_test.cc
@@ -104,5 +104,21 @@ TEST_F(NameUniquerTest, KeepNamesInRandomOrder) {
   EXPECT_EQ("foo.3", uniquer.GetUniqueName("foo.3"));
 }
 
+TEST_F(NameUniquerTest, AvoidKeywords) {
+  NameUniquer uniquer(".");
+
+  EXPECT_EQ("f32_", uniquer.GetUniqueName("f32"));
+  EXPECT_EQ("s64_", uniquer.GetUniqueName("s64"));
+  EXPECT_EQ("pred_", uniquer.GetUniqueName("pred"));
+
+  // Though a primitive type, "tuple" is not a keyword.
+  EXPECT_EQ("tuple", uniquer.GetUniqueName("tuple"));
+
+  // Keywords are not capitalized.
+  EXPECT_EQ("F32", uniquer.GetUniqueName("F32"));
+  EXPECT_EQ("S32", uniquer.GetUniqueName("S32"));
+  EXPECT_EQ("Pred", uniquer.GetUniqueName("Pred"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index fb1645d9b2ebeae77190a950ebd023979c567016..81db3bb643a989cafb6c6a8bcbd35e218fdcaf44 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -64,6 +64,9 @@ namespace xla {
 //       e.g. IsConstantScalar() or IsConstantScalar(42).
 //     - WithFusionKind
 //     - WithTupleIndex: get-tuple-element operations with the given tuple index
+//     - WithOneUse: Instruction is used as an operand exactly once.
+//     - WithOneUser: Instruction is used by exactly one other instruction, but
+//       is possibly used more than once as an operand (e.g. multiply(x,x)).
 //
 //   Shape():
 //     - EqualTo
@@ -1133,6 +1136,13 @@ inline const HloInstruction* HloOperand(const HloInstruction* instr,
   return instr->operand(idx);
 }
 
+// Pretty-printer for HloInstruction.  Sort of like ToShortString, but with
+// fewer %s and more shapes.
+inline string InstToString(const HloInstruction* inst) {
+  return inst->ToString(
+      HloPrintOptions().set_print_metadata(false).set_print_percent(false));
+}
+
 template <typename HloInstructionType, typename Impl>
 class HloInstructionPattern;
 
@@ -1187,14 +1197,14 @@ class HloInstructionIsImpl {
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
     if (inst != inst_) {
       EXPLAIN << "HloInstruction " << inst << " is not " << inst_ << " ("
-              << inst_->ToShortString() << ")";
+              << InstToString(inst_) << ")";
       return false;
     }
     return true;
   }
 
   void DescribeTo(std::ostream* os, int64 indent = 0) const {
-    *os << "which is " << inst_ << " (" << inst_->ToShortString() << ")";
+    *os << "which is " << inst_ << " (" << InstToString(inst_) << ")";
   }
 
  private:
@@ -1603,6 +1613,64 @@ class HloInstructionPatternParameterNumImpl {
   int64 parameter_num_;
 };
 
+// Superclass that contains common code used by Op::WithOneUse() and
+// Op::WithOneUser().
+class HloInstructionPatternOneUseOrUserImpl {
+ protected:
+  bool MatchOneUser(const HloInstruction* inst, MatchOption option) const {
+    if (inst->user_count() != 1) {
+      EXPLAIN << "HloInstruction has " << inst->user_count()
+              << " users, but expected exactly one.";
+      if (inst->user_count() > 1) {
+        EXPLAIN << "\nAll users:";
+        for (const HloInstruction* user : inst->users()) {
+          EXPLAIN << "\n - " << InstToString(user);
+        }
+      }
+      return false;
+    }
+    return true;
+  }
+};
+
+class HloInstructionPatternOneUseImpl
+    : public HloInstructionPatternOneUseOrUserImpl {
+ public:
+  bool Match(const HloInstruction* inst, MatchOption option) const {
+    if (!MatchOneUser(inst, option)) {
+      return false;
+    }
+
+    int64 use_count = absl::c_count_if(
+        inst->users()[0]->operands(),
+        [&](const HloInstruction* operand) { return operand == inst; });
+    if (use_count != 1) {
+      EXPLAIN << "HloInstruction is used " << use_count
+              << " times by its user, but is expected to be used just once: "
+              << InstToString(inst->users()[0]);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which has exactly one use";
+  }
+};
+
+class HloInstructionPatternOneUserImpl
+    : public HloInstructionPatternOneUseOrUserImpl {
+ public:
+  bool Match(const HloInstruction* inst, MatchOption option) const {
+    return MatchOneUser(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which has exactly one user (but possibly is used multiple times by "
+           "that instruction)";
+  }
+};
+
 // Matches a constant scalar or effective scalar, optionally with a given value.
 template <typename ScalarTy>
 class HloConstantScalarImpl {
@@ -1669,7 +1737,8 @@ class HloConstantScalarImpl {
               literal_r0_as_val_ty_or.ValueOrDie() == val_literal &&
               literal_r0 == val_as_literal_ty;
     if (!rv) {
-      EXPLAIN << "HloInstruction's constant value " << literal_r0.ToString()
+      EXPLAIN << "HloInstruction's constant value "
+              << literal_r0.ToStringWithoutShape()
               << " did not match expected value " << *val_;
     }
     return rv;
@@ -1706,10 +1775,7 @@ class HloInstructionPattern {
       return true;
     }
     if (inst != nullptr) {
-      EXPLAIN << "\nin "
-              << inst->ToString(HloPrintOptions()
-                                    .set_print_metadata(false)
-                                    .set_print_percent(false));
+      EXPLAIN << "\nin " << InstToString(inst);
     }
     return false;
   }
@@ -1722,10 +1788,7 @@ class HloInstructionPattern {
       }
       return true;
     }
-    EXPLAIN << "\nin "
-            << inst->ToString(HloPrintOptions()
-                                  .set_print_metadata(false)
-                                  .set_print_percent(false));
+    EXPLAIN << "\nin " << InstToString(inst);
     return false;
   }
 
@@ -1877,6 +1940,22 @@ class HloInstructionPattern {
     return AppendImpl(HloInstructionPatternParameterNumImpl(parameter_num));
   }
 
+  // Modifies the pattern to match if the instruction is used exactly once.
+  // Does not match if the instruction is used twice by the same user (e.g.
+  // multiply(x,x)).
+  constexpr auto WithOneUse() const
+      -> decltype(this->AppendImpl(HloInstructionPatternOneUseImpl())) {
+    return AppendImpl(HloInstructionPatternOneUseImpl());
+  }
+
+  // Modifies the pattern to match if the instruction is used by exactly one
+  // other instruction.  Will match if the instruction is used twice, so long as
+  // it's by the same user (e.g.  multiply(x,x)).
+  constexpr auto WithOneUser() const
+      -> decltype(this->AppendImpl(HloInstructionPatternOneUserImpl())) {
+    return AppendImpl(HloInstructionPatternOneUserImpl());
+  }
+
   void DescribeTo(std::ostream* os, int64 indent = 0) const {
     impl_.DescribeTo(os, indent);
   }
@@ -1922,6 +2001,7 @@ Op(::xla::HloInstruction** matched_inst) {
 XLA_NULLOP_PATTERN(Constant)
 XLA_NULLOP_PATTERN(Parameter)
 XLA_NULLOP_PATTERN(Iota)
+XLA_NULLOP_PATTERN(Rng)
 #undef XLA_NULLOP_PATTERN
 
 // Helpers for unary instructions.
@@ -2028,10 +2108,10 @@ XLA_UNOP_PATTERN(Transpose)
   }                                                                         \
   template <typename Lhs, typename Rhs>                                     \
   inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs)                          \
-      ->decltype(NAME##AnyOrder<HloInstruction>(                            \
+      ->decltype(NAME##AnyOrder<const HloInstruction>(                      \
           nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs))) {       \
-    return NAME##AnyOrder<HloInstruction>(nullptr, std::forward<Lhs>(lhs),  \
-                                          std::forward<Rhs>(rhs));          \
+    return NAME##AnyOrder<const HloInstruction>(                            \
+        nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs));           \
   }
 XLA_COMMUTATIVE_BINOP_PATTERN(Add)
 XLA_BINOP_PATTERN(Atan2)
@@ -2053,6 +2133,7 @@ XLA_COMMUTATIVE_BINOP_PATTERN(Ne)
 XLA_BINOP_PATTERN(Outfeed)
 XLA_BINOP_PATTERN(Pad)
 XLA_BINOP_PATTERN(Power)
+XLA_BINOP_PATTERN(ReduceWindow)
 XLA_BINOP_PATTERN(Remainder)
 XLA_BINOP_PATTERN(Send)
 XLA_BINOP_PATTERN(Subtract)
@@ -2099,6 +2180,7 @@ XLA_BINOP_PATTERN(ShiftRightLogical)
         .WithOperand(2, std::forward<Arg2>(arg2));                     \
   }
 XLA_TERNOP_PATTERN(Clamp);
+XLA_TERNOP_PATTERN(Scatter);
 XLA_TERNOP_PATTERN(Select);
 #undef XLA_TERNOP_PATTERN
 
@@ -2151,8 +2233,10 @@ inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg,
 
 // We could implement all ops as "variadic" ops, but it would make the
 // already-bad compile errors even worse.
+XLA_VARIADIC_OP_PATTERN(AfterAll);
 XLA_VARIADIC_OP_PATTERN(Concatenate);
 XLA_VARIADIC_OP_PATTERN(CustomCall);
+XLA_VARIADIC_OP_PATTERN(Map)
 XLA_VARIADIC_OP_PATTERN(Reduce);
 XLA_VARIADIC_OP_PATTERN(Tuple);
 
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
index 13886fa6f5b7b55283e6e420734a22312987d8a6..5c3c009a68bffbda8642fceedfb724879fbf1530 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -242,8 +242,8 @@ TEST(PatternMatcherTest, ConstantScalar) {
     HloModule test_module
     ENTRY test {
       a = s32[] constant(1)
-      b = s32[1,1] constant(s32[1,1]{{2}})
-      c = s32[1,2] constant(s32[1,2]{{2,2}})
+      b = s32[1,1] constant({{2}})
+      c = s32[1,2] constant({{2,2}})
       d = f32[] constant(1)
       e = f32[] constant(1.25)
       ROOT tuple = (s32[], s32[1,1], s32[1,2], f32[], f32[]) tuple(a,b,c,d,e)
@@ -767,10 +767,11 @@ TEST(PatternMatcherTest, HloInstructionDescribeToAndExplain) {
       "in c = f64[] constant(2.25)");
   EXPECT_DESC_AND_EXPLANATION(
       constant, m::Op().Is(iota.get()),
-      absl::StrCat("an HloInstruction which is 0x", absl::Hex(iota.get()), " (",
-                   iota->ToShortString(), ")"),
+      absl::StrCat("an HloInstruction which is 0x", absl::Hex(iota.get()),
+                   " (i = s32[42]{0} iota(), iota_dimension=0)"),
       absl::StrCat("HloInstruction 0x", absl::Hex(constant.get()), " is not 0x",
-                   absl::Hex(iota.get()), " (", iota->ToShortString(), ")\n",
+                   absl::Hex(iota.get()),
+                   " (i = s32[42]{0} iota(), iota_dimension=0)\n"
                    "in c = s32[] constant(0)"));
 }
 
@@ -875,5 +876,60 @@ TEST(PatternMatcherTest, Parameter) {
             "in p0 = f32[] parameter(0)");
 }
 
+TEST(PatternMatcherTest, OneUseAndOneUser) {
+  auto param =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0");
+
+  EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse()));
+  EXPECT_DESC_AND_EXPLANATION(
+      param, m::Op().WithOneUse(),
+      "an HloInstruction which has exactly one use",
+      "HloInstruction has 0 users, but expected exactly one.\n"
+      "in p0 = f32[] parameter(0)");
+
+  EXPECT_FALSE(Match(param.get(), m::Op().WithOneUser()));
+  EXPECT_DESC_AND_EXPLANATION(
+      param, m::Op().WithOneUser(),
+      "an HloInstruction which has exactly one user (but possibly is used "
+      "multiple times by that instruction)",
+      "HloInstruction has 0 users, but expected exactly one.\n"
+      "in p0 = f32[] parameter(0)");
+
+  {
+    auto reshape =
+        SetName("r", HloInstruction::CreateReshape(
+                         ShapeUtil::MakeShape(F32, {1}), param.get()));
+    EXPECT_TRUE(Match(param.get(), m::Op().WithOneUse()));
+    EXPECT_TRUE(Match(param.get(), m::Op().WithOneUser()));
+
+    auto reshape1 =
+        SetName("r1", HloInstruction::CreateReshape(
+                          ShapeUtil::MakeShape(F32, {1}), param.get()));
+    EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse()));
+    EXPECT_FALSE(Match(param.get(), m::Op().WithOneUser()));
+
+    const char* kMultipleUserExplanation =
+        "HloInstruction has 2 users, but expected exactly one.\n"
+        "All users:\n"
+        " - r = f32[1]{0} reshape(f32[] p0)\n"
+        " - r1 = f32[1]{0} reshape(f32[] p0)\n"
+        "in p0 = f32[] parameter(0)";
+    EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUse()),
+              kMultipleUserExplanation);
+    EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUser()),
+              kMultipleUserExplanation);
+  }
+
+  auto add = SetName("add", HloInstruction::CreateBinary(
+                                ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd,
+                                param.get(), param.get()));
+  EXPECT_TRUE(Match(param.get(), m::Op().WithOneUser()));
+  EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse()));
+  EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUse()),
+            "HloInstruction is used 2 times by its user, but is expected to be "
+            "used just once: add = f32[] add(f32[] p0, f32[] p0)\n"
+            "in p0 = f32[] parameter(0)");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 3b336d5c9db80ff2ca8d0e45396dca66a29a0494..ae5bd93e7c56117cc78ecc729d370250787efac6 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -746,9 +746,9 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
   }
   if (available_device_count < arg->device_count() * replica_count) {
     return ResourceExhausted(
-        "Requested device count (%d) exceeds the number of available devices "
-        "on the target (%d)",
-        arg->device_count(), available_device_count);
+        "Requested logical device count (%d) with replica count (%d) exceeds "
+        "the number of available physical devices on the target (%d)",
+        arg->device_count(), replica_count, available_device_count);
   }
 
   for (int64 i = 0; i < arg->device_count(); ++i) {
@@ -1078,9 +1078,11 @@ Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
 
   ProgramShape program_shape(arg->computation().host_program_shape());
   TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result()));
+  absl::optional<Layout> output_layout;
   if (arg->has_output_layout()) {
+    output_layout = Layout::CreateFromProto(arg->output_layout());
     TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutForShape(
-        arg->output_layout(), program_shape.result()));
+        *output_layout, program_shape.result()));
   }
 
   HloModuleConfig config(program_shape);
@@ -1096,8 +1098,8 @@ Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
   // relayout here.
   //
   // TODO(b/77824332): Make HloEvaluator take care of the re-layout.
-  if (arg->has_output_layout()) {
-    result_literal = result_literal.Relayout(arg->output_layout());
+  if (output_layout.has_value()) {
+    result_literal = result_literal.Relayout(*output_layout);
   }
   *result->mutable_literal() = result_literal.ToProto();
 
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 17cdaa74fc328d156292f5af828d4222a9a01f1f..3ca53edc8171a134f2bfb9a36beacfd2d2e0d425 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -139,9 +139,9 @@ TEST_F(TransposeFoldingTest, FoldDotTransposeConstant) {
 HloModule FoldDotTransposeConstant
 
 ENTRY entry_computation {
-  constant = f32[2,1]{1,0} constant(f32[2,1] { { 1 }, { 2 } })
+  constant = f32[2,1]{1,0} constant({ { 1 }, { 2 } })
   transpose = f32[1,2]{1,0} transpose(constant), dimensions={1,0}
-  constant.1 = f32[3,2]{1,0} constant(f32[3,2] { { 1, 2 }, { 3, 4 }, { 5, 6 } })
+  constant.1 = f32[3,2]{1,0} constant({ { 1, 2 }, { 3, 4 }, { 5, 6 } })
   transpose.1 = f32[2,3]{1,0} transpose(constant.1), dimensions={1,0}
   ROOT dot = f32[1,3]{1,0} dot(transpose, transpose.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
index 75d406435b6f58faecc86b82c33e9e2dd6bccbea..3bcf5c38309a86e9e3cab3268f3f065005f7a923 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
@@ -129,7 +129,7 @@ condition {
 
 ENTRY entry {
   const_0 = f32[2] constant({1, 2})
-  const_1 = (f32[2], f32[2]) constant((f32[2], f32[2]) ({2, 1},{3,1}))
+  const_1 = (f32[2], f32[2]) constant(({2, 1},{3,1}))
   while_init = (f32[2],(f32[2],f32[2])) tuple(const_0, const_1)
   ROOT while = (f32[2],(f32[2],f32[2])) while(while_init), condition=condition, body=body
 }
@@ -206,8 +206,8 @@ body {
   p_body.0 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=0
   p_body.1 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=1
 
-  token = token[] after-all()
-  outfeed = token[] outfeed(p_body.0, token)
+  token0 = token[] after-all()
+  outfeed = token[] outfeed(p_body.0, token0)
   ROOT root = (f32[2],f32[2],f32[2]) tuple(p_body.0, p_body.1, p_body.1)
 }
 
@@ -305,7 +305,7 @@ condition {
 
 ENTRY entry {
   const_0 = f32[] constant(0)
-  const_1 = (f32[], f32[]) constant((f32[], f32[]) (1, 10))
+  const_1 = (f32[], f32[]) constant((1, 10))
   while_init = (f32[],(f32[],f32[])) tuple(const_0, const_1)
   ROOT while = (f32[],(f32[],f32[])) while(while_init), condition=condition, body=body
 }
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index 4950e8269e9cf0723d717bd1734518d104c0c9f2..3713989ca2f64ee1d94c9f77255017909d957da2 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -554,8 +555,7 @@ TEST_F(WhileLoopSimplifierTest, FlattenNestedTuple) {
 
   HloInstruction* new_while = FindFirstWhile(m.get());
   Shape flat_tuple =
-      ShapeUtil::ParseShapeString("(s32[1], s32[2], s32[3], s32[4])")
-          .ValueOrDie();
+      ParseShape("(s32[1], s32[2], s32[3], s32[4])").ValueOrDie();
   SCOPED_TRACE(m->ToString());
   EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), flat_tuple));
   EXPECT_TRUE(ShapeUtil::Equal(
@@ -567,8 +567,7 @@ TEST_F(WhileLoopSimplifierTest, FlattenNestedTuple) {
       flat_tuple));
   EXPECT_TRUE(ShapeUtil::Equal(
       m->entry_computation()->root_instruction()->shape(),
-      ShapeUtil::ParseShapeString("((s32[1]), (s32[2], s32[3], (s32[4])))")
-          .ValueOrDie()));
+      ParseShape("((s32[1]), (s32[2], s32[3], (s32[4])))").ValueOrDie()));
 }
 
 // Edge-case: All elements of the loop carry are constants which can be removed,
@@ -641,8 +640,7 @@ TEST_F(WhileLoopSimplifierTest, RemoveConstantFromLoopCarry) {
   EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
 
   HloInstruction* new_while = FindFirstWhile(m.get());
-  Shape new_while_shape =
-      ShapeUtil::ParseShapeString("(s32[1], s32[3])").ValueOrDie();
+  Shape new_while_shape = ParseShape("(s32[1], s32[3])").ValueOrDie();
   EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape));
   EXPECT_TRUE(ShapeUtil::Equal(
       new_while->while_body()->root_instruction()->shape(), new_while_shape));
@@ -652,9 +650,9 @@ TEST_F(WhileLoopSimplifierTest, RemoveConstantFromLoopCarry) {
   EXPECT_TRUE(ShapeUtil::Equal(
       new_while->while_condition()->parameter_instruction(0)->shape(),
       new_while_shape));
-  EXPECT_TRUE(ShapeUtil::Equal(
-      m->entry_computation()->root_instruction()->shape(),
-      ShapeUtil::ParseShapeString("(s32[1], s32[2], s32[3])").ValueOrDie()));
+  EXPECT_TRUE(
+      ShapeUtil::Equal(m->entry_computation()->root_instruction()->shape(),
+                       ParseShape("(s32[1], s32[2], s32[3])").ValueOrDie()));
   EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(_, op::Constant(), _));
 }
@@ -712,7 +710,7 @@ TEST_F(WhileLoopSimplifierTest, MergeInductionVariables_Simple) {
   // We should have added a new loop counter for s32[] to the end of the tuple.
   SCOPED_TRACE(m->ToString());
   Shape new_while_shape =
-      ShapeUtil::ParseShapeString("(s32[], s32[], s32[], s32[])").ValueOrDie();
+      ParseShape("(s32[], s32[], s32[], s32[])").ValueOrDie();
   EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape));
   EXPECT_TRUE(ShapeUtil::Equal(
       new_while->while_body()->root_instruction()->shape(), new_while_shape));
diff --git a/tensorflow/compiler/xla/service/while_util_test.cc b/tensorflow/compiler/xla/service/while_util_test.cc
index 5e6941933330fde29bc9c779aae4bb3c36914660..d92b9870f373564ae8fd904c8bf9f0d1afbff9c4 100644
--- a/tensorflow/compiler/xla/service/while_util_test.cc
+++ b/tensorflow/compiler/xla/service/while_util_test.cc
@@ -180,8 +180,8 @@ body {
 
 cond {
   param.c = (s32[], s32[]) parameter(0)
-  token = token[] after-all()
-  infeed = (pred[], token[]) infeed(token)
+  token0 = token[] after-all()
+  infeed = (pred[], token[]) infeed(token0)
   ROOT condition = pred[] get-tuple-element(infeed), index=0
 }
 
diff --git a/tensorflow/compiler/xla/shape.cc b/tensorflow/compiler/xla/shape.cc
index 746ab9e9977b1b10cdb0cb57197027d65bd50f55..b206345db2ac2940b1f139c82fa03a93538b5ccd 100644
--- a/tensorflow/compiler/xla/shape.cc
+++ b/tensorflow/compiler/xla/shape.cc
@@ -32,7 +32,7 @@ Shape::Shape(const ShapeProto& shape_proto) {
     *add_tuple_shapes() = Shape(element_shape);
   }
   if (shape_proto.has_layout()) {
-    *mutable_layout() = shape_proto.layout();
+    *mutable_layout() = Layout::CreateFromProto(shape_proto.layout());
   }
 }
 
@@ -48,7 +48,7 @@ ShapeProto Shape::ToProto() const {
     *proto.add_tuple_shapes() = shape.ToProto();
   }
   if (has_layout()) {
-    *proto.mutable_layout() = layout();
+    *proto.mutable_layout() = layout().ToProto();
   }
   return proto;
 }
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
index 7f6b14ab4286c696dce64d2250a3fe8a57e4865b..7643f64d8a5f0450be1cddad35cf7422afb89048 100644
--- a/tensorflow/compiler/xla/shape.h
+++ b/tensorflow/compiler/xla/shape.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/layout.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/types.h"
@@ -76,21 +77,10 @@ class Shape {
   std::vector<Shape>* mutable_tuple_shapes() { return &tuple_shapes_; }
 
   // Methods for accessing the layout field.
-  bool has_layout() const { return layout_.has_value(); }
-  const Layout& layout() const {
-    if (layout_.has_value()) {
-      return *layout_;
-    } else {
-      return Layout::default_instance();
-    }
-  }
-  Layout* mutable_layout() {
-    if (!layout_.has_value()) {
-      layout_ = Layout();
-    }
-    return &layout_.value();
-  }
-  void clear_layout() { layout_.reset(); }
+  bool has_layout() const { return layout_.format() != INVALID_FORMAT; }
+  const Layout& layout() const { return layout_; }
+  Layout* mutable_layout() { return &layout_; }
+  void clear_layout() { layout_.Clear(); }
 
   void Swap(Shape* other) {
     using std::swap;
@@ -101,7 +91,7 @@ class Shape {
     element_type_ = PRIMITIVE_TYPE_INVALID;
     dimensions_.clear();
     tuple_shapes_.clear();
-    layout_.reset();
+    clear_layout();
   }
 
   string SerializeAsString() const { return ToProto().SerializeAsString(); }
@@ -118,8 +108,8 @@ class Shape {
   // The tuple element subshapes. This is nonempty only for tuple shapes.
   std::vector<Shape> tuple_shapes_;
 
-  // The array layout of the shape. This is present only for array shapes.
-  absl::optional<Layout> layout_;
+  // The layout of the shape. Only relevant for arrays.
+  Layout layout_;
 };
 
 // Shape of the parameters and output of an XLA computation. This is analogous
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index b95fabf488291b0a7f393cb9f7f4a5dc9eb7c7eb..be7d71ada009535a5c08aa3d3d062fa451cfeef3 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -164,9 +164,9 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   TF_ASSIGN_OR_RETURN(Shape shape,
                       ShapeUtil::MakeValidatedShape(element_type, dimensions));
   auto min2maj = shape.mutable_layout()->mutable_minor_to_major();
-  min2maj->Clear();
+  min2maj->clear();
   for (int64 value : minor_to_major) {
-    min2maj->Add(value);
+    min2maj->push_back(value);
   }
   if (!shape.has_layout()) {
     return InvalidArgument("Shape has no layout.");
@@ -234,7 +234,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 
 /* static */ StatusOr<Shape> ShapeUtil::MakeValidatedShape(
     PrimitiveType element_type, absl::Span<const int64> dimensions) {
-  CHECK(IsArrayPrimitiveType(element_type));
+  CHECK(IsArrayPrimitiveType(element_type)) << element_type;
   Shape result;
   TF_RETURN_IF_ERROR(PopulateShape(element_type, dimensions, &result));
   return result;
@@ -480,54 +480,6 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return IsScalar(shape) && shape.element_type() == element_type;
 }
 
-namespace {
-
-// Class to memoize the computation of
-//   absl::AsciiStrToLower(PrimitiveType_Name(p))
-// for all PrimitiveType values "p"
-class PrimitiveTypeNameGenerator {
- public:
-  PrimitiveTypeNameGenerator() {
-    for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) {
-      if (PrimitiveType_IsValid(i)) {
-        lowercase_name_[i] = absl::AsciiStrToLower(
-            PrimitiveType_Name(static_cast<PrimitiveType>(i)));
-      }
-    }
-  }
-  const string& LowercaseName(PrimitiveType t) {
-    return lowercase_name_[static_cast<int>(t)];
-  }
-
- private:
-  string lowercase_name_[PrimitiveType_ARRAYSIZE];
-};
-
-const string& LowercasePrimitiveTypeName(PrimitiveType s) {
-  static PrimitiveTypeNameGenerator* gen = new PrimitiveTypeNameGenerator();
-  return gen->LowercaseName(s);
-}
-
-StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
-  static std::unordered_map<string, PrimitiveType>* name_to_type = [] {
-    static auto* map = new std::unordered_map<string, PrimitiveType>;
-    for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) {
-      if (PrimitiveType_IsValid(i)) {
-        auto value = static_cast<PrimitiveType>(i);
-        (*map)[LowercasePrimitiveTypeName(value)] = value;
-      }
-    }
-    return map;
-  }();
-  auto found = name_to_type->find(name);
-  if (found == name_to_type->end()) {
-    return InvalidArgument("Invalid element type string: \"%s\".", name);
-  }
-  return found->second;
-}
-
-}  // namespace
-
 /* static */ string ShapeUtil::HumanString(const Shape& shape) {
   if (IsTuple(shape)) {
     string text = "(";
@@ -539,8 +491,9 @@ StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
     text += ")";
     return text;
   }
-  return StrCat(LowercasePrimitiveTypeName(shape.element_type()), "[",
-                absl::StrJoin(shape.dimensions(), ","), "]");
+  return StrCat(
+      primitive_util::LowercasePrimitiveTypeName(shape.element_type()), "[",
+      absl::StrJoin(shape.dimensions(), ","), "]");
 }
 
 /* static */ string ShapeUtil::HumanStringWithLayout(const Shape& shape) {
@@ -554,7 +507,8 @@ StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
     text += ")";
     return text;
   }
-  string result = StrCat(LowercasePrimitiveTypeName(shape.element_type()), "[");
+  string result = StrCat(
+      primitive_util::LowercasePrimitiveTypeName(shape.element_type()), "[");
   for (int i = 0; i < shape.dimensions().size(); i++) {
     StrAppend(&result, (i > 0) ? "," : "", shape.dimensions(i));
   }
@@ -580,116 +534,6 @@ StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
                 HumanString(program_shape.result()));
 }
 
-namespace {
-// Parses shapes with simple recursive descent structure -- consumes from the
-// front of s and passes that view recursively as required.
-StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
-  *s = StripLeadingAsciiWhitespace(*s);
-
-  if (absl::ConsumePrefix(s, "(")) {  // Tuple.
-    std::vector<Shape> shapes;
-    bool must_end = false;
-    while (true) {
-      if (absl::ConsumePrefix(s, ")")) {
-        break;
-      } else if (must_end) {
-        return InvalidArgument("Expected end of tuple; got: \"%s\"", *s);
-      }
-      shapes.emplace_back();
-      TF_ASSIGN_OR_RETURN(shapes.back(), ParseShapeStringInternal(s));
-      *s = StripLeadingAsciiWhitespace(*s);
-      must_end = !absl::ConsumePrefix(s, ",");
-    }
-    return ShapeUtil::MakeTupleShape(shapes);
-  }
-
-  string element_type_string;
-  string dimensions_string;
-  string format_string;
-  string layout_string;
-  // absl::string_view is not compatible with internal RE2 StringPiece, so
-  // we convert in to the RE2-consumable type and then consume the corresponding
-  // amount from our string_view type.
-  static LazyRE2 shape_pattern = {
-      "^(\\w*\\d*)\\[([\\d,\\s]*)\\](?:\\s*(dense|sparse)?\\s*{([\\d,\\s]+)})"
-      "?"};
-  tensorflow::RegexpStringPiece s_consumable(s->data(), s->size());
-  if (RE2::Consume(&s_consumable, *shape_pattern, &element_type_string,
-                   &dimensions_string, &format_string, &layout_string)) {
-    size_t consumed = s->size() - s_consumable.size();
-    s->remove_prefix(consumed);
-    auto string_to_int64 = [&s](absl::string_view input) -> StatusOr<int64> {
-      int64 element;
-      if (!absl::SimpleAtoi(input, &element)) {
-        return InvalidArgument(
-            "Invalid s64 value in parsed shape string: \"%s\" in \"%s\"", input,
-            *s);
-      }
-      return element;
-    };
-
-    auto comma_list_to_int64s =
-        [string_to_int64](const string& input) -> StatusOr<std::vector<int64>> {
-      std::vector<int64> results;
-      for (const auto& piece : absl::StrSplit(input, ',', absl::SkipEmpty())) {
-        TF_ASSIGN_OR_RETURN(int64 element, string_to_int64(piece));
-        results.push_back(element);
-      }
-      return results;
-    };
-
-    // Extract the dimensions.
-    TF_ASSIGN_OR_RETURN(std::vector<int64> dimensions,
-                        comma_list_to_int64s(dimensions_string));
-
-    // Extract the primitive element type.
-    TF_ASSIGN_OR_RETURN(const PrimitiveType primitive_type,
-                        StringToPrimitiveType(element_type_string));
-    if (primitive_type == PRIMITIVE_TYPE_INVALID || primitive_type == TUPLE) {
-      return InvalidArgument("Invalid element type string: \"%s\".",
-                             element_type_string);
-    }
-
-    Shape result;
-    if (primitive_type == OPAQUE) {
-      result = ShapeUtil::MakeOpaqueShape();
-    } else if (primitive_type == TOKEN) {
-      result = ShapeUtil::MakeTokenShape();
-    } else if (format_string.empty() && layout_string.empty()) {
-      // Create a shape without a layout set.
-      TF_ASSIGN_OR_RETURN(
-          result, ShapeUtil::MakeValidatedShape(primitive_type, dimensions));
-    } else if (format_string == "sparse") {
-      TF_ASSIGN_OR_RETURN(int64 max_elements, string_to_int64(layout_string));
-      result = ShapeUtil::MakeShapeWithSparseLayout(primitive_type, dimensions,
-                                                    max_elements);
-    } else if (format_string.empty() || format_string == "dense") {
-      // Extract the layout minor-to-major and set it.
-      TF_ASSIGN_OR_RETURN(std::vector<int64> min2maj,
-                          comma_list_to_int64s(layout_string));
-      TF_ASSIGN_OR_RETURN(result, MakeShapeWithLayoutInternal(
-                                      primitive_type, dimensions, min2maj));
-    } else {
-      // This should not be reached.
-      LOG(FATAL) << "Unhandled condition when parsing shape; format: \""
-                 << format_string << "\", layout: \"" << layout_string << "\"";
-    }
-    TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(result));
-    return std::move(result);
-  }
-
-  return InvalidArgument("Invalid shape string to parse: \"%s\"", *s);
-}
-}  // namespace
-
-/* static */ StatusOr<Shape> ShapeUtil::ParseShapeString(absl::string_view s) {
-  TF_ASSIGN_OR_RETURN(Shape shape, ParseShapeStringInternal(&s));
-  if (!s.empty()) {
-    return InvalidArgument("Invalid shape string to parse: \"%s\"", s);
-  }
-  return shape;
-}
-
 /* static */ bool ShapeUtil::SameDimensions(const Shape& lhs,
                                             const Shape& rhs) {
   CHECK(ShapeUtil::IsArray(lhs));
@@ -867,13 +711,13 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
     if (shape.dimensions_size() != 0) {
       return InvalidArgument(
           "shape has %s element type, but has dimensions field: %s",
-          LowercasePrimitiveTypeName(shape.element_type()),
+          primitive_util::LowercasePrimitiveTypeName(shape.element_type()),
           shape.ShortDebugString());
     }
     if (shape.has_layout()) {
       return InvalidArgument(
           "shape has %s element type, but has layout field: %s",
-          LowercasePrimitiveTypeName(shape.element_type()),
+          primitive_util::LowercasePrimitiveTypeName(shape.element_type()),
           shape.ShortDebugString());
     }
     return Status::OK();
@@ -1067,6 +911,11 @@ bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) {
   return absl::c_linear_search(shape.dimensions(), 1);
 }
 
+/* static */ Shape ShapeUtil::DropDegenerateDimensions(const Shape& shape) {
+  return FilterDimensions(
+      [&](int64 dim) -> bool { return shape.dimensions()[dim] != 1; }, shape);
+}
+
 namespace {
 
 // Helper for ForEachSubshape which visits the subshapes of the given shape in
@@ -1168,7 +1017,7 @@ Status ForEachMutableSubshapeHelper(
   // Let the argument `permutation` be P.  This is a permutation over `shape`'s
   // dimensions, so our return value will be a shape with dims P.I = P.  Our
   // goal is to construct a layout permutation L* that we can apply to P such
-  // that that the physical dimension ordering of the returned shape is the same
+  // that the physical dimension ordering of the returned shape is the same
   // as that of the original shape, namely L'.
   //
   // Our returned shape has dims P and layout L*, so its in-memory layout is
@@ -1618,10 +1467,10 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   if (LayoutUtil::HasLayout(shape)) {
     Layout* layout = shape.mutable_layout();
     layout->set_format(DENSE);
-    for (size_t i = 0; i < layout->minor_to_major().size();) {
+    for (int64 i = 0; i < layout->minor_to_major().size();) {
       if (layout->minor_to_major(i) == dim_to_delete) {
         layout->mutable_minor_to_major()->erase(
-            layout->minor_to_major().begin() + i);
+            layout->mutable_minor_to_major()->begin() + i);
         continue;
       }
       if (layout->minor_to_major(i) > dim_to_delete) {
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 84a27f662a57ba274562e2e9be57b7e971c9b477..6b7a9cd34f25f2088bdb8d2c7f0412e5d8519d23 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -241,10 +241,6 @@ class ShapeUtil {
   // (param_name: f32[42x12], ...) -> f32[24x42]
   static string HumanString(const ProgramShape& program_shape);
 
-  // Parses a ShapeUtil::HumanString-format shape string back into a shape
-  // object.
-  static StatusOr<Shape> ParseShapeString(absl::string_view s);
-
   // Returns whether the LHS and RHS shapes have the same dimensions; note: does
   // not check element type.
   // Precondition: IsArray(lhs) && IsArray(rhs)
@@ -551,6 +547,9 @@ class ShapeUtil {
   // (dimensions with bound 1).
   static bool HasDegenerateDimensions(const Shape& shape);
 
+  // Drops any degenerate dimensions (i.e. dimensions of size 1)
+  static Shape DropDegenerateDimensions(const Shape& shape);
+
   // Permutes the dimensions by the given permutation, so
   // return_value.dimensions[permutation[i]] = argument.dimensions[i].
   //
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 60bdbe302045e6f3b4bae500c50bc68fb217525d..0a3081f5161f80ac97e864ba08d186df4fbdb55d 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -82,102 +82,6 @@ TEST(ShapeUtilTest, Rank4DimensionIndexing) {
   ASSERT_EQ(3, shape.dimensions(0));
 }
 
-TEST(ShapeUtilTest, ParseShapeStringR2F32) {
-  string shape_string = "f32[123,456]";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
-                          ShapeUtil::ParseShapeString(shape_string));
-  Shape expected = ShapeUtil::MakeShape(F32, {123, 456});
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual:   " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseShapeStringTupleOfArrays) {
-  string shape_string = "(f32[1572864],s8[5120,1024])";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
-                          ShapeUtil::ParseShapeString(shape_string));
-  Shape expected =
-      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {1572864}),
-                                 ShapeUtil::MakeShape(S8, {5120, 1024})});
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual:   " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseShapeStringNestedTuple) {
-  string shape_string = "(f32[1],(f32[2], token[]), opaque[], f32[3])";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
-                          ShapeUtil::ParseShapeString(shape_string));
-  Shape expected = ShapeUtil::MakeTupleShape({
-      ShapeUtil::MakeShape(F32, {1}),
-      ShapeUtil::MakeTupleShape(
-          {ShapeUtil::MakeShape(F32, {2}), ShapeUtil::MakeTokenShape()}),
-      ShapeUtil::MakeOpaqueShape(),
-      ShapeUtil::MakeShape(F32, {3}),
-  });
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual:   " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseShapeStringWithLayout) {
-  string shape_string = "f32[123,456]{0,1}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
-                          ShapeUtil::ParseShapeString(shape_string));
-  Shape expected = ShapeUtil::MakeShapeWithLayout(F32, {123, 456}, {0, 1});
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual:   " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseShapeStringWithExplicitDenseLayout) {
-  string shape_string = "f32[123,456]dense{0,1}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
-                          ShapeUtil::ParseShapeString(shape_string));
-  Shape expected = ShapeUtil::MakeShapeWithLayout(F32, {123, 456}, {0, 1});
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual:   " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseShapeStringWithSparseLayout) {
-  string shape_string = "f32[123,456]sparse{10}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
-                          ShapeUtil::ParseShapeString(shape_string));
-  Shape expected = ShapeUtil::MakeShapeWithSparseLayout(F32, {123, 456}, 10);
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual: " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseOpaqueType) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
-                          ShapeUtil::ParseShapeString("opaque[]"));
-  Shape expected = ShapeUtil::MakeOpaqueShape();
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual:   " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseTokenType) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ShapeUtil::ParseShapeString("token[]"));
-  Shape expected = ShapeUtil::MakeTokenShape();
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual:   " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseInvalidShapeString) {
-  string shape_strings[] = {
-      "f32[123,456]foobar{0,1}", "f32[123,456]sparse{0,1}", "f32[123,456]{foo}",
-      "f32[123,456]dense{foo}",  "f32[123,456]sparse{foo}",
-  };
-  for (const string& shape_string : shape_strings) {
-    StatusOr<Shape> result = ShapeUtil::ParseShapeString(shape_string);
-    ASSERT_FALSE(result.ok()) << "shape: " << shape_string;
-  }
-}
-
 TEST(ShapeUtilTest, CompatibleIdenticalShapes) {
   Shape shape1 = ShapeUtil::MakeShape(F32, {3, 2});
   Shape shape2 = ShapeUtil::MakeShape(F32, {3, 2});
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 2c18e2fd10105b6f0c146cad1842c7723699c8d9..0300b64ed59a3d4d8b0cd161109c97cabfdc6734 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1,6 +1,13 @@
 # Description:
 #   Base testing infrastructure for XLA.
 
+load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_suites", "generate_backend_test_macros", "xla_test", "xla_test_library")
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+
 licenses(["notice"])  # Apache 2.0
 
 package(
@@ -23,17 +30,6 @@ filegroup(
     ]),
 )
 
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test_library")
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_suites")
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_test_macros")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
-    "tf_cuda_tests_tags",
-)
-
 # Generate test_suites for all backends, named "${backend}_tests".
 generate_backend_suites()
 
@@ -303,10 +299,31 @@ xla_test(
     name = "conv_depthwise_test",
     timeout = "long",
     srcs = ["conv_depthwise_test.cc"],
+    shard_count = 50,
+    deps = [
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/service:bfloat16_normalization",
+        "//tensorflow/compiler/xla/service:despecializer",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+xla_test(
+    name = "grouped_convolution_test",
+    timeout = "long",
+    srcs = ["grouped_convolution_test.cc"],
     blacklisted_backends = [
         # disabled because of a break b/119590850.
-        "cpu",
         "gpu",
+        # disabled because it times out.
+        "cpu",
     ],
     shard_count = 50,
     deps = [
@@ -1327,6 +1344,7 @@ xla_test(
 xla_test(
     name = "custom_call_test",
     srcs = ["custom_call_test.cc"],
+    backends = ["cpu"],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 0615f9425c1289d666641f4d581946b44b4895ce..915b456b52215f8d6a9eb6c5b933f3502f1d3d2c 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -329,13 +329,13 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
   Literal b_literal = LiteralUtil::CreateR1<float>({b_values});
   std::unique_ptr<GlobalData> b_data =
       client_->TransferToServer(b_literal).ConsumeValueOrDie();
-  auto b_constant = Parameter(&builder, 1, a_literal.shape(), "b_param");
-  auto b_param = ConstantR1<float>(&builder, b_values);
+  auto b_param = Parameter(&builder, 1, a_literal.shape(), "b_param");
+  auto b_constant = ConstantR1<float>(&builder, b_values);
 
-  auto sum1 = Add(a_constant, b_constant);
-  auto sum2 = Add(a_constant, b_param);
-  auto sum3 = Add(a_param, b_constant);
-  auto sum4 = Add(a_param, b_param);
+  auto sum1 = Add(a_constant, b_param);
+  auto sum2 = Add(a_constant, b_constant);
+  auto sum3 = Add(a_param, b_param);
+  auto sum4 = Add(a_param, b_constant);
 
   auto sum = Add(sum1, sum2);
   sum = Add(sum, sum3);
@@ -350,6 +350,44 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
                              error_spec_);
 }
 
+// TODO(b/119692968): This test runs OOM on the GPU and CPU backend.
+XLA_TEST_F(ArrayElementwiseOpTest,
+           DISABLED_ON_GPU(DISABLED_ON_CPU(DeeplyNestedAddWithSlices))) {
+  XlaBuilder builder(TestName());
+  std::vector<float> values(30, 0.0);
+  auto a_literal = LiteralUtil::CreateR1<float>(values);
+  auto a = Parameter(&builder, 0, a_literal.shape(), "x");
+  auto b_literal = LiteralUtil::CreateR1<float>(values);
+  auto b = Parameter(&builder, 1, b_literal.shape(), "x");
+
+  // Construct a sequence of diamond-shaped gadgets like this:
+  //
+  //      add
+  //    /    \
+  //  slice  slice
+  //     \   /
+  //      add
+  //
+  // Each 'left' slice removes the last element, each 'right' slice removes the
+  // first element. In this way, we index into the add with different
+  // multi-dimensional index arrays, which defeats the caching we use to avoid
+  // exponential compile time.
+  std::function<XlaOp(int64)> generate_recursive =
+      [&](int64 slice_size) -> XlaOp {
+    if (slice_size == values.size()) {
+      return Add(a, b);
+    }
+    XlaOp param = generate_recursive(slice_size + 1);
+    auto slice1 = Slice(param, {0}, {slice_size}, {1});
+    auto slice2 = Slice(param, {1}, {slice_size + 1}, {1});
+    return Add(slice1, slice2);
+  };
+  generate_recursive(1);
+  auto a_data = client_->TransferToServer(a_literal).ConsumeValueOrDie();
+  auto b_data = client_->TransferToServer(b_literal).ConsumeValueOrDie();
+  ComputeAndCompareR1<float>(&builder, {0.0}, {a_data.get(), b_data.get()});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantF32s) {
   XlaBuilder builder(TestName());
   auto a = ConstantR1<float>(&builder, {-2.5f, 3.14f, 2.25f, -10.0f, 6.0f});
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 12c029983336cc9aed0fde4ce6881c9a00a9869e..697236dc6236738df08205fa3631a2919dd361c5 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -74,6 +74,9 @@ ClientLibraryTestBase::ClientLibraryTestBase(
   // default.
   execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
       "constant_folding");
+
+  execution_options_.mutable_debug_options()
+      ->set_xla_hlo_evaluator_use_fast_path(true);
 }
 
 ClientLibraryTestBase::ClientLibraryTestBase(se::Platform* platform)
@@ -88,6 +91,9 @@ ClientLibraryTestBase::ClientLibraryTestBase(se::Platform* platform)
 
   execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
       "constant_folding");
+
+  execution_options_.mutable_debug_options()
+      ->set_xla_hlo_evaluator_use_fast_path(true);
 }
 
 string ClientLibraryTestBase::TestName() const {
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 34148e5886d3806b19fc5bee90806c5678df345e..65a23dd883594b9bf9c37494a37e9be39b197788 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -76,7 +76,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   void SetFastMathDisabled(bool disabled) {
     auto* opts = execution_options_.mutable_debug_options();
     opts->set_xla_cpu_enable_fast_math(!disabled);
-    opts->set_xla_gpu_enable_fast_math(!disabled);
+    opts->set_xla_gpu_enable_fast_min_max(!disabled);
   }
 
   void SetSeed(uint64 seed) { execution_options_.set_seed(seed); }
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
index 60ce576ceb20b89b59e72d821e63b0ccdee51b0b..627a17a0ca114085240dbaf28211bb3511cf0cab 100644
--- a/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
@@ -50,9 +50,9 @@ class DepthwiseConvolution2DTest
 static std::vector<DepthwiseConvolution2DSpec> GetConv2DTestCases() {
   std::vector<DepthwiseConvolution2DSpec> config_set;
   std::vector<std::vector<int64>> config_options = {
-      {128, 6, 3, 64},  {256, 5, 3, 256},  {256, 5, 2, 144}, {144, 5, 3, 64},
-      {144, 5, 2, 256}, {8, 48, 17, 8},    {128, 20, 6, 64}, {128, 1, 2, 144},
-      {256, 1, 2, 64},  {64, 14, 12, 172}, {16, 9, 4, 16}};
+      {128, 6, 3, 64},  {256, 5, 3, 256}, {256, 5, 2, 144}, {144, 5, 3, 64},
+      {144, 5, 2, 256}, {8, 48, 17, 8},   {128, 20, 6, 64}, {64, 14, 12, 172},
+      {16, 9, 4, 16},   {128, 1, 2, 144}, {256, 1, 2, 64}};
 
   for (auto option : config_options) {
     int64 feature = option[0];
@@ -136,7 +136,7 @@ string BuildHloTextDepthwiseConvolution2D(
   if (spec.activation_dims[1] == 1 && spec.kernel_dims[1] == 2) {
     return absl::StrFormat(
         R"(
-    HloModule TensorFlowDepthwiseConv, is_scheduled=true
+    HloModule TensorFlowDepthwiseConv
 
     ENTRY main {
       activation = %s[%s]{%s} parameter(0)
@@ -161,7 +161,7 @@ string BuildHloTextDepthwiseConvolution2D(
   } else if (spec.stride == -1) {
     return absl::StrFormat(
         R"(
-      HloModule TensorFlowDepthwiseConv, is_scheduled=true
+      HloModule TensorFlowDepthwiseConv
 
       ENTRY main {
         activation = %s[%s]{%s} parameter(0)
@@ -185,7 +185,7 @@ string BuildHloTextDepthwiseConvolution2D(
   } else {
     return absl::StrFormat(
         R"(
-    HloModule TensorFlowDepthwiseConv, is_scheduled=true
+    HloModule TensorFlowDepthwiseConv
 
     ENTRY main {
       activation = %s[%s]{%s} parameter(0)
@@ -215,13 +215,13 @@ XLA_TEST_P(DepthwiseConvolution2DTest, DoIt) {
   const string hlo_text =
       BuildHloTextDepthwiseConvolution2D(spec, use_bfloat16);
 
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      hlo_text, ErrorSpec{0.01, 0.01}, [](HloModule* module) -> Status {
-        BFloat16MixedPrecisionRemoval remover;
-        TF_RETURN_IF_ERROR(remover.Run(module).status());
-        Despecializer despecializer;
-        return despecializer.Run(module).status();
-      }));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01},
+                            [](HloModule* module) -> Status {
+                              BFloat16MixedPrecisionRemoval remover;
+                              TF_RETURN_IF_ERROR(remover.Run(module).status());
+                              Despecializer despecializer;
+                              return despecializer.Run(module).status();
+                            }));
 }
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index 3622f2c1e84639baed13059b21b20609d1347da6..df005a67097bb8aaf070c57d1c51acd1909fee12 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -133,7 +133,9 @@ XLA_TEST_F(CopyOpTest, CopyConstantR2DifferentLayouts) {
   // Reverse the minor-to-major order of the literal.
   Layout* literal_layout = literal.mutable_shape_do_not_use()->mutable_layout();
   ASSERT_EQ(2, literal_layout->minor_to_major_size());
-  literal_layout->mutable_minor_to_major()->SwapElements(0, 1);
+  // Swap the first and second elements.
+  *literal_layout->mutable_minor_to_major() = {
+      literal_layout->minor_to_major(1), literal_layout->minor_to_major(0)};
 
   HloInstruction* constant = builder.AddInstruction(
       HloInstruction::CreateConstant(std::move(literal)));
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 738b6442354b01364278e3e3c713aa2cdb5cf47d..cad43d1b5547d74701760fa623e50466fc15c263 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -54,11 +54,20 @@ void Add1ToValues(float* out, float** in) {
   out[2] = array[2] + 1;
   out[3] = array[3] + 1;
 }
+
+void F32TupleSwap(float** out, float** in) {
+  TF_ANNOTATE_MEMORY_IS_INITIALIZED(in[0], sizeof(float));
+  TF_ANNOTATE_MEMORY_IS_INITIALIZED(in[1], sizeof(float));
+  *out[0] = *in[1];
+  *out[1] = *in[0];
+}
+
 }  // namespace
 
 REGISTER_CUSTOM_CALL_TARGET(R0F32Add2);
 REGISTER_CUSTOM_CALL_TARGET(R2F32ReduceSum);
 REGISTER_CUSTOM_CALL_TARGET(Add1ToValues);
+REGISTER_CUSTOM_CALL_TARGET(F32TupleSwap);
 
 namespace xla {
 namespace {
@@ -69,7 +78,7 @@ class CustomCallTest : public HloTestBase {
   Shape r2f32_ = ShapeUtil::MakeShape(F32, {2, 2});
 };
 
-XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR0F32Add2)) {
+XLA_TEST_F(CustomCallTest, CustomCallR0F32Add2) {
   auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
@@ -84,7 +93,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR0F32Add2)) {
   LiteralTestUtil::ExpectR0Near<float>(44.0f, result, error_spec_);
 }
 
-XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR2F32Reduce)) {
+XLA_TEST_F(CustomCallTest, CustomCallR2F32Reduce) {
   auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
@@ -105,7 +114,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR2F32Reduce)) {
   LiteralTestUtil::ExpectR0Near<float>(10.0f, result, error_spec_);
 }
 
-XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(UsedInOtherComputations)) {
+XLA_TEST_F(CustomCallTest, UsedInOtherComputations) {
   auto module = CreateNewUnverifiedModule();
   auto b = HloComputation::Builder(TestName());
 
@@ -129,7 +138,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(UsedInOtherComputations)) {
       Array3D<float>{{{2, 3}, {4, 5}}, {{3, 4}, {5, 6}}}, result);
 }
 
-XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(InputAndOutputLayoutDiffer)) {
+XLA_TEST_F(CustomCallTest, InputAndOutputLayoutDiffer) {
   auto module = CreateNewUnverifiedModule();
   auto b = HloComputation::Builder(TestName());
 
@@ -151,7 +160,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(InputAndOutputLayoutDiffer)) {
   LiteralTestUtil::ExpectR2Equal<float>({{2.f, 4.f}, {3.f, 5.f}}, result);
 }
 
-XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(LayoutConstrained)) {
+XLA_TEST_F(CustomCallTest, LayoutConstrained) {
   // The argument and result of the computation are set to different layouts,
   // but the custom call is layout constrained to a fixed operand and result
   // layout, so the correct result should be produced.
@@ -176,6 +185,26 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(LayoutConstrained)) {
   LiteralTestUtil::ExpectR2Equal<float>({{2.f, 3.f}, {4.f, 5.f}}, result);
 }
 
+XLA_TEST_F(CustomCallTest, TupleOutput) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT %custom-call = (f32[], f32[]) custom-call(f32[] %p0, f32[] %p1), custom_call_target="F32TupleSwap", operand_layout_constraints={f32[], f32[]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  Literal arg0 = LiteralUtil::CreateR0<float>(7.f);
+  Literal arg1 = LiteralUtil::CreateR0<float>(42.f);
+
+  Literal expected = LiteralUtil::MakeTuple({&arg1, &arg0});
+  Literal result = ExecuteAndTransfer(std::move(module), {&arg0, &arg1});
+  EXPECT_EQ(result, expected);
+}
+
 class CustomCallClientAPITest : public ClientLibraryTestBase {};
 
 // When using the client API, CustomCall targets can't begin with '$' -- these
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 25091b8d5d5498edf3ce86efe225cd0e2fd8ff6b..c5d8b663f4abe77e05ec213d2e4e075c260a8655 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/tests/grouped_convolution_test.cc b/tensorflow/compiler/xla/tests/grouped_convolution_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f7049910e70c4e591636a47c1b6ba72cf2c234f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/grouped_convolution_test.cc
@@ -0,0 +1,245 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/despecializer.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+string GetFloatDataType(bool use_bfloat16) {
+  return use_bfloat16 ? "bf16" : "f32";
+}
+
+struct GroupedConvolution2DSpec {
+  int64 input_feature, output_feature, window, stride, pad, lhs_dilate;
+  int64 group_size, group_count;
+  std::vector<int64> activation_dims;
+  std::vector<int64> activation_layout;
+  std::vector<int64> kernel_dims;
+  std::vector<int64> kernel_layout;
+  std::vector<int64> output_dims;
+  std::vector<int64> output_layout;
+};
+
+class GroupedConvolution2DTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<GroupedConvolution2DSpec, bool>> {};
+
+static std::vector<GroupedConvolution2DSpec> GetConv2DTestCases() {
+  std::vector<GroupedConvolution2DSpec> config_set;
+  // Add to this set if you want a new test configuration.
+  // Rule : the penultimate number must be divisible by the last number.
+  std::vector<std::vector<int64>> config_options = {{8, 2, 2, 1, 1024, 128},
+                                                    {512, 3, 3, 144, 1024, 16},
+                                                    {256, 3, 3, 129, 512, 64},
+                                                    {64, 1, 2, 127, 32, 8},
+                                                    {256, 3, 3, 256, 1024, 4}};
+
+  for (auto option : config_options) {
+    int64 output_feature = option[0];
+    int64 activation_size = option[1];
+    int64 kernel_size = option[2];
+    int64 batch = option[3];
+    int64 input_feature = option[4];
+    int64 group_size = option[5];
+
+    std::vector<int64> kernel_layout = {3, 2, 1, 0};
+    GroupedConvolution2DSpec config;
+    config.group_size = group_size;
+    config.group_count = input_feature / group_size;
+    config.output_feature = output_feature;
+    config.window = kernel_size;
+
+    config.activation_dims = {batch, activation_size, activation_size,
+                              input_feature};
+    config.activation_layout = {3, 0, 2, 1};
+
+    config.kernel_dims = {kernel_size, kernel_size, group_size, output_feature};
+    config.kernel_layout = {3, 2, 1, 0};
+
+    if (activation_size == 1 && kernel_size == 2) {
+      // Test for outer dim.
+      config.output_dims = {batch, activation_size + kernel_size - 1,
+                            activation_size + kernel_size, output_feature};
+    } else if (output_feature == 256) {
+      // Restrict dilation-based tests only to one feature configuration.
+      config.stride = activation_size - 1;
+      config.pad = 0;
+      config.lhs_dilate = output_feature / 32;
+      config.output_dims = {batch, output_feature / 32,
+                            activation_size - kernel_size + 1, output_feature};
+    } else {
+      config.stride = config.pad = config.lhs_dilate = -1;
+      config.output_dims = {batch, activation_size - kernel_size + 1,
+                            activation_size - kernel_size + 1, output_feature};
+    }
+
+    // Try this layout for all kernel shapes.
+    config.output_layout = {3, 0, 2, 1};
+    config_set.push_back(config);
+
+    // Try other layouts only for certain kernel shapes.
+    if (kernel_size % 2 == 0) {
+      config.activation_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.output_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.activation_layout = {3, 0, 2, 1};
+      config_set.push_back(config);
+    }
+  }
+
+  return config_set;
+}
+
+string GroupedConvolution2DTestDataToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<GroupedConvolution2DSpec, bool>>& data) {
+  const auto& spec = ::testing::get<0>(data.param);
+  const string data_type = GetFloatDataType(::testing::get<1>(data.param));
+  string str = absl::StrCat(
+      "activation_dims_", absl::StrJoin(spec.activation_dims, "x"),
+      "_activation_layout_", absl::StrJoin(spec.activation_layout, "_"),
+      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"), "_kernel_layout_",
+      absl::StrJoin(spec.kernel_layout, "_"), "_output_dims_",
+      absl::StrJoin(spec.output_dims, "x"), "_output_layout_",
+      absl::StrJoin(spec.output_layout, "_"), data_type);
+  // -1 indicates non-existence.
+  if (spec.stride != -1) {
+    absl::StrAppend(&str, "_lhs_dilation_", spec.lhs_dilate, "x1");
+  }
+
+  // Test names are not allowed to contain the '-' character.
+  absl::c_replace(str, '-', 'n');
+  return str;
+}
+
+string BuildHloTextGroupedConvolution2D(const GroupedConvolution2DSpec& spec,
+                                        bool use_bfloat16) {
+  const string data_type = GetFloatDataType(use_bfloat16);
+  if (spec.activation_dims[1] == 1 && spec.kernel_dims[1] == 2) {
+    // Check for outer dim.
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d  pad=1_1x%d_%d rhs_dilate=1x%d}, dim_labels=b01f_01io->b01f,
+          feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.window, spec.window, spec.window, spec.group_count);
+
+  } else if (spec.stride == -1) {
+    // Check for basic, non-dilated cases.
+    return absl::StrFormat(
+        R"(
+      HloModule TensorFlowDepthwiseConv
+
+      ENTRY main {
+        activation = %s[%s]{%s} parameter(0)
+        kernel = %s[%s]{%s} parameter(1)
+        ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+            window={size=%dx%d}, dim_labels=b01f_01io->b01f,
+            feature_group_count=%d
+      }
+      )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.group_count);
+  } else {
+    // Check for base dilations.
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d stride=%dx1 pad=%d_%dx0_0 lhs_dilate=%dx1}, 
+          dim_labels=b01f_01io->b01f, feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.stride, 0, 0, spec.lhs_dilate, spec.group_count);
+  }
+}
+
+XLA_TEST_P(GroupedConvolution2DTest, DoIt) {
+  const GroupedConvolution2DSpec& spec = ::testing::get<0>(GetParam());
+  bool use_bfloat16 = ::testing::get<1>(GetParam());
+  const string hlo_text = BuildHloTextGroupedConvolution2D(spec, use_bfloat16);
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01},
+                            [](HloModule* module) -> Status {
+                              BFloat16MixedPrecisionRemoval remover;
+                              TF_RETURN_IF_ERROR(remover.Run(module).status());
+                              Despecializer despecializer;
+                              return despecializer.Run(module).status();
+                            }));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    GroupedConvolution2DTestWithRandomIndices, GroupedConvolution2DTest,
+    ::testing::Combine(::testing::ValuesIn(GetConv2DTestCases()),
+                       ::testing::Bool()),
+    GroupedConvolution2DTestDataToString);
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 989a7c705a8254f99e5cc0e97dfde5942f146964..d57846e19bb80c5b9c87d50596da2915f9aef317 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -181,6 +181,7 @@ DebugOptions HloTestBase::GetDebugOptionsForTest() {
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
   debug_options.add_xla_disable_hlo_passes("constant_folding");
   debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
+  debug_options.set_xla_hlo_evaluator_use_fast_path(true);
   return debug_options;
 }
 
diff --git a/tensorflow/compiler/xla/tests/iota_test.cc b/tensorflow/compiler/xla/tests/iota_test.cc
index 65205f53ddc582ae477d67705f161fef1e31b857..37b2c635eebe57590e1ba73c62f015ccf399b548 100644
--- a/tensorflow/compiler/xla/tests/iota_test.cc
+++ b/tensorflow/compiler/xla/tests/iota_test.cc
@@ -80,7 +80,7 @@ TEST_P(IotaR2Test, DoIt) {
 }
 
 INSTANTIATE_TEST_CASE_P(IotaR2TestInstantiation, IotaR2Test,
-                        ::testing::Combine(::testing::Values(F32, S32),
+                        ::testing::Combine(::testing::Values(F32, S32, BF16),
                                            ::testing::Range(/*start=*/10,
                                                             /*end=*/1001,
                                                             /*step=*/10),
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index b6f9b8156b51144e4f74d285b1e4111d098f13c2..ea9b3037cf482e41238413179888f125822d161c 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -89,11 +89,11 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
     Literal literal =
         Literal::CreateFromProto(literal_proto).ConsumeValueOrDie();
     if (result.find("expected") != string::npos) {
-      EXPECT_EQ("2", literal.ToString());
+      EXPECT_EQ("f32[] 2", literal.ToString());
     } else if (result.find("actual") != string::npos) {
-      EXPECT_EQ("4", literal.ToString());
+      EXPECT_EQ("f32[] 4", literal.ToString());
     } else if (result.find("mismatches") != string::npos) {
-      EXPECT_EQ("true", literal.ToString());
+      EXPECT_EQ("pred[] true", literal.ToString());
     } else {
       FAIL() << "unknown file in temporary directory: " << result;
     }
@@ -105,9 +105,9 @@ TEST(LiteralTestUtilTest, NotEqualHasValuesInMessage) {
   auto actual = LiteralUtil::CreateR1<int32>({4, 5, 6});
   ::testing::AssertionResult result = LiteralTestUtil::Equal(expected, actual);
   EXPECT_THAT(result.message(),
-              ::testing::HasSubstr("Expected literal:\n{1, 2, 3}"));
+              ::testing::HasSubstr("Expected literal:\ns32[3] {1, 2, 3}"));
   EXPECT_THAT(result.message(),
-              ::testing::HasSubstr("Actual literal:\n{4, 5, 6}"));
+              ::testing::HasSubstr("Actual literal:\ns32[3] {4, 5, 6}"));
 }
 
 TEST(LiteralTestUtilTest, NearComparatorR1) {
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index e8f5d7a9a79ebddea3cb989dbe8eab90b630d5e7..448a66cfdd897b17cce1c87c050520a2f2eb0ea2 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -61,11 +61,11 @@ XLA_TEST_F(TestUtilsTest, Token) {
                     R"(HloModule outfeed_module
 
     ENTRY InfeedToOutfeed {
-      token = token[] parameter(0)
-      infeed = ((u32[3]{0}, pred[]), token[]) infeed(token)
+      token0 = token[] parameter(0)
+      infeed = ((u32[3]{0}, pred[]), token[]) infeed(token0)
       infeed.data = (u32[3]{0}, pred[]) get-tuple-element(infeed), index=0
-      outfeed = token[] outfeed(infeed.data, token)
-      ROOT infeed.1 = ((u32[3]{0}, pred[]), token[]) infeed(token)
+      outfeed = token[] outfeed(infeed.data, token0)
+      ROOT infeed.1 = ((u32[3]{0}, pred[]), token[]) infeed(token0)
       infeed.1.data = (u32[3]{0}, pred[]) get-tuple-element(infeed.1), index=0
       infeed.1.token = token[] get-tuple-element(infeed.1), index=1
       outfeed.1 = token[] outfeed(infeed.1.data, infeed.1.token)
diff --git a/tensorflow/compiler/xla/tests/token_hlo_test.cc b/tensorflow/compiler/xla/tests/token_hlo_test.cc
index 601c6b06938fef1f1ae809b33209ae59b24c70a2..b77cf38ed8e29973985406015c0a3936916ad5e6 100644
--- a/tensorflow/compiler/xla/tests/token_hlo_test.cc
+++ b/tensorflow/compiler/xla/tests/token_hlo_test.cc
@@ -214,8 +214,8 @@ ENTRY %AddDependency (p0: f32[], p1: f32[]) -> f32[] {
 
   %forty_two = f32[] constant(42.0)
   %add = f32[] add(f32[] %p0, f32[] %forty_two)
-  %token = token[] after-all(f32[] %add)
-  %p1_after_token = f32[] add-dependency(f32[] %p1, token[] %token)
+  %token0 = token[] after-all(f32[] %add)
+  %p1_after_token = f32[] add-dependency(f32[] %p1, token[] %token0)
   %neg = f32[] negate(f32[] %p1_after_token)
   ROOT %product = f32[] multiply(f32[] %add, f32[] %neg)
 }
@@ -236,8 +236,8 @@ HloModule AddDependencyOfConstant, is_scheduled=true
 ENTRY %AddDependency (p0: f32[]) -> f32[] {
   %p0 = f32[] parameter(0)
   %forty_two = f32[] constant(42.0)
-  %token = token[] after-all(f32[] %p0)
-  %forty_two_after_token = f32[] add-dependency(f32[] %forty_two, token[] %token)
+  %token0 = token[] after-all(f32[] %p0)
+  %forty_two_after_token = f32[] add-dependency(f32[] %forty_two, token[] %token0)
   ROOT %product = f32[] multiply(f32[] %p0, f32[] %forty_two_after_token)
 }
 )";
@@ -255,8 +255,8 @@ HloModule AddDependencyAsRoot, is_scheduled=true
 ENTRY %AddDependency (p: f32[3]) -> f32[3] {
   %p = f32[3] parameter(0)
   %neg = f32[3] negate(f32[3] %p)
-  %token = token[] after-all()
-  ROOT %add_dep = f32[3] add-dependency(f32[3] %neg, token[] %token)
+  %token0 = token[] after-all()
+  ROOT %add_dep = f32[3] add-dependency(f32[3] %neg, token[] %token0)
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(
@@ -274,9 +274,9 @@ ENTRY %TupleShapedAddDependency (p0: f32[3], p1: f32[3]) -> f32[3] {
   %p0 = f32[3] parameter(0)
   %p1 = f32[3] parameter(1)
   %forty_two = f32[] constant(42.0)
-  %token = token[] after-all()
-  %tuple = (f32[3], token[], f32[3], f32[]) tuple(f32[3] %p0, token[] %token, f32[3] %p1, f32[] %forty_two)
-  %add_dep = (f32[3], token[], f32[3], f32[]) add-dependency((f32[3], token[], f32[3], f32[]) %tuple, token[] %token)
+  %token0 = token[] after-all()
+  %tuple = (f32[3], token[], f32[3], f32[]) tuple(f32[3] %p0, token[] %token0, f32[3] %p1, f32[] %forty_two)
+  %add_dep = (f32[3], token[], f32[3], f32[]) add-dependency((f32[3], token[], f32[3], f32[]) %tuple, token[] %token0)
   %elem0 = f32[3] get-tuple-element((f32[3], token[], f32[3], f32[]) %add_dep), index=0
   %elem2 = f32[3] get-tuple-element((f32[3], token[], f32[3], f32[]) %add_dep), index=2
   ROOT %diff = f32[3] subtract(f32[3] %elem0, f32[3] %elem2)
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 27ce243e9bd4afbdcc1fdc5b6873d4968086e459..9c586bdeb05afb7378e92caed1f3edc408e051bf 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -555,8 +555,8 @@ XLA_TEST_F(TupleHloTest,
       s = (f32[2],f32[2]) tuple-select(cond, tup0, tup1)
       gte = f32[2] get-tuple-element(s), index=0
       tuple = (f32[2]) tuple(gte)
-      token = token[] after-all()
-      ROOT outfeed = token[] outfeed(tuple, token)
+      token0 = token[] after-all()
+      ROOT outfeed = token[] outfeed(tuple, token0)
     }
   )";
   auto module =
diff --git a/tensorflow/compiler/xla/text_literal_reader.cc b/tensorflow/compiler/xla/text_literal_reader.cc
index cdde88c1359416d423685f330e9cbdf77948034f..c78ec522aa5f13556c6d4602267544694887f250 100644
--- a/tensorflow/compiler/xla/text_literal_reader.cc
+++ b/tensorflow/compiler/xla/text_literal_reader.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -66,7 +67,7 @@ StatusOr<Literal> TextLiteralReader::ReadAllLines() {
   }
 
   absl::StripAsciiWhitespace(&shape_string);
-  TF_ASSIGN_OR_RETURN(Shape shape, ShapeUtil::ParseShapeString(shape_string));
+  TF_ASSIGN_OR_RETURN(Shape shape, ParseShape(shape_string));
   if (shape.element_type() != F32) {
     return Unimplemented(
         "unsupported element type for text literal reading: %s",
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index ff2c3399928c0e6339304323c4f93e212933a340..27a8dd13308b29da9a5013ac9f696613981d68bb 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -118,7 +118,12 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
   std::vector<std::unique_ptr<GlobalData>> global_data_arguments;
   std::vector<const ShapedBuffer*> argument_ptrs;
   if (opts.use_fake_data) {
-    global_data_arguments = MakeFakeArgumentsOrDie(computation, client);
+    // Run fake computations with debug options ignoring XLA_FLAGS.  Users very
+    // likely want XLA_FLAGS only to apply to the "real" computation being run,
+    // not to the fake computations we use for generating arguments.
+    auto debug_opts = DefaultDebugOptionsIgnoringFlags();
+    global_data_arguments =
+        MakeFakeArgumentsOrDie(computation, client, &debug_opts);
     for (const auto& data : global_data_arguments) {
       argument_ptrs.push_back(
           client->GlobalDataToShapedBuffer(data->handle(), /*device_ordinal=*/0)
@@ -140,8 +145,7 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
   bool provide_infeed = false;
   Shape infeed_shape;
   if (!opts.fake_infeed_shape.empty()) {
-    StatusOr<Shape> shape_status =
-        ShapeUtil::ParseShapeString(opts.fake_infeed_shape);
+    StatusOr<Shape> shape_status = ParseShape(opts.fake_infeed_shape);
     TF_CHECK_OK(shape_status.status());
     infeed_shape = std::move(shape_status).ValueOrDie();
     provide_infeed = true;
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index f745fb850655edaba8c95ba0cd3af3cc765b99e6..0e8fa73f8170addfa5061b33f3d6882a13890bce 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -100,6 +100,14 @@ message DebugOptions {
   // names as specified by the HloPassInterface::name() method.
   repeated string xla_disable_hlo_passes = 30;
 
+  // Disables all HLO passes.  Notes that some passes are necessary for
+  // correctness and the invariants that must be satisfied by "fully optimized"
+  // HLO are different for different devices and may change over time.  The only
+  // "guarantee", such as it is, is that if you compile XLA and dump the
+  // optimized HLO for some graph, you should be able to run it again on the
+  // same device with the same build of XLA.
+  bool xla_disable_all_hlo_passes = 104;
+
   // Numerical optimization level for the XLA compiler backend; the specific
   // interpretation of this value is left to the backends.
   int32 xla_backend_optimization_level = 31;
@@ -193,7 +201,11 @@ message DebugOptions {
   //  - Assuming that operations never produce or consume NaN or +/- Inf.
   //  - Assuming that +0 and -0 are indistinguishable.
   bool xla_cpu_enable_fast_math = 99;
-  bool xla_gpu_enable_fast_math = 100;
+
+  // When true we lower the Minimum and Maximum hlos in the GPU backend such
+  // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NotNaN.  In other words, if flag
+  // this is true we don't propagate NaNs through Min and Max.
+  bool xla_gpu_enable_fast_min_max = 100;
 
   // Crashes the program when any kind of verification fails, instead of just
   // logging the failures. One example is cross checking of convolution results
@@ -209,6 +221,17 @@ message DebugOptions {
   // the host that run models in parallel across multiple devices.
   int32 xla_force_host_platform_device_count = 102;
 
+  // If set to true XLA:GPU invokes `ptxas` with -O0 (default is -O3).
+  bool xla_gpu_disable_ptxas_optimizations = 103;
+
+  // Dump HLO graphs as an HTML (DOT -> SVG inlined in HTML)
+  bool xla_hlo_dump_as_html = 105;
+
+  // Enable fast math with eigen in the HLO evaluator.
+  bool xla_hlo_evaluator_use_fast_path = 106;
+
+  // Next id: 107
+
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
@@ -382,7 +405,7 @@ message WaitForExecutionResponse {
 
 message ComputeConstantGraphRequest {
   HloModuleProto computation = 1;
-  Layout output_layout = 2;
+  LayoutProto output_layout = 2;
 }
 
 message ComputeConstantResponse {
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 85ec83437a10d973687a7fb84285c2e2541a53c7..e9c86abe5094244988d3465ef7c949509deaec37 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -100,6 +100,8 @@ message PaddingConfig {
 
 // A format specifies the method used by a layout to store an array in memory.
 enum Format {
+  // TODO(b/120869032): Rename this to FORMAT_NONE or something else which
+  // better corresponds to its meaning.
   INVALID_FORMAT = 0;
   // The default layout, with exactly one storage location per element.
   DENSE = 1;
@@ -109,8 +111,9 @@ enum Format {
 }
 
 // Describes a tile used in tiling-based layout. Refer to
-// g3doc/layout_with_tiling.md for details about tiling-based layout.
-message Tile {
+// g3doc/third_party/tensorflow/compiler/xla/g3doc/layout_with_tiling.md for
+// details about tiling-based layout.
+message TileProto {
   // Number of elements in each dimension of the tile. It's ordered from the
   // most major dimension of the tile to the most minor dimension of the tile.
   // The dimensions correspond to a suffix of the dimensions of the shape being
@@ -128,7 +131,7 @@ message Tile {
 // See the XLA documentation for more information on shapes and layouts.
 //
 // LINT.IfChange
-message Layout {
+message LayoutProto {
   // The method used to store the data in memory. The format determines which of
   // the other fields are used by the layout.
   Format format = 4;
@@ -153,7 +156,7 @@ message Layout {
   //
   // TODO(b/119839262): implement tiling in each backend or add Unimplemented
   // error.
-  repeated Tile tiles = 6;
+  repeated TileProto tiles = 6;
 
   // Bit size of each element. If the size is bigger than what the element
   // type requires, the value is stored in the least significant
@@ -196,7 +199,7 @@ message ShapeProto {
   repeated ShapeProto tuple_shapes = 4;
 
   // The layout used to back this shape.
-  Layout layout = 5;
+  LayoutProto layout = 5;
 
   // Important: if any field is added, be sure to modify ShapeUtil::Equal(),
   // ShapeUtil::Compatible() and ShapeUtil::Hash() appropriately to account for
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 8c6191ddc06ea7d85f5fd21a7d4058c669ffdeb2..751329eefc33f3372335c805233dafabbf42bf36 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -228,14 +228,35 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) {
   TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
       shaped_buffer, device_ref.backend(), device_ref.device_ordinal(),
       &output_tuple));
-
-  Tensor* output_tensor;
-  TF_RETURN_IF_ERROR(
-      context->allocate_output(0, TensorShape({}), &output_tensor));
-  int64 key;
-  TF_RETURN_IF_ERROR(output_tuple->Intern(rm, &key));
-  output_tensor->scalar<int64>()() = key;
-
+  if (config_proto.return_exploded_tuple() &&
+      xla::ShapeUtil::IsTuple(output_tuple->on_device_shape())) {
+    int64 tuple_element_count =
+        xla::ShapeUtil::TupleElementCount(output_tuple->on_device_shape());
+    Tensor* output_tensor;
+    TF_RETURN_IF_ERROR(context->allocate_output(
+        0, TensorShape({tuple_element_count}), &output_tensor));
+
+    for (int64 i = 0; i < tuple_element_count; ++i) {
+      xla::ShapeIndex shape_index;
+      shape_index.push_back(i);
+
+      XRTTupleAllocation* suballocation;
+      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
+          output_tuple, shape_index, &suballocation,
+          /*alias_parent_allocation=*/false));
+      int64 key;
+      TF_RETURN_IF_ERROR(suballocation->Intern(rm, &key));
+      output_tensor->vec<int64>()(i) = key;
+    }
+    output_tuple->Unref();
+  } else {
+    Tensor* output_tensor;
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(0, TensorShape({}), &output_tensor));
+    int64 key;
+    TF_RETURN_IF_ERROR(output_tuple->Intern(rm, &key));
+    output_tensor->scalar<int64>()() = key;
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
index ffea592491d43788b876a51866dc8a6611e8c734..1a5bfac337baf773b84b92af5f88ef7a4c8ba81f 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
@@ -87,6 +87,19 @@ REGISTER_KERNEL_BUILDER(Name("XRTReadLiteral")
                             .HostMemory("literal"),
                         XRTReadLiteralOp<false, XRTGenericDeviceAccessor>);
 
+REGISTER_KERNEL_BUILDER(Name("XRTWriteLiteral")
+                            .Device(DEVICE_XLA_GPU)
+                            .HostMemory("handle")
+                            .HostMemory("literal")
+                            .HostMemory("output_handle"),
+                        XRTWriteLiteralOp<XRTGenericDeviceAccessor>);
+REGISTER_KERNEL_BUILDER(Name("XRTWriteLiteral")
+                            .Device(DEVICE_XLA_CPU)
+                            .HostMemory("handle")
+                            .HostMemory("literal")
+                            .HostMemory("output_handle"),
+                        XRTWriteLiteralOp<XRTGenericDeviceAccessor>);
+
 REGISTER_KERNEL_BUILDER(Name("XRTReadLiteralAndRelease")
                             .Device(DEVICE_XLA_GPU)
                             .HostMemory("handle")
@@ -107,4 +120,9 @@ REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllocationHandle")
                             .HostMemory("handle"),
                         XRTReleaseAllocationOp<XRTGenericDeviceAccessor>);
 
+REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllAllocations").Device(DEVICE_XLA_GPU),
+                        XRTReleaseAllAllocationsOp<XRTGenericDeviceAccessor>);
+REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllAllocations").Device(DEVICE_XLA_CPU),
+                        XRTReleaseAllAllocationsOp<XRTGenericDeviceAccessor>);
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
index 54b06558adcd8ef1f8f1bee52d210d558801afea..e3b292e7907bfb82f1efc8ed0f27462c682848ce 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@@ -393,6 +393,56 @@ class XRTReadLiteralOp : public OpKernel {
   }
 };
 
+// Op that writes a new literal value into device-resident memory.
+template <class DeviceAccessor>
+class XRTWriteLiteralOp : public OpKernel {
+ public:
+  explicit XRTWriteLiteralOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~XRTWriteLiteralOp() override = default;
+  XRTWriteLiteralOp(const XRTWriteLiteralOp&) = delete;
+  XRTWriteLiteralOp& operator=(const XRTWriteLiteralOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "XRTWriteLiteralOp::Compute";
+
+    const Tensor& handle_tensor = ctx->input(0);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(handle_tensor.shape()),
+        errors::Internal("computation input should be an int64 scalar"));
+    int64 allocation_handle = handle_tensor.scalar<int64>()();
+
+    const Tensor& literal_info = ctx->input(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(literal_info.shape()),
+                errors::Internal("literal input should be a string scalar"));
+    xla::LiteralProto literal_proto;
+    OP_REQUIRES(ctx,
+                literal_proto.ParseFromString(literal_info.scalar<string>()()),
+                errors::InvalidArgument(
+                    "Unable to parse allocation input to LiteralProto"));
+    xla::Literal literal;
+    OP_REQUIRES_OK(ctx, XRTStateHelpers::MakeLiteral(literal_proto, &literal));
+
+    ResourceMgr* rm;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
+
+    XRTTupleAllocation* allocation;
+    OP_REQUIRES_OK(
+        ctx, XRTTupleAllocation::Lookup(rm, allocation_handle, &allocation));
+    core::ScopedUnref allocation_unref(allocation);
+    // We are guaranteed that the underlying device object won't be deleted out
+    // from under us, while the ScopedRef is live.
+    typename DeviceAccessor::ScopedRef device_ref;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(
+                            ctx, allocation->device_ordinal(), &device_ref));
+    OP_REQUIRES_OK(ctx,
+                   allocation->WriteLiteral(device_ref.backend(), literal));
+
+    Tensor output(DT_INT64, TensorShape({}));
+    output.scalar<int64>()() = allocation_handle;
+    ctx->set_output(0, output);
+  }
+};
+
 // Op that discards a handle to device memory.
 template <class DeviceAccessor>
 class XRTReleaseAllocationOp : public OpKernel {
@@ -419,6 +469,26 @@ class XRTReleaseAllocationOp : public OpKernel {
   }
 };
 
+// Op that discards a handle to device memory.
+template <class DeviceAccessor>
+class XRTReleaseAllAllocationsOp : public OpKernel {
+ public:
+  explicit XRTReleaseAllAllocationsOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+  ~XRTReleaseAllAllocationsOp() override = default;
+  XRTReleaseAllAllocationsOp(const XRTReleaseAllAllocationsOp&) = delete;
+  XRTReleaseAllAllocationsOp& operator=(const XRTReleaseAllAllocationsOp&) =
+      delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "XRTReleaseAllAllocationsOp::Compute";
+
+    ResourceMgr* rm;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
+    OP_REQUIRES_OK(ctx, XRTTupleAllocation::ReleaseAllAllocations(rm));
+  }
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_XRT_KERNELS_XRT_STATE_OPS_H_
diff --git a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
index 07d025ce343f229097b557d33ad41bf9612b0696..fe6bee0dacf5dc2050613fc9ad34d3235b5a7b63 100644
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
@@ -95,6 +95,20 @@ Copies an allocated tuple from device memory and returns it as a literal.
 'literal' is a serialized xla::LiteralProto proto.
 )");
 
+REGISTER_OP("XRTWriteLiteral")
+    .Input("handle: int64")
+    .Input("literal: string")
+    .Output("output_handle: int64")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(
+        R"(
+Copies the input literal into the device memory pointed to by handle.
+Returns the handle itself.
+
+'handle' is the id returned from the Op that produced the on-device allocation.
+'literal' is a serialized xla::LiteralProto proto to be written to device memory.
+)");
+
 REGISTER_OP("XRTReadLiteralAndRelease")
     .Input("handle: int64")
     .Output("literal: string")
@@ -119,4 +133,11 @@ used.
 'handle' is the id returned from the Op that produced the on-device allocation.
 )");
 
+REGISTER_OP("XRTReleaseAllAllocations")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .Doc(
+        R"(
+Discards all the XRT allocations. All the client held handles will be invalid.
+)");
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index b9262c1843a7ae48af49acbef5ba4ef58ec0f050..730a2271677c91afecaf252f4a3d1a989a1ccfba 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -102,7 +102,7 @@ bool CompareLiteralProtos(const xla::LiteralProto& a,
   auto l_b = xla::Literal::CreateFromProto(b).ValueOrDie();
   bool equal = l_a == l_b;
   if (!equal) {
-    LOG(INFO) << "LiteralProtos don't match " << a.DebugString()
+    LOG(INFO) << "LiteralProtos don't match: " << a.DebugString()
               << " != " << b.DebugString();
   }
   return equal;
@@ -175,6 +175,18 @@ xla::XlaComputation AddAndTuple() {
   return builder.Build().ValueOrDie();
 }
 
+xla::XlaComputation AddAndSubTuple() {
+  xla::XlaBuilder builder("AddAndSubTuple");
+  auto p0 = xla::Parameter(&builder, 0, xla::ShapeUtil::MakeShape(xla::F32, {}),
+                           "P0");
+  auto p1 = xla::Parameter(&builder, 1, xla::ShapeUtil::MakeShape(xla::F32, {}),
+                           "P1");
+  auto sum = xla::Add(p0, p1);
+  auto sub = xla::Sub(p0, p1);
+  xla::Tuple(&builder, {sum, sub});
+  return builder.Build().ValueOrDie();
+}
+
 void StoreComputationSnapshot(const xla::XlaComputation& computation,
                               xla::HloSnapshot* dst) {
   auto snapshot = computation.Snapshot().ValueOrDie();
@@ -203,6 +215,87 @@ xla::ProgramShape XlaCompiledProgramShape(
       ->ComputeProgramShape();
 }
 
+TEST(RawApiTest, AllocAndRewrite) {
+  xrt::XLAAllocation alloc;
+  alloc.set_device_ordinal(0);
+  *alloc.mutable_value() =
+      xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}}).ToProto();
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto value =
+      ops::Const(root.WithDevice("/device:CPU:0"), alloc.SerializeAsString());
+  auto handle = ops::XRTAllocate(root, value);
+  auto read_back = ops::XRTReadLiteral(root, handle);
+  TF_ASSERT_OK(root.status());
+
+  tensorflow::ClientSession session(root);
+  std::vector<tensorflow::Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, handle}, &outputs));
+  EXPECT_EQ(outputs.size(), 2);
+
+  int64 allocation_handle = outputs[1].scalar<int64>()();
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(CompareLiteralProtos(alloc.value(), response));
+  outputs.clear();
+
+  xla::LiteralProto new_literal =
+      xla::LiteralUtil::CreateR2({{9, 2}, {4, 1}}).ToProto();
+  auto new_value = ops::Const(root.WithDevice("/device:CPU:0"),
+                              new_literal.SerializeAsString());
+  auto write_op =
+      ops::XRTWriteLiteral(root, Input(allocation_handle), new_value);
+  TF_ASSERT_OK(root.status());
+  TF_EXPECT_OK(session.Run({write_op}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+  EXPECT_EQ(allocation_handle, outputs[0].scalar<int64>()());
+  outputs.clear();
+
+  auto read_after_write = ops::XRTReadLiteral(root, Input(allocation_handle));
+  TF_EXPECT_OK(session.Run({read_after_write}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  xla::LiteralProto new_response;
+  EXPECT_TRUE(new_response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(CompareLiteralProtos(new_literal, new_response));
+
+  auto release =
+      ops::XRTReleaseAllocationHandle(root, Input(allocation_handle));
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
+                           &outputs));
+}
+
+TEST(RawApiTest, AllocAndClearAll) {
+  xrt::XLAAllocation alloc;
+  alloc.set_device_ordinal(0);
+  *alloc.mutable_value() =
+      xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}}).ToProto();
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto value =
+      ops::Const(root.WithDevice("/device:CPU:0"), alloc.SerializeAsString());
+  auto handle = ops::XRTAllocate(root, value);
+  TF_ASSERT_OK(root.status());
+
+  tensorflow::ClientSession session(root);
+  std::vector<tensorflow::Tensor> outputs;
+  TF_EXPECT_OK(session.Run({handle}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  int64 allocation_handle = outputs[0].scalar<int64>()();
+
+  auto clear_all = ops::XRTReleaseAllAllocations(root);
+
+  outputs.clear();
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {},
+                           {clear_all}, &outputs));
+  EXPECT_EQ(outputs.size(), 0);
+
+  auto read_after_clear = ops::XRTReadLiteral(root, Input(allocation_handle));
+  EXPECT_EQ(session.Run({read_after_clear}, &outputs).code(),
+            tensorflow::error::Code::NOT_FOUND);
+}
+
 TEST(RawApiTest, ReadAndWriteState) {
   xrt::XLAAllocation alloc;
   alloc.set_device_ordinal(0);
@@ -681,6 +774,70 @@ TEST(RawApiTest, CompileAndExecuteReturnTuple) {
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 }
 
+TEST(RawApiTest, CompileAndExecuteReturnExplodedTuple) {
+  xrt::XLAAllocation p0;
+  p0.set_device_ordinal(0);
+  *p0.mutable_value() = xla::LiteralUtil::CreateR0<float>(12.0f).ToProto();
+
+  xrt::XLAAllocation p1;
+  p1.set_device_ordinal(0);
+  *p1.mutable_value() = xla::LiteralUtil::CreateR0<float>(3.0f).ToProto();
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {}),
+                                      xla::ShapeUtil::MakeShape(xla::F32, {})})
+          .ToProto();
+  StoreComputationSnapshot(AddAndSubTuple(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+  e.set_return_exploded_tuple(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto e_config =
+      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
+  auto computation =
+      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value =
+      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value =
+      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
+                                {Output(p0_handle), Output(p1_handle)});
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({result}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  auto handles_vec = outputs.front().vec<int64>();
+  EXPECT_EQ(handles_vec.size(), 2);
+
+  const float kResults[2] = {15.0f, 9.0f};
+  for (int64 i = 0; i < handles_vec.size(); ++i) {
+    auto read_back = ops::XRTReadLiteralAndRelease(root, Input(handles_vec(i)));
+    std::vector<Tensor> voutputs;
+    TF_EXPECT_OK(session.Run({read_back}, &voutputs));
+    EXPECT_EQ(voutputs.size(), 1);
+
+    xla::LiteralProto response;
+    EXPECT_TRUE(response.ParseFromString(voutputs[0].scalar<string>()()));
+
+    auto expected = xla::LiteralUtil::CreateR0<float>(kResults[i]);
+    EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+  }
+}
+
 TEST(RawApiTest, LeakCompilationReference) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto
index e149f2f43593ea412ef279b2c99dabac285cdac4..378bb9246f27b8106310d565435404d7ac260a87 100644
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@@ -101,4 +101,8 @@ message XRTExecutionConfig {
   bool release_input_handles = 5;
   // If true, release the handle to the computation after running.
   bool release_compilation_handle = 6;
+  // If set to true, and the result shape is a tuple, then instead of returning
+  // a single tuple allocation the execution will return a vector of
+  // allocations, one for each of the first-level elements of the result tuple.
+  bool return_exploded_tuple = 7;
 }
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
index 3a99820d7aa9e9546cc95385fd98c05f28988e9e..343460ff107fa81be127950837f786fe4eeadf26 100644
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt_state.h"
 
 #include <stdint.h>
+#include <map>
 #include <memory>
 #include <string>
 #include <utility>
@@ -34,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
@@ -41,6 +43,34 @@ namespace tensorflow {
 
 namespace {
 
+class BufferAllocStats {
+ public:
+  struct Stats {
+    int64 count = 0;
+    int64 size = 0;
+  };
+
+  Stats ReportAlloc(int64 device, int64 msize) {
+    mutex_lock lock(lock_);
+    Stats* device_stats = &stats_[device];
+    device_stats->count += 1;
+    device_stats->size += msize;
+    return *device_stats;
+  }
+
+  Stats ReportFree(int64 device, int64 msize) {
+    mutex_lock lock(lock_);
+    Stats* device_stats = &stats_[device];
+    device_stats->count -= 1;
+    device_stats->size -= msize;
+    return *device_stats;
+  }
+
+ private:
+  mutable mutex lock_;
+  std::map<int64, Stats> stats_;
+};
+
 const char* kTupleContainer = "tuples";
 
 int64 get_uid() {
@@ -48,6 +78,11 @@ int64 get_uid() {
   return static_cast<int64>(unsigned_rand);
 }
 
+BufferAllocStats* GetAllocStats() {
+  static BufferAllocStats* stats = new BufferAllocStats();
+  return stats;
+}
+
 Status AllocateScopedShapedBuffer(
     xla::Backend* backend, int device_ordinal, const xla::Shape& shape,
     std::unique_ptr<xla::ScopedShapedBuffer>* buffer) {
@@ -100,9 +135,19 @@ XRTBufferAllocation::XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
                                          xla::DeviceMemoryAllocator* allocator)
     : allocation_(allocation),
       device_ordinal_(device_ordinal),
-      allocator_(allocator) {}
+      allocator_(allocator) {
+  if (VLOG_IS_ON(2)) {
+    auto stats =
+        GetAllocStats()->ReportAlloc(device_ordinal_, allocation_.size());
+    LOG(INFO) << "XRT Allocation Stats: device=" << device_ordinal_
+              << " count=" << stats.count << " size=" << stats.size;
+  }
+}
 
 XRTBufferAllocation::~XRTBufferAllocation() {
+  if (VLOG_IS_ON(2)) {
+    GetAllocStats()->ReportFree(device_ordinal_, allocation_.size());
+  }
   // Deallocate explicitly allows allocation_ to be null.
   Status s = allocator_->Deallocate(device_ordinal_, allocation_);
   // Nothing to do but check fail here if memory datastructures are corrupted.
@@ -183,6 +228,20 @@ Status XRTTupleAllocation::ToLiteral(xla::Backend* backend, int device_ordinal,
   return Status::OK();
 }
 
+Status XRTTupleAllocation::WriteLiteral(xla::Backend* backend,
+                                        const xla::Literal& literal) {
+  if (!xla::ShapeUtil::Equal(literal.shape(), on_host_shape())) {
+    return errors::InvalidArgument(
+        "New literal shape not matching the existing one: literal=",
+        xla::ShapeUtil::HumanStringWithLayout(literal.shape()),
+        " device=", xla::ShapeUtil::HumanStringWithLayout(on_host_shape()));
+  }
+  auto transfer_manager = backend->transfer_manager();
+  TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal()));
+  return transfer_manager->TransferLiteralToDevice(stream.get(), literal,
+                                                   ToShapedBuffer());
+}
+
 void XRTTupleAllocation::DiscardAllocation(
     const xla::ShapeIndex& buffer_index) {
   buffers_.element(buffer_index)->DiscardAllocation();
@@ -213,6 +272,11 @@ const se::DeviceMemoryBase& XRTTupleAllocation::root_allocation() {
   return rm->Delete<XRTTupleAllocation>(kTupleContainer, key_string);
 }
 
+/* static */ Status XRTTupleAllocation::ReleaseAllAllocations(ResourceMgr* rm) {
+  VLOG(1) << "Releasing all XRT held device memory";
+  return rm->Cleanup(kTupleContainer);
+}
+
 // Helper typedef to make ShapeTree ForEach helper lambda signatures more
 // readable. They need a type of const T& where in this case T is the
 // following pointer.
diff --git a/tensorflow/compiler/xrt/xrt_state.h b/tensorflow/compiler/xrt/xrt_state.h
index 73b5584e38f781343fe6793af7ad28232fbfc184..3e3d5024124e13b87eed6f79596d50cd64325914 100644
--- a/tensorflow/compiler/xrt/xrt_state.h
+++ b/tensorflow/compiler/xrt/xrt_state.h
@@ -129,6 +129,10 @@ class XRTTupleAllocation : public ResourceBase {
   // Deletes the reference in the rm to an allocation interned under key.
   static Status DeleteFromResourceManager(ResourceMgr* rm, int64 key);
 
+  // Releases all the device memory allocated by XRT within the resource
+  // manager.
+  static Status ReleaseAllAllocations(ResourceMgr* rm);
+
   // Adds the allocation to a ResourceMgr and returns the key that will be used
   // to retrieve it. Transfers a reference on *this to rm.
   Status Intern(ResourceMgr* rm, int64* key);
@@ -137,6 +141,9 @@ class XRTTupleAllocation : public ResourceBase {
   Status ToLiteral(xla::Backend* backend, int device_ordinal,
                    xla::Literal* literal);
 
+  // Write a new literal value to the allocation.
+  Status WriteLiteral(xla::Backend* backend, const xla::Literal& literal);
+
   // True if none of the buffers in the allocation are aliased by any other live
   // handle.
   bool IsExclusiveOwner();
diff --git a/tensorflow/contrib/android/cmake/build.gradle b/tensorflow/contrib/android/cmake/build.gradle
index 17a57b99fd6c9efc09bda0ce1249b1f51bd5af5c..ddec08894f34f96b080610f1d27a6a436f7ffa91 100644
--- a/tensorflow/contrib/android/cmake/build.gradle
+++ b/tensorflow/contrib/android/cmake/build.gradle
@@ -22,8 +22,8 @@ android {
         }
         externalNativeBuild {
             cmake {
-                arguments '-DANDROID_TOOLCHAIN=gcc',
-                          '-DANDROID_STL=gnustl_static'
+                arguments '-DANDROID_TOOLCHAIN=clang',
+                          '-DANDROID_STL=c++_static'
             }
         }
     }
@@ -70,7 +70,7 @@ if (ndkDir == null || ndkDir == "") {
     ndkDir = System.getenv('ANDROID_NDK_HOME')
 }
 
-if(! Os.isFamily(Os.FAMILY_WINDOWS)) {
+if (!Os.isFamily(Os.FAMILY_WINDOWS)) {
     // This script is for non-Windows OS. For Windows OS, MANUALLY build
     // (or copy the built) libs/headers to the
     //    ${TENSORFLOW_ROOT_DIR}/tensorflow/contrib/makefile/gen
diff --git a/tensorflow/contrib/bigtable/README.md b/tensorflow/contrib/bigtable/README.md
index 2c44abed5e1955cc666273e97e6b2378766f13d2..79052bee35c7895cb4048b10c1f73acb036d1587 100644
--- a/tensorflow/contrib/bigtable/README.md
+++ b/tensorflow/contrib/bigtable/README.md
@@ -51,25 +51,18 @@ BIGTABLE_TABLE_NAME = '<FILL_ME_IN>'
 PREFIX = 'train-'
 
 def main():
+  tf.enable_eager_execution()
+
   client = tf.contrib.cloud.BigtableClient(GCP_PROJECT_ID, BIGTABLE_INSTANCE_ID)
   table = client.table(BIGTABLE_TABLE_NAME)
   dataset = table.keys_by_prefix_dataset(PREFIX)
-  iterator = dataset.make_initializable_iterator()
-  get_next_op = iterator.get_next()
 
-  with tf.Session() as sess:
-    print('Initializing the iterator.')
-    sess.run(iterator.initializer)
-    print('Retrieving rows:')
-    row_index = 0
-    while True:
-      try:
-        row_key = sess.run(get_next_op)
-        print('Row key %d: %s' % (row_index, row_key))
-        row_index += 1
-      except tf.errors.OutOfRangeError:
-        print('Finished reading data!')
-        break
+  print('Retrieving rows:')
+  row_index = 0
+  for row_key in dataset:
+    print('Row key %d: %s' % (row_index, row_key))
+    row_index += 1
+  print('Finished reading data!')
 
 if __name__ == '__main__':
   main()
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
index e95dc577184f7e81d942755b41065f52131ce9f6..3fe71a2ea730cc9b60b2e2088a0d80a08b38d1a9 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
@@ -399,6 +399,17 @@ BigtableTestClient::AsyncMutateRows(
   return nullptr;
 }
 
+std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+    google::bigtable::v2::CheckAndMutateRowResponse>>
+BigtableTestClient::AsyncCheckAndMutateRow(
+    grpc::ClientContext* context,
+    const google::bigtable::v2::CheckAndMutateRowRequest& request,
+    grpc::CompletionQueue* cq) {
+  LOG(WARNING) << "Call to InMemoryDataClient::" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
 std::shared_ptr<grpc::Channel> BigtableTestClient::Channel() {
   LOG(WARNING) << "Call to InMemoryDataClient::Channel(); this will likely "
                   "cause a crash!";
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
index c4a1f06bc504c3565c7bb09b42e48e7fbddb9cc6..85705904573e9e7710912e3f4ff30dd8fed5bf85 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
@@ -80,6 +80,13 @@ class BigtableTestClient : public ::google::cloud::bigtable::DataClient {
                   const ::google::bigtable::v2::MutateRowsRequest& request,
                   ::grpc::CompletionQueue* cq, void* tag) override;
 
+  std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+      google::bigtable::v2::CheckAndMutateRowResponse>>
+  AsyncCheckAndMutateRow(
+      grpc::ClientContext* context,
+      const google::bigtable::v2::CheckAndMutateRowRequest& request,
+      grpc::CompletionQueue* cq) override;
+
   std::shared_ptr<grpc::Channel> Channel() override;
 
  private:
diff --git a/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py b/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
index 316da9ebe152ef52c7e7f846cf8c3eb1555ee8a6..197f5578eb010bee5a3aad7c05446393193f99e2 100644
--- a/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
+++ b/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
@@ -57,7 +57,7 @@ class BigtableOpsTest(test.TestCase):
     sess.run(write_op)
 
   def runReadKeyTest(self, read_ds):
-    itr = read_ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(read_ds)
     n = itr.get_next()
     expected = list(self.COMMON_ROW_KEYS)
     expected.reverse()
@@ -78,7 +78,7 @@ class BigtableOpsTest(test.TestCase):
     self.runReadKeyTest(self._table.keys_by_range_dataset("r1", "r4"))
 
   def runScanTest(self, read_ds):
-    itr = read_ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(read_ds)
     n = itr.get_next()
     expected_keys = list(self.COMMON_ROW_KEYS)
     expected_keys.reverse()
@@ -120,7 +120,7 @@ class BigtableOpsTest(test.TestCase):
   def testLookup(self):
     ds = self._table.keys_by_prefix_dataset("r")
     ds = ds.apply(self._table.lookup_columns(cf1="c1"))
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     expected_keys = list(self.COMMON_ROW_KEYS)
     expected_values = list(self.COMMON_VALUES)
@@ -141,7 +141,7 @@ class BigtableOpsTest(test.TestCase):
 
   def testSampleKeys(self):
     ds = self._table.sample_keys()
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     expected_key = self.COMMON_ROW_KEYS[0]
     with self.cached_session() as sess:
@@ -161,7 +161,7 @@ class BigtableOpsTest(test.TestCase):
         sess.run(n)
 
   def runSampleKeyPairsTest(self, ds, expected_key_pairs):
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     with self.cached_session() as sess:
       self._writeCommonValues(sess)
@@ -218,7 +218,7 @@ class BigtableOpsTest(test.TestCase):
   def testSampleKeyPairsPrefixAndStartKey(self):
     ds = bigtable_api._BigtableSampleKeyPairsDataset(
         self._table, prefix="r", start="r1", end="")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     with self.cached_session() as sess:
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(itr.initializer)
@@ -226,14 +226,14 @@ class BigtableOpsTest(test.TestCase):
   def testSampleKeyPairsPrefixAndEndKey(self):
     ds = bigtable_api._BigtableSampleKeyPairsDataset(
         self._table, prefix="r", start="", end="r3")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     with self.cached_session() as sess:
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(itr.initializer)
 
   def testParallelScanPrefix(self):
     ds = self._table.parallel_scan_prefix(prefix="r", cf1="c1")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     with self.cached_session() as sess:
       self._writeCommonValues(sess)
@@ -251,7 +251,7 @@ class BigtableOpsTest(test.TestCase):
 
   def testParallelScanRange(self):
     ds = self._table.parallel_scan_range(start="r1", end="r4", cf1="c1")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     with self.cached_session() as sess:
       self._writeCommonValues(sess)
diff --git a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
index 7c87b0daeb09950cc44c51f49c16534d413f0376..b6cdc7aab0320fe5f457288ada03a46e18a694cc 100644
--- a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
+++ b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
@@ -35,8 +35,8 @@ from tensorflow.contrib.util import loader
 from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import resource_loader
 
@@ -111,8 +111,7 @@ class BigtableClient(object):
 
 
 class BigtableTable(object):
-  """BigtableTable is the entrypoint for reading and writing data in Cloud
-  Bigtable.
+  """Entry point for reading and writing data in Cloud Bigtable.
 
   This BigtableTable class is the Python representation of the Cloud Bigtable
   table within TensorFlow. Methods on this class allow data to be read from and
@@ -222,7 +221,7 @@ class BigtableTable(object):
       A `tf.data.Dataset`. containing `tf.string` Tensors corresponding to all
       of the row keys matching that prefix.
     """
-    return _BigtablePrefixKeyDataset(self, prefix)
+    return dataset_ops.DatasetV1Adapter(_BigtablePrefixKeyDataset(self, prefix))
 
   def sample_keys(self):
     """Retrieves a sampling of row keys from the Bigtable table.
@@ -234,7 +233,7 @@ class BigtableTable(object):
     Returns:
       A `tf.data.Dataset` returning string row keys.
     """
-    return _BigtableSampleKeysDataset(self)
+    return dataset_ops.DatasetV1Adapter(_BigtableSampleKeysDataset(self))
 
   def scan_prefix(self, prefix, probability=None, columns=None, **kwargs):
     """Retrieves row (including values) from the Bigtable service.
@@ -279,7 +278,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    return _BigtableScanDataset(self, prefix, "", "", normalized, probability)
+    return dataset_ops.DatasetV1Adapter(
+        _BigtableScanDataset(self, prefix, "", "", normalized, probability))
 
   def scan_range(self, start, end, probability=None, columns=None, **kwargs):
     """Retrieves rows (including values) from the Bigtable service.
@@ -324,7 +324,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    return _BigtableScanDataset(self, "", start, end, normalized, probability)
+    return dataset_ops.DatasetV1Adapter(
+        _BigtableScanDataset(self, "", start, end, normalized, probability))
 
   def parallel_scan_prefix(self,
                            prefix,
@@ -380,7 +381,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    ds = _BigtableSampleKeyPairsDataset(self, prefix, "", "")
+    ds = dataset_ops.DatasetV1Adapter(
+        _BigtableSampleKeyPairsDataset(self, prefix, "", ""))
     return self._make_parallel_scan_dataset(ds, num_parallel_scans, probability,
                                             normalized)
 
@@ -442,7 +444,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    ds = _BigtableSampleKeyPairsDataset(self, "", start, end)
+    ds = dataset_ops.DatasetV1Adapter(
+        _BigtableSampleKeyPairsDataset(self, "", start, end))
     return self._make_parallel_scan_dataset(ds, num_parallel_scans, probability,
                                             normalized)
 
@@ -589,16 +592,8 @@ class _BigtableKeyDataset(dataset_ops.DatasetSource):
     self._table = table
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.TensorShape([])
-
-  @property
-  def output_types(self):
-    return dtypes.string
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
 
 
 class _BigtablePrefixKeyDataset(_BigtableKeyDataset):
@@ -658,16 +653,9 @@ class _BigtableLookupDataset(dataset_ops.DatasetSource):
     self._columns = [i[1] for i in normalized]
 
   @property
-  def output_classes(self):
-    return tuple([ops.Tensor] * self._num_outputs)
-
-  @property
-  def output_shapes(self):
-    return tuple([tensor_shape.TensorShape([])] * self._num_outputs)
-
-  @property
-  def output_types(self):
-    return tuple([dtypes.string] * self._num_outputs)
+  def _element_structure(self):
+    return structure.NestedStructure(tuple(
+        [structure.TensorStructure(dtypes.string, [])] * self._num_outputs))
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
@@ -693,16 +681,9 @@ class _BigtableScanDataset(dataset_ops.DatasetSource):
     self._num_outputs = len(normalized) + 1  # 1 for row key
 
   @property
-  def output_classes(self):
-    return tuple([ops.Tensor] * self._num_outputs)
-
-  @property
-  def output_shapes(self):
-    return tuple([tensor_shape.TensorShape([])] * self._num_outputs)
-
-  @property
-  def output_types(self):
-    return tuple([dtypes.string] * self._num_outputs)
+  def _element_structure(self):
+    return structure.NestedStructure(tuple(
+        [structure.TensorStructure(dtypes.string, [])] * self._num_outputs))
 
   def _as_variant_tensor(self):
     return gen_bigtable_ops.bigtable_scan_dataset(
@@ -726,16 +707,10 @@ class _BigtableSampleKeyPairsDataset(dataset_ops.DatasetSource):
     self._end = end
 
   @property
-  def output_classes(self):
-    return (ops.Tensor, ops.Tensor)
-
-  @property
-  def output_shapes(self):
-    return (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
-
-  @property
-  def output_types(self):
-    return (dtypes.string, dtypes.string)
+  def _element_structure(self):
+    return structure.NestedStructure(
+        (structure.TensorStructure(dtypes.string, []),
+         structure.TensorStructure(dtypes.string, [])))
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 9fdc2fc0c2c7b85502f7a3f9ae7c85cf05d5916c..a5951fb7377d48748f5eb578c034176517df7749 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -614,13 +614,19 @@ class GradientBoostedDecisionTreeModel(object):
           predictions_dict[NUM_TREES_ATTEMPTED] % self._logits_dimension)
     return constant_op.constant(-1, dtype=dtypes.int32)
 
-  def update_stats(self, loss, predictions_dict):
+  def update_stats(self, loss, predictions_dict, gradients=None, hessians=None):
     """Update the accumulators with stats from this batch.
 
     Args:
       loss: A scalar tensor representing average loss of examples.
       predictions_dict: Dictionary of Rank 2 `Tensor` representing information
           about predictions per example.
+      gradients: A tensor with the gradients with the respect to logits from
+        predictions_dict. If not provided, tensorflow will do
+        autodifferentiation.
+      hessians: A tensor with the hessians with the respect to logits from
+        predictions_dict. If not provided, tensorflow will do
+        autodifferentiation.
 
     Returns:
       Three values:
@@ -642,13 +648,14 @@ class GradientBoostedDecisionTreeModel(object):
     predictions = predictions_dict[PREDICTIONS]
     partition_ids = predictions_dict[PARTITION_IDS]
     ensemble_stamp = predictions_dict[ENSEMBLE_STAMP]
-    gradients = gradients_impl.gradients(
-        loss,
-        predictions,
-        name="Gradients",
-        colocate_gradients_with_ops=False,
-        gate_gradients=0,
-        aggregation_method=None)[0]
+    if gradients is None:
+      gradients = gradients_impl.gradients(
+          loss,
+          predictions,
+          name="Gradients",
+          colocate_gradients_with_ops=False,
+          gate_gradients=0,
+          aggregation_method=None)[0]
     strategy = self._learner_config.multi_class_strategy
 
     class_id = self._get_class_id(predictions_dict)
@@ -657,17 +664,20 @@ class GradientBoostedDecisionTreeModel(object):
       # We build one vs rest trees.
       if self._logits_dimension == 1:
         # We have only 1 score, gradients is of shape [batch, 1].
-        hessians = gradients_impl.gradients(
-            gradients,
-            predictions,
-            name="Hessian",
-            colocate_gradients_with_ops=False,
-            gate_gradients=0,
-            aggregation_method=None)[0]
+        if hessians is None:
+          hessians = gradients_impl.gradients(
+              gradients,
+              predictions,
+              name="Hessian",
+              colocate_gradients_with_ops=False,
+              gate_gradients=0,
+              aggregation_method=None)[0]
 
         squeezed_gradients = array_ops.squeeze(gradients, axis=[1])
         squeezed_hessians = array_ops.squeeze(hessians, axis=[1])
       else:
+        if hessians is not None:
+          raise ValueError("Providing hessians is not yet supported here.")
         hessian_list = self._diagonal_hessian(gradients, predictions)
         # Assemble hessian list into a tensor.
         hessians = array_ops.stack(hessian_list, axis=1)
@@ -678,6 +688,8 @@ class GradientBoostedDecisionTreeModel(object):
         squeezed_hessians = array_ops.squeeze(
             _get_column_by_index(hessians, class_id))
     else:
+      if hessians is not None:
+        raise ValueError("Providing hessians is not yet supported here.")
       # Other multiclass strategies.
       if strategy == learner_pb2.LearnerConfig.FULL_HESSIAN:
         hessian_list = self._full_hessian(gradients, predictions)
@@ -835,9 +847,9 @@ class GradientBoostedDecisionTreeModel(object):
     stats_update_ops.append(
         control_flow_ops.cond(
             continue_centering,
-            self._make_update_bias_stats_fn(
-                ensemble_stamp, predictions, gradients,
-                bias_stats_accumulator), control_flow_ops.no_op))
+            self._make_update_bias_stats_fn(ensemble_stamp, predictions,
+                                            gradients, bias_stats_accumulator,
+                                            hessians), control_flow_ops.no_op))
 
     # Update handler stats.
     handler_reads = collections.OrderedDict()
@@ -1162,7 +1174,8 @@ class GradientBoostedDecisionTreeModel(object):
   def get_max_tree_depth(self):
     return self._max_tree_depth
 
-  def train(self, loss, predictions_dict, labels):
+  def train(self, loss, predictions_dict, labels, gradients=None,
+            hessians=None):
     """Updates the accumalator stats and grows the ensemble.
 
     Args:
@@ -1171,6 +1184,12 @@ class GradientBoostedDecisionTreeModel(object):
           about predictions per example.
       labels: Rank 2 `Tensor` representing labels per example. Has no effect
           on the training and is only kept for backward compatibility.
+      gradients: A tensor with the gradients with the respect to logits from
+        predictions_dict. If not provided, tensorflow will do
+        autodifferentiation.
+      hessians: A tensor with the hessians with the respect to logits from
+        predictions_dict. If not provided, tensorflow will do
+        autodifferentiation.
 
     Returns:
       An op that adds a new tree to the ensemble.
@@ -1179,7 +1198,8 @@ class GradientBoostedDecisionTreeModel(object):
       ValueError: if inputs are not valid.
     """
     del labels  # unused; kept for backward compatibility.
-    update_op, _, training_state = self.update_stats(loss, predictions_dict)
+    update_op, _, training_state = self.update_stats(loss, predictions_dict,
+                                                     gradients, hessians)
     with ops.control_dependencies(update_op):
       return self.increment_step_counter_and_maybe_update_ensemble(
           predictions_dict, training_state)
@@ -1271,21 +1291,28 @@ class GradientBoostedDecisionTreeModel(object):
         ps_ops=ps_ops,
         ps_strategy=ps_strategy)
 
-  def _make_update_bias_stats_fn(self, ensemble_stamp, predictions, gradients,
-                                 bias_stats_accumulator):
+  def _make_update_bias_stats_fn(self,
+                                 ensemble_stamp,
+                                 predictions,
+                                 gradients,
+                                 bias_stats_accumulator,
+                                 hessians=None):
     """A method to create the function which updates the bias stats."""
 
     def _update_bias_stats():
       """A method to update the bias stats."""
       # Get reduced gradients and hessians.
       grads_sum = math_ops.reduce_sum(gradients, 0)
-      hess = gradients_impl.gradients(
-          grads_sum,
-          predictions,
-          name="Hessians",
-          colocate_gradients_with_ops=False,
-          gate_gradients=0,
-          aggregation_method=None)[0]
+      if hessians is not None:
+        hess = hessians
+      else:
+        hess = gradients_impl.gradients(
+            grads_sum,
+            predictions,
+            name="Hessians",
+            colocate_gradients_with_ops=False,
+            gate_gradients=0,
+            aggregation_method=None)[0]
       hess_sum = math_ops.reduce_sum(hess, 0)
 
       # Accumulate gradients and hessians.
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index a63366e1361effe20787c197eddd66b5c0c96410..2ad9ae42a16f690d38b8e2652e853012ec1dd267 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -3,16 +3,16 @@ cmake_minimum_required(VERSION 3.5)
 
 if(WIN32)
 	if(${CMAKE_VERSION} VERSION_LESS "3.8")
-		message(WARNING "Your current cmake version is ${CMAKE_VERSION} which does not support setting the toolset architecture to x64. This may cause \"compiler out of heap space\" errors when building. Consider upgrading your cmake to > 3.8 and using the flag -Thost=x64 when running cmake.")
+		message(WARNING "Your current cmake version is ${CMAKE_VERSION} which does not support setting the toolset architecture to x64. This may cause \"compiler out of heap space\" errors when building. Consider upgrading your cmake to > 3.8 and using the flag -Thost=x64 when running cmake. Ignore this if you are on CMake GUI.")
 	else()
 		if(NOT CMAKE_VS_PLATFORM_TOOLSET_HOST_ARCHITECTURE OR NOT "${CMAKE_VS_PLATFORM_TOOLSET_HOST_ARCHITECTURE}" STREQUAL "x64")
-			message(WARNING "Your current cmake generator is set to use 32 bit toolset architecture. This may cause \"compiler out of heap space\" errors when building. Consider using the flag -Thost=x64 when running cmake.")
+			message(WARNING "Your current cmake generator is set to use 32 bit toolset architecture. This may cause \"compiler out of heap space\" errors when building. Consider using the flag -Thost=x64 when running cmake. Ignore this if you are on CMake GUI.")
 		endif()
 	endif()
 endif()
 
 # Project
-project(tensorflow C CXX)
+project(tensorflow VERSION 1.12.0 LANGUAGES C CXX)
 
 # Set C++14 as standard for the whole project
 set(CMAKE_CXX_STANDARD 14)
@@ -52,11 +52,17 @@ option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for th
 option(tensorflow_ENABLE_SNAPPY_SUPPORT "Enable SNAPPY compression support" ON)
 option(tensorflow_DISABLE_EIGEN_FORCEINLINE "Disable forceinline, to speed up build on windows." OFF)
 
+if (WIN32)
+SET(tensorflow_WIN_CPU_SIMD_OPTIONS "/arch:AVX" CACHE STRING "Enables CPU SIMD instructions")
+SET_PROPERTY(CACHE tensorflow_WIN_CPU_SIMD_OPTIONS PROPERTY STRINGS /arch:AVX) 
+endif()
+
 # SIMD, MKL and MKLDNN options
 option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions" OFF)
 option(tensorflow_ENABLE_MKL_SUPPORT "Enable Intel MKL support" OFF)
 option(tensorflow_ENABLE_MKLDNN_SUPPORT "Enable Intel MKLDNN support, requires MKL enabled" OFF)
 
+
 # GPU, CUDA and cuDNN options
 option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
 
@@ -79,6 +85,11 @@ if (NOT WIN32)
     # option's default value is OFF. Fill it with real default values
     set(tensorflow_CUDNN_INCLUDE /usr/include)
   endif (NOT tensorflow_CUDNN_INCLUDE)
+  option(tensorflow_NCCL_INCLUDE "nccl.h header install path" /usr/include/)
+  if (NOT tensorflow_NCCL_INCLUDE)
+    # option's default value is OFF. Fill it with real default values
+    set(tensorflow_NCCL_INCLUDE /usr/include)
+  endif (NOT tensorflow_NCCL_INCLUDE)
   option(tensorflow_PATH_CUDNN_LIB "Override PATH_CUDA_LIB for cudnn" ${tensorflow_PATH_CUDA_LIB})
   if (NOT tensorflow_PATH_CUDNN_LIB)
     # option's default value is OFF. Fill it with real default values
@@ -193,6 +204,7 @@ if(WIN32)
   set(CMAKE_SUPPRESS_REGENERATION ON)
 endif()
 
+
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -std=c++11")
 endif()
@@ -281,6 +293,14 @@ else (systemlib_ZLIB)
     ${zlib_STATIC_LIBRARIES})
 endif (systemlib_ZLIB)
 
+if (systemlib_ABSEIL_CPP)
+  set(tensorflow_EXTERNAL_LIBRARIES ${tensorflow_EXTERNAL_LIBRARIES}
+      ${abseil_cpp_LIBRARIES})
+else (systemlib_ABSEIL_CPP)
+  set(tensorflow_EXTERNAL_LIBRARIES ${tensorflow_EXTERNAL_LIBRARIES}
+    ${abseil_cpp_STATIC_LIBRARIES})
+endif (systemlib_ABSEIL_CPP)
+
 set(tensorflow_EXTERNAL_DEPENDENCIES
     zlib_copy_headers_to_destination
     gif_copy_headers_to_destination
@@ -378,8 +398,8 @@ if (tensorflow_ENABLE_GPU)
     list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}/stubs")
   endif (NOT WIN32)
 
-  # minimum 9.1 in cuda version
-  find_package(CUDA 9.1 REQUIRED)
+  # minimum 9.0 in cuda version
+  find_package(CUDA 9.0 REQUIRED)
   if(NOT CUDA_FOUND)
     message(FATAL_ERROR "CUDA not found.")
   endif()
@@ -394,6 +414,7 @@ if (tensorflow_ENABLE_GPU)
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true)  # Flush denormals to zero
   set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
+
   include_directories(${CUDA_INCLUDE})
   if (WIN32)
     add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.7,5.2,6.0,6.1,7.0)
@@ -546,14 +567,20 @@ if (tensorflow_ENABLE_GPU)
       cudnn_version_number=${tensorflow_CUDNN_VERSION})
   endif(WIN32)
 else(tensorflow_ENABLE_GPU)
-  set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value
-    msvcp_dll_name=msvcp140.dll)
+  if(WIN32)
+    set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value
+      msvcp_dll_name=msvcp140.dll)
+  else()
+    set(tensorflow_BUILD_INFO_FLAGS --build_config cpu)
+  endif()
 endif(tensorflow_ENABLE_GPU)
 
-# Find python executable
-include(FindPythonInterp)
-if(NOT ${PYTHONINTERP_FOUND})
-    message(FATAL_ERROR "CMake was unable to find a python interpreter.")
+if(tensorflow_BUILD_PYTHON_BINDINGS)
+  # Find python executable
+  include(FindPythonInterp)
+  if(NOT ${PYTHONINTERP_FOUND})
+      message(FATAL_ERROR "CMake was unable to find a python interpreter.")
+  endif()
 endif()
 
 # Let's get to work!
@@ -574,6 +601,7 @@ include(tf_cc_ops.cmake)
 include(tf_c.cmake)
 include(tf_grappler.cmake)
 include(tf_core_profiler.cmake)
+include(tf_core_eager_runtime.cmake)
 if(tensorflow_BUILD_CC_EXAMPLE)
   include(tf_tutorials.cmake)
   include(tf_label_image_example.cmake)
@@ -587,4 +615,4 @@ if(tensorflow_BUILD_SHARED_LIB)
 endif()
 if(tensorflow_BUILD_CC_TESTS OR tensorflow_BUILD_PYTHON_TESTS)
   include(tf_tests.cmake)
-endif()
+endif()
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 84c679162c3ed8ffc9babcd3af583b26fb62c2d6..df8b48dfc46124d3b9454d92ffb70dbcf1bc4217 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -5,10 +5,10 @@ CMAKE build is deprecated for TensorFlow. Please use `bazel` to build TF for all
 platforms. For details, see the
 [TensorFlow install guide](https://www.tensorflow.org/install/).
 
-This directory contains CMake files for building TensorFlow on Microsoft
-Windows. [CMake](https://cmake.org) is a cross-platform tool that can
-generate build scripts for multiple build systems, including Microsoft
-Visual Studio.
+This directory contains CMake files for building TensorFlow on Microsoft Windows
+and Linux. [CMake](https://cmake.org) is a cross-platform tool that can generate
+build scripts for multiple build systems, including Microsoft Visual Studio and
+GCC. "The method has not been tested on Mac OS X.
 
 **N.B.** We provide Linux build instructions primarily for the purpose of
 testing the build. We recommend using the standard Bazel-based build on
@@ -17,12 +17,17 @@ Linux.
 Current Status
 --------------
 
-CMake can be used to build TensorFlow on Windows. See the [getting started documentation](https://www.tensorflow.org/install/source_windows)
-for instructions on how to install a pre-built TensorFlow package on Windows.
+CMake can be used to build TensorFlow on all platforms. See the
+[getting started documentation](https://www.tensorflow.org/install/install_windows)
+for instructions on how to install a pre-built TensorFlow package on Windows and
+Linux. The procedure in MacOS is similar to the Linux build.
 
 ### Current known limitations
-* It is not possible to load a custom Op library.
-* GCS file system is not supported.
+
+*   It is not possible to load a custom Op library.
+*   GCS file system is not supported.
+*   Debug build is not available since Python for Windows is no longer
+    distributed with a debug library.
 
 ## Building with CMake
 
@@ -32,70 +37,88 @@ bindings.
 
 ### Prerequisites
 
-* CMake version 3.5 or later.
+*   CMake version 3.5 or later.
+
+*   [Git](https://git-scm.com)
+
+*   [SWIG](http://www.swig.org/download.html)
+
+*   [Perl](https://www.perl.org/get.html) (optional, for SSL support build)
+
+*   [Go](https://golang.org/) (optional, for SSL support build)
+
+*   [NASM](http://www.nasm.us/)/[YASM](http://yasm.tortall.net/) (optional, for
+    SSL support build)
+
+*   Additional pre-requisites for Microsoft Windows:
+
+    -   Visual Studio 2015 (latest version of MSVC 2017 is not supported by CUDA
+        yet, try it on your own risk)
 
-* [Git](https://git-scm.com)
+    -   Python 3.5
 
-* [SWIG](http://www.swig.org/download.html)
+*   Additional prerequisites for Linux:
 
-* Additional prerequisites for Microsoft Windows:
-  - Visual Studio 2015
-  - Python 3.5
+    -   Python 2.7 or later
+    -   [Docker](https://www.docker.com/) (for automated testing)
 
-* Additional prerequisites for Linux:
-  - Python 2.7 or later
-  - [Docker](https://www.docker.com/) (for automated testing)
+*   Python dependencies:
 
-* Python dependencies:
-  - wheel
-  - NumPy 1.11.0 or later
+    -   wheel
+    -   NumPy 1.11.0 or later
 
 ### Known-good configurations
 
-* Microsoft Windows 10
-  - Microsoft Visual Studio Enterprise 2015 with Visual C++ 2015
-  - [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.anaconda.com/download/)
-  - [Git for Windows version 2.9.2.windows.1](https://git-scm.com/download/win)
-  - [swigwin-3.0.10](http://www.swig.org/download.html)
-  - [NVidia CUDA Toolkit 8.0](https://developer.nvidia.com/cuda-downloads)
-  - [NVidia CUDNN 5.1](https://developer.nvidia.com/cudnn)
-  - [CMake 3.6](https://cmake.org/files/v3.6/cmake-3.6.3-win64-x64.msi)
+*   Microsoft Windows 10
 
-* Ubuntu 14.04
-  - Makefile generator
-  - Docker 1.9.1 (for automated testing)
+    -   Microsoft Visual Studio Enterprise/ Community 2015 with Visual C++ 2015
+    -   [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.anaconda.com/download/)
+    -   [Git for Windows version 2.9.2.windows.1](https://git-scm.com/download/win)
+    -   [swigwin-3.0.10](http://www.swig.org/download.html)
+    -   [NVidia CUDA Toolkit 9.0](https://developer.nvidia.com/cuda-downloads)
+    -   [NVidia CUDNN 7](https://developer.nvidia.com/cudnn)
+    -   [CMake 3.6](https://cmake.org/files/v3.6/cmake-3.6.3-win64-x64.msi)
+
+*   Ubuntu 14.04
+
+    -   Makefile generator
+    -   Docker 1.9.1 (for automated testing)
 
 ### Current known limitations
-  - The Python package supports **Python 3.5 only**, because that is the only
-    version for which standard Python binaries exist and those binaries are
-    compatible with the TensorFlow runtime. (On Windows, the standard Python
+
+-   The Python package supports **Python 3.5/3.6 only**, because these are the
+    only versions for which standard Python binaries exist and those binaries
+    are compatible with the TensorFlow runtime. (On Windows, the standard Python
     binaries for versions earlier than 3.5 were compiled with older compilers
     that do not have all of the features (e.g. C++11 support) needed to compile
-    TensorFlow. We welcome patches for making TensorFlow work with Python 2.7
-    on Windows, but have not yet committed to supporting that configuration.)
-
-  - The following Python APIs are not currently implemented:
-    * Loading custom op libraries via `tf.load_op_library()`. In order to use your
-      custom op, please put the source code under the tensorflow/core/user_ops
-      directory, and a shape function is required (not optional) for each op.
-    * Path manipulation functions (such as `tf.gfile.ListDirectory()`) are not
-      functional.
-
-  - The `tf.contrib` libraries are not currently included in the PIP package.
-
-  - The following operations are not currently implemented:
-    * `DepthwiseConv2dNative`
-    * `Digamma`
-    * `Erf`
-    * `Erfc`
-    * `Igamma`
-    * `Igammac`
-    * `ImmutableConst`
-    * `Lgamma`
-    * `Polygamma`
-    * `Zeta`
-
-  - Google Cloud Storage support is not currently implemented. The GCS library
+    TensorFlow. We welcome patches for making TensorFlow work with Python 2.7 on
+    Windows, but have not yet committed to supporting that configuration.)
+
+-   The following Python APIs are not currently implemented:
+
+    *   Loading custom op libraries via `tf.load_op_library()`. In order to use
+        your custom op, please put the source code under the
+        tensorflow/core/user_ops directory, and a shape function is required
+        (not optional) for each op.
+    *   Path manipulation functions (such as `tf.gfile.ListDirectory()`) are not
+        functional.
+
+-   The `tf.contrib` libraries are not currently included in the PIP package.
+
+-   The following operations are not currently implemented:
+
+    *   `DepthwiseConv2dNative`
+    *   `Digamma`
+    *   `Erf`
+    *   `Erfc`
+    *   `Igamma`
+    *   `Igammac`
+    *   `ImmutableConst`
+    *   `Lgamma`
+    *   `Polygamma`
+    *   `Zeta`
+
+-   Google Cloud Storage support is not currently implemented. The GCS library
     currently depends on `libcurl` and `boringssl`, and the Windows version
     could use standard Windows APIs for making HTTP requests and cryptography
     (for OAuth). Contributions are welcome for this feature.
@@ -104,9 +127,211 @@ We are actively working on improving CMake and Windows support, and addressing
 these limitations. We would appreciate pull requests that implement missing
 ops or APIs.
 
+# CMake GUI build (all platforms)
+
+Install from CMake GUI would be a convenient way to generate C++ build projects.
+The software supports Windows, MacOS and Linux, while the posix platform
+provides an extra ccmake binary to run command line GUI. Both working principal
+of cmake, ccmake and cmake-gui are the same, the only difference is by providing
+suitable interface for project configuration and dependency setting.
+
+1.  Pre-buid checklist: The following binary/libraries should be setted in
+    system path, otherwise you need to set manualy via cmake.
+    *   Compiler (GCC for Linux, MSVC for Windows)
+    *   Make sure compiler directory has been set to system path
+    *   CUDA 9.0 (GPU build)
+    *   CUDNN (GPU build)
+    *   NCCL (GPU build on Linux)
+    *   SWIG (python binding)
+    *   Perl (required if you need ssl support, optional)
+    *   Go (required if you need ssl support, optional)
+    *   NASM/YASM (required by grpc for ssl support, optional)
+2.  Start CMake GUI
+3.  Click on `Browse Source` and direct to the the folder
+    `<tensorflow-source>/tensorflow/contrib/cmake`
+4.  Click on `Browse Build` and spectify a location that you want tensorflow to
+    be build
+5.  Click on `Configure`, a new window will be prompted out, specify the
+    generator mode for the project generation. For Windows, choose `Visual
+    Studio <version> <year> Win64`, for Linux, choose `Unix Makefiles`, then
+    press `Finish`. Wait for a moment, the default project dependecy would
+    automatically generate.
+6.  There are a few options that you can customize your own build. **The setting
+    here is crucial for a sucessful build, please check all items carefully.**
+
+    *   `tensorflow_BUILD_ALL_KERNELS` should alway be `on`
+    *   `tensorflow_BUILD_CC_EXAMPLE` is default to be `on`. This can help you
+        to test build (optional)
+    *   `tensorflow_BUILD_CONTRIB_KERNELS` is default to be `on`, but it won't
+        affect tensorflow function, turn it to `off` if you want a slim build.
+        (optional)
+    *   `tensorflow_BUILD_PYTHON_BINDING` is default to be `on`. Set to `off` if
+        you don't need python interaface. If SWIG is not in system path, you
+        need set it manually. (optional)
+    *   `tensorflow_BUILD_SHARED_LIB` is default to be `off`. Set to `on` if you
+        want the c++ interface. (optional)
+    *   `tensorflow_ENABLE_GPU` is default to be `off`. Set to `on` if you want
+        GPU support. It will search CUDA and CUDNN dependecies if you have set
+        them to system path, otherwise CMake would prompt error and request you
+        to set it manually. (optional)
+    *   `tensorflow_ENABLE_GRPC_SUPPORT` is default to be `on`. For Linux build,
+        this option must always be `on`. This need to be `on` for a gpu build.
+        Reminded that Perl, Go and NASM/YASM are required for this option if you
+        want to build grpc with offical SSL support.
+    *   `tensorflow_ENABLE_POSITION_INDEPENDENT_CODE` should always be `on`
+    *   `tensorflow_ENABLE_SNAPPY_SUPPORT` should always be `on`
+    *   `tensorflow_OPTIMIZE_FOR_NATIVE_ARCH` should always be `on`
+    *   `CMAKE_INSTALL_PREFIX` is the location where the final package will be
+        installed. You may change it to your own preferred path (optional)
+
+7.  After changing the configuration in step 5, press `Configure` again
+
+8.  If not error is found, press `Generate`
+
+#### Windows
+
+1.  Open `tensorflow.sln` in the build folder (Windows). Change build type from
+    `Debug` to `Release`. Choose `Build`->`Build Solution`. This may take more
+    than hours of compilation. If everything is alright, the output window would
+    show no error.
+
+    ##### Python
+
+    In solution explorer, right click on `tf_python_build_pip_package` ->
+    `build`. It will generate the wheel file in
+    `<tensorflow-build>/tf_python/dist`. Install with following command:
+
+    `pip install --upgrade tensorflow-<config>.whl`
+
+    ***The wheel name varies depends on you config. Change to your own wheel
+    filename.***
+
+    Reminded that some pip installation requires administrator right command
+    prompt.
+
+    ##### C++
+
+    You can directly use the build folder tree for C++ interface with cmake. If
+    you want to do installation for api releasing, right click on `Install` ->
+    `build`. The headers and library will be installed in the directory specify
+    by `CMAKE_INSTALL_PREFIX` during configuration.
+
+1.  For smaller RAM computer, it is noticed that out of heap space error
+    appears. Change to command prompt build is an alternative to do step 1.
+
+    Open `VS2015 x64 Native Tools Command Prompt`. You can open it by press
+    `Start`, then type the binary name. Use `VS2017 x64 Native Tools Command
+    Prompt` if you are using MSVC 2017.
+
+    ##### Python
+
+    Directly build python wheel package by following command:
+
+    `MSBuild /p:Configuration=Release
+    <path-to-tf_python_build_pip_package.vcxproj>`
+
+    Remember to change `<path-to-tf_python_build_pip_package.vcxproj>` to the
+    actual path of the file, it can be found at the root of build directory
+
+    Install the wheel file generated as instructed by step 1.
+
+    ##### C++ interface
+
+    Build from VS native toolchain with following command: `MSBuild
+    /p:Configuration=Release <path-to-ALL_BUILD.vcxproj>`
+
+    Headers are discretely located in the build folders. Tensorflow library can
+    be found at `<path-to-build>/Release`, namely `tensorflow.dll` and
+    `tensorflow.lib`.
+
+    *   Build to install for api release (optional): `MSBuild
+        /p:Configuration=Release <path-to-INSTALL.vcxproj>`
+
+    Remember to change `<path-to-ALL_BUILD.vcxproj>` and
+    `<path-to-INSTALL.vcxproj>` to the actual path of the file, it can be found
+    at the root of build directory.
+
+#### Linux/MacOS (command line GNU build)
+
+1.  Open the terminal, change working directory to the one specified in step 3.
+
+2.  Type the following command:
+
+    `make -sj<number-of-threads> all`
+
+    ##### Python
+
+    **Important Note** CMake generated python wheel for Linux/MacOs is currently
+    under development. Please use bazel build.
+
+    Follow code is an expected Linux/MacOS python package build after
+    development work is completed.
+
+    ```
+    make -sj<number-of-threads> tf_python_build_pip_package
+    cd tf_python
+    pip install --upgrade tensorflow-<config>.whl
+    ```
+
+    ##### C++ interface
+
+    `make -sj<number-of-threads> install`
+
+    Where `<number-of-threads>` is the threads used for the compilation, change
+    to any integer less or equal to your computer's maxiumum thread number.
+
+    Headers are discretely located in the build folders. Tensorflow library can
+    be found at `<path-to-build>`, namely `tensorflow.so` (Linux) or
+    `tensorflow.dylib` (MacOS).
+
+#### Start a Tensorflow C++ project with CMake
+
+Here we assume that you have basic knowledge on gathering dependency with
+`CMakeLists.txt`. Here we introduce how the C++ api works with
+[official hello world tutorial](https://www.tensorflow.org/api_guides/cc/guide).
+
+1.  Create a new working directory and create a new text file named
+    `CMakeLists.txt` and the c++ file `main.cxx`
+2.  Fill in the `main.cxx` with the code provided in
+    [official c++ api basic](https://www.tensorflow.org/api_guides/cc/guide).
+3.  Fill in the `CMakeLists.txt` with following code: ``` cmake
+    cmake_minimum_required (VERSION 2.6) project (tf_hello)
+
+    # Tensorflow
+
+    find_package(Tensorflow REQUIRED)
+    include_directories(${TENSORFLOW_INCLUDE_DIRS})
+
+    # compiler setting required by tensorflow, to be tested on all compilers
+
+    # currently only tested on MSVC and GCC
+
+    if (${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC) add_definitions(-DCOMPILER_MSVC)
+    elseif (${CMAKE_CXX_COMPILER_ID} STREQUAL GNU) if
+    (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS "3")
+    add_definitions(-DCOMPILER_GCC3) else() add_definitions(-D__GNUC__) endif()
+    else() message(ERROR " compiler ${CMAKE_CXX_COMPILER_ID} not supported by
+    this CMakeList.txt, under development") endif()
+
+    add_executable(tf_hello main.cxx) target_link_libraries(tf_hello
+    ${TENSORFLOW_LIBRARIES}) ```
+
+4.  Configure the folder with cmake-gui, an error should be prompted out,
+    requesting you to locate the folder containing `TensorflowConfig.cmake`.
+    This file can be found at `<tensorflow-build>` or `<tensorflow-intall>` (for
+    those have build install in previous steps).
+
+5.  Configure again, generate the project.
+
+6.  Compile the project with `Release` config (Windows). For Linux users, just
+    compile the project.
+
+7.  Copy the `tensorflow.dll`(Windows)/`tensorflow.so`(Linux) from build
+    directory to the build folder containing `tf_hello` binary.
+
+8.  Run `tf_hello` binary
 
-Step-by-step Windows build
-==========================
+# Step-by-step Windows build (command prompt)
 
 1.  Install the prerequisites detailed above, and set up your environment.
 
diff --git a/tensorflow/contrib/cmake/TensorflowConfig.cmake.in b/tensorflow/contrib/cmake/TensorflowConfig.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..cc04db6e952f53b8bb5416dde60b8173e60bf60e
--- /dev/null
+++ b/tensorflow/contrib/cmake/TensorflowConfig.cmake.in
@@ -0,0 +1,16 @@
+# - Config file for the Tensorflow package
+# It defines the following variables
+#  TENSORFLOW_INCLUDE_DIRS - include directories for FooBar
+#  TENSORFLOW_LIBRARIES    - libraries to link against
+ 
+# Compute paths
+get_filename_component(TENSORFLOW_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+set(TENSORFLOW_INCLUDE_DIRS "@CONF_INCLUDE_DIRS@")
+ 
+# Our library dependencies (contains definitions for IMPORTED targets)
+if(NOT TENSORFLOW_BINARY_DIR)
+  include("${TENSORFLOW_CMAKE_DIR}/TensorflowTargets.cmake")
+endif()
+ 
+# These are IMPORTED targets created by TensorflowTargets.cmake
+set(TENSORFLOW_LIBRARIES tensorflow)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/TensorflowConfigVersion.cmake.in b/tensorflow/contrib/cmake/TensorflowConfigVersion.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..2a9609ddb9c4ca864651818bdfae0f8fe290de31
--- /dev/null
+++ b/tensorflow/contrib/cmake/TensorflowConfigVersion.cmake.in
@@ -0,0 +1,11 @@
+set(PACKAGE_VERSION "@TENSORFLOW_VERSION@")
+ 
+# Check whether the requested PACKAGE_FIND_VERSION is compatible
+if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}")
+  set(PACKAGE_VERSION_COMPATIBLE FALSE)
+else()
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}")
+    set(PACKAGE_VERSION_EXACT TRUE)
+  endif()
+endif()
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/external/abseil_cpp.cmake b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
index 4546dbdecc0dbc36f17cc727345e0762718b5165..46a193971c5084523d432065f265fa7a9909f595 100644
--- a/tensorflow/contrib/cmake/external/abseil_cpp.cmake
+++ b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
@@ -31,27 +31,24 @@ if (systemlib_ABSEIL_CPP)
   message(STATUS "  abseil_cpp includes: ${ABSEIL_CPP_INCLUDE_DIR}")
   message(STATUS "  abseil_cpp libraries: ${ABSEIL_CPP_LIBRARIES}")
 
-  add_custom_target(abseil_cpp_build)
-  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp_build)
+  add_custom_target(abseil_cpp)
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp)
 
 else (systemlib_ABSEIL_CPP)
 
   include (ExternalProject)
 
-  set(abseil_cpp_INCLUDE_DIR ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp_build)
+  set(abseil_cpp_INCLUDE_DIR ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp)
   set(abseil_cpp_URL https://github.com/abseil/abseil-cpp/archive/e01d95528ea2137a4a27a88d1f57c6cb260aafed.tar.gz)
   set(abseil_cpp_HASH SHA256=84043ed402d2a2a6ba4cdddb7e85118b1158fd81fe4ac3a14adc343d054c1e2e)
-  set(abseil_cpp_BUILD ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp_build)
+  set(abseil_cpp_BUILD ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp-build)
 
   if(WIN32)
     if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
       set(abseil_cpp_STATIC_LIBRARIES
           ${abseil_cpp_BUILD}/absl/base/Release/absl_base.lib
-          ${abseil_cpp_BUILD}/absl/base/Release/absl_spinlock_wait.lib
           ${abseil_cpp_BUILD}/absl/base/Release/absl_dynamic_annotations.lib
-          ${abseil_cpp_BUILD}/absl/base/Release/absl_malloc_internal.lib
-          ${abseil_cpp_BUILD}/absl/base/Release/absl_throw_delegate.lib
-          ${abseil_cpp_BUILD}/absl/numeric/Release/absl_int128.lib
+          ${abseil_cpp_BUILD}/absl/base/Release/absl_internal_malloc_internal.lib
           ${abseil_cpp_BUILD}/absl/strings/Release/absl_strings.lib
           ${abseil_cpp_BUILD}/absl/strings/Release/str_format_internal.lib
           ${abseil_cpp_BUILD}/absl/types/Release/absl_bad_optional_access.lib)
@@ -80,15 +77,12 @@ else (systemlib_ABSEIL_CPP)
         ${abseil_cpp_BUILD}/absl/types/libabsl_bad_optional_access.a)
   endif()
 
-  ExternalProject_Add(abseil_cpp_build
+  ExternalProject_Add(abseil_cpp
       PREFIX abseil_cpp
       URL ${abseil_cpp_URL}
       URL_HASH ${abseil_cpp_HASH}
       DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
-      BUILD_IN_SOURCE 1
       BUILD_BYPRODUCTS ${abseil_cpp_STATIC_LIBRARIES}
-      BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release
-      COMMAND ${CMAKE_COMMAND} --build . --config Release
       INSTALL_COMMAND ""
       CMAKE_CACHE_ARGS
           -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
@@ -99,6 +93,6 @@ else (systemlib_ABSEIL_CPP)
   include_directories(${abseil_cpp_INCLUDE_DIR})
   list(APPEND tensorflow_EXTERNAL_LIBRARIES ${abseil_cpp_STATIC_LIBRARIES})
 
-  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp_build)
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp)
 
-endif (systemlib_ABSEIL_CPP)
+endif (systemlib_ABSEIL_CPP)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index b1e64aa55c80ad59cfdc0f4767c0282b4f73367f..e570c09ecb5e64130ed6f3375a51d74850cc3989 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -17,7 +17,7 @@ include (ExternalProject)
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
 set(GRPC_URL https://github.com/grpc/grpc.git)
 set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
-set(GRPC_TAG d184fa229d75d336aedea0041bd59cb93e7e267f)
+set(GRPC_TAG 69b6c047bc767b4d80e7af4d00ccb7c45b683dae)
 
 if(WIN32)
   # We use unsecure gRPC because boringssl does not build on windows
@@ -26,9 +26,9 @@ if(WIN32)
   set(grpc_SSL_PROVIDER NONE)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
     set(grpc_STATIC_LIBRARIES
-        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/grpc++_unsecure.lib
-        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/grpc_unsecure.lib
-        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/gpr.lib)
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/grpc++_unsecure.lib
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/grpc_unsecure.lib
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/gpr.lib)
   else()
     set(grpc_STATIC_LIBRARIES
         ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/grpc++_unsecure.lib
@@ -43,8 +43,9 @@ else()
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libaddress_sorting.a
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/cares/cares/lib/libcares.a
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a)
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/zlib/libz.a)
 endif()
 
 add_definitions(-DGRPC_ARES=0)
@@ -66,7 +67,7 @@ ExternalProject_Add(grpc
         -DPROTOBUF_INCLUDE_DIRS:STRING=${PROTOBUF_INCLUDE_DIRS}
         -DPROTOBUF_LIBRARIES:STRING=${protobuf_STATIC_LIBRARIES}
         -DZLIB_ROOT:STRING=${ZLIB_INSTALL}
-	-DgRPC_SSL_PROVIDER:STRING=${grpc_SSL_PROVIDER}
+        -DgRPC_SSL_PROVIDER:STRING=${grpc_SSL_PROVIDER}
 )
 
 # grpc/src/core/ext/census/tracing.c depends on the existence of openssl/rand.h.
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index 1a147e9c8e5a9fee17a81e37c9babe3c9ec0290b..32e6d78e508e25f76bd263e9d52b6574ca315f6c 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -59,6 +59,7 @@ ExternalProject_Add(png
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DCMAKE_INSTALL_PREFIX:STRING=${png_INSTALL}
 	-DZLIB_ROOT:STRING=${ZLIB_INSTALL}
+  -DPNG_TESTS:BOOL=OFF
 )
 
 ## put png includes in the directory where they are expected
diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index 56a57a2340ddc7f923c611c222a0399e279ad58a..773c37b309b1dff4ed28d24cd7d6140a63ec5bc6 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -16,7 +16,18 @@ include (ExternalProject)
 
 set(PROTOBUF_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src)
 set(PROTOBUF_URL https://github.com/google/protobuf.git)
-set(PROTOBUF_TAG v3.6.1)
+
+# enable choose protobuf versions
+SET(PROTOBUF_VERSION "3.6.1" CACHE STRING "Protobuf version")
+SET_PROPERTY(CACHE PROTOBUF_VERSION PROPERTY STRINGS "3.4.0" "3.5.0" "3.6.1") 
+
+if(${PROTOBUF_VERSION} STREQUAL "3.5.1")
+    set(PROTOBUF_TAG v3.6.1)
+elseif(${PROTOBUF_VERSION} STREQUAL "3.5.0")
+    set(PROTOBUF_TAG 2761122b810fe8861004ae785cc3ab39f384d342)
+elseif(${PROTOBUF_VERSION} STREQUAL "3.4.0")
+    set(PROTOBUF_TAG b04e5cba356212e4e8c66c61bbe0c3a20537c5b9)
+endif()
 
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
diff --git a/tensorflow/contrib/cmake/modules/FindAbseilCpp.cmake b/tensorflow/contrib/cmake/modules/FindAbseilCpp.cmake
index d4f8bb1bec9ae8eff58dfe78168d8e71319c85e1..944ae3997a9489c13f65f93d9a7e61c21dd975c1 100644
--- a/tensorflow/contrib/cmake/modules/FindAbseilCpp.cmake
+++ b/tensorflow/contrib/cmake/modules/FindAbseilCpp.cmake
@@ -24,10 +24,10 @@ if(EXISTS "${ABSEIL_CPP_INCLUDE_DIR}" AND NOT "${ABSEIL_CPP_INCLUDE_DIR}" STREQU
     # search all libraries if no COMPONENTS was requested
     set(AbseilCpp_FIND_COMPONENTS
         "absl_algorithm;absl_any;absl_bad_any_cast"
-        "absl_bad_optional_access;absl_base absl_container;absl_debugging"
+        "absl_bad_optional_access;absl_base;absl_container;absl_debugging"
         "absl_dynamic_annotations;absl_examine_stack;absl_failure_signal_handler"
-        "absl_int128;absl_leak_check;absl_malloc_internal;absl_memory;absl_meta"
-        "absl_numeric;absl_optional;absl_span;absl_spinlock_wait;absl_stack_consumption"
+        "absl_int128;absl_leak_check;absl_internal_malloc_internal;absl_memory;absl_meta"
+        "absl_numeric;absl_optional;absl_span;absl_internal_spinlock_wait;absl_stack_consumption"
         "absl_stacktrace;absl_str_format;absl_strings;absl_symbolize;absl_synchronization"
         "absl_throw_delegate;absl_time;absl_utility;str_format_extension_internal"
         "str_format_internal;test_instance_tracker_lib")
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index 7a30eb94f54b18a2a517615a315e23e09e1170d0..a04142bd249ed5e16beba11057d0efc1e191e31b 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+
 ########################################################
 # tf_c_framework library
 ########################################################
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index 6c90cf398c69c8c1b22ea75e0c407f258e2535f9..6514ae50a4a35b35ba100af6997079294c22f9b8 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -149,11 +149,7 @@ add_library(tf_cc OBJECT ${tf_cc_srcs})
 add_dependencies(tf_cc tf_cc_framework tf_cc_ops)
 
 if (WIN32)
-  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
-    set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
-  else()
-    set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib")
-  endif()
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib")
 else (WIN32)
   set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif (WIN32)
diff --git a/tensorflow/contrib/cmake/tf_core_cpu.cmake b/tensorflow/contrib/cmake/tf_core_cpu.cmake
index a54cbff33b66d63d7229fa2f50b8a4ca962111ed..d8884d464fb5974d77506561a9ed36110a3804c0 100644
--- a/tensorflow/contrib/cmake/tf_core_cpu.cmake
+++ b/tensorflow/contrib/cmake/tf_core_cpu.cmake
@@ -39,6 +39,8 @@ file(GLOB_RECURSE tf_core_cpu_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/*test*.h"
     "${tensorflow_source_dir}/tensorflow/core/*test*.cc"
     "${tensorflow_source_dir}/tensorflow/core/*main.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*.h"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu_device_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/direct_session.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_eager_runtime.cmake b/tensorflow/contrib/cmake/tf_core_eager_runtime.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..78e4c0d3035cdaefa1d0950f4270d60152c805af
--- /dev/null
+++ b/tensorflow/contrib/cmake/tf_core_eager_runtime.cmake
@@ -0,0 +1,57 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+########################################################
+# tf_core_eager_runtime library
+########################################################
+file(GLOB_RECURSE tf_core_eager_runtime_srcs
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*.h"
+)
+
+file(GLOB_RECURSE tf_core_eager_runtime_exclude_srcs
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*test*.h"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*test*.cc"
+)
+
+list(REMOVE_ITEM tf_core_eager_runtime_srcs ${tf_core_eager_runtime_exclude_srcs})
+
+add_library(tf_core_eager_runtime OBJECT ${tf_core_eager_runtime_srcs})
+add_dependencies(
+	tf_core_eager_runtime 
+	tf_c 
+	tf_core_lib)
+
+
+file(GLOB_RECURSE tf_c_eager_srcs
+    "${tensorflow_source_dir}/tensorflow/c/eager/*.cc"
+    "${tensorflow_source_dir}/tensorflow/c/eager/*.h"
+)
+
+file(GLOB_RECURSE tf_c_eager_exlclude_srcs
+    "${tensorflow_source_dir}/tensorflow/c/eager/*test*.h"
+    "${tensorflow_source_dir}/tensorflow/c/eager/*test*.cc"
+)
+
+list(REMOVE_ITEM tf_c_eager_srcs ${tf_c_eager_exlclude_srcs})
+
+add_library(tf_c_eager OBJECT ${tf_c_eager_srcs})
+add_dependencies(
+  tf_c_eager
+  tf_core_eager_runtime
+  tf_c
+  tf_cc_framework
+  tf_cc_while_loop
+  tf_core_lib
+  tf_protos_cc)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index 7e806685b8448cbd629985cdc00ed1193857abe6..d7b2a1339e047aba0a9424a53a63726805e89721 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -140,16 +140,19 @@ set(tf_proto_text_srcs
     "tensorflow/core/example/example.proto"
     "tensorflow/core/example/feature.proto"
     "tensorflow/core/framework/allocation_description.proto"
+    "tensorflow/core/framework/api_def.proto"
     "tensorflow/core/framework/attr_value.proto"
     "tensorflow/core/framework/cost_graph.proto"
     "tensorflow/core/framework/device_attributes.proto"
     "tensorflow/core/framework/function.proto"
     "tensorflow/core/framework/graph.proto"
     "tensorflow/core/framework/graph_transfer_info.proto"
+    "tensorflow/core/framework/iterator.proto"
     "tensorflow/core/framework/kernel_def.proto"
     "tensorflow/core/framework/log_memory.proto"
     "tensorflow/core/framework/node_def.proto"
     "tensorflow/core/framework/op_def.proto"
+    "tensorflow/core/framework/reader_base.proto"
     "tensorflow/core/framework/remote_fused_graph_execute_info.proto"
     "tensorflow/core/framework/resource_handle.proto"
     "tensorflow/core/framework/step_stats.proto"
@@ -159,6 +162,7 @@ set(tf_proto_text_srcs
     "tensorflow/core/framework/tensor_shape.proto"
     "tensorflow/core/framework/tensor_slice.proto"
     "tensorflow/core/framework/types.proto"
+    "tensorflow/core/framework/variable.proto"
     "tensorflow/core/framework/versions.proto"
     "tensorflow/core/lib/core/error_codes.proto"
     "tensorflow/core/protobuf/cluster.proto"
@@ -204,10 +208,10 @@ file(GLOB tf_core_platform_srcs
     "${tensorflow_source_dir}/tensorflow/core/framework/resource_handle.h"
     "${tensorflow_source_dir}/tensorflow/core/framework/resource_handle.cc")
 if (NOT tensorflow_ENABLE_GPU)
-  file(GLOB tf_core_platform_gpu_srcs
+  file(GLOB tf_core_platform_gpu_srcs_exclude
       "${tensorflow_source_dir}/tensorflow/core/platform/cuda_libdevice_path.*"
       "${tensorflow_source_dir}/tensorflow/core/platform/default/cuda_libdevice_path.*")
-  list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_gpu_srcs})
+  list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_gpu_srcs_exclude})
 else()
   file(GLOB tf_core_platform_srcs_exclude
       "${tensorflow_source_dir}/tensorflow/core/platform/default/device_tracer.cc")
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 9cfa8b90749280b6aa815cc210941c75bd5e16c5..310eed4ecbfdd30a3b3bdd4728c030fe70930797 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -13,13 +13,14 @@
 # limitations under the License.
 # ==============================================================================
 set(tf_op_lib_names
-    "audio_ops"
     "array_ops"
+    "audio_ops"
     "batch_ops"
     "bitwise_ops"
     "boosted_trees_ops"
     "candidate_sampling_ops"
     "checkpoint_ops"
+    "collective_ops"
     "control_flow_ops"
     "ctc_ops"
     "cudnn_rnn_ops"
@@ -27,13 +28,14 @@ set(tf_op_lib_names
     "dataset_ops"
     "decode_proto_ops"
     "encode_proto_ops"
+    "function_ops"
     "functional_ops"
     "image_ops"
     "io_ops"
     "linalg_ops"
     "list_ops"
-    "lookup_ops"
     "logging_ops"
+    "lookup_ops"
     "manip_ops"
     "math_ops"
     "nn_ops"
@@ -43,10 +45,11 @@ set(tf_op_lib_names
     "remote_fused_graph_ops"
     "resource_variable_ops"
     "rpc_ops"
+    "scoped_allocator_ops"
     "script_ops"
     "sdca_ops"
-    "set_ops"
     "sendrecv_ops"
+    "set_ops"
     "sparse_ops"
     "spectral_ops"
     "state_ops"
@@ -54,6 +57,7 @@ set(tf_op_lib_names
     "string_ops"
     "summary_ops"
     "training_ops"
+    "word2vec_ops"
 )
 
 foreach(tf_op_lib_name ${tf_op_lib_names})
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index df7b854afcca1a0bed660624152f465d4bf3b25f..8faccf8d55902e6701ebb4ce534b84705304fd5f 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -313,15 +313,14 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
         ${GENERATE_PYTHON_OP_LIB_DESTINATION} PARENT_SCOPE)
 endfunction()
 
-GENERATE_PYTHON_OP_LIB("audio_ops")
 GENERATE_PYTHON_OP_LIB("array_ops")
+GENERATE_PYTHON_OP_LIB("audio_ops")
 GENERATE_PYTHON_OP_LIB("batch_ops")
 GENERATE_PYTHON_OP_LIB("bitwise_ops")
 GENERATE_PYTHON_OP_LIB("boosted_trees_ops")
-GENERATE_PYTHON_OP_LIB("math_ops")
-GENERATE_PYTHON_OP_LIB("functional_ops")
 GENERATE_PYTHON_OP_LIB("candidate_sampling_ops")
 GENERATE_PYTHON_OP_LIB("checkpoint_ops")
+GENERATE_PYTHON_OP_LIB("collective_ops")
 GENERATE_PYTHON_OP_LIB("control_flow_ops"
   ADDITIONAL_LIBRARIES $<TARGET_OBJECTS:tf_no_op>)
 GENERATE_PYTHON_OP_LIB("ctc_ops")
@@ -332,14 +331,18 @@ GENERATE_PYTHON_OP_LIB("decode_proto_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_decode_proto_op.py)
 GENERATE_PYTHON_OP_LIB("encode_proto_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_encode_proto_op.py)
+GENERATE_PYTHON_OP_LIB("function_ops")
+GENERATE_PYTHON_OP_LIB("functional_ops")
 GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
 GENERATE_PYTHON_OP_LIB("list_ops")
 GENERATE_PYTHON_OP_LIB("logging_ops")
 GENERATE_PYTHON_OP_LIB("lookup_ops")
-GENERATE_PYTHON_OP_LIB("nn_ops")
 GENERATE_PYTHON_OP_LIB("manip_ops")
+GENERATE_PYTHON_OP_LIB("math_ops")
+GENERATE_PYTHON_OP_LIB("nn_ops")
+GENERATE_PYTHON_OP_LIB("no_op")
 GENERATE_PYTHON_OP_LIB("parsing_ops")
 GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
@@ -347,17 +350,21 @@ GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
 GENERATE_PYTHON_OP_LIB("rpc_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rpc/python/ops/gen_rpc_op.py)
+GENERATE_PYTHON_OP_LIB("scoped_allocator_ops")
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
+GENERATE_PYTHON_OP_LIB("sendrecv_ops")
 GENERATE_PYTHON_OP_LIB("set_ops")
-GENERATE_PYTHON_OP_LIB("state_ops")
 GENERATE_PYTHON_OP_LIB("sparse_ops")
 GENERATE_PYTHON_OP_LIB("spectral_ops")
+GENERATE_PYTHON_OP_LIB("state_ops")
+GENERATE_PYTHON_OP_LIB("stateless_random_ops")
 GENERATE_PYTHON_OP_LIB("string_ops")
 GENERATE_PYTHON_OP_LIB("summary_ops")
 GENERATE_PYTHON_OP_LIB("user_ops")
 GENERATE_PYTHON_OP_LIB("training_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/training/gen_training_ops.py)
+GENERATE_PYTHON_OP_LIB("word2vec_ops")
 
 GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_model_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_model_ops.py)
@@ -391,11 +398,8 @@ GENERATE_PYTHON_OP_LIB("contrib_layers_sparse_feature_cross_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/layers/ops/gen_sparse_feature_cross_op.py)
 GENERATE_PYTHON_OP_LIB("contrib_memory_stats_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/memory_stats/ops/gen_memory_stats_ops.py)
-GENERATE_PYTHON_OP_LIB("contrib_nccl_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/nccl/ops/gen_nccl_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_periodic_resample_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/periodic_resample/python/ops/gen_periodic_resample_op.py)
-
 GENERATE_PYTHON_OP_LIB("contrib_nearest_neighbor_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/nearest_neighbor/ops/gen_nearest_neighbor_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_resampler_ops"
@@ -420,8 +424,6 @@ GENERATE_PYTHON_OP_LIB("contrib_bigquery_reader_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cloud/python/ops/gen_bigquery_reader_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_gcs_config_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cloud/python/ops/gen_gcs_config_ops.py)
-GENERATE_PYTHON_OP_LIB("stateless_random_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/stateless/gen_stateless_random_ops.py)
 GENERATE_PYTHON_OP_LIB("debug_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/debug/ops/gen_debug_ops.py)
 
@@ -524,11 +526,13 @@ if(WIN32)
     add_library(pywrap_tensorflow_internal_static STATIC
         ${pywrap_tensorflow_internal_src}
         $<TARGET_OBJECTS:tf_c>
+        $<TARGET_OBJECTS:tf_c_eager>
         $<TARGET_OBJECTS:tf_c_python_api>
         $<TARGET_OBJECTS:tf_core_lib>
         $<TARGET_OBJECTS:tf_core_cpu>
         $<TARGET_OBJECTS:tf_core_framework>
         $<TARGET_OBJECTS:tf_core_profiler>
+        $<TARGET_OBJECTS:tf_core_eager_runtime>
         $<TARGET_OBJECTS:tf_cc>
         $<TARGET_OBJECTS:tf_cc_ops>
         $<TARGET_OBJECTS:tf_cc_while_loop>
@@ -581,11 +585,13 @@ endif(WIN32)
 add_library(pywrap_tensorflow_internal SHARED
     ${pywrap_tensorflow_internal_src}
     $<TARGET_OBJECTS:tf_c>
+    $<TARGET_OBJECTS:tf_c_eager>
     $<TARGET_OBJECTS:tf_c_python_api>
     $<TARGET_OBJECTS:tf_core_lib>
     $<TARGET_OBJECTS:tf_core_cpu>
     $<TARGET_OBJECTS:tf_core_framework>
     $<TARGET_OBJECTS:tf_core_profiler>
+    $<TARGET_OBJECTS:tf_core_eager_runtime>
     $<TARGET_OBJECTS:tf_cc>
     $<TARGET_OBJECTS:tf_cc_ops>
     $<TARGET_OBJECTS:tf_cc_while_loop>
@@ -615,13 +621,28 @@ target_include_directories(pywrap_tensorflow_internal PUBLIC
     ${NUMPY_INCLUDE_DIR}
 )
 
-target_link_libraries(pywrap_tensorflow_internal PRIVATE
+if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+	# There is a bug in GCC 5 resulting in undefined reference to a __cpu_model function when
+	# linking to the tensorflow library. Adding the following libraries fixes it.
+	# See issue on github: https://github.com/tensorflow/tensorflow/issues/9593
+	target_link_libraries(pywrap_tensorflow_internal PRIVATE
     ${tf_core_gpu_kernels_lib}
     ${tensorflow_EXTERNAL_LIBRARIES}
     tf_protos_cc
     tf_python_protos_cc
     ${PYTHON_LIBRARIES}
+    gcc_s
+    gcc
 )
+else()
+	target_link_libraries(pywrap_tensorflow_internal PRIVATE
+    ${tf_core_gpu_kernels_lib}
+    ${tensorflow_EXTERNAL_LIBRARIES}
+    tf_protos_cc
+    tf_python_protos_cc
+    ${PYTHON_LIBRARIES}
+)
+endif()
 
 if(WIN32)
 
@@ -806,10 +827,10 @@ add_dependencies(tf_python_api tf_python_ops)
 ########################################################
 
 # Parse tensorflow/python/tools/api/generator/BUILD to get list of generated files.
-FILE(READ ${tensorflow_source_dir}/tensorflow/python/tools/api/generator/api_gen.bzl api_generator_BUILD_text)
-STRING(REGEX MATCH "# BEGIN GENERATED ESTIMATOR FILES.*# END GENERATED ESTIMATOR FILES" api_init_files_text ${api_generator_BUILD_text})
-string(REPLACE "# BEGIN GENERATED ESTIMATOR FILES" "" api_init_files_text ${api_init_files_text})
-string(REPLACE "# END GENERATED ESTIMATOR FILES" "" api_init_files_text ${api_init_files_text})
+FILE(READ ${tensorflow_source_dir}/tensorflow/python/tools/api/generator/api_init_files.bzl api_generator_BUILD_text)
+STRING(REGEX MATCH "# BEGIN GENERATED FILES.*# END GENERATED FILES" api_init_files_text ${api_generator_BUILD_text})
+string(REPLACE "# BEGIN GENERATED FILES" "" api_init_files_text ${api_init_files_text})
+string(REPLACE "# END GENERATED FILES" "" api_init_files_text ${api_init_files_text})
 string(REPLACE "," ";" api_init_files_list ${api_init_files_text})
 
 set(api_init_files "")
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index fdf522f1fd90ffc64acbe82381ef57a389645d61..62005dd113bfb80fbdf23afb6d4aa5f90a1e32de 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -23,6 +23,8 @@ if(WIN32)
   # we need.
   #
   add_library(tensorflow_static STATIC
+      $<TARGET_OBJECTS:tf_c_eager>
+      $<TARGET_OBJECTS:tf_core_eager_runtime>
       $<TARGET_OBJECTS:tf_c>
       $<TARGET_OBJECTS:tf_cc>
       $<TARGET_OBJECTS:tf_cc_framework>
@@ -65,6 +67,8 @@ endif(WIN32)
 # tensorflow is a shared library containing all of the
 # TensorFlow runtime and the standard ops and kernels.
 add_library(tensorflow SHARED
+    $<TARGET_OBJECTS:tf_c_eager>
+    $<TARGET_OBJECTS:tf_core_eager_runtime>
     $<TARGET_OBJECTS:tf_c>
     $<TARGET_OBJECTS:tf_cc>
     $<TARGET_OBJECTS:tf_cc_framework>
@@ -96,6 +100,27 @@ if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
     target_link_libraries(tensorflow PRIVATE gcc_s gcc)
 endif()
 
+# Offer the user the choice of overriding the installation directories
+set(INSTALL_LIB_DIR lib CACHE PATH "Installation directory for libraries")
+set(INSTALL_BIN_DIR bin CACHE PATH "Installation directory for executables")
+set(INSTALL_INCLUDE_DIR include CACHE PATH
+  "Installation directory for header files")
+if(WIN32 AND NOT CYGWIN)
+  set(DEF_INSTALL_CMAKE_DIR cmake)
+else()
+  set(DEF_INSTALL_CMAKE_DIR lib/cmake)
+endif()
+set(INSTALL_CMAKE_DIR ${DEF_INSTALL_CMAKE_DIR} CACHE PATH
+  "Installation directory for CMake files")
+
+# Make relative paths absolute (needed later on)
+foreach(p LIB BIN INCLUDE CMAKE)
+  set(var INSTALL_${p}_DIR)
+  if(NOT IS_ABSOLUTE "${${var}}")
+    set(${var} "${CMAKE_INSTALL_PREFIX}/${${var}}")
+  endif()
+endforeach()
+
 if(WIN32)
   add_dependencies(tensorflow tensorflow_static)
 endif(WIN32)
@@ -103,14 +128,57 @@ endif(WIN32)
 target_include_directories(tensorflow PUBLIC 
     $<INSTALL_INTERFACE:include/>)
 
-install(TARGETS tensorflow EXPORT tensorflow_export
-        RUNTIME DESTINATION bin
-        LIBRARY DESTINATION lib
-        ARCHIVE DESTINATION lib)
+# Add all targets to build-tree export set
+export(TARGETS tensorflow
+  FILE ${PROJECT_BINARY_DIR}/TensorflowTargets.cmake)
+
+# Export the package for use from the build-tree
+export(PACKAGE Tensorflow)
+
+# Create the TensorflowConfig.cmake and TensorflowConfigVersion files
+file(RELATIVE_PATH REL_INCLUDE_DIR "${INSTALL_CMAKE_DIR}"
+   "${INSTALL_INCLUDE_DIR}")
+# for the build tree
+set(CONF_INCLUDE_DIRS "${tensorflow_source_dir}" 
+                      "${PROJECT_BINARY_DIR}"
+                      "${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src"
+                      "${CMAKE_CURRENT_BINARY_DIR}/nsync/install/include" # Please if there is a better directory
+                      "${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/Eigen/"
+                      "${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive/"
+                      "${tensorflow_source_dir}/third_party/eigen3/"
+                      "${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen/")
+configure_file(TensorflowConfig.cmake.in
+  "${PROJECT_BINARY_DIR}/TensorflowConfig.cmake" @ONLY)
+# for the install tree, yet to be complete
+set(CONF_INCLUDE_DIRS "\${TENSORFLOW_CMAKE_DIR}/${REL_INCLUDE_DIR}")
+configure_file(TensorflowConfig.cmake.in
+  "${PROJECT_BINARY_DIR}/${CMAKE_FILES_DIRECTORY}/TensorflowConfig.cmake" @ONLY)
+# for both
+configure_file(TensorflowConfigVersion.cmake.in
+  "${PROJECT_BINARY_DIR}/TensorflowConfigVersion.cmake" @ONLY)
+
+# install(TARGETS tensorflow EXPORT tensorflow_export
+#         RUNTIME DESTINATION ${INSTALL_BIN_DIR}
+#         LIBRARY DESTINATION ${INSTALL_LIB_DIR}
+#         ARCHIVE DESTINATION ${INSTALL_LIB_DIR})
+
+# install(EXPORT tensorflow_export
+#         FILE TensorflowConfig.cmake
+#         DESTINATION ${INSTALL_CMAKE_DIR})
         
-install(EXPORT tensorflow_export
-        FILE TensorflowConfig.cmake
-        DESTINATION lib/cmake)
+install(FILES
+  "${PROJECT_BINARY_DIR}/${CMAKE_FILES_DIRECTORY}/TensorflowConfig.cmake"
+  "${PROJECT_BINARY_DIR}/TensorflowConfigVersion.cmake"
+  DESTINATION "${INSTALL_CMAKE_DIR}" COMPONENT dev)
+
+# install the export set for use with the install-tree
+install(EXPORT TensorflowTargets 
+  DESTINATION ${INSTALL_CMAKE_DIR})
+
+install(TARGETS tensorflow EXPORT TensorflowTargets
+        RUNTIME DESTINATION ${INSTALL_BIN_DIR}
+        LIBRARY DESTINATION ${INSTALL_LIB_DIR}
+        ARCHIVE DESTINATION ${INSTALL_LIB_DIR})
 
 # install necessary headers
 # tensorflow headers
@@ -145,6 +213,10 @@ install(DIRECTORY ${tensorflow_source_dir}/third_party/eigen3/
 # unsupported Eigen directory
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen/
         DESTINATION include/unsupported/Eigen)
+# absl directory
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/abseil_cpp/src/abseil_cpp/absl/
+        DESTINATION include/absl
+        FILES_MATCHING PATTERN "*.h")
 # mkl
 if (tensorflow_ENABLE_MKL_SUPPORT)
     install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/include/
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 656633f0bf21a4d46cb85547241ef0fd42807ed6..40e159b8fcbd1864284e208cb15d9ed96119f840 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -38,12 +38,12 @@ tf_unary_scores, tf_sequence_lengths, tf_transition_params, _ = session.run(
     [unary_scores, sequence_lengths, transition_params, train_op])
 for tf_unary_scores_, tf_sequence_length_ in zip(tf_unary_scores,
                                                  tf_sequence_lengths):
-# Remove padding.
-tf_unary_scores_ = tf_unary_scores_[:tf_sequence_length_]
+    # Remove padding.
+    tf_unary_scores_ = tf_unary_scores_[:tf_sequence_length_]
 
-# Compute the highest score and its tag sequence.
-tf_viterbi_sequence, tf_viterbi_score = tf.contrib.crf.viterbi_decode(
-    tf_unary_scores_, tf_transition_params)
+    # Compute the highest score and its tag sequence.
+    tf_viterbi_sequence, tf_viterbi_score = tf.contrib.crf.viterbi_decode(
+        tf_unary_scores_, tf_transition_params)
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
index 0456463a1928cf226010670b90a5d574579e0411..6c5f8c6b00975b3fba041271309a93cecd9f5057 100644
--- a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
@@ -46,7 +46,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     result = dataset.apply(batching.assert_element_shape(expected_shapes))
     self.assertEqual(expected_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -88,7 +88,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     result = dataset.apply(batching.assert_element_shape(expected_shapes))
     self.assertEqual(expected_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -115,9 +115,8 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
 
     wrong_shapes = (tensor_shape.TensorShape(2),
                     tensor_shape.TensorShape((3, 10)))
-    iterator = (
-        dataset.apply(batching.assert_element_shape(wrong_shapes))
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset.apply(batching.assert_element_shape(wrong_shapes)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -142,7 +141,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
                      tensor_shape.TensorShape((3, 4)))
     self.assertEqual(actual_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -184,7 +183,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     result = dataset.apply(batching.assert_element_shape(expected_shapes))
     self.assertEqual(expected_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -211,9 +210,8 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
 
     wrong_shapes = (tensor_shape.TensorShape(2),
                     tensor_shape.TensorShape((None, 10)))
-    iterator = (
-        dataset.apply(batching.assert_element_shape(wrong_shapes))
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset.apply(batching.assert_element_shape(wrong_shapes)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
diff --git a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
index d2a72272db159755ac2d741bcdbce9ec646d928e..b9840b1ff1a3df5a05db0e64f436637220f49f80 100644
--- a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
@@ -23,6 +23,7 @@ import shutil
 
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -48,7 +49,7 @@ class LMDBDatasetTest(test_base.DatasetTestBase):
     num_repeats = 2
 
     dataset = readers.LMDBDataset(filenames).repeat(num_repeats)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
index c5a786232252432481566e3cde23e9310df172cc..2527706709fae8e459aca3489324d4db3c784be6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
@@ -63,13 +63,13 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
     # RepeatDataset(count) ->
     # _SlideDataset(window_size, window_shift, window_stride).
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
         .repeat(count).apply(
             sliding.sliding_window_batch(
                 window_size=window_size_t,
                 window_shift=window_shift_t,
-                window_stride=window_stride_t)).make_initializable_iterator())
+                window_stride=window_stride_t)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -127,13 +127,13 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
     # RepeatDataset(count) -> _SlideDataset(window_size, stride, window_stride).
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
         .repeat(count).apply(
             sliding.sliding_window_batch(
                 window_size=window_size_t,
                 stride=stride_t,
-                window_stride=window_stride_t)).make_initializable_iterator())
+                window_stride=window_stride_t)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -173,12 +173,12 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     window_shift_t = array_ops.placeholder(dtypes.int64, shape=[])
     window_stride_t = array_ops.placeholder(dtypes.int64, shape=[])
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(10).map(lambda x: x).repeat(count_t).apply(
             sliding.sliding_window_batch(
                 window_size=window_size_t,
                 window_shift=window_shift_t,
-                window_stride=window_stride_t)).make_initializable_iterator())
+                window_stride=window_stride_t)))
     init_op = iterator.initializer
 
     with self.cached_session() as sess:
@@ -204,9 +204,9 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return sparse_tensor.SparseTensorValue(
           indices=[[0]], values=(i * [1]), dense_shape=[1])
 
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).apply(
-        sliding.sliding_window_batch(
-            window_size=5, window_shift=3)).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_sparse).apply(
+            sliding.sliding_window_batch(window_size=5, window_shift=3)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -233,9 +233,9 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           values=array_ops.fill([math_ops.to_int32(i)], i),
           dense_shape=[i])
 
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).apply(
-        sliding.sliding_window_batch(
-            window_size=5, window_shift=3)).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_sparse).apply(
+            sliding.sliding_window_batch(window_size=5, window_shift=3)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -265,11 +265,10 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return sparse_tensor.SparseTensorValue(
           indices=[[0]], values=(i * [1]), dense_shape=[1])
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(10).map(_sparse).apply(
             sliding.sliding_window_batch(window_size=4, window_shift=2)).apply(
-                sliding.sliding_window_batch(window_size=3, window_shift=1))
-        .make_initializable_iterator())
+                sliding.sliding_window_batch(window_size=3, window_shift=1)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -305,11 +304,10 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       yield [4.0, 5.0, 6.0]
       yield [7.0, 8.0, 9.0, 10.0]
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_generator(
             generator, dtypes.float32, output_shapes=[None]).apply(
-                sliding.sliding_window_batch(window_size=3, window_shift=1))
-        .make_initializable_iterator())
+                sliding.sliding_window_batch(window_size=3, window_shift=1)))
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 34dc2379d0cb38f8f6962fa42efe21b793bc8d65..0fb406f1167053a128646c5c692986b0ce016f1e 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -188,8 +188,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 4601376dff47e161962e92678883039c4b88bab7..c0152156a1ba70297adb7054622b15ca04f859cd 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -21,10 +21,9 @@ from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
-from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util import deprecation
 
@@ -355,7 +354,7 @@ def read_batch_features(file_pattern,
       shuffle=randomize_input,
       num_epochs=num_epochs,
       shuffle_buffer_size=capacity)
-  iterator = dataset.make_one_shot_iterator()
+  iterator = dataset_ops.make_one_shot_iterator(dataset)
   outputs = iterator.get_next()
   return outputs
 
@@ -379,15 +378,13 @@ class LMDBDataset(dataset_ops.DatasetSource):
     (key value) pairs sequentially.
     For example:
     ```python
+    tf.enable_eager_execution()
+
     dataset = tf.contrib.lmdb.LMDBDataset("/foo/bar.mdb")
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
+
     # Prints the (key, value) pairs inside a lmdb file.
-    while True:
-      try:
-        print(sess.run(next_element))
-      except tf.errors.OutOfRangeError:
-        break
+    for key, value in dataset:
+      print(key, value)
     ```
     Args:
       filenames: A `tf.string` tensor containing one or more filenames.
@@ -398,18 +395,10 @@ class LMDBDataset(dataset_ops.DatasetSource):
 
   def _as_variant_tensor(self):
     return gen_experimental_dataset_ops.experimental_lmdb_dataset(
-        self._filenames,
-        output_types=nest.flatten(self.output_types),
-        output_shapes=nest.flatten(self.output_shapes))
-
-  @property
-  def output_classes(self):
-    return ops.Tensor, ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
+        self._filenames, **dataset_ops.flat_structure(self))
 
   @property
-  def output_types(self):
-    return dtypes.string, dtypes.string
+  def _element_structure(self):
+    return structure.NestedStructure(
+        (structure.TensorStructure(dtypes.string, []),
+         structure.TensorStructure(dtypes.string, [])))
diff --git a/tensorflow/contrib/data/python/ops/sliding.py b/tensorflow/contrib/data/python/ops/sliding.py
index bcc383587c54bd89502313f9328bc06c49046a87..5c6ee6bfdc7167d14b292f8f763adafca4e3a72c 100644
--- a/tensorflow/contrib/data/python/ops/sliding.py
+++ b/tensorflow/contrib/data/python/ops/sliding.py
@@ -18,11 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.util import deprecation
 
 
@@ -40,8 +39,13 @@ class _SlideDataset(dataset_ops.UnaryDataset):
     self._window_shift = ops.convert_to_tensor(
         window_shift, dtype=dtypes.int64, name="window_shift")
 
+    input_structure = structure.convert_legacy_structure(
+        input_dataset.output_types, input_dataset.output_shapes,
+        input_dataset.output_classes)
+    self._structure = input_structure._batch(None)  # pylint: disable=protected-access
+
   def _as_variant_tensor(self):
-    return gen_dataset_ops.slide_dataset(
+    return ged_ops.experimental_sliding_window_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         window_size=self._window_size,
         window_shift=self._window_shift,
@@ -49,20 +53,8 @@ class _SlideDataset(dataset_ops.UnaryDataset):
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    input_shapes = self._input_dataset.output_shapes
-    return nest.pack_sequence_as(input_shapes, [
-        tensor_shape.vector(None).concatenate(s)
-        for s in nest.flatten(self._input_dataset.output_shapes)
-    ])
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
 @deprecation.deprecated_args(
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 249258def3c4e52604b63764d8a7b5f238b45daa..4c9c35da5a36aa8149d15c8d1c25e4dfaa6a07c1 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -591,6 +591,7 @@ py_library(
         "//tensorflow/contrib/distribute/python:tpu_strategy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
+        "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/keras",
         "//third_party/py/numpy",
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
index 6e9f9facd0a209146d1ad8d101f0b8c41d77752a..346513dc586f208315fd777dc7ddfa500c82f0d7 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
@@ -67,30 +67,31 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
   def __init__(self, container_strategy, num_gpus_per_worker):
     distribute_lib.DistributionStrategyExtended.__init__(
         self, container_strategy)
+    self._cross_device_ops = None
     self._num_gpus_per_worker = num_gpus_per_worker
-    self._initialize_local_worker(container_strategy, num_gpus_per_worker)
+    self._initialize_local_worker(num_gpus_per_worker)
+    assert isinstance(self._get_cross_device_ops(),
+                      cross_device_ops_lib.CollectiveAllReduce)
 
-  def _initialize_local_worker(self, container_strategy, num_gpus_per_worker):
+  def _initialize_local_worker(self, num_gpus_per_worker):
     """Initializes the object for local training."""
     self._is_chief = True
     self._num_workers = 1
 
     if num_gpus_per_worker:
-      local_devices = [
+      local_devices = tuple(
           "/device:GPU:%d" % i for i in range(num_gpus_per_worker)
-      ]
+      )
     else:
-      local_devices = ["/device:CPU:0"]
+      local_devices = ("/device:CPU:0",)
     self._worker_device = device_util.canonicalize("/device:CPU:0")
 
     self._collective_keys = cross_device_utils.CollectiveKeys()
-    super(CollectiveAllReduceExtended, self).__init__(
-        container_strategy,
-        devices=local_devices,
-        cross_device_ops=cross_device_ops_lib.CollectiveAllReduce(
-            num_workers=1,
-            num_gpus_per_worker=num_gpus_per_worker,
-            collective_keys=self._collective_keys))
+    self._initialize_local(local_devices)
+    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
+        num_workers=self._num_workers,
+        num_gpus_per_worker=num_gpus_per_worker,
+        collective_keys=self._collective_keys)
 
     self._cluster_spec = None
     self._task_type = None
@@ -99,13 +100,13 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     logging.info("CollectiveAllReduceStrategy with local_devices = %r",
                  local_devices)
 
-  def _initialize_multi_worker(self, container_strategy, num_gpus_per_worker,
-                               cluster_spec, task_type, task_id):
+  def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
+                               task_type, task_id):
     """Initializes the object for multi-worker training."""
     if task_type is None or task_id is None:
       raise ValueError("When `cluster_spec` is given, you must also specify "
                        "`task_type` and `task_id`")
-    if task_type not in ["chief", "worker"]:
+    if task_type not in ("chief", "worker"):
       raise ValueError(
           "Unrecognized task_type: %r, valid task types are: \"chief\", "
           "\"worker\"." % task_type)
@@ -120,21 +121,19 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
 
     self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
     if num_gpus_per_worker:
-      local_devices = [
+      local_devices = tuple(
           "%s/device:GPU:%d" % (self._worker_device, i)
           for i in range(num_gpus_per_worker)
-      ]
+      )
     else:
-      local_devices = [self._worker_device]
+      local_devices = (self._worker_device,)
 
     self._collective_keys = cross_device_utils.CollectiveKeys()
-    super(CollectiveAllReduceExtended, self).__init__(
-        container_strategy,
-        devices=local_devices,
-        cross_device_ops=cross_device_ops_lib.CollectiveAllReduce(
-            num_workers=self._num_workers,
-            num_gpus_per_worker=num_gpus_per_worker,
-            collective_keys=self._collective_keys))
+    self._initialize_local(local_devices)
+    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
+        num_workers=self._num_workers,
+        num_gpus_per_worker=num_gpus_per_worker,
+        collective_keys=self._collective_keys)
 
     # Add a default device so that ops without specified devices will not end up
     # on other workers.
@@ -268,9 +267,10 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
       # If a `cluster_spec` is already passed in, do nothing here.
       # TODO(yuefengz): check `cluster_spec` is the same if this object has
       # already been initialized with a `cluster_spec`.
-      self._initialize_multi_worker(
-          self._container_strategy(), self._num_gpus_per_worker, cluster_spec,
-          task_type, task_id)
+      self._initialize_multi_worker(self._num_gpus_per_worker, cluster_spec,
+                                    task_type, task_id)
+      assert isinstance(self._get_cross_device_ops(),
+                        cross_device_ops_lib.CollectiveAllReduce)
 
     if session_config:
       session_config.CopyFrom(self._update_config_proto(session_config))
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index eba3585a55375ee1db561a459e079256c53a85cc..6d7cd14ed5ad8a283e3d0d3405efc58fe670f9cd 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -82,7 +82,8 @@ class CollectiveAllReduceStrategyTestBase(
         instance_key_with_id_start=num_gpus * 10000 +
         CollectiveAllReduceStrategyTestBase.collective_key_base)
     distribution.extended._collective_keys = collective_keys
-    distribution.extended._cross_device_ops._collective_keys = collective_keys
+    distribution.extended._cross_device_ops._collective_keys = (
+        collective_keys)
     if task_type and task_id is not None:
       return distribution, 'grpc://' + self._cluster_spec[task_type][
           task_id], session_config
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index c5ce29a43632918be555db865891fdbb5d22e941..365ce5cdec79f1914f0c9ccdf59a7dc59e6f819e 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -192,7 +192,7 @@ def _augment_with_special_arguments(test_method):
         kwargs_to_pass[arg] = kwargs[arg]
 
     if mode == "eager":
-      with ops.Graph().as_default(), context.eager_mode():
+      with context.eager_mode():
         if distribution:
           kwargs_to_pass["distribution"] = distribution.strategy
         test_method(**kwargs_to_pass)
diff --git a/tensorflow/contrib/distribute/python/cross_device_ops_test.py b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
index 3602cc92094ff607187f19e9e1c0ebde45aa6787..d6e9521c1c1115ffdbdcf375ad4017bacb962832 100644
--- a/tensorflow/contrib/distribute/python/cross_device_ops_test.py
+++ b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
@@ -392,18 +392,16 @@ class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
           # pylint: disable=g-long-lambda
           combinations.NamedDistribution(
               "CoreMirroredCPU",
-              lambda: mirrored_strategy.CoreMirroredStrategy(
-                  num_gpus_per_worker=0),
+              lambda: mirrored_strategy.CoreMirroredStrategy(["/device:CPU:0"]),
               required_gpus=0),
           combinations.NamedDistribution(
               "CoreMirrored1GPU",
-              lambda: mirrored_strategy.CoreMirroredStrategy(
-                  num_gpus_per_worker=1),
+              lambda: mirrored_strategy.CoreMirroredStrategy(["/device:GPU:0"]),
               required_gpus=1),
           combinations.NamedDistribution(
               "CoreMirrored2GPUs",
               lambda: mirrored_strategy.CoreMirroredStrategy(
-                  num_gpus_per_worker=2),
+                  ["/device:GPU:0", "/device:GPU:1"]),
               required_gpus=2),
       ],
       mode=["graph"])
diff --git a/tensorflow/contrib/distribute/python/estimator_training_test.py b/tensorflow/contrib/distribute/python/estimator_training_test.py
index 0f35657a8099523b6ba5b8f0a1a2f289c06b531a..3f55a8a1c8b88d1b8e4031547fa3fbe519983630 100644
--- a/tensorflow/contrib/distribute/python/estimator_training_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_training_test.py
@@ -24,7 +24,6 @@ import json
 import os
 import sys
 import tempfile
-import threading
 from absl.testing import parameterized
 import numpy as np
 
@@ -70,57 +69,19 @@ PS = dc._TaskType.PS
 original_run_std_server = dc._run_std_server
 
 
-class MockOsEnv(dict):
-
-  def __init__(self, *args):
-    self._thread_local = threading.local()
-    super(MockOsEnv, self).__init__(*args)
-
-  def get(self, key, default):
-    if not hasattr(self._thread_local, "dict"):
-      self._thread_local.dict = dict()
-    if key == "TF_CONFIG":
-      return dict.get(self._thread_local.dict, key, default)
-    else:
-      return dict.get(self, key, default)
-
-  def __getitem__(self, key):
-    if not hasattr(self._thread_local, "dict"):
-      self._thread_local.dict = dict()
-    if key == "TF_CONFIG":
-      return dict.__getitem__(self._thread_local.dict, key)
-    else:
-      return dict.__getitem__(self, key)
-
-  def __setitem__(self, key, val):
-    if not hasattr(self._thread_local, "dict"):
-      self._thread_local.dict = dict()
-    if key == "TF_CONFIG":
-      return dict.__setitem__(self._thread_local.dict, key, val)
-    else:
-      return dict.__setitem__(self, key, val)
-
-
-class DistributeCoordinatorIntegrationTest(test.TestCase,
-                                           parameterized.TestCase):
+class DistributeCoordinatorIntegrationTest(
+    multi_worker_test_base.IndependentWorkerTestBase, parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
     """Create a local cluster with 2 workers."""
+    super(DistributeCoordinatorIntegrationTest, cls).setUpClass()
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=2, has_eval=True)
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
-    self._mock_os_env = MockOsEnv()
-    self._mock_context = test.mock.patch.object(os, "environ",
-                                                self._mock_os_env)
     super(DistributeCoordinatorIntegrationTest, self).setUp()
-    self._mock_context.__enter__()
-
-  def tearDown(self):
-    self._mock_context.__exit__(None, None, None)
-    super(DistributeCoordinatorIntegrationTest, self).tearDown()
 
   def dataset_input_fn(self, x, y, batch_size, shuffle):
 
@@ -143,8 +104,8 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
   def _extract_loss_and_global_step(self, event_folder):
     """Returns the loss and global step in last event."""
     event_paths = glob.glob(os.path.join(event_folder, "events*"))
-    self.assertGreater(len(event_paths), 0,
-                       msg="Event file not found in dir %s" % event_folder)
+    self.assertNotEmpty(
+        event_paths, msg="Event file not found in dir %s" % event_folder)
 
     loss = None
     global_step_count = None
@@ -287,6 +248,12 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
     ])
     self.assertAllEqual((BATCH_SIZE, LABEL_DIMENSION), predicted_proba.shape)
 
+  def _get_strategy_object(self, strategy_cls):
+    if strategy_cls == mirrored_strategy.CoreMirroredStrategy:
+      return strategy_cls(mirrored_strategy.all_local_devices())
+    else:
+      return strategy_cls(num_gpus_per_worker=context.num_gpus())
+
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
@@ -305,12 +272,10 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
           required_gpus=[0, 1]))
   def test_complete_flow_standalone_client(self, train_distribute_cls,
                                            eval_distribute_cls):
-    train_distribute = train_distribute_cls(
-        num_gpus_per_worker=context.num_gpus())
+    train_distribute = self._get_strategy_object(train_distribute_cls)
 
     if eval_distribute_cls:
-      eval_distribute = eval_distribute_cls(
-          num_gpus_per_worker=context.num_gpus())
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
     else:
       eval_distribute = None
 
@@ -337,12 +302,10 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
           required_gpus=[0, 1]))
   def test_estimator_standalone_client(self, train_distribute_cls,
                                        eval_distribute_cls):
-    train_distribute = train_distribute_cls(
-        num_gpus_per_worker=context.num_gpus())
+    train_distribute = self._get_strategy_object(train_distribute_cls)
 
     if eval_distribute_cls:
-      eval_distribute = eval_distribute_cls(
-          num_gpus_per_worker=context.num_gpus())
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
     else:
       eval_distribute = None
 
@@ -362,47 +325,15 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
     self._barrier.wait()
     return ret
 
-  def _task_thread(self, train_distribute, eval_distribute, tf_config):
-    os.environ["TF_CONFIG"] = json.dumps(tf_config)
+  def _independent_worker_fn(
+      self,
+      train_distribute,
+      eval_distribute,
+  ):
     with test.mock.patch.object(dc, "_run_std_server",
                                 self._mock_run_std_server):
       self._complete_flow(train_distribute, eval_distribute)
 
-  def _run_task_in_thread(self, cluster_spec, task_type, task_id,
-                          train_distribute, eval_distribute):
-    if task_type:
-      tf_config = {
-          "cluster": cluster_spec,
-          "task": {
-              "type": task_type,
-              "index": task_id
-          }
-      }
-    else:
-      tf_config = {
-          "cluster": cluster_spec,
-          "task": {
-              "type": task_type,
-              "index": task_id
-          }
-      }
-    t = threading.Thread(
-        target=self._task_thread,
-        args=(train_distribute, eval_distribute, tf_config))
-    t.start()
-    return t
-
-  def _run_multiple_tasks_in_threads(self, cluster_spec, train_distribute,
-                                     eval_distribute):
-    threads = {}
-    for task_type in cluster_spec.keys():
-      threads[task_type] = []
-      for task_id in range(len(cluster_spec[task_type])):
-        t = self._run_task_in_thread(cluster_spec, task_type, task_id,
-                                     train_distribute, eval_distribute)
-        threads[task_type].append(t)
-    return threads
-
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
@@ -418,16 +349,14 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
           required_gpus=[0, 1]))
   def test_complete_flow_indepedent_worker_between_graph(
       self, train_distribute_cls, eval_distribute_cls):
-    train_distribute = train_distribute_cls(
-        num_gpus_per_worker=context.num_gpus())
-
     if (context.num_gpus() < 2 and eval_distribute_cls ==
         collective_all_reduce_strategy.CollectiveAllReduceStrategy):
       self.skipTest("`CollectiveAllReduceStrategy` needs at least two towers.")
 
+    train_distribute = self._get_strategy_object(train_distribute_cls)
+
     if eval_distribute_cls:
-      eval_distribute = eval_distribute_cls(
-          num_gpus_per_worker=context.num_gpus())
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
     else:
       eval_distribute = None
 
@@ -443,13 +372,16 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
       # 3 workers and 1 evaluator.
       self._barrier = dc._Barrier(4)
 
-    threads = self._run_multiple_tasks_in_threads(
-        cluster_spec, train_distribute, eval_distribute)
+    threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
+                                                 cluster_spec, train_distribute,
+                                                 eval_distribute)
+    threads_to_join = []
     for task_type, ts in threads.items():
       if task_type == PS:
         continue
       for t in ts:
-        t.join()
+        threads_to_join.append(t)
+    self.join_independent_workers(threads_to_join)
 
     estimator = self._get_estimator(train_distribute, eval_distribute)
     self._inspect_train_and_eval_events(estimator)
@@ -469,12 +401,10 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
           required_gpus=[0, 1]))
   def test_complete_flow_indepedent_worker_in_graph(self, train_distribute_cls,
                                                     eval_distribute_cls):
-    train_distribute = train_distribute_cls(
-        num_gpus_per_worker=context.num_gpus())
+    train_distribute = self._get_strategy_object(train_distribute_cls)
 
     if eval_distribute_cls:
-      eval_distribute = eval_distribute_cls(
-          num_gpus_per_worker=context.num_gpus())
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
     else:
       eval_distribute = None
 
@@ -482,10 +412,10 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
         num_workers=3, num_ps=0, has_eval=True)
     # 3 workers and 1 evaluator.
     self._barrier = dc._Barrier(4)
-    threads = self._run_multiple_tasks_in_threads(
-        cluster_spec, train_distribute, eval_distribute)
-    threads[WORKER][0].join()
-    threads[EVALUATOR][0].join()
+    threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
+                                                 cluster_spec, train_distribute,
+                                                 eval_distribute)
+    self.join_independent_workers([threads[WORKER][0], threads[EVALUATOR][0]])
 
     estimator = self._get_estimator(train_distribute, eval_distribute)
     self._inspect_train_and_eval_events(estimator)
@@ -522,7 +452,7 @@ class RunConfigTest(test.TestCase):
       run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
               train_distribute=mirrored_strategy.CoreMirroredStrategy(
-                  num_gpus_per_worker=2)))
+                  ["/device:GPU:0", "/device:GPU:1"])))
 
   def test_should_run_distribute_coordinator(self):
     """Tests that should_run_distribute_coordinator return a correct value."""
@@ -546,11 +476,11 @@ class RunConfigTest(test.TestCase):
       config_with_train_distribute = run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
               train_distribute=mirrored_strategy.CoreMirroredStrategy(
-                  num_gpus_per_worker=2)))
+                  ["/device:GPU:0", "/device:GPU:1"])))
       config_with_eval_distribute = run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
               eval_distribute=mirrored_strategy.CoreMirroredStrategy(
-                  num_gpus_per_worker=2)))
+                  ["/device:GPU:0", "/device:GPU:1"])))
     self.assertTrue(
         dc_training.should_run_distribute_coordinator(
             config_with_train_distribute))
@@ -564,7 +494,7 @@ class RunConfigTest(test.TestCase):
       config = run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
               train_distribute=mirrored_strategy.CoreMirroredStrategy(
-                  num_gpus_per_worker=2)))
+                  ["/device:GPU:0", "/device:GPU:1"])))
     self.assertFalse(dc_training.should_run_distribute_coordinator(config))
 
   def test_init_run_config_duplicate_distribute(self):
diff --git a/tensorflow/contrib/distribute/python/examples/keras_mnist.py b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
index 8b6487252df54dc18cc0763fb1c58a190faad88a..60fda996642464135fe1fb8c314bcf7f04d19362 100644
--- a/tensorflow/contrib/distribute/python/examples/keras_mnist.py
+++ b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
@@ -20,6 +20,10 @@ from __future__ import print_function
 import tensorflow as tf
 
 
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+
+
 NUM_CLASSES = 10
 
 
@@ -109,10 +113,10 @@ def main(_):
 
   # Instantiate the MirroredStrategy object. If we don't specify `num_gpus` or
   # the `devices` argument then all the GPUs available on the machine are used.
-  strategy = tf.contrib.distribute.MirroredStrategy(['/gpu:0', '/cpu:0'])
+  # TODO(priyag): Use `tf.distribute.MirroredStrategy` once available.
+  strategy = mirrored_strategy.MirroredStrategy(['/gpu:0', '/cpu:0'])
 
-  # TODO(priyag): Use RMSPropOptimizer when it works with eager mode.
-  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
+  optimizer = rmsprop.RMSProp(learning_rate=0.001)
 
   # Compile the model by passing the distribution strategy object to the
   # `distribute` argument. `fit`, `evaluate` and `predict` will be distributed
diff --git a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
index 6dfd85bcc4f3784e2744fd876a7190cc9581d96a..8c596549c4e20754675f69861d4c7f14f7c3c126 100644
--- a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
@@ -18,24 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import shutil
-import tempfile
 from absl.testing import parameterized
 import numpy as np
-import six
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import keras
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
-from tensorflow.python.estimator import run_config
-from tensorflow.python.estimator import training
-from tensorflow.python.estimator.canned import dnn_linear_combined
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -44,103 +32,7 @@ from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-
-
-class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def dataset_input_fn(self, x, y, batch_size):
-
-    def input_fn():
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-      dataset = dataset.repeat(1).batch(batch_size)
-      return dataset
-
-    return input_fn
-
-  @combinations.generate(
-      combinations.combine(
-          mode=['graph'],
-          distribution=[
-              combinations.one_device_strategy,
-              combinations.mirrored_strategy_with_gpu_and_cpu,
-              combinations.mirrored_strategy_with_two_gpus,
-              combinations.core_mirrored_strategy_with_gpu_and_cpu,
-              combinations.core_mirrored_strategy_with_two_gpus
-          ],
-          use_train_and_evaluate=[True, False]))
-  def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
-    label_dimension = 2
-    input_dimension = label_dimension
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-    train_input_fn = self.dataset_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size // distribution.num_replicas_in_sync)
-    eval_input_fn = self.dataset_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size // distribution.num_replicas_in_sync)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data}, batch_size=batch_size, shuffle=False)
-
-    linear_feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))
-    ]
-    dnn_feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))
-    ]
-    feature_columns = linear_feature_columns + dnn_feature_columns
-    session_config = config_pb2.ConfigProto(
-        log_device_placement=True, allow_soft_placement=True)
-    estimator = dnn_linear_combined.DNNLinearCombinedRegressor(
-        linear_feature_columns=linear_feature_columns,
-        dnn_hidden_units=(2, 2),
-        dnn_feature_columns=dnn_feature_columns,
-        label_dimension=label_dimension,
-        model_dir=self._model_dir,
-        dnn_optimizer=adam.Adam(0.001),
-        linear_optimizer=adam.Adam(0.001),
-        config=run_config.RunConfig(
-            train_distribute=distribution,
-            eval_distribute=distribution,
-            session_config=session_config))
-
-    num_steps = 2
-    if use_train_and_evaluate:
-      scores, _ = training.train_and_evaluate(
-          estimator, training.TrainSpec(train_input_fn, max_steps=num_steps),
-          training.EvalSpec(eval_input_fn))
-    else:
-      estimator.train(train_input_fn, steps=num_steps)
-      scores = estimator.evaluate(eval_input_fn)
-
-    self.assertIn('loss', six.iterkeys(scores))
-
-    predictions = np.array([
-        x[prediction_keys.PredictionKeys.PREDICTIONS]
-        for x in estimator.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
-
-    feature_spec = feature_column.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
-                                             serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
 
 
 def get_model():
@@ -162,7 +54,9 @@ class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
       var = variables.Variable(
           2.0, name='var', aggregation=variable_scope.VariableAggregation.SUM)
       # grad for cpu is 1, grad for gpu is 2, avg grad is 1.5.
-      loss = math_ops.cast(_replica_id() + 1, dtype=dtypes.float32) * var
+      def loss():
+        return math_ops.cast(_replica_id() + 1, dtype=dtypes.float32) * var
+
       optimizer = adam.Adam(learning_rate=0.01, beta_1=0.2, beta_2=0.2)
       train_op = optimizer.minimize(loss, var_list=[var])
       m = optimizer.get_slot(var, 'm')
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index 1d002819745f1959b535ffa534be8f1a6b93b31d..c53e76f922372d8c7937e05fde61772d0b064674 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -165,7 +165,9 @@ def get_multi_inputs_multi_outputs_data():
   return (train_data, test_data)
 
 
-def batch_wrapper(dataset, batch_size, distribution):
+def batch_wrapper(dataset, batch_size, distribution, repeat=None):
+  if repeat:
+    dataset = dataset.repeat(repeat)
   # TPUs currently require fully defined input shapes, drop_remainder ensures
   # the input will have fully defined shapes.
   if isinstance(distribution, tpu_strategy.TPUStrategy):
@@ -212,9 +214,11 @@ def multi_input_output_model():
   return model
 
 
-def get_correctness_test_inputs(use_numpy, with_distribution,
+def get_correctness_test_inputs(use_numpy, use_validation_data,
+                                with_distribution,
                                 x_train, y_train, x_predict):
   """Generates the inputs for correctness check when enable Keras with DS."""
+  training_epochs = 2
   global_batch_size = 64
   batch_size = global_batch_size
   # TODO(b/118776054): Use global batch size for Keras/DS support.
@@ -230,14 +234,19 @@ def get_correctness_test_inputs(use_numpy, with_distribution,
         'batch_size': batch_size,
         'x': x_train,
         'y': y_train,
-        'epochs': 1,
+        'epochs': training_epochs,
         'shuffle': False,
     }
-    eval_inputs = {
-        'batch_size': batch_size,
-        'x': x_train,
-        'y': y_train,
-    }
+
+    if use_validation_data:
+      eval_inputs = None
+      training_inputs['validation_data'] = (x_train, y_train)
+    else:
+      eval_inputs = {
+          'batch_size': batch_size,
+          'x': x_train,
+          'y': y_train,
+      }
     predict_inputs = {
         'x': np.array(x_predict, dtype=np.float32),
     }
@@ -246,22 +255,32 @@ def get_correctness_test_inputs(use_numpy, with_distribution,
     # keras.fit/evaluate/predict. The batch size is part of the dataset.
     train_dataset = dataset_ops.Dataset.from_tensor_slices(
         (x_train, y_train))
-    x = batch_wrapper(train_dataset, batch_size, with_distribution)
+    x = batch_wrapper(
+        train_dataset, batch_size, with_distribution, repeat=training_epochs)
 
     training_inputs = {
         'batch_size': None,
         'x': x,
         'y': None,
-        'epochs': 1,
+        'epochs': training_epochs,
         'shuffle': False,
         'steps_per_epoch': len(x_train) // global_batch_size,
     }
-    eval_inputs = {
-        'batch_size': None,
-        'x': x,
-        'y': None,
-        'steps': 20,
-    }
+    if use_validation_data:
+      eval_inputs = None  # Remove the eval_inputs
+      eval_dataset = dataset_ops.Dataset.from_tensor_slices(
+          (x_train, y_train))
+      x = batch_wrapper(eval_dataset, batch_size, with_distribution)
+      training_inputs['validation_data'] = x
+      training_inputs['validation_steps'] = 5
+    else:
+      eval_inputs = {
+          'batch_size': None,
+          'x': x,
+          'y': None,
+          'steps': 20,
+      }
+
     predict_batch_size = len(x_predict)
     if use_per_core_batch_size:
       predict_batch_size //= with_distribution.num_replicas_in_sync
@@ -276,47 +295,66 @@ def get_correctness_test_inputs(use_numpy, with_distribution,
   return training_inputs, eval_inputs, predict_inputs
 
 
-strategies = [combinations.default_strategy,
-              combinations.one_device_strategy,
-              combinations.mirrored_strategy_with_gpu_and_cpu,
-              combinations.mirrored_strategy_with_two_gpus,
-              combinations.core_mirrored_strategy_with_gpu_and_cpu,
-              combinations.core_mirrored_strategy_with_two_gpus,
-              combinations.tpu_strategy,  # steps_per_run=2
-              combinations.tpu_strategy_one_step]
+strategies_minus_tpu = [
+    combinations.default_strategy,
+    combinations.one_device_strategy,
+    combinations.mirrored_strategy_with_gpu_and_cpu,
+    combinations.mirrored_strategy_with_two_gpus,
+    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+    combinations.core_mirrored_strategy_with_two_gpus]
+
+tpu_strategies = [
+    combinations.tpu_strategy,  # steps_per_run=2
+    combinations.tpu_strategy_one_step]
 
 
 def strategy_minus_tpu_combinations():
   return combinations.combine(
-      distribution=[combinations.default_strategy,
-                    combinations.one_device_strategy,
-                    combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus,
-                    combinations.core_mirrored_strategy_with_gpu_and_cpu,
-                    combinations.core_mirrored_strategy_with_two_gpus],
-      mode=['graph'])
+      distribution=strategies_minus_tpu,
+      mode=['graph', 'eager'])
 
 
-def strategy_combinations():
+def tpu_strategy_combinations():
   return combinations.combine(
-      distribution=strategies,
+      distribution=tpu_strategies,
       mode=['graph'])
 
 
-def strategy_and_optimizer_combinations():
-  return combinations.combine(
-      distribution=strategies,
-      optimizer=[combinations.adagrad_optimizer_v1_fn,
-                 combinations.adam_optimizer_v1_fn,
-                 combinations.gradient_descent_optimizer_v1_fn,
-                 combinations.rmsprop_optimizer_v1_fn],
-      mode=['graph'])
+def all_strategy_combinations():
+  return strategy_minus_tpu_combinations() + tpu_strategy_combinations()
 
 
-def strategy_and_inputs():
+# TODO(priyag): Add v2 optimizers here.
+def strategy_and_optimizer_combinations():
+  return combinations.times(
+      all_strategy_combinations(),
+      combinations.combine(
+          optimizer=[combinations.adagrad_optimizer_v1_fn,
+                     combinations.adam_optimizer_v1_fn,
+                     combinations.gradient_descent_optimizer_v1_fn,
+                     combinations.rmsprop_optimizer_v1_fn]))
+
+
+def strategy_and_input_combinations():
+  return (
+      combinations.times(
+          combinations.combine(distribution=strategies_minus_tpu),
+          combinations.combine(mode=['graph'],
+                               use_numpy=[True, False],
+                               use_validation_data=[True, False])
+          + combinations.combine(mode=['eager'],
+                                 use_numpy=[False],
+                                 use_validation_data=[False])) +
+      combinations.times(
+          combinations.combine(distribution=tpu_strategies),
+          combinations.combine(mode=['graph'],
+                               use_numpy=[True, False],
+                               use_validation_data=[True, False])))
+
+
+def strategy_for_numpy_input_combinations():
   return combinations.combine(
-      distribution=strategies,
-      use_numpy=[True, False],
+      distribution=strategies_minus_tpu + tpu_strategies,
       mode=['graph'])
 
 
@@ -337,7 +375,9 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
 
   @combinations.generate(combinations.combine(
       distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
           combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu,
           combinations.core_mirrored_strategy_with_two_gpus],
       mode=['graph']))
   def test_train_functional_with_distribution_strategy(self, distribution):
@@ -365,7 +405,9 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
 
   @combinations.generate(combinations.combine(
       distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
           combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu,
           combinations.core_mirrored_strategy_with_two_gpus],
       mode=['graph']))
   def test_train_sequential_with_distribution_strategy(self, distribution):
@@ -392,8 +434,8 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
 
   @combinations.generate(combinations.combine(
       distribution=[
-          combinations.mirrored_strategy_with_two_gpus,
-          combinations.core_mirrored_strategy_with_two_gpus],
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
       mode=['graph']))
   def test_multi_inputs_multi_outputs_with_input_fn_as_dict(self, distribution):
     train_data, test_data = get_multi_inputs_multi_outputs_data()
@@ -444,8 +486,8 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
 
   @combinations.generate(combinations.combine(
       distribution=[
-          combinations.mirrored_strategy_with_two_gpus,
-          combinations.core_mirrored_strategy_with_two_gpus],
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
       mode=['graph']))
   def test_keras_optimizer_with_distribution_strategy(self, distribution):
     keras_model = simple_sequential_model()
@@ -471,7 +513,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_creating_var_with_numpy_arrays(self, distribution):
     with self.cached_session():
       x = np.asarray(np.random.random((64, 3)), dtype=np.float32)
@@ -480,7 +522,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # Verify that the numpy value is copied to the variable.
       self.assertAllEqual(x, val)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calculating_input_params_no_steps_no_batch_size(self, distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
     # that use per_core_batch_size
@@ -511,7 +553,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         distributed_training_utils.get_input_params(
             distribution, input_63_samples, steps=None, batch_size=None)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calculating_input_params_with_steps_no_batch_size(self,
                                                              distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
@@ -557,7 +599,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
           distributed_training_utils.get_input_params(
               distribution, input_63_samples, steps=1, batch_size=None)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calculating_input_params_no_steps_with_batch_size(self,
                                                              distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
@@ -591,7 +633,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         distributed_training_utils.get_input_params(
             distribution, input_64_samples, steps=None, batch_size=3)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calculating_input_params_with_steps_with_batch_size(self,
                                                                distribution):
     with self.cached_session():
@@ -608,7 +650,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         distributed_training_utils.get_input_params(
             distribution, input_64_samples, steps=10, batch_size=13)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calling_model_with_numpy_arrays(self, distribution):
     with self.cached_session():
       model = get_model()
@@ -639,7 +681,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # with batch_size
       model.predict(inputs, batch_size=8)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calling_model_with_nested_numpy_arrays(self, distribution):
     with self.cached_session():
       model = multi_input_output_model()
@@ -673,7 +715,8 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # with batch_size
       model.predict(inputs, batch_size=8)
 
-  @combinations.generate(strategy_minus_tpu_combinations())
+  @combinations.generate(combinations.combine(
+      distribution=strategies_minus_tpu, mode=['graph']))
   def test_numpy_with_sample_weights(self, distribution):
     model = get_model()
     optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
@@ -687,7 +730,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
     model.fit(inputs, targets, sample_weight=sample_weights, epochs=1,
               steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_flatten_predict_outputs(self, distribution):
     with self.cached_session():
       model = multi_input_output_model()
@@ -715,7 +758,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 class TestDistributionStrategyWithDatasets(test.TestCase,
                                            parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calling_model_on_same_dataset(self, distribution):
     with self.cached_session():
       model = get_model()
@@ -734,7 +777,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
                 validation_data=dataset, validation_steps=2)
       model.predict(get_predict_dataset(distribution), steps=2)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_model_interleaved_eval_same_as_direct_eval(self, distribution):
     with self.cached_session():
       user_controlled_model = get_model()
@@ -782,7 +825,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       distribution=[
           combinations.mirrored_strategy_with_gpu_and_cpu,
           combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph']))
+      mode=['graph', 'eager']))
   def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution):
     with self.cached_session():
       model = multi_input_output_model()
@@ -814,7 +857,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_fit_eval_and_predict_methods_on_dataset(self, distribution):
     with self.cached_session():
       model = get_model()
@@ -865,10 +908,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
   @combinations.generate(combinations.combine(
       distribution=[
-          combinations.mirrored_strategy_with_two_gpus,
-          combinations.core_mirrored_strategy_with_two_gpus],
-      mode=['graph']))
-  def test_dataset_wrong_input_shape(self, distribution):
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  # TODO(b/120943676, b/120957836): Re-enable once the validation code is
+  # restored.
+  def DISABLED_test_dataset_wrong_input_shape(self, distribution):
     with self.cached_session():
       model = get_model()
 
@@ -888,9 +933,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
   @combinations.generate(combinations.combine(
-      distribution=[combinations.mirrored_strategy_with_two_gpus],
-      mode=['graph']))
-  def test_dataset_no_batch_input_validation(self, distribution):
+      distribution=[combinations.mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  # TODO(b/120943676, b/120957836): Re-enable once the validation code is
+  # restored.
+  def DISABLED_test_dataset_no_batch_input_validation(self, distribution):
     with self.cached_session():
       model = get_model()
 
@@ -928,9 +975,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
   @combinations.generate(combinations.combine(
       distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
           combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu,
           combinations.core_mirrored_strategy_with_two_gpus],
-      mode=['graph']))
+      mode=['graph', 'eager']))
   def test_learning_phase_value(self, distribution):
     # TODO(anjalisridhar): Modify this test to use Lambdas since we can compare
     # meaningful values. Currently we don't pass the learning phase if the
@@ -1002,7 +1051,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
       distribution=[
           combinations.mirrored_strategy_with_gpu_and_cpu,
           combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph']))
+      mode=['graph', 'eager']))
   def test_validating_dataset_input_tensors_with_shape_mismatch(self,
                                                                 distribution):
     with self.cached_session():
@@ -1025,7 +1074,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
       distribution=[
           combinations.mirrored_strategy_with_gpu_and_cpu,
           combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph']))
+      mode=['graph', 'eager']))
   def test_validating_dataset_input_tensors_with_dtype_mismatch(self,
                                                                 distribution):
     with self.cached_session():
@@ -1046,9 +1095,9 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(
       distribution=[
-          combinations.mirrored_strategy_with_two_gpus,
-          combinations.core_mirrored_strategy_with_two_gpus],
-      mode=['graph']))
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
   def test_unsupported_features(self, distribution):
     with self.cached_session():
       model = get_model()
@@ -1095,9 +1144,9 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(
       distribution=[
-          combinations.mirrored_strategy_with_two_gpus,
-          combinations.core_mirrored_strategy_with_two_gpus],
-      mode=['graph']))
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
   def test_calling_with_unsupported_predefined_callbacks(self, distribution):
     with self.cached_session():
       model = get_model()
@@ -1122,12 +1171,6 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                                    'using'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
                   callbacks=[keras.callbacks.ReduceLROnPlateau()])
-      with self.assertRaisesRegexp(ValueError,
-                                   'histogram_freq in the TensorBoard callback '
-                                   'is not supported when using '
-                                   'DistributionStrategy.'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                  callbacks=[keras.callbacks.TensorBoard(histogram_freq=10)])
 
 
 class TestDistributionStrategyWithLossMasking(test.TestCase,
@@ -1137,9 +1180,9 @@ class TestDistributionStrategyWithLossMasking(test.TestCase,
   # work for TPU due to some invalid datatype.
   @combinations.generate(combinations.combine(
       distribution=[
-          combinations.mirrored_strategy_with_two_gpus,
-          combinations.core_mirrored_strategy_with_two_gpus],
-      mode=['graph']))
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
   def test_masking(self, distribution):
     with self.cached_session():
       np.random.seed(1337)
@@ -1163,7 +1206,7 @@ class TestDistributionStrategyWithLossMasking(test.TestCase,
 class TestDistributionStrategyWithNormalizationLayer(
     test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_batchnorm_correctness(self, distribution):
     with self.cached_session():
       model = keras.models.Sequential()
@@ -1195,7 +1238,7 @@ class TestDistributionStrategyWithNormalizationLayer(
 class TestDistributionStrategyCorrectness(test.TestCase,
                                           parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_metric_correctness(self, distribution):
     with self.cached_session():
       keras.backend.set_image_data_format('channels_last')
@@ -1224,18 +1267,57 @@ class TestDistributionStrategyCorrectness(test.TestCase,
       train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
       train_dataset = batch_wrapper(train_dataset, batch_size, distribution)
 
-      history = model.fit(x=train_dataset, epochs=1, steps_per_epoch=10)
-      self.assertEqual(history.history['binary_accuracy'], [1.0])
+      history = model.fit(x=train_dataset, epochs=2, steps_per_epoch=10)
+      self.assertEqual(history.history['binary_accuracy'], [1.0, 1.0])
 
-  @combinations.generate(strategy_and_inputs())
-  def test_correctness(self, distribution, use_numpy):
+  @combinations.generate(all_strategy_combinations())
+  def test_eval_metrics_correctness(self, distribution):
     with self.cached_session():
-      tolerance = 1e-5
+      model = keras.Sequential()
+      model.add(
+          keras.layers.Dense(
+              3, activation='relu', input_dim=4, kernel_initializer='ones'))
+      model.add(
+          keras.layers.Dense(
+              1, activation='sigmoid', kernel_initializer='ones'))
+      model.compile(
+          loss='mae',
+          metrics=['accuracy', keras.metrics.BinaryAccuracy()],
+          optimizer=gradient_descent.GradientDescentOptimizer(0.001),
+          distribute=distribution)
+
+      # verify correctness of stateful and stateless metrics.
+      x = np.ones((100, 4)).astype('float32')
+      y = np.ones((100, 1)).astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
+      dataset = batch_wrapper(dataset, 4, distribution)
+      outs = model.evaluate(dataset, steps=10)
+      self.assertEqual(outs[1], 1.)
+      self.assertEqual(outs[2], 1.)
+
+      y = np.zeros((100, 1)).astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
+      dataset = batch_wrapper(dataset, 4, distribution)
+      outs = model.evaluate(dataset, steps=10)
+      self.assertEqual(outs[1], 0.)
+      self.assertEqual(outs[2], 0.)
+
+  @combinations.generate(strategy_and_input_combinations())
+  def test_correctness(self, distribution, use_numpy, use_validation_data):
+
+    with self.cached_session():
+      default_tolerance = 1e-5
+      tol_table = {}
 
       if isinstance(distribution, (mirrored_strategy.MirroredStrategy,
                                    mirrored_strategy.CoreMirroredStrategy)):
-        # TODO(b/119257215): use the default one once the flakyness is fixed.
-        tolerance = 1e-4
+        # TODO(b/119257215): Weights are not exactly the same, so use larger
+        # tolerance for now. Predict should be related to weights.
+        tol_table = {
+            'weights_1': 1e-4,
+            'weights_2': 1e-4,
+            'predict_result_1': 1e-4,
+        }
 
       keras.backend.set_image_data_format('channels_last')
       np.random.seed(_RANDOM_SEED)
@@ -1256,54 +1338,75 @@ class TestDistributionStrategyCorrectness(test.TestCase,
       # This is used to initialize the model for both the distribution and
       # non-distribution run. In addition, we add few non-linear layers to make
       # it non-trivial.
-      model = keras.Sequential()
-      model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
-      model.add(keras.layers.Dense(10, activation='relu'))
-      model.add(keras.layers.Dense(10, activation='relu'))
-      model.add(keras.layers.Dense(1))
+      def _create_model():
+        model = keras.Sequential()
+        model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
+        model.add(keras.layers.Dense(10, activation='relu'))
+        model.add(keras.layers.Dense(10, activation='relu'))
+        model.add(keras.layers.Dense(1))
+        return model
+
+      model = _create_model()
       initial_weights = model.get_weights()
+      del model  # avoid accident usage.
 
       def fit_eval_and_predict(with_distribution=None):
+        model = _create_model()
         # We have initialized the model to the same weight for the distribution
         # and non-distribution run.
         model.set_weights(initial_weights)
-        # TODO(b/120245072): Also use gradient_descent_keras.SGD for
-        # TPUStrategy.
-        # pylint: disable=line-too-long
-        if with_distribution and with_distribution.__class__.__name__ == 'TPUStrategy':
-        # pylint: enable=line-too-long
-          optimizer = gradient_descent.GradientDescentOptimizer(0.5)
-        else:
-          optimizer = gradient_descent_keras.SGD(0.5)
         model.compile(
             loss=keras.losses.mean_squared_error,
-            optimizer=optimizer,
+            optimizer=gradient_descent_keras.SGD(0.5),
+            metrics=['mse'],
             distribute=with_distribution)
 
         training_inputs, eval_inputs, predict_inputs = (
-            get_correctness_test_inputs(use_numpy, with_distribution,
+            get_correctness_test_inputs(use_numpy, use_validation_data,
+                                        with_distribution,
                                         x_train, y_train, x_predict))
 
-        model.fit(**training_inputs)
-        eval_result = model.evaluate(**eval_inputs)
-        weights = model.get_weights()
-        predict_result = model.predict(**predict_inputs)
-
-        return weights, eval_result, predict_result
-
-      wts_with_ds, eval_with_ds, predict_with_ds = fit_eval_and_predict(
-          with_distribution=distribution)
-      wts_without_ds, eval_without_ds, predict_without_ds = (
-          fit_eval_and_predict(with_distribution=None))
-
-      # Verify that the weights, eval results, predict outputs  are the same
-      # within some limits of tolerance.
-      self.assertAllClose(
-          wts_with_ds, wts_without_ds, atol=tolerance, rtol=tolerance)
-      self.assertAllClose(
-          eval_with_ds, eval_without_ds, atol=tolerance, rtol=tolerance)
-      self.assertAllClose(
-          predict_with_ds, predict_without_ds, atol=tolerance, rtol=tolerance)
+        result = {}
+        result['training_history_1'] = model.fit(**training_inputs).history
+
+        if eval_inputs is not None:
+          result['eval_result_1'] = model.evaluate(**eval_inputs)
+
+        result['weights_1'] = model.get_weights()
+        result['predict_result_1'] = model.predict(**predict_inputs)
+
+        # Train and eval again to mimic user's flow.
+
+        result['training_history_2'] = model.fit(**training_inputs).history
+
+        if eval_inputs is not None:
+          result['eval_result_2'] = model.evaluate(**eval_inputs)
+
+        result['weights_2'] = model.get_weights()
+
+        return result
+
+      results_with_ds = fit_eval_and_predict(with_distribution=distribution)
+      results_without_ds = fit_eval_and_predict(with_distribution=None)
+
+      # Verify that the weights, training history, eval results, predict outputs
+      # are the same within some limits of tolerance.
+      for key in results_with_ds:
+        if (key.startswith('training_history') and
+            isinstance(distribution, tpu_strategy.TPUStrategy) and
+            distribution.extended.steps_per_run > 1):
+          # TODO(b/119894254): Enable this test for all cases once the
+          # underlying bug is fixed.
+          continue
+
+        tolerance = tol_table.get(key, default_tolerance)
+
+        self.assertAllClose(
+            results_with_ds[key],
+            results_without_ds[key],
+            atol=tolerance,
+            rtol=tolerance,
+            msg='Fail to assert {}.'.format(key))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index dcc9df4cda51b87e95fb166a726170a8817715fc..f09483cb56b66fd4720ee71085203c14f1ccadc3 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -232,7 +232,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         fetches = distribution.unwrap(
             distribution.call_for_each_replica(model_fn, args=inputs))
         if update_ops_in_cross_replica_mode:
-          fetches += ops.get_collection(ops.GraphKeys.UPDATE_OPS)
+          fetches += tuple(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
         return control_flow_ops.group(fetches)
 
       iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
@@ -443,7 +443,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
             step_fn, iterator, iterations=2,
             initial_loop_values=initial_loop_values)
 
-        self.assertEqual({key1: [value1]}, ctx.non_tensor_outputs)
+        self.assertEqual({key1: (value1,)}, ctx.non_tensor_outputs)
         self._verify_loss_output(
             initial_loss(),
             loss_output=ctx.last_step_outputs["replica_loss_reduced"],
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 4a594f056e96a2a48563d9902bdeed8458b847e4..24399db6522c325722b95399fd002eed9fd955f2 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -28,8 +28,8 @@ from tensorflow.python.distribute import values
 
 # pylint: disable=protected-access,invalid-name
 _call_for_each_replica = mirrored_strategy._call_for_each_replica
-_reduce_non_distributed_value = mirrored_strategy._reduce_non_distributed_value
 _create_mirrored_variable = mirrored_strategy._create_mirrored_variable
+all_local_devices = mirrored_strategy.all_local_devices
 CoreMirroredStrategy = mirrored_strategy.MirroredStrategy
 CoreMirroredExtended = mirrored_strategy.MirroredExtended
 # pylint: enable=protected-access,invalid-name
@@ -115,8 +115,13 @@ class MirroredExtended(CoreMirroredExtended):
                num_gpus_per_worker=None,
                cross_device_ops=None,
                auto_shard_dataset=False):
-    super(MirroredExtended, self).__init__(
-        container_strategy, devices, num_gpus_per_worker, cross_device_ops)
+    if devices is None:
+      devices = mirrored_strategy.all_local_devices(num_gpus_per_worker)
+    elif num_gpus_per_worker is not None:
+      raise ValueError(
+          "Must only specify one of `devices` and `num_gpus_per_worker`.")
+    super(MirroredExtended, self).__init__(container_strategy, devices,
+                                           cross_device_ops)
     self._auto_shard_dataset = auto_shard_dataset
 
   def _make_dataset_iterator(self, dataset):
@@ -131,22 +136,22 @@ class MirroredExtended(CoreMirroredExtended):
     Returns:
       An `InputIterator` which returns inputs for each step of the computation.
     """
-    if self._cluster_spec:
-      worker_device_pairs = self._worker_devices
-    else:
+    if self._local_mode:
       worker = device_util.canonicalize("/device:CPU:0")
       worker_device_pairs = [(worker, self._devices)]
+    else:
+      worker_device_pairs = self._worker_devices
     return values.DatasetIterator(dataset, worker_device_pairs)
 
   def _distribute_dataset(self, dataset_fn):
-    if self._cluster_spec:
+    if self._local_mode:
+      return values.PerReplicaDataset(
+          self._call_dataset_fn(dataset_fn), self._devices)
+    else:
       return values.MultiWorkerDataset(
           functools.partial(self._call_dataset_fn, dataset_fn),
           self._worker_devices,
           auto_shard=self._auto_shard_dataset)
-    else:
-      return values.PerReplicaDataset(
-          self._call_dataset_fn(dataset_fn), self._devices)
 
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index fee37daa424b8ada9f18b2046599a62647d8c33d..337a86b3421fdb90c98cd5097dd880fdbe5871b9 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.engine import training as keras_training
@@ -179,9 +180,37 @@ class MirroredStrategyVariableCreatorStackTest(
         variable_scope.variable_creator_scope(main_thread_creator):
       result = distribution.extended.call_for_each_replica(model_fn)
       result = distribution.unwrap(result)
-      expected = ["main_thread:thread_0", "main_thread:thread_1"]
+      expected = ("main_thread:thread_0", "main_thread:thread_1")
       self.assertEqual(expected, result)
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
+class MirroredStrategyCallForEachReplicaTest(test.TestCase):
+
+  def testExecutingEagerlyOutsideFunction(self, distribution):
+    """Verify we preserve the value of executing_eagerly_outside_functions()."""
+    def model_fn():
+      return ops.executing_eagerly_outside_functions()
+
+    originally = ops.executing_eagerly_outside_functions()
+    with distribution.scope():
+      in_scope = ops.executing_eagerly_outside_functions()
+      in_model_fn = distribution.extended.call_for_each_replica(model_fn)
+      unwrapped = distribution.unwrap(in_model_fn)
+      self.assertEqual(in_scope, unwrapped[0])
+      self.assertEqual(in_scope, originally)
+
+    # Verify this all again, but this time in a FuncGraph.
+    with func_graph.FuncGraph("fg").as_default(), distribution.scope():
+      in_scope = ops.executing_eagerly_outside_functions()
+      in_model_fn = distribution.extended.call_for_each_replica(model_fn)
+      unwrapped = distribution.unwrap(in_model_fn)
+      self.assertEqual(in_scope, unwrapped[0])
+      self.assertEqual(in_scope, originally)
+
 
 @combinations.generate(combinations.combine(
     distribution=[
@@ -190,6 +219,27 @@ class MirroredStrategyVariableCreatorStackTest(
     mode=["graph", "eager"]))
 class MirroredStrategyVariableCreationTest(test.TestCase):
 
+  # TODO(priyag): Modify more tests to use this helper and check more
+  # properties.
+  def _test_mv_properties(self, var, name):
+    self.assertIsInstance(var, values.MirroredVariable)
+    self.assertEqual(name, var.name)
+    for d in var.devices:
+      self.assertEqual(d, var.get(d).device)
+
+  def testVariableInFuncGraph(self, distribution):
+    def model_fn():
+      v = variable_scope.variable(2.0, name="bar")
+      ds_context.get_replica_context().merge_call(lambda _: _)
+      return v
+
+    with func_graph.FuncGraph("fg").as_default(), distribution.scope():
+      v1 = variable_scope.variable(1.0, name="foo")
+      v2 = distribution.extended.call_for_each_replica(model_fn)
+
+    self._test_mv_properties(v1, "foo:0")
+    self._test_mv_properties(v2, "bar:0")
+
   def testSingleVariable(self, distribution):
     def model_fn():
       # This variable should be created only once across the threads because of
@@ -201,8 +251,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     with distribution.scope():
       result = distribution.extended.call_for_each_replica(model_fn)
-      self.assertIsInstance(result, values.MirroredVariable)
-      self.assertEqual("foo:0", result.name)
+      self._test_mv_properties(result, "foo:0")
 
   def testUnnamedVariable(self, distribution):
     def model_fn():
@@ -212,9 +261,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     with distribution.scope():
       result = distribution.extended.call_for_each_replica(model_fn)
-      self.assertIsInstance(result, values.MirroredVariable)
-      # Default name of "Variable" will be used.
-      self.assertEqual("Variable:0", result.name)
+      self._test_mv_properties(result, "Variable:0")
 
   def testMultipleVariables(self, distribution):
     def model_fn():
@@ -227,8 +274,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     with distribution.scope():
       result = distribution.extended.call_for_each_replica(model_fn)
       for i, v in enumerate(result):
-        self.assertIsInstance(v, values.MirroredVariable)
-        self.assertEqual("foo" + str(i) + ":0", v.name)
+        self._test_mv_properties(v, "foo" + str(i) + ":0")
 
   def testMultipleVariablesWithSameCanonicalName(self, distribution):
     def model_fn():
@@ -757,21 +803,23 @@ class MirroredStrategyNameScopeTest(test.TestCase):
       self.assertEqual("c/replica_1:0", c1.name)
 
 
-@combinations.generate(combinations.combine(
-    distribution=[
-        combinations.NamedDistribution(
-            "Mirrored3Devices",
-            # pylint: disable=g-long-lambda
-            lambda: mirrored_strategy.MirroredStrategy(
-                ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"]),
-            required_gpus=2),
-        combinations.NamedDistribution(
-            "CoreMirrored3Devices",
-            # pylint: disable=g-long-lambda
-            lambda: mirrored_strategy.CoreMirroredStrategy(
-                ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"]),
-            required_gpus=2)],
-    mode=["graph", "eager"]))
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            combinations.NamedDistribution(
+                "Mirrored3Devices",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.MirroredStrategy(
+                    ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"]),
+                required_gpus=2),
+            combinations.NamedDistribution(
+                "CoreMirrored3Devices",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.CoreMirroredStrategy(
+                    ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"]),
+                required_gpus=2)
+        ],
+        mode=["graph", "eager"]))
 class MirroredThreeDeviceDistributionTest(
     strategy_test_lib.DistributionTestBase,
     parameterized.TestCase):
@@ -1283,14 +1331,14 @@ class MirroredStrategyDefunTest(test.TestCase):
             combinations.NamedDistribution(
                 "Mirrored",
                 # pylint: disable=g-long-lambda
-                lambda: mirrored_strategy.CoreMirroredStrategy(
-                    num_gpus_per_worker=context.num_gpus()),
+                lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=
+                                                           context.num_gpus()),
                 required_gpus=1),
             combinations.NamedDistribution(
                 "CoreMirrored",
                 # pylint: disable=g-long-lambda
                 lambda: mirrored_strategy.CoreMirroredStrategy(
-                    num_gpus_per_worker=context.num_gpus()),
+                    mirrored_strategy.all_local_devices()),
                 required_gpus=1)
         ],
         mode=["graph"]))
@@ -1374,7 +1422,7 @@ class MultiWorkerMirroredStrategyTestWithChief(
 
   def testMinimizeLossGraphCoreMirroredStrategy(self):
     strategy = mirrored_strategy.CoreMirroredStrategy(
-        num_gpus_per_worker=context.num_gpus())
+        mirrored_strategy.all_local_devices())
     strategy.configure(cluster_spec=self._cluster_spec)
     self._test_minimize_loss_graph(strategy, learning_rate=0.05)
 
diff --git a/tensorflow/contrib/distribute/python/moving_averages_test.py b/tensorflow/contrib/distribute/python/moving_averages_test.py
index c492d8bafc9024ed059f05b92e5466f3702726b9..8f13e9153ea7a951dd722c4549882c97e79b57fe 100644
--- a/tensorflow/contrib/distribute/python/moving_averages_test.py
+++ b/tensorflow/contrib/distribute/python/moving_averages_test.py
@@ -139,6 +139,27 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
            (2.0 * 0.25 + 0.0) / (1.0 * 0.25 + 1.0)],
           var.eval())
 
+  @combinations.generate(all_combinations)
+  def testAssignVariable(self, distribution):
+
+    def replica_fn():
+      var = variables.Variable([10.0, 11.0])
+      # Here we expect to check the case when input value are variable.
+      val = variables.Variable([1., 2.])
+      decay = 0.25
+      assign = moving_averages.assign_moving_average(
+          var, val, decay, zero_debias=False)
+      return var, assign
+
+    with distribution.scope(), self.cached_session() as sess:
+      var, assign = distribution.call_for_each_replica(replica_fn)
+      variables.global_variables_initializer().run()
+      self.assertAllClose([10.0, 11.0], var.eval())
+      sess.run(distribution.unwrap(assign))
+      self.assertAllClose(
+          [10 * 0.25 + 1. * (1 - 0.25), 11 * 0.25 + 2. * (1 - 0.25)],
+          var.eval())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/multi_worker_test_base.py b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
index 8eec3dc0f6ec0676353c7434d203e017b9aab80d..b05aac431f65b4281d9ed9c2fa95c210d55f4008 100644
--- a/tensorflow/contrib/distribute/python/multi_worker_test_base.py
+++ b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
@@ -18,8 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import contextlib
 import copy
+import json
+import os
 import threading
 import numpy as np
 
@@ -37,6 +40,7 @@ from tensorflow.python.client import session
 from tensorflow.python.estimator import run_config
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import coordinator
 from tensorflow.python.training import server_lib
 
 ASSIGNED_PORTS = set()
@@ -271,7 +275,6 @@ class MultiWorkerTestBase(test.TestCase):
 
     return config
 
-
   def _run_client(self, client_fn, task_type, task_id, num_gpus, *args,
                   **kwargs):
     result = client_fn(task_type, task_id, num_gpus, *args, **kwargs)
@@ -303,3 +306,106 @@ class MultiWorkerTestBase(test.TestCase):
     for t in threads:
       t.join()
     self.assertEqual(self._result, len(threads))
+
+
+class MockOsEnv(collections.Mapping):
+  """A class that allows per-thread TF_CONFIG."""
+
+  def __init__(self, *args):
+    self._dict = dict()
+    self._thread_local = threading.local()
+    super(MockOsEnv, self).__init__(*args)
+
+  def get(self, key, default=None):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    if key == 'TF_CONFIG':
+      return dict.get(self._thread_local.dict, key, default)
+    else:
+      return dict.get(self._dict, key, default)
+
+  def __getitem__(self, key):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    if key == 'TF_CONFIG':
+      return dict.__getitem__(self._thread_local.dict, key)
+    else:
+      return dict.__getitem__(self._dict, key)
+
+  def __setitem__(self, key, val):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    if key == 'TF_CONFIG':
+      return dict.__setitem__(self._thread_local.dict, key, val)
+    else:
+      return dict.__setitem__(self._dict, key, val)
+
+  def __iter__(self):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    for x in self._thread_local.dict.items():
+      yield x
+    for x in self._dict.items():
+      yield x
+
+  def __len__(self):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    return self._thread_local.dict.__len__() + self._dict.__len__()
+
+
+class IndependentWorkerTestBase(test.TestCase):
+  """Testing infra for independent workers."""
+
+  def setUp(self):
+    self._mock_os_env = MockOsEnv()
+    self._mock_context = test.mock.patch.object(os, 'environ',
+                                                self._mock_os_env)
+    self._coord = coordinator.Coordinator()
+    super(IndependentWorkerTestBase, self).setUp()
+    self._mock_context.__enter__()
+
+  def tearDown(self):
+    self._mock_context.__exit__(None, None, None)
+    super(IndependentWorkerTestBase, self).tearDown()
+
+  def _task_thread(self, task_fn, tf_config, *args, **kwargs):
+    with self._coord.stop_on_exception():
+      os.environ['TF_CONFIG'] = json.dumps(tf_config)
+      task_fn(*args, **kwargs)
+
+  def _run_task_in_thread(self, task_fn, cluster_spec, task_type, task_id,
+                          *args, **kwargs):
+    if task_type:
+      tf_config = {
+          'cluster': cluster_spec,
+          'task': {
+              'type': task_type,
+              'index': task_id
+          }
+      }
+    else:
+      tf_config = {
+          'cluster': cluster_spec,
+      }
+    t = threading.Thread(
+        target=self._task_thread,
+        args=(task_fn, tf_config) + args,
+        kwargs=kwargs)
+    t.start()
+    return t
+
+  def run_multiple_tasks_in_threads(self, task_fn, cluster_spec, *args,
+                                    **kwargs):
+    # The task_fn should create std_server by itself.
+    threads = {}
+    for task_type in cluster_spec.keys():
+      threads[task_type] = []
+      for task_id in range(len(cluster_spec[task_type])):
+        t = self._run_task_in_thread(task_fn, cluster_spec, task_type, task_id,
+                                     *args, **kwargs)
+        threads[task_type].append(t)
+    return threads
+
+  def join_independent_workers(self, worker_threads):
+    self._coord.join(worker_threads)
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index e322b6acb84c166a885c9aaa3002f331903a5063..fdbfba4e04358451a46b23ef250dc7c534c855a0 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -60,7 +60,7 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
     if isinstance(colocate_with, six.string_types):
       with ops.device(colocate_with):
         return next_creator(*args, **kwargs)
-    if (isinstance(colocate_with, list) and len(colocate_with) == 1 and
+    if (isinstance(colocate_with, (list, tuple)) and len(colocate_with) == 1 and
         isinstance(colocate_with[0], six.string_types)):
       with ops.device(colocate_with[0]):
         return next_creator(*args, **kwargs)
@@ -166,7 +166,7 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
     return array_ops.identity(replica_local_var)
 
   def _unwrap(self, value):
-    return [value]
+    return (value,)
 
   def value_container(self, value):
     return value
@@ -177,15 +177,15 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
 
   @property
   def worker_devices(self):
-    return [self._device]
+    return (self._device,)
 
   @property
   def parameter_devices(self):
-    return [self._device]
+    return (self._device,)
 
   def non_slot_devices(self, var_list):
     del var_list
-    return [self._device]
+    return (self._device,)
 
   @property
   def experimental_should_init(self):
@@ -216,4 +216,4 @@ class _OneDeviceReplicaContext(distribute_lib.ReplicaContext):
 
   @property
   def devices(self):
-    return [self._distribution_strategy.extended.worker_devices[0]]
+    return self._distribution_strategy.extended.worker_devices
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index eaeb4d703015fc0762359b24dc23888c01e69111..ca51b07be6601dd615e24137e51c4b34793fdbc0 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -145,14 +145,14 @@ class ParameterServerExtended(distribute_lib.DistributionStrategyExtended):
     # replica. When there are GPUs, replicate operations on these GPUs.
     # Otherwise, place operations on CPU.
     if num_gpus_per_worker > 0:
-      self._compute_devices = [
+      self._compute_devices = tuple(
           "%s/device:GPU:%d" % (self._worker_device, i)
           for i in range(num_gpus_per_worker)
-      ]
+      )
     else:
-      self._compute_devices = [self._worker_device]
+      self._compute_devices = (self._worker_device,)
 
-    self._compute_devices = list(
+    self._compute_devices = tuple(
         map(device_util.resolve, self._compute_devices))
     self._canonical_compute_device_set = set(self._compute_devices)
 
@@ -176,8 +176,8 @@ class ParameterServerExtended(distribute_lib.DistributionStrategyExtended):
     # The `_parameter_devices` is needed for the `parameter_devices` property
     # and is a list of all variable devices. Here parameter devices are all
     # tasks of the "ps" job.
-    self._parameter_devices = map("/job:ps/task:{}".format,
-                                  range(num_ps_replicas))
+    self._parameter_devices = tuple(map("/job:ps/task:{}".format,
+                                        range(num_ps_replicas)))
 
     # Add a default device so that ops without specified devices will not end up
     # on other workers.
@@ -204,24 +204,24 @@ class ParameterServerExtended(distribute_lib.DistributionStrategyExtended):
     # replica. When there are GPUs, replicate operations on these GPUs.
     # Otherwise, place operations on CPU.
     if num_gpus_per_worker > 0:
-      self._compute_devices = list(
+      self._compute_devices = tuple(
           map("/device:GPU:{}".format, range(num_gpus_per_worker)))
     else:
-      self._compute_devices = [_LOCAL_CPU]
+      self._compute_devices = (_LOCAL_CPU,)
 
-    self._compute_devices = list(
+    self._compute_devices = tuple(
         map(device_util.resolve, self._compute_devices))
     self._canonical_compute_device_set = set(self._compute_devices)
 
     # If there is only one GPU, put everything on that GPU. Otherwise, place
     # variables on CPU.
     if num_gpus_per_worker == 1:
-      assert len(list(self._compute_devices)) == 1
+      assert len(self._compute_devices) == 1
       self._variable_device = _LOCAL_GPU_0
-      self._parameter_devices = [_LOCAL_GPU_0]
+      self._parameter_devices = (_LOCAL_GPU_0,)
     else:
       self._variable_device = _LOCAL_CPU
-      self._parameter_devices = [_LOCAL_CPU]
+      self._parameter_devices = (_LOCAL_CPU,)
 
     self._is_chief = True
     self._cluster_spec = None
@@ -356,7 +356,7 @@ class ParameterServerExtended(distribute_lib.DistributionStrategyExtended):
     self._verify_destinations_not_different_worker(destinations)
     if not isinstance(value, values.DistributedValues):
       # pylint: disable=protected-access
-      return mirrored_strategy._reduce_non_distributed_value(
+      return cross_device_ops_lib.reduce_non_distributed_value(
           self, reduce_op, value, destinations)
     return self._cross_device_ops.reduce(
         reduce_op, value, destinations=destinations)
@@ -417,9 +417,9 @@ class ParameterServerExtended(distribute_lib.DistributionStrategyExtended):
     if isinstance(val, values.DistributedValues):
       # Return in a deterministic order.
       if set(val.devices) == self._canonical_compute_device_set:
-        return [val.get(device=d) for d in self._compute_devices]
-      return [val.get(device=d) for d in sorted(val.devices)]
-    return [val]
+        return tuple(val.get(device=d) for d in self._compute_devices)
+      return tuple(val.get(device=d) for d in sorted(val.devices))
+    return (val,)
 
   def value_container(self, val):
     if (hasattr(val, "_aggregating_container") and
@@ -497,12 +497,11 @@ class ParameterServerExtended(distribute_lib.DistributionStrategyExtended):
 
   @property
   def worker_devices(self):
-    # Make a copy to prevent users from accidentally mutating our copy.
-    return list(self._compute_devices)
+    return self._compute_devices
 
   @property
   def parameter_devices(self):
-    return list(self._parameter_devices)
+    return self._parameter_devices
 
   def non_slot_devices(self, var_list):
     return min(var_list, key=lambda x: x.name)
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
index d50b142c5e9ad36522b11a77219140a7b40d9bf6..d441b5af5f6aa41efde2c75d09d9589516c54992 100644
--- a/tensorflow/contrib/distribute/python/strategy_test_lib.py
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -290,4 +290,4 @@ class DistributionTestBase(test.TestCase):
       self.evaluate(strategy.group(train_ops))
       global_step_tensors = strategy.unwrap(value)
       global_step_values = self.evaluate(global_step_tensors)
-      self.assertEqual([1] * len(global_step_tensors), global_step_values)
+      self.assertEqual((1,) * len(global_step_tensors), global_step_values)
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index 39ed8f7cf10371c0e8dd70e2bdf53f13e8ce8383..7ea245eb6eb9738bc95e8ac54c1c43de0ddcef7c 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -28,6 +28,8 @@ from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.contrib.tpu.python.tpu import training_loop
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as session_lib
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
@@ -43,12 +45,10 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
-_TPU_INITIALIZE_SYSTEM_COLLECTION = "TPU_STRATEGY_INITIALIZE"
-
-
 def get_tpu_system_metadata(tpu_cluster_resolver):
   """Retrieves TPU system metadata given a TPUClusterResolver."""
   master = tpu_cluster_resolver.master()
@@ -145,6 +145,9 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
 class TPUExtended(distribute_lib.DistributionStrategyExtended):
   """Implementation of TPUStrategy."""
 
+  # Track what TPU devices have been initialized.
+  _initialized_devices = []
+
   def __init__(self, container_strategy, tpu_cluster_resolver, steps_per_run,
                num_cores=None):
     super(TPUExtended, self).__init__(container_strategy)
@@ -159,16 +162,41 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
                   if "device:TPU:" in d.name}
     self._device_index = values.PerReplica(device_map)
     self._host_device = self.get_host_cpu_device(0)
-    self._tpu_devices = sorted(device_map.keys())
+    self._tpu_devices = tuple(sorted(device_map.keys()))
     # Only create variables for the number of replicas we're running.
     self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]
 
     # TODO(sourabhbajaj): Remove this once performance of running one step
     # at a time is comparable to multiple steps.
     self.steps_per_run = steps_per_run
-
     self._require_static_shapes = True
 
+    # Initialize the TPU devices.
+    self._initialize_tpu()
+
+  def _initialize_tpu(self):
+    """Initialize the TPU devices in a separate session and graph.
+
+    We keep track of all the TPU devices that we're initialized as we should
+    only be running TPU initialize once for the entire process.
+    """
+    master = self._tpu_cluster_resolver.master()
+    # Verify TPU has not already been initialized in this process.
+    if master in TPUExtended._initialized_devices:
+      logging.info("TPU master %s has already been initialized." % master)
+      return
+
+    logging.info("Initializing the TPU system.")
+    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    self._configure(session_config)
+    with ops.Graph().as_default():
+      with session_lib.Session(config=session_config, target=master) as sess:
+        sess.run([tpu.initialize_system()])
+    logging.info("Finized initializing TPU system.")
+
+    # Update Strategy state to make sure we can track device initialization.
+    TPUExtended._initialized_devices.append(master)
+
   def _get_enqueue_op_per_host(self, host_id, multi_worker_iterator,
                                input_shapes, iterations):
     """Create an enqueue op for a single host identified using host_id.
@@ -380,22 +408,14 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
       # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
       raise NotImplementedError("Eager mode not supported in TPUStrategy.")
     else:
-      # TODO(jhseu): We need this hack because DistributionStrategies must be
-      # pickleable for copy.deepcopy(). Remove when initialize_system goes away.
-      graph = ops.get_default_graph()
-      tpu_init = graph.get_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION)
-      if tpu_init:
-        return tpu_init
-      graph.add_to_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION,
-                              tpu.initialize_system())
-      return graph.get_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION)
+      return []
 
   def _finalize(self):
     if context.executing_eagerly():
       # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
       raise NotImplementedError("Eager mode not supported in TPUStrategy.")
     else:
-      return [tpu.shutdown_system()]
+      return []
 
   def _get_devices_from(self, colocate_with=None):
     # TODO(jhseu): Change this when we support model parallelism.
@@ -445,6 +465,14 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
             "Currently only support sum & mean in TPUStrategy.")
       return tpu_ops.cross_replica_sum(value)
 
+    if not isinstance(value, values.DistributedValues):
+      # This function handles reducing values that are not PerReplica or
+      # Mirrored values. For example, the same value could be present on all
+      # replicas in which case `value` would be a single value or value could
+      # be 0.
+      return cross_device_ops_lib.reduce_non_distributed_value(
+          self, reduce_op, value, destinations)
+
     # Validate that the destination is same as the host device
     # Note we don't do this when in replicate context as the reduction is
     # performed on the TPU device itself.
@@ -487,13 +515,13 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
   def _unwrap(self, val):
     if isinstance(val, values.DistributedValues):
       # Return in a deterministic order.
-      return [val.get(device=d) for d in sorted(val.devices)]
+      return tuple(val.get(device=d) for d in sorted(val.devices))
     elif isinstance(val, list):
       # TODO(josh11b): We need to remove this case; per device values should
       # be represented using a PerReplica wrapper instead of a list with
       # one entry per device.
-      return val
-    return [val]
+      return tuple(val)
+    return (val,)
 
   def value_container(self, value):
     return value
@@ -599,4 +627,4 @@ class _TPUReplicaContext(distribute_lib.ReplicaContext):
     distribute_lib.require_replica_context(self)
     ds = self._distribution_strategy
     replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
-    return [ds.extended.worker_devices[replica_id]]
+    return (ds.extended.worker_devices[replica_id],)
diff --git a/tensorflow/contrib/eager/python/evaluator.py b/tensorflow/contrib/eager/python/evaluator.py
index 7949a3f6da293abdd85512209242bae76ab4d816..51443d24829bdc31a41813e0ff50ad7102422112 100644
--- a/tensorflow/contrib/eager/python/evaluator.py
+++ b/tensorflow/contrib/eager/python/evaluator.py
@@ -22,6 +22,7 @@ import six
 
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.contrib.eager.python import metrics
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import errors_impl
@@ -164,8 +165,8 @@ class Evaluator(object):
         self.__call__(example, *args, **kwargs)
       return self.all_metric_results(summary_logdir)
     # Graph construction
-    call_op = self.__call__(dataset.make_one_shot_iterator().get_next(), *args,
-                            **kwargs)
+    call_op = self.__call__(
+        dataset_ops.make_one_shot_iterator(dataset).get_next(), *args, **kwargs)
     init_op = self.init_variables()
     results_op = self.all_metric_results(summary_logdir)
     return (init_op, call_op, results_op)
diff --git a/tensorflow/contrib/eager/python/examples/densenet/BUILD b/tensorflow/contrib/eager/python/examples/densenet/BUILD
index 2dc196f550a10367066730f6f042c4ed69533ec3..e2154fcc5fcf774dcd52285d9442dfd5073a4992 100644
--- a/tensorflow/contrib/eager/python/examples/densenet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/densenet/BUILD
@@ -3,6 +3,7 @@ licenses(["notice"])  # Apache 2.0
 package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "py_binary")
 
 py_binary(
     name = "densenet",
diff --git a/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py b/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py
index 4b3cb624bc947a1d1956eff6accb6d4da3bf3b87..24f6b007b526b29157011f3b1e9abdbd50bacc8e 100644
--- a/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py
@@ -119,7 +119,8 @@ class DensenetBenchmark(tf.test.Benchmark):
       with tf.Graph().as_default():
         np_images, np_labels = random_batch(batch_size)
         dataset = tf.data.Dataset.from_tensors((np_images, np_labels)).repeat()
-        (images, labels) = dataset.make_one_shot_iterator().get_next()
+        (images, labels) = tf.compat.v1.data.make_one_shot_iterator(
+            dataset).get_next()
 
         model = densenet.DenseNet(self.depth, self.growth_rate, self.num_blocks,
                                   self.output_classes,
diff --git a/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py b/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py
index 12b39b0cde49d4c017acfa74572c725036c54eff..e73841fbf724e05eaa3be90cc8650f795d3e1ccf 100644
--- a/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py
@@ -42,7 +42,8 @@ class MnistGraphGanBenchmark(tf.test.Benchmark):
     # Generate some random data.
     images_data = np.random.randn(batch_size, 784).astype(np.float32)
     dataset = tf.data.Dataset.from_tensors(images_data)
-    images = dataset.repeat().make_one_shot_iterator().get_next()
+    images = tf.compat.v1.data.make_one_shot_iterator(
+        dataset.repeat()).get_next()
 
     # Create the models and optimizers
     generator = mnist.Generator(data_format())
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
index ca27a85a229d41a85fa26ecdc982da478fe9e202..1a08cc0fd06516be4af5c2b0b46a3ffcf9101e95 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
@@ -470,7 +470,7 @@
         "\n",
         "  if epoch % 1 == 0:\n",
         "    loss = tfe.metrics.Mean()\n",
-        "    for test_x in test_dataset.make_one_shot_iterator():\n",
+        "    for test_x in test_dataset:\n",
         "      loss(compute_loss(model, test_x))\n",
         "    elbo = -loss.result()\n",
         "    display.clear_output(wait=False)\n",
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
index 3acecd283cda83992bab0c37cf0b8037ed2cf27a..12c5eff2b4aa901bdab52bf545e95b1e4dce7468 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
@@ -1,1184 +1,1174 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "K2s1A9eLRPEj"
+   },
+   "source": [
+    "##### Copyright 2018 The TensorFlow Authors.\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\").\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "Cffg2i257iMS"
+   },
+   "source": [
+    "# Image Captioning with Attention\n",
+    "\n",
+    "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
+    "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\">\n",
+    "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
+    "</td><td>\n",
+    "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "QASbY_HGo4Lq"
+   },
+   "source": [
+    "Image captioning is the task of generating a caption for an image. Given an image like this:\n",
+    "\n",
+    "![Man Surfing](https://tensorflow.org/images/surf.jpg) \n",
+    "\n",
+    "[Image Source](https://commons.wikimedia.org/wiki/Surfing#/media/File:Surfing_in_Hawaii.jpg), License: Public Domain\n",
+    "\n",
+    "Our goal is to generate a caption, such as \"a surfer riding on a wave\". Here, we'll use an attention-based model. This enables us to see which parts of the image the model focuses on as it generates a caption.\n",
+    "\n",
+    "![Prediction](https://tensorflow.org/images/imcap_prediction.png)\n",
+    "\n",
+    "This model architecture below is similar to [Show, Attend and Tell: Neural Image Caption Generation with Visual Attention](https://arxiv.org/abs/1502.03044). \n",
+    "\n",
+    "The code uses [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager), which you can learn more about in the linked guides.\n",
+    "\n",
+    "This notebook is an end-to-end example. If you run it, it will download the  [MS-COCO](http://cocodataset.org/#home) dataset, preprocess and cache a subset of the images using Inception V3, train an encoder-decoder model, and use it to generate captions on new images.\n",
+    "\n",
+    "The code requires TensorFlow version >=1.9. If you're running this in [Colab]()\n",
+    "\n",
+    "In this example, we're training on a relatively small amount of data as an example. On a single P100 GPU, this example will take about ~2 hours to train. We train on the first 30,000 captions (corresponding to about ~20,000 images depending on shuffling, as there are multiple captions per image in the dataset)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
     "colab": {
-      "name": "image_captioning_with_attention.ipynb",
-      "version": "0.3.2",
-      "views": {},
-      "default_view": {},
-      "provenance": [
-        {
-          "file_id": "1HI8OK2sMjcx9CTWVn0122QAHOuXaOaMg",
-          "timestamp": 1530222436922
-        }
-      ],
-      "private_outputs": true,
-      "collapsed_sections": [],
-      "toc_visible": true
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
-    },
-    "accelerator": "GPU"
+    "colab_type": "code",
+    "id": "U8l4RJ0XRPEm"
+   },
+   "outputs": [],
+   "source": [
+    "# Import TensorFlow and enable eager execution\n",
+    "# This code requires TensorFlow version >=1.9\n",
+    "import tensorflow as tf\n",
+    "tf.enable_eager_execution()\n",
+    "\n",
+    "# We'll generate plots of attention in order to see which parts of an image\n",
+    "# our model focuses on during captioning\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Scikit-learn includes many helpful utilities\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.utils import shuffle\n",
+    "\n",
+    "import re\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import time\n",
+    "import json\n",
+    "from glob import glob\n",
+    "from PIL import Image\n",
+    "import pickle"
+   ]
   },
-  "cells": [
-    {
-      "metadata": {
-        "id": "K2s1A9eLRPEj",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "##### Copyright 2018 The TensorFlow Authors.\n",
-        "\n",
-        "Licensed under the Apache License, Version 2.0 (the \"License\").\n"
-      ]
-    },
-    {
-      "metadata": {
-        "id": "Cffg2i257iMS",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "# Image Captioning with Attention\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
-        "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\">\n",
-        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
-        "</td><td>\n",
-        "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
-      ]
-    },
-    {
-      "metadata": {
-        "id": "QASbY_HGo4Lq",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "Image captioning is the task of generating a caption for an image. Given an image like this:\n",
-        "\n",
-        "![Man Surfing](https://tensorflow.org/images/surf.jpg) \n",
-        "\n",
-        "[Image Source](https://commons.wikimedia.org/wiki/Surfing#/media/File:Surfing_in_Hawaii.jpg), License: Public Domain\n",
-        "\n",
-        "Our goal is to generate a caption, such as \"a surfer riding on a wave\". Here, we'll use an attention-based model. This enables us to see which parts of the image the model focuses on as it generates a caption.\n",
-        "\n",
-        "![Prediction](https://tensorflow.org/images/imcap_prediction.png)\n",
-        "\n",
-        "This model architecture below is similar to [Show, Attend and Tell: Neural Image Caption Generation with Visual Attention](https://arxiv.org/abs/1502.03044). \n",
-        "\n",
-        "The code uses [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager), which you can learn more about in the linked guides.\n",
-        "\n",
-        "This notebook is an end-to-end example. If you run it, it will download the  [MS-COCO](http://cocodataset.org/#home) dataset, preprocess and cache a subset of the images using Inception V3, train an encoder-decoder model, and use it to generate captions on new images.\n",
-        "\n",
-        "The code requires TensorFlow version >=1.9. If you're running this in [Colab]()\n",
-        "\n",
-        "In this example, we're training on a relatively small amount of data as an example. On a single P100 GPU, this example will take about ~2 hours to train. We train on the first 30,000 captions (corresponding to about ~20,000 images depending on shuffling, as there are multiple captions per image in the dataset)\n"
-      ]
-    },
-    {
-      "metadata": {
-        "id": "U8l4RJ0XRPEm",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# Import TensorFlow and enable eager execution\n",
-        "# This code requires TensorFlow version >=1.9\n",
-        "import tensorflow as tf\n",
-        "tf.enable_eager_execution()\n",
-        "\n",
-        "# We'll generate plots of attention in order to see which parts of an image\n",
-        "# our model focuses on during captioning\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "# Scikit-learn includes many helpful utilities\n",
-        "from sklearn.model_selection import train_test_split\n",
-        "from sklearn.utils import shuffle\n",
-        "\n",
-        "import re\n",
-        "import numpy as np\n",
-        "import os\n",
-        "import time\n",
-        "import json\n",
-        "from glob import glob\n",
-        "from PIL import Image\n",
-        "import pickle"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "b6qbGw8MRPE5",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Download and prepare the MS-COCO dataset\n",
-        "\n",
-        "We will use the [MS-COCO dataset](http://cocodataset.org/#home) to train our model. This dataset contains >82,000 images, each of which has been annotated with at least 5 different captions. The code below will download and extract the dataset automatically.  \n",
-        "\n",
-        "**Caution: large download ahead**. We'll use the training set, it's a 13GB file."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "krQuPYTtRPE7",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "annotation_zip = tf.keras.utils.get_file('captions.zip', \n",
-        "                                          cache_subdir=os.path.abspath('.'),\n",
-        "                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',\n",
-        "                                          extract = True)\n",
-        "annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'\n",
-        "\n",
-        "name_of_zip = 'train2014.zip'\n",
-        "if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):\n",
-        "  image_zip = tf.keras.utils.get_file(name_of_zip, \n",
-        "                                      cache_subdir=os.path.abspath('.'),\n",
-        "                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',\n",
-        "                                      extract = True)\n",
-        "  PATH = os.path.dirname(image_zip)+'/train2014/'\n",
-        "else:\n",
-        "  PATH = os.path.abspath('.')+'/train2014/'"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "aANEzb5WwSzg",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Optionally, limit the size of the training set for faster training\n",
-        "For this example, we'll select a subset of 30,000 captions and use these and the corresponding images to train our model. As always, captioning quality will improve if you choose to use more data."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "4G3b8x8_RPFD",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# read the json file\n",
-        "with open(annotation_file, 'r') as f:\n",
-        "    annotations = json.load(f)\n",
-        "\n",
-        "# storing the captions and the image name in vectors\n",
-        "all_captions = []\n",
-        "all_img_name_vector = []\n",
-        "\n",
-        "for annot in annotations['annotations']:\n",
-        "    caption = '<start> ' + annot['caption'] + ' <end>'\n",
-        "    image_id = annot['image_id']\n",
-        "    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)\n",
-        "    \n",
-        "    all_img_name_vector.append(full_coco_image_path)\n",
-        "    all_captions.append(caption)\n",
-        "\n",
-        "# shuffling the captions and image_names together\n",
-        "# setting a random state\n",
-        "train_captions, img_name_vector = shuffle(all_captions,\n",
-        "                                          all_img_name_vector,\n",
-        "                                          random_state=1)\n",
-        "\n",
-        "# selecting the first 30000 captions from the shuffled set\n",
-        "num_examples = 30000\n",
-        "train_captions = train_captions[:num_examples]\n",
-        "img_name_vector = img_name_vector[:num_examples]"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "mPBMgK34RPFL",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "len(train_captions), len(all_captions)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "8cSW4u-ORPFQ",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Preprocess the images using InceptionV3\n",
-        "Next, we will use InceptionV3 (pretrained on Imagenet) to classify each image. We will extract features from the last convolutional layer. \n",
-        "\n",
-        "First, we will need to convert the images into the format inceptionV3 expects by:\n",
-        "* Resizing the image to (299, 299)\n",
-        "* Using the [preprocess_input](https://www.tensorflow.org/api_docs/python/tf/keras/applications/inception_v3/preprocess_input) method to place the pixels in the range of -1 to 1 (to match the format of the images used to train InceptionV3)."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "zXR0217aRPFR",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def load_image(image_path):\n",
-        "    img = tf.read_file(image_path)\n",
-        "    img = tf.image.decode_jpeg(img, channels=3)\n",
-        "    img = tf.image.resize_images(img, (299, 299))\n",
-        "    img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
-        "    return img, image_path"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "MDvIu4sXRPFV",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Initialize InceptionV3 and load the pretrained Imagenet weights\n",
-        "\n",
-        "To do so, we'll create a tf.keras model where the output layer is the last convolutional layer in the InceptionV3 architecture. \n",
-        "* Each image is forwarded through the network and the vector that we get at the end is stored in a dictionary (image_name --> feature_vector). \n",
-        "* We use the last convolutional layer because we are using attention in this example. The shape of the output of this layer is ```8x8x2048```. \n",
-        "* We avoid doing this during training so it does not become a bottleneck. \n",
-        "* After all the images are passed through the network, we pickle the dictionary and save it to disk."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "RD3vW4SsRPFW",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "image_model = tf.keras.applications.InceptionV3(include_top=False, \n",
-        "                                                weights='imagenet')\n",
-        "new_input = image_model.input\n",
-        "hidden_layer = image_model.layers[-1].output\n",
-        "\n",
-        "image_features_extract_model = tf.keras.Model(new_input, hidden_layer)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "rERqlR3WRPGO",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Caching the features extracted from InceptionV3\n",
-        "\n",
-        "We will pre-process each image with InceptionV3 and cache the output to disk. Caching the output in RAM would be faster but memory intensive, requiring 8 \\* 8 \\* 2048 floats per image. At the time of writing, this would exceed the memory limitations of Colab (although these may change, an instance appears to have about 12GB of memory currently). \n",
-        "\n",
-        "Performance could be improved with a more sophisticated caching strategy (e.g., by sharding the images to reduce random access disk I/O) at the cost of more code.\n",
-        "\n",
-        "This will take about 10 minutes to run in Colab with a GPU. If you'd like to see a progress bar, you could: install [tqdm](https://github.com/tqdm/tqdm) (```!pip install tqdm```), then change this line: \n",
-        "\n",
-        "```for img, path in image_dataset:``` \n",
-        "\n",
-        "to:\n",
-        "\n",
-        "```for img, path in tqdm(image_dataset):```."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "Dx_fvbVgRPGQ",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# getting the unique images\n",
-        "encode_train = sorted(set(img_name_vector))\n",
-        "\n",
-        "# feel free to change the batch_size according to your system configuration\n",
-        "image_dataset = tf.data.Dataset.from_tensor_slices(\n",
-        "                                encode_train).map(load_image).batch(16)\n",
-        "\n",
-        "for img, path in image_dataset:\n",
-        "  batch_features = image_features_extract_model(img)\n",
-        "  batch_features = tf.reshape(batch_features, \n",
-        "                              (batch_features.shape[0], -1, batch_features.shape[3]))\n",
-        "\n",
-        "  for bf, p in zip(batch_features, path):\n",
-        "    path_of_feature = p.numpy().decode(\"utf-8\")\n",
-        "    np.save(path_of_feature, bf.numpy())"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "nyqH3zFwRPFi",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Preprocess and tokenize the captions\n",
-        "\n",
-        "* First, we'll tokenize the captions (e.g., by splitting on spaces). This will give us a  vocabulary of all the unique words in the data (e.g., \"surfing\", \"football\", etc).\n",
-        "* Next, we'll limit the vocabulary size to the top 5,000 words to save memory. We'll replace all other words with the token \"UNK\" (for unknown).\n",
-        "* Finally, we create a word --> index mapping and vice-versa.\n",
-        "* We will then pad all sequences to the be same length as the longest one. "
-      ]
-    },
-    {
-      "metadata": {
-        "id": "HZfK8RhQRPFj",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# This will find the maximum length of any caption in our dataset\n",
-        "def calc_max_length(tensor):\n",
-        "    return max(len(t) for t in tensor)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "oJGE34aiRPFo",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# The steps above is a general process of dealing with text processing\n",
-        "\n",
-        "# choosing the top 5000 words from the vocabulary\n",
-        "top_k = 5000\n",
-        "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, \n",
-        "                                                  oov_token=\"<unk>\", \n",
-        "                                                  filters='!\"#$%&()*+.,-/:;=?@[\\]^_`{|}~ ')\n",
-        "tokenizer.fit_on_texts(train_captions)\n",
-        "train_seqs = tokenizer.texts_to_sequences(train_captions)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "b6qbGw8MRPE5"
+   },
+   "source": [
+    "## Download and prepare the MS-COCO dataset\n",
+    "\n",
+    "We will use the [MS-COCO dataset](http://cocodataset.org/#home) to train our model. This dataset contains >82,000 images, each of which has been annotated with at least 5 different captions. The code below will download and extract the dataset automatically.  \n",
+    "\n",
+    "**Caution: large download ahead**. We'll use the training set, it's a 13GB file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "8Q44tNQVRPFt",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "tokenizer.word_index = {key:value for key, value in tokenizer.word_index.items() if value <= top_k}\n",
-        "# putting <unk> token in the word2idx dictionary\n",
-        "tokenizer.word_index[tokenizer.oov_token] = top_k + 1\n",
-        "tokenizer.word_index['<pad>'] = 0"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "krQuPYTtRPE7"
+   },
+   "outputs": [],
+   "source": [
+    "annotation_zip = tf.keras.utils.get_file('captions.zip', \n",
+    "                                          cache_subdir=os.path.abspath('.'),\n",
+    "                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',\n",
+    "                                          extract = True)\n",
+    "annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'\n",
+    "\n",
+    "name_of_zip = 'train2014.zip'\n",
+    "if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):\n",
+    "  image_zip = tf.keras.utils.get_file(name_of_zip, \n",
+    "                                      cache_subdir=os.path.abspath('.'),\n",
+    "                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',\n",
+    "                                      extract = True)\n",
+    "  PATH = os.path.dirname(image_zip)+'/train2014/'\n",
+    "else:\n",
+    "  PATH = os.path.abspath('.')+'/train2014/'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "aANEzb5WwSzg"
+   },
+   "source": [
+    "## Optionally, limit the size of the training set for faster training\n",
+    "For this example, we'll select a subset of 30,000 captions and use these and the corresponding images to train our model. As always, captioning quality will improve if you choose to use more data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "0fpJb5ojRPFv",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# creating the tokenized vectors\n",
-        "train_seqs = tokenizer.texts_to_sequences(train_captions)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "4G3b8x8_RPFD"
+   },
+   "outputs": [],
+   "source": [
+    "# read the json file\n",
+    "with open(annotation_file, 'r') as f:\n",
+    "    annotations = json.load(f)\n",
+    "\n",
+    "# storing the captions and the image name in vectors\n",
+    "all_captions = []\n",
+    "all_img_name_vector = []\n",
+    "\n",
+    "for annot in annotations['annotations']:\n",
+    "    caption = '<start> ' + annot['caption'] + ' <end>'\n",
+    "    image_id = annot['image_id']\n",
+    "    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)\n",
+    "    \n",
+    "    all_img_name_vector.append(full_coco_image_path)\n",
+    "    all_captions.append(caption)\n",
+    "\n",
+    "# shuffling the captions and image_names together\n",
+    "# setting a random state\n",
+    "train_captions, img_name_vector = shuffle(all_captions,\n",
+    "                                          all_img_name_vector,\n",
+    "                                          random_state=1)\n",
+    "\n",
+    "# selecting the first 30000 captions from the shuffled set\n",
+    "num_examples = 30000\n",
+    "train_captions = train_captions[:num_examples]\n",
+    "img_name_vector = img_name_vector[:num_examples]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "olQArbgbRPF1",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# creating a reverse mapping (index -> word)\n",
-        "index_word = {value:key for key, value in tokenizer.word_index.items()}"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "mPBMgK34RPFL"
+   },
+   "outputs": [],
+   "source": [
+    "len(train_captions), len(all_captions)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "8cSW4u-ORPFQ"
+   },
+   "source": [
+    "## Preprocess the images using InceptionV3\n",
+    "Next, we will use InceptionV3 (pretrained on Imagenet) to classify each image. We will extract features from the last convolutional layer. \n",
+    "\n",
+    "First, we will need to convert the images into the format inceptionV3 expects by:\n",
+    "* Resizing the image to (299, 299)\n",
+    "* Using the [preprocess_input](https://www.tensorflow.org/api_docs/python/tf/keras/applications/inception_v3/preprocess_input) method to place the pixels in the range of -1 to 1 (to match the format of the images used to train InceptionV3)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "AidglIZVRPF4",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# padding each vector to the max_length of the captions\n",
-        "# if the max_length parameter is not provided, pad_sequences calculates that automatically\n",
-        "cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "zXR0217aRPFR"
+   },
+   "outputs": [],
+   "source": [
+    "def load_image(image_path):\n",
+    "    img = tf.read_file(image_path)\n",
+    "    img = tf.image.decode_jpeg(img, channels=3)\n",
+    "    img = tf.image.resize_images(img, (299, 299))\n",
+    "    img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
+    "    return img, image_path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "MDvIu4sXRPFV"
+   },
+   "source": [
+    "## Initialize InceptionV3 and load the pretrained Imagenet weights\n",
+    "\n",
+    "To do so, we'll create a tf.keras model where the output layer is the last convolutional layer in the InceptionV3 architecture. \n",
+    "* Each image is forwarded through the network and the vector that we get at the end is stored in a dictionary (image_name --> feature_vector). \n",
+    "* We use the last convolutional layer because we are using attention in this example. The shape of the output of this layer is ```8x8x2048```. \n",
+    "* We avoid doing this during training so it does not become a bottleneck. \n",
+    "* After all the images are passed through the network, we pickle the dictionary and save it to disk."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "gL0wkttkRPGA",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# calculating the max_length \n",
-        "# used to store the attention weights\n",
-        "max_length = calc_max_length(train_seqs)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "RD3vW4SsRPFW"
+   },
+   "outputs": [],
+   "source": [
+    "image_model = tf.keras.applications.InceptionV3(include_top=False, \n",
+    "                                                weights='imagenet')\n",
+    "new_input = image_model.input\n",
+    "hidden_layer = image_model.layers[-1].output\n",
+    "\n",
+    "image_features_extract_model = tf.keras.Model(new_input, hidden_layer)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "rERqlR3WRPGO"
+   },
+   "source": [
+    "## Caching the features extracted from InceptionV3\n",
+    "\n",
+    "We will pre-process each image with InceptionV3 and cache the output to disk. Caching the output in RAM would be faster but memory intensive, requiring 8 \\* 8 \\* 2048 floats per image. At the time of writing, this would exceed the memory limitations of Colab (although these may change, an instance appears to have about 12GB of memory currently). \n",
+    "\n",
+    "Performance could be improved with a more sophisticated caching strategy (e.g., by sharding the images to reduce random access disk I/O) at the cost of more code.\n",
+    "\n",
+    "This will take about 10 minutes to run in Colab with a GPU. If you'd like to see a progress bar, you could: install [tqdm](https://github.com/tqdm/tqdm) (```!pip install tqdm```), then change this line: \n",
+    "\n",
+    "```for img, path in image_dataset:``` \n",
+    "\n",
+    "to:\n",
+    "\n",
+    "```for img, path in tqdm(image_dataset):```."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "M3CD75nDpvTI",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Split the data into training and testing"
-      ]
+    "colab_type": "code",
+    "id": "Dx_fvbVgRPGQ"
+   },
+   "outputs": [],
+   "source": [
+    "# getting the unique images\n",
+    "encode_train = sorted(set(img_name_vector))\n",
+    "\n",
+    "# feel free to change the batch_size according to your system configuration\n",
+    "image_dataset = tf.data.Dataset.from_tensor_slices(\n",
+    "                                encode_train).map(load_image).batch(16)\n",
+    "\n",
+    "for img, path in image_dataset:\n",
+    "  batch_features = image_features_extract_model(img)\n",
+    "  batch_features = tf.reshape(batch_features, \n",
+    "                              (batch_features.shape[0], -1, batch_features.shape[3]))\n",
+    "\n",
+    "  for bf, p in zip(batch_features, path):\n",
+    "    path_of_feature = p.numpy().decode(\"utf-8\")\n",
+    "    np.save(path_of_feature, bf.numpy())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "nyqH3zFwRPFi"
+   },
+   "source": [
+    "## Preprocess and tokenize the captions\n",
+    "\n",
+    "* First, we'll tokenize the captions (e.g., by splitting on spaces). This will give us a  vocabulary of all the unique words in the data (e.g., \"surfing\", \"football\", etc).\n",
+    "* Next, we'll limit the vocabulary size to the top 5,000 words to save memory. We'll replace all other words with the token \"UNK\" (for unknown).\n",
+    "* Finally, we create a word --> index mapping and vice-versa.\n",
+    "* We will then pad all sequences to the be same length as the longest one. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "iS7DDMszRPGF",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# Create training and validation sets using 80-20 split\n",
-        "img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, \n",
-        "                                                                    cap_vector, \n",
-        "                                                                    test_size=0.2, \n",
-        "                                                                    random_state=0)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "HZfK8RhQRPFj"
+   },
+   "outputs": [],
+   "source": [
+    "# This will find the maximum length of any caption in our dataset\n",
+    "def calc_max_length(tensor):\n",
+    "    return max(len(t) for t in tensor)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "XmViPkRFRPGH",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "oJGE34aiRPFo"
+   },
+   "outputs": [],
+   "source": [
+    "# The steps above is a general process of dealing with text processing\n",
+    "\n",
+    "# choosing the top 5000 words from the vocabulary\n",
+    "top_k = 5000\n",
+    "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, \n",
+    "                                                  oov_token=\"<unk>\", \n",
+    "                                                  filters='!\"#$%&()*+.,-/:;=?@[\\]^_`{|}~ ')\n",
+    "tokenizer.fit_on_texts(train_captions)\n",
+    "train_seqs = tokenizer.texts_to_sequences(train_captions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "uEWM9xrYcg45",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Our images and captions are ready! Next, let's create a tf.data dataset to use for training our model.\n",
-        "\n"
-      ]
+    "colab_type": "code",
+    "id": "8Q44tNQVRPFt"
+   },
+   "outputs": [],
+   "source": [
+    "tokenizer.word_index['<pad>'] = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "Q3TnZ1ToRPGV",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# feel free to change these parameters according to your system's configuration\n",
-        "\n",
-        "BATCH_SIZE = 64\n",
-        "BUFFER_SIZE = 1000\n",
-        "embedding_dim = 256\n",
-        "units = 512\n",
-        "vocab_size = len(tokenizer.word_index)\n",
-        "# shape of the vector extracted from InceptionV3 is (64, 2048)\n",
-        "# these two variables represent that\n",
-        "features_shape = 2048\n",
-        "attention_features_shape = 64"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "0fpJb5ojRPFv"
+   },
+   "outputs": [],
+   "source": [
+    "# creating the tokenized vectors\n",
+    "train_seqs = tokenizer.texts_to_sequences(train_captions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "SmZS2N0bXG3T",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# loading the numpy files \n",
-        "def map_func(img_name, cap):\n",
-        "    img_tensor = np.load(img_name.decode('utf-8')+'.npy')\n",
-        "    return img_tensor, cap"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "AidglIZVRPF4"
+   },
+   "outputs": [],
+   "source": [
+    "# padding each vector to the max_length of the captions\n",
+    "# if the max_length parameter is not provided, pad_sequences calculates that automatically\n",
+    "cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "FDF_Nm3tRPGZ",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))\n",
-        "\n",
-        "# using map to load the numpy files in parallel\n",
-        "# NOTE: Be sure to set num_parallel_calls to the number of CPU cores you have\n",
-        "# https://www.tensorflow.org/api_docs/python/tf/py_func\n",
-        "dataset = dataset.map(lambda item1, item2: tf.py_func(\n",
-        "          map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=8)\n",
-        "\n",
-        "# shuffling and batching\n",
-        "dataset = dataset.shuffle(BUFFER_SIZE)\n",
-        "# https://www.tensorflow.org/api_docs/python/tf/contrib/data/batch_and_drop_remainder\n",
-        "dataset = dataset.batch(BATCH_SIZE)\n",
-        "dataset = dataset.prefetch(1)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "gL0wkttkRPGA"
+   },
+   "outputs": [],
+   "source": [
+    "# calculating the max_length \n",
+    "# used to store the attention weights\n",
+    "max_length = calc_max_length(train_seqs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "M3CD75nDpvTI"
+   },
+   "source": [
+    "## Split the data into training and testing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "nrvoDphgRPGd",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Model\n",
-        "\n",
-        "Fun fact, the decoder below is identical to the one in the example for [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
-        "\n",
-        "The model architecture is inspired by the [Show, Attend and Tell](https://arxiv.org/pdf/1502.03044.pdf) paper.\n",
-        "\n",
-        "* In this example, we extract the features from the lower convolutional layer of InceptionV3 giving us a vector of shape (8, 8, 2048). \n",
-        "* We squash that to a shape of (64, 2048).\n",
-        "* This vector is then passed through the CNN Encoder(which consists of a single Fully connected layer).\n",
-        "* The RNN(here GRU) attends over the image to predict the next word."
-      ]
+    "colab_type": "code",
+    "id": "iS7DDMszRPGF"
+   },
+   "outputs": [],
+   "source": [
+    "# Create training and validation sets using 80-20 split\n",
+    "img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, \n",
+    "                                                                    cap_vector, \n",
+    "                                                                    test_size=0.2, \n",
+    "                                                                    random_state=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "AAppCGLKRPGd",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def gru(units):\n",
-        "  # If you have a GPU, we recommend using the CuDNNGRU layer (it provides a \n",
-        "  # significant speedup).\n",
-        "  if tf.test.is_gpu_available():\n",
-        "    return tf.keras.layers.CuDNNGRU(units, \n",
-        "                                    return_sequences=True, \n",
-        "                                    return_state=True, \n",
-        "                                    recurrent_initializer='glorot_uniform')\n",
-        "  else:\n",
-        "    return tf.keras.layers.GRU(units, \n",
-        "                               return_sequences=True, \n",
-        "                               return_state=True, \n",
-        "                               recurrent_activation='sigmoid', \n",
-        "                               recurrent_initializer='glorot_uniform')"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "XmViPkRFRPGH"
+   },
+   "outputs": [],
+   "source": [
+    "len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "uEWM9xrYcg45"
+   },
+   "source": [
+    "## Our images and captions are ready! Next, let's create a tf.data dataset to use for training our model.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "ja2LFTMSdeV3",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "class BahdanauAttention(tf.keras.Model):\n",
-        "  def __init__(self, units):\n",
-        "    super(BahdanauAttention, self).__init__()\n",
-        "    self.W1 = tf.keras.layers.Dense(units)\n",
-        "    self.W2 = tf.keras.layers.Dense(units)\n",
-        "    self.V = tf.keras.layers.Dense(1)\n",
-        "  \n",
-        "  def call(self, features, hidden):\n",
-        "    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)\n",
-        "    \n",
-        "    # hidden shape == (batch_size, hidden_size)\n",
-        "    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)\n",
-        "    hidden_with_time_axis = tf.expand_dims(hidden, 1)\n",
-        "    \n",
-        "    # score shape == (batch_size, 64, hidden_size)\n",
-        "    score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))\n",
-        "    \n",
-        "    # attention_weights shape == (batch_size, 64, 1)\n",
-        "    # we get 1 at the last axis because we are applying score to self.V\n",
-        "    attention_weights = tf.nn.softmax(self.V(score), axis=1)\n",
-        "    \n",
-        "    # context_vector shape after sum == (batch_size, hidden_size)\n",
-        "    context_vector = attention_weights * features\n",
-        "    context_vector = tf.reduce_sum(context_vector, axis=1)\n",
-        "    \n",
-        "    return context_vector, attention_weights"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "Q3TnZ1ToRPGV"
+   },
+   "outputs": [],
+   "source": [
+    "# feel free to change these parameters according to your system's configuration\n",
+    "\n",
+    "BATCH_SIZE = 64\n",
+    "BUFFER_SIZE = 1000\n",
+    "embedding_dim = 256\n",
+    "units = 512\n",
+    "vocab_size = len(tokenizer.word_index)\n",
+    "# shape of the vector extracted from InceptionV3 is (64, 2048)\n",
+    "# these two variables represent that\n",
+    "features_shape = 2048\n",
+    "attention_features_shape = 64"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "AZ7R1RxHRPGf",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "class CNN_Encoder(tf.keras.Model):\n",
-        "    # Since we have already extracted the features and dumped it using pickle\n",
-        "    # This encoder passes those features through a Fully connected layer\n",
-        "    def __init__(self, embedding_dim):\n",
-        "        super(CNN_Encoder, self).__init__()\n",
-        "        # shape after fc == (batch_size, 64, embedding_dim)\n",
-        "        self.fc = tf.keras.layers.Dense(embedding_dim)\n",
-        "        \n",
-        "    def call(self, x):\n",
-        "        x = self.fc(x)\n",
-        "        x = tf.nn.relu(x)\n",
-        "        return x"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "SmZS2N0bXG3T"
+   },
+   "outputs": [],
+   "source": [
+    "# loading the numpy files \n",
+    "def map_func(img_name, cap):\n",
+    "    img_tensor = np.load(img_name.decode('utf-8')+'.npy')\n",
+    "    return img_tensor, cap"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "V9UbGQmERPGi",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "class RNN_Decoder(tf.keras.Model):\n",
-        "  def __init__(self, embedding_dim, units, vocab_size):\n",
-        "    super(RNN_Decoder, self).__init__()\n",
-        "    self.units = units\n",
-        "\n",
-        "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
-        "    self.gru = gru(self.units)\n",
-        "    self.fc1 = tf.keras.layers.Dense(self.units)\n",
-        "    self.fc2 = tf.keras.layers.Dense(vocab_size)\n",
-        "    \n",
-        "    self.attention = BahdanauAttention(self.units)\n",
-        "        \n",
-        "  def call(self, x, features, hidden):\n",
-        "    # defining attention as a separate model\n",
-        "    context_vector, attention_weights = self.attention(features, hidden)\n",
-        "    \n",
-        "    # x shape after passing through embedding == (batch_size, 1, embedding_dim)\n",
-        "    x = self.embedding(x)\n",
-        "    \n",
-        "    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)\n",
-        "    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n",
-        "    \n",
-        "    # passing the concatenated vector to the GRU\n",
-        "    output, state = self.gru(x)\n",
-        "    \n",
-        "    # shape == (batch_size, max_length, hidden_size)\n",
-        "    x = self.fc1(output)\n",
-        "    \n",
-        "    # x shape == (batch_size * max_length, hidden_size)\n",
-        "    x = tf.reshape(x, (-1, x.shape[2]))\n",
-        "    \n",
-        "    # output shape == (batch_size * max_length, vocab)\n",
-        "    x = self.fc2(x)\n",
-        "\n",
-        "    return x, state, attention_weights\n",
-        "\n",
-        "  def reset_state(self, batch_size):\n",
-        "    return tf.zeros((batch_size, self.units))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "FDF_Nm3tRPGZ"
+   },
+   "outputs": [],
+   "source": [
+    "dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))\n",
+    "\n",
+    "# using map to load the numpy files in parallel\n",
+    "# NOTE: Be sure to set num_parallel_calls to the number of CPU cores you have\n",
+    "# https://www.tensorflow.org/api_docs/python/tf/py_func\n",
+    "dataset = dataset.map(lambda item1, item2: tf.py_func(\n",
+    "          map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=8)\n",
+    "\n",
+    "# shuffling and batching\n",
+    "dataset = dataset.shuffle(BUFFER_SIZE)\n",
+    "# https://www.tensorflow.org/api_docs/python/tf/contrib/data/batch_and_drop_remainder\n",
+    "dataset = dataset.batch(BATCH_SIZE)\n",
+    "dataset = dataset.prefetch(1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "nrvoDphgRPGd"
+   },
+   "source": [
+    "## Model\n",
+    "\n",
+    "Fun fact, the decoder below is identical to the one in the example for [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
+    "\n",
+    "The model architecture is inspired by the [Show, Attend and Tell](https://arxiv.org/pdf/1502.03044.pdf) paper.\n",
+    "\n",
+    "* In this example, we extract the features from the lower convolutional layer of InceptionV3 giving us a vector of shape (8, 8, 2048). \n",
+    "* We squash that to a shape of (64, 2048).\n",
+    "* This vector is then passed through the CNN Encoder(which consists of a single Fully connected layer).\n",
+    "* The RNN(here GRU) attends over the image to predict the next word."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "Qs_Sr03wRPGk",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "encoder = CNN_Encoder(embedding_dim)\n",
-        "decoder = RNN_Decoder(embedding_dim, units, vocab_size)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "AAppCGLKRPGd"
+   },
+   "outputs": [],
+   "source": [
+    "def gru(units):\n",
+    "  # If you have a GPU, we recommend using the CuDNNGRU layer (it provides a \n",
+    "  # significant speedup).\n",
+    "  if tf.test.is_gpu_available():\n",
+    "    return tf.keras.layers.CuDNNGRU(units, \n",
+    "                                    return_sequences=True, \n",
+    "                                    return_state=True, \n",
+    "                                    recurrent_initializer='glorot_uniform')\n",
+    "  else:\n",
+    "    return tf.keras.layers.GRU(units, \n",
+    "                               return_sequences=True, \n",
+    "                               return_state=True, \n",
+    "                               recurrent_activation='sigmoid', \n",
+    "                               recurrent_initializer='glorot_uniform')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "-bYN7xA0RPGl",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "optimizer = tf.train.AdamOptimizer()\n",
-        "\n",
-        "# We are masking the loss calculated for padding\n",
-        "def loss_function(real, pred):\n",
-        "    mask = 1 - np.equal(real, 0)\n",
-        "    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
-        "    return tf.reduce_mean(loss_)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "ja2LFTMSdeV3"
+   },
+   "outputs": [],
+   "source": [
+    "class BahdanauAttention(tf.keras.Model):\n",
+    "  def __init__(self, units):\n",
+    "    super(BahdanauAttention, self).__init__()\n",
+    "    self.W1 = tf.keras.layers.Dense(units)\n",
+    "    self.W2 = tf.keras.layers.Dense(units)\n",
+    "    self.V = tf.keras.layers.Dense(1)\n",
+    "  \n",
+    "  def call(self, features, hidden):\n",
+    "    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)\n",
+    "    \n",
+    "    # hidden shape == (batch_size, hidden_size)\n",
+    "    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)\n",
+    "    hidden_with_time_axis = tf.expand_dims(hidden, 1)\n",
+    "    \n",
+    "    # score shape == (batch_size, 64, hidden_size)\n",
+    "    score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))\n",
+    "    \n",
+    "    # attention_weights shape == (batch_size, 64, 1)\n",
+    "    # we get 1 at the last axis because we are applying score to self.V\n",
+    "    attention_weights = tf.nn.softmax(self.V(score), axis=1)\n",
+    "    \n",
+    "    # context_vector shape after sum == (batch_size, hidden_size)\n",
+    "    context_vector = attention_weights * features\n",
+    "    context_vector = tf.reduce_sum(context_vector, axis=1)\n",
+    "    \n",
+    "    return context_vector, attention_weights"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "PHod7t72RPGn",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Training\n",
-        "\n",
-        "* We extract the features stored in the respective `.npy` files and then pass those features through the encoder.\n",
-        "* The encoder output, hidden state(initialized to 0) and the decoder input (which is the start token) is passed to the decoder.\n",
-        "* The decoder returns the predictions and the decoder hidden state.\n",
-        "* The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.\n",
-        "* Use teacher forcing to decide the next input to the decoder.\n",
-        "* Teacher forcing is the technique where the target word is passed as the next input to the decoder.\n",
-        "* The final step is to calculate the gradients and apply it to the optimizer and backpropagate.\n"
-      ]
+    "colab_type": "code",
+    "id": "AZ7R1RxHRPGf"
+   },
+   "outputs": [],
+   "source": [
+    "class CNN_Encoder(tf.keras.Model):\n",
+    "    # Since we have already extracted the features and dumped it using pickle\n",
+    "    # This encoder passes those features through a Fully connected layer\n",
+    "    def __init__(self, embedding_dim):\n",
+    "        super(CNN_Encoder, self).__init__()\n",
+    "        # shape after fc == (batch_size, 64, embedding_dim)\n",
+    "        self.fc = tf.keras.layers.Dense(embedding_dim)\n",
+    "        \n",
+    "    def call(self, x):\n",
+    "        x = self.fc(x)\n",
+    "        x = tf.nn.relu(x)\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "Vt4WZ5mhJE-E",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# adding this in a separate cell because if you run the training cell \n",
-        "# many times, the loss_plot array will be reset\n",
-        "loss_plot = []"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "V9UbGQmERPGi"
+   },
+   "outputs": [],
+   "source": [
+    "class RNN_Decoder(tf.keras.Model):\n",
+    "  def __init__(self, embedding_dim, units, vocab_size):\n",
+    "    super(RNN_Decoder, self).__init__()\n",
+    "    self.units = units\n",
+    "\n",
+    "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
+    "    self.gru = gru(self.units)\n",
+    "    self.fc1 = tf.keras.layers.Dense(self.units)\n",
+    "    self.fc2 = tf.keras.layers.Dense(vocab_size)\n",
+    "    \n",
+    "    self.attention = BahdanauAttention(self.units)\n",
+    "        \n",
+    "  def call(self, x, features, hidden):\n",
+    "    # defining attention as a separate model\n",
+    "    context_vector, attention_weights = self.attention(features, hidden)\n",
+    "    \n",
+    "    # x shape after passing through embedding == (batch_size, 1, embedding_dim)\n",
+    "    x = self.embedding(x)\n",
+    "    \n",
+    "    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)\n",
+    "    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n",
+    "    \n",
+    "    # passing the concatenated vector to the GRU\n",
+    "    output, state = self.gru(x)\n",
+    "    \n",
+    "    # shape == (batch_size, max_length, hidden_size)\n",
+    "    x = self.fc1(output)\n",
+    "    \n",
+    "    # x shape == (batch_size * max_length, hidden_size)\n",
+    "    x = tf.reshape(x, (-1, x.shape[2]))\n",
+    "    \n",
+    "    # output shape == (batch_size * max_length, vocab)\n",
+    "    x = self.fc2(x)\n",
+    "\n",
+    "    return x, state, attention_weights\n",
+    "\n",
+    "  def reset_state(self, batch_size):\n",
+    "    return tf.zeros((batch_size, self.units))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "UlA4VIQpRPGo",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "EPOCHS = 20\n",
-        "\n",
-        "for epoch in range(EPOCHS):\n",
-        "    start = time.time()\n",
-        "    total_loss = 0\n",
-        "    \n",
-        "    for (batch, (img_tensor, target)) in enumerate(dataset):\n",
-        "        loss = 0\n",
-        "        \n",
-        "        # initializing the hidden state for each batch\n",
-        "        # because the captions are not related from image to image\n",
-        "        hidden = decoder.reset_state(batch_size=target.shape[0])\n",
-        "\n",
-        "        dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)\n",
-        "        \n",
-        "        with tf.GradientTape() as tape:\n",
-        "            features = encoder(img_tensor)\n",
-        "            \n",
-        "            for i in range(1, target.shape[1]):\n",
-        "                # passing the features through the decoder\n",
-        "                predictions, hidden, _ = decoder(dec_input, features, hidden)\n",
-        "\n",
-        "                loss += loss_function(target[:, i], predictions)\n",
-        "                \n",
-        "                # using teacher forcing\n",
-        "                dec_input = tf.expand_dims(target[:, i], 1)\n",
-        "        \n",
-        "        total_loss += (loss / int(target.shape[1]))\n",
-        "        \n",
-        "        variables = encoder.variables + decoder.variables\n",
-        "        \n",
-        "        gradients = tape.gradient(loss, variables) \n",
-        "        \n",
-        "        optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n",
-        "        \n",
-        "        if batch % 100 == 0:\n",
-        "            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, \n",
-        "                                                          batch, \n",
-        "                                                          loss.numpy() / int(target.shape[1])))\n",
-        "    # storing the epoch end loss value to plot later\n",
-        "    loss_plot.append(total_loss / len(cap_vector))\n",
-        "    \n",
-        "    print ('Epoch {} Loss {:.6f}'.format(epoch + 1, \n",
-        "                                         total_loss/len(cap_vector)))\n",
-        "    print ('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "Qs_Sr03wRPGk"
+   },
+   "outputs": [],
+   "source": [
+    "encoder = CNN_Encoder(embedding_dim)\n",
+    "decoder = RNN_Decoder(embedding_dim, units, vocab_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "1Wm83G-ZBPcC",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "plt.plot(loss_plot)\n",
-        "plt.xlabel('Epochs')\n",
-        "plt.ylabel('Loss')\n",
-        "plt.title('Loss Plot')\n",
-        "plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "-bYN7xA0RPGl"
+   },
+   "outputs": [],
+   "source": [
+    "optimizer = tf.train.AdamOptimizer()\n",
+    "\n",
+    "# We are masking the loss calculated for padding\n",
+    "def loss_function(real, pred):\n",
+    "    mask = 1 - np.equal(real, 0)\n",
+    "    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
+    "    return tf.reduce_mean(loss_)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "PHod7t72RPGn"
+   },
+   "source": [
+    "## Training\n",
+    "\n",
+    "* We extract the features stored in the respective `.npy` files and then pass those features through the encoder.\n",
+    "* The encoder output, hidden state(initialized to 0) and the decoder input (which is the start token) is passed to the decoder.\n",
+    "* The decoder returns the predictions and the decoder hidden state.\n",
+    "* The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.\n",
+    "* Use teacher forcing to decide the next input to the decoder.\n",
+    "* Teacher forcing is the technique where the target word is passed as the next input to the decoder.\n",
+    "* The final step is to calculate the gradients and apply it to the optimizer and backpropagate.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "xGvOcLQKghXN",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Caption!\n",
-        "\n",
-        "* The evaluate function is similar to the training loop, except we don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.\n",
-        "* Stop predicting when the model predicts the end token.\n",
-        "* And store the attention weights for every time step."
-      ]
+    "colab_type": "code",
+    "id": "Vt4WZ5mhJE-E"
+   },
+   "outputs": [],
+   "source": [
+    "# adding this in a separate cell because if you run the training cell \n",
+    "# many times, the loss_plot array will be reset\n",
+    "loss_plot = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "RCWpDtyNRPGs",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def evaluate(image):\n",
-        "    attention_plot = np.zeros((max_length, attention_features_shape))\n",
-        "\n",
-        "    hidden = decoder.reset_state(batch_size=1)\n",
-        "\n",
-        "    temp_input = tf.expand_dims(load_image(image)[0], 0)\n",
-        "    img_tensor_val = image_features_extract_model(temp_input)\n",
-        "    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))\n",
-        "\n",
-        "    features = encoder(img_tensor_val)\n",
-        "\n",
-        "    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)\n",
-        "    result = []\n",
-        "\n",
-        "    for i in range(max_length):\n",
-        "        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)\n",
-        "\n",
-        "        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()\n",
-        "\n",
-        "        predicted_id = tf.argmax(predictions[0]).numpy()\n",
-        "        result.append(index_word[predicted_id])\n",
-        "\n",
-        "        if index_word[predicted_id] == '<end>':\n",
-        "            return result, attention_plot\n",
-        "\n",
-        "        dec_input = tf.expand_dims([predicted_id], 0)\n",
-        "\n",
-        "    attention_plot = attention_plot[:len(result), :]\n",
-        "    return result, attention_plot"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "UlA4VIQpRPGo"
+   },
+   "outputs": [],
+   "source": [
+    "EPOCHS = 20\n",
+    "\n",
+    "for epoch in range(EPOCHS):\n",
+    "    start = time.time()\n",
+    "    total_loss = 0\n",
+    "    \n",
+    "    for (batch, (img_tensor, target)) in enumerate(dataset):\n",
+    "        loss = 0\n",
+    "        \n",
+    "        # initializing the hidden state for each batch\n",
+    "        # because the captions are not related from image to image\n",
+    "        hidden = decoder.reset_state(batch_size=target.shape[0])\n",
+    "\n",
+    "        dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)\n",
+    "        \n",
+    "        with tf.GradientTape() as tape:\n",
+    "            features = encoder(img_tensor)\n",
+    "            \n",
+    "            for i in range(1, target.shape[1]):\n",
+    "                # passing the features through the decoder\n",
+    "                predictions, hidden, _ = decoder(dec_input, features, hidden)\n",
+    "\n",
+    "                loss += loss_function(target[:, i], predictions)\n",
+    "                \n",
+    "                # using teacher forcing\n",
+    "                dec_input = tf.expand_dims(target[:, i], 1)\n",
+    "        \n",
+    "        total_loss += (loss / int(target.shape[1]))\n",
+    "        \n",
+    "        variables = encoder.variables + decoder.variables\n",
+    "        \n",
+    "        gradients = tape.gradient(loss, variables) \n",
+    "        \n",
+    "        optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n",
+    "        \n",
+    "        if batch % 100 == 0:\n",
+    "            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, \n",
+    "                                                          batch, \n",
+    "                                                          loss.numpy() / int(target.shape[1])))\n",
+    "    # storing the epoch end loss value to plot later\n",
+    "    loss_plot.append(total_loss / len(cap_vector))\n",
+    "    \n",
+    "    print ('Epoch {} Loss {:.6f}'.format(epoch + 1, \n",
+    "                                         total_loss/len(cap_vector)))\n",
+    "    print ('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "fD_y7PD6RPGt",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def plot_attention(image, result, attention_plot):\n",
-        "    temp_image = np.array(Image.open(image))\n",
-        "\n",
-        "    fig = plt.figure(figsize=(10, 10))\n",
-        "    \n",
-        "    len_result = len(result)\n",
-        "    for l in range(len_result):\n",
-        "        temp_att = np.resize(attention_plot[l], (8, 8))\n",
-        "        ax = fig.add_subplot(len_result//2, len_result//2, l+1)\n",
-        "        ax.set_title(result[l])\n",
-        "        img = ax.imshow(temp_image)\n",
-        "        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())\n",
-        "\n",
-        "    plt.tight_layout()\n",
-        "    plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "1Wm83G-ZBPcC"
+   },
+   "outputs": [],
+   "source": [
+    "plt.plot(loss_plot)\n",
+    "plt.xlabel('Epochs')\n",
+    "plt.ylabel('Loss')\n",
+    "plt.title('Loss Plot')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "xGvOcLQKghXN"
+   },
+   "source": [
+    "## Caption!\n",
+    "\n",
+    "* The evaluate function is similar to the training loop, except we don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.\n",
+    "* Stop predicting when the model predicts the end token.\n",
+    "* And store the attention weights for every time step."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "io7ws3ReRPGv",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# captions on the validation set\n",
-        "rid = np.random.randint(0, len(img_name_val))\n",
-        "image = img_name_val[rid]\n",
-        "real_caption = ' '.join([index_word[i] for i in cap_val[rid] if i not in [0]])\n",
-        "result, attention_plot = evaluate(image)\n",
-        "\n",
-        "print ('Real Caption:', real_caption)\n",
-        "print ('Prediction Caption:', ' '.join(result))\n",
-        "plot_attention(image, result, attention_plot)\n",
-        "# opening the image\n",
-        "Image.open(img_name_val[rid])"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "RCWpDtyNRPGs"
+   },
+   "outputs": [],
+   "source": [
+    "def evaluate(image):\n",
+    "    attention_plot = np.zeros((max_length, attention_features_shape))\n",
+    "\n",
+    "    hidden = decoder.reset_state(batch_size=1)\n",
+    "\n",
+    "    temp_input = tf.expand_dims(load_image(image)[0], 0)\n",
+    "    img_tensor_val = image_features_extract_model(temp_input)\n",
+    "    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))\n",
+    "\n",
+    "    features = encoder(img_tensor_val)\n",
+    "\n",
+    "    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)\n",
+    "    result = []\n",
+    "\n",
+    "    for i in range(max_length):\n",
+    "        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)\n",
+    "\n",
+    "        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()\n",
+    "\n",
+    "        predicted_id = tf.argmax(predictions[0]).numpy()\n",
+    "        result.append(tokenizer.index_word[predicted_id])\n",
+    "\n",
+    "        if tokenizer.index_word[predicted_id] == '<end>':\n",
+    "            return result, attention_plot\n",
+    "\n",
+    "        dec_input = tf.expand_dims([predicted_id], 0)\n",
+    "\n",
+    "    attention_plot = attention_plot[:len(result), :]\n",
+    "    return result, attention_plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "Rprk3HEvZuxb",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Try it on your own images\n",
-        "For fun, below we've provided a method you can use to caption your own images with the model we've just trained. Keep in mind, it was trained on a relatively small amount of data, and your images may be different from the training data (so be prepared for weird results!)\n"
-      ]
+    "colab_type": "code",
+    "id": "fD_y7PD6RPGt"
+   },
+   "outputs": [],
+   "source": [
+    "def plot_attention(image, result, attention_plot):\n",
+    "    temp_image = np.array(Image.open(image))\n",
+    "\n",
+    "    fig = plt.figure(figsize=(10, 10))\n",
+    "    \n",
+    "    len_result = len(result)\n",
+    "    for l in range(len_result):\n",
+    "        temp_att = np.resize(attention_plot[l], (8, 8))\n",
+    "        ax = fig.add_subplot(len_result//2, len_result//2, l+1)\n",
+    "        ax.set_title(result[l])\n",
+    "        img = ax.imshow(temp_image)\n",
+    "        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "9Psd1quzaAWg",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "image_url = 'https://tensorflow.org/images/surf.jpg'\n",
-        "image_extension = image_url[-4:]\n",
-        "image_path = tf.keras.utils.get_file('image'+image_extension, \n",
-        "                                     origin=image_url)\n",
-        "\n",
-        "result, attention_plot = evaluate(image_path)\n",
-        "print ('Prediction Caption:', ' '.join(result))\n",
-        "plot_attention(image_path, result, attention_plot)\n",
-        "# opening the image\n",
-        "Image.open(image_path)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "io7ws3ReRPGv"
+   },
+   "outputs": [],
+   "source": [
+    "# captions on the validation set\n",
+    "rid = np.random.randint(0, len(img_name_val))\n",
+    "image = img_name_val[rid]\n",
+    "real_caption = ' '.join([tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]])\n",
+    "result, attention_plot = evaluate(image)\n",
+    "\n",
+    "print ('Real Caption:', real_caption)\n",
+    "print ('Prediction Caption:', ' '.join(result))\n",
+    "plot_attention(image, result, attention_plot)\n",
+    "# opening the image\n",
+    "Image.open(img_name_val[rid])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "Rprk3HEvZuxb"
+   },
+   "source": [
+    "## Try it on your own images\n",
+    "For fun, below we've provided a method you can use to caption your own images with the model we've just trained. Keep in mind, it was trained on a relatively small amount of data, and your images may be different from the training data (so be prepared for weird results!)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
+    "colab_type": "code",
+    "id": "9Psd1quzaAWg"
+   },
+   "outputs": [],
+   "source": [
+    "image_url = 'https://tensorflow.org/images/surf.jpg'\n",
+    "image_extension = image_url[-4:]\n",
+    "image_path = tf.keras.utils.get_file('image'+image_extension, \n",
+    "                                     origin=image_url)\n",
+    "\n",
+    "result, attention_plot = evaluate(image_path)\n",
+    "print ('Prediction Caption:', ' '.join(result))\n",
+    "plot_attention(image_path, result, attention_plot)\n",
+    "# opening the image\n",
+    "Image.open(image_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "VJZXyJco6uLO"
+   },
+   "source": [
+    "# Next steps\n",
+    "\n",
+    "Congrats! You've just trained an image captioning model with attention. Next, we recommend taking a look at this example [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb). It uses a similar architecture to translate between Spanish and English sentences. You can also experiment with training the code in this notebook on a different dataset."
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "default_view": {},
+   "name": "image_captioning_with_attention.ipynb",
+   "private_outputs": true,
+   "provenance": [
     {
-      "metadata": {
-        "id": "VJZXyJco6uLO",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "# Next steps\n",
-        "\n",
-        "Congrats! You've just trained an image captioning model with attention. Next, we recommend taking a look at this example [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb). It uses a similar architecture to translate between Spanish and English sentences. You can also experiment with training the code in this notebook on a different dataset."
-      ]
+     "file_id": "1HI8OK2sMjcx9CTWVn0122QAHOuXaOaMg",
+     "timestamp": 1530222436922
     }
-  ]
+   ],
+   "toc_visible": true,
+   "version": "0.3.2",
+   "views": {}
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
 }
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
index 557ad42752144243ae3da61b955b31398cba846e..d412b25b368260b81256fd58034330b884261b2b 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
@@ -36,7 +36,7 @@ class GraphLinearRegressionBenchmark(tf.test.Benchmark):
         noise_level=0.01,
         batch_size=batch_size,
         num_batches=num_batches)
-    iterator = dataset.make_initializable_iterator()
+    iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
     x, y = iterator.get_next()
 
     model = linear_regression.LinearModel()
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
index f3bb978875e226f58d6a00e09154191673a97415..fb7975d8fe867711cff31d627788a2d62a520aa9 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
@@ -142,7 +142,8 @@ class ResNet50Benchmarks(tf.test.Benchmark):
       with tf.Graph().as_default():
         np_images, np_labels = random_batch(batch_size)
         dataset = tf.data.Dataset.from_tensors((np_images, np_labels)).repeat()
-        (images, labels) = dataset.make_one_shot_iterator().get_next()
+        images, labels = tf.compat.v1.data.make_one_shot_iterator(
+            dataset).get_next()
 
         model = resnet50.ResNet50(data_format())
         logits = model(images, training=True)
diff --git a/tensorflow/contrib/eager/python/examples/revnet/main.py b/tensorflow/contrib/eager/python/examples/revnet/main.py
index b702e91f92220c2a9003a1b82411131332012a9e..9585f3565f83af724b6336e466d3671443ba2361 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/main.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/main.py
@@ -72,14 +72,11 @@ def main(_):
     train_one_iter(model, x, y, optimizer, global_step=global_step)
 
     if global_step.numpy() % config.log_every == 0:
-      it_test = ds_test.make_one_shot_iterator()
-      acc_test, loss_test = evaluate(model, it_test)
+      acc_test, loss_test = evaluate(model, ds_test)
 
       if FLAGS.validate:
-        it_train = ds_train_one_shot.make_one_shot_iterator()
-        it_validation = ds_validation.make_one_shot_iterator()
-        acc_train, loss_train = evaluate(model, it_train)
-        acc_validation, loss_validation = evaluate(model, it_validation)
+        acc_train, loss_train = evaluate(model, ds_train_one_shot)
+        acc_validation, loss_validation = evaluate(model, ds_validation)
         print("Iter {}, "
               "training set accuracy {:.4f}, loss {:.4f}; "
               "validation set accuracy {:.4f}, loss {:.4f}; "
@@ -218,11 +215,11 @@ def train_one_iter(model, inputs, labels, optimizer, global_step=None):
   return logits, loss
 
 
-def evaluate(model, iterator):
+def evaluate(model, dataset):
   """Compute accuracy with the given dataset iterator."""
   mean_loss = tfe.metrics.Mean()
   accuracy = tfe.metrics.Accuracy()
-  for x, y in iterator:
+  for x, y in dataset:
     logits, _ = model(x, training=False)
     loss = model.compute_loss(logits=logits, labels=y)
     accuracy(
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py
index 63b5c4c54d13e9c2448ec1f572ca1389f2443bef..770484abed96e540cf75cc5368a1410c31a8d2d0 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py
@@ -82,7 +82,7 @@ class PTBBenchmark(tf.test.Benchmark):
         tf.ones(
             [PTBBenchmark.SEQ_LEN, PTBBenchmark.BATCH_SIZE],
             dtype=tf.int64)).repeat(num_iters + num_warmup)
-    inputs = dataset.make_one_shot_iterator().get_next()
+    inputs = tf.compat.v1.data.make_one_shot_iterator(dataset).get_next()
 
     with tf.device(tf.test.gpu_device_name()):
       outputs = model(inputs, training=True)
@@ -124,7 +124,8 @@ class PTBBenchmark(tf.test.Benchmark):
             dtype=tf.int64)).repeat(num_iters + num_warmup)
     # inputs and labels have the same shape
     dataset = tf.data.Dataset.zip((dataset, dataset))
-    (inputs, labels) = dataset.make_one_shot_iterator().get_next()
+    (inputs, labels) = tf.compat.v1.data.make_one_shot_iterator(
+        dataset).get_next()
 
     with tf.device(tf.test.gpu_device_name()):
       optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
diff --git a/tensorflow/contrib/eager/python/saver.py b/tensorflow/contrib/eager/python/saver.py
index f9c716360c5755ee1902b576545d776725f9966f..1d0d6c6c14ce4a8e454206e0be9fea4724f09192 100644
--- a/tensorflow/contrib/eager/python/saver.py
+++ b/tensorflow/contrib/eager/python/saver.py
@@ -115,6 +115,11 @@ def restore_variables_on_create(save_path, map_func=None):
 
 class Saver(object):
   """A tf.train.Saver adapter for use when eager execution is enabled.
+
+  `Saver`'s name-based checkpointing strategy is fragile. Please switch to
+  `tf.train.Checkpoint` or `tf.keras.Model.save_weights`, which perform a more
+  robust object-based saving. These APIs will load checkpoints written by
+  `Saver`.
   """
 
   def __init__(self, var_list):
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index 33c988fd9065e7fbe7b9aeb85cad82eb3c119f76..8882a863c30d8b222c68d6952279c3744345883c 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -41,6 +41,8 @@ To use, at program startup, call `tf.enable_eager_execution()`.
 
 @@add_execution_callback
 @@clear_execution_callbacks
+@@errstate
+@@ExecutionCallback
 @@inf_callback
 @@inf_nan_callback
 @@nan_callback
@@ -119,6 +121,8 @@ from tensorflow.python.eager.context import set_server_def
 from tensorflow.python.eager.def_function import function
 from tensorflow.python.eager.execution_callbacks import add_execution_callback
 from tensorflow.python.eager.execution_callbacks import clear_execution_callbacks
+from tensorflow.python.eager.execution_callbacks import errstate
+from tensorflow.python.eager.execution_callbacks import ExecutionCallback
 from tensorflow.python.eager.execution_callbacks import inf_callback
 from tensorflow.python.eager.execution_callbacks import inf_nan_callback
 from tensorflow.python.eager.execution_callbacks import nan_callback
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
index 1cd83bdb5de7c2f6dc91c980750b49aca1a7790b..4c1d1a29f20b5574b63cf87ecf62db95f92902cd 100644
--- a/tensorflow/contrib/feature_column/BUILD
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -110,8 +110,8 @@ py_test(
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
-        "//tensorflow/python/feature_column",
         "//tensorflow/python/feature_column:feature_column_py",
+        "//tensorflow/python/feature_column:feature_column_v2_test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
index 0d34ad161855476b6a4cd9a258521dbe122b4140..83b93ec332044f754f9dcde8d7c5c19b26e53a4a 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
@@ -203,7 +203,8 @@ def sequence_categorical_column_with_identity(
   columns = [watches_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  input_layer, sequence_length = sequence_input_layer(features, columns)
+  sequence_feature_layer = SequenceFeatureLayer(columns)
+  input_layer, sequence_length = sequence_feature_layer(features)
 
   rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
   outputs, state = tf.nn.dynamic_rnn(
@@ -219,15 +220,17 @@ def sequence_categorical_column_with_identity(
       `[0, num_buckets)`, and will replace out-of-range inputs.
 
   Returns:
-    A `_SequenceCategoricalColumn`.
+    A `SequenceCategoricalColumn`.
 
   Raises:
     ValueError: if `num_buckets` is less than one.
     ValueError: if `default_value` is not in range `[0, num_buckets)`.
   """
-  return fc_old._SequenceCategoricalColumn(
-      fc_old._categorical_column_with_identity(
-          key=key, num_buckets=num_buckets, default_value=default_value))
+  return fc.SequenceCategoricalColumn(
+      fc.categorical_column_with_identity(
+          key=key,
+          num_buckets=num_buckets,
+          default_value=default_value))
 
 
 def sequence_categorical_column_with_hash_bucket(
@@ -247,7 +250,8 @@ def sequence_categorical_column_with_hash_bucket(
   columns = [tokens_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  input_layer, sequence_length = sequence_input_layer(features, columns)
+  sequence_feature_layer = SequenceFeatureLayer(columns)
+  input_layer, sequence_length = sequence_feature_layer(features)
 
   rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
   outputs, state = tf.nn.dynamic_rnn(
@@ -260,15 +264,17 @@ def sequence_categorical_column_with_hash_bucket(
     dtype: The type of features. Only string and integer types are supported.
 
   Returns:
-    A `_SequenceCategoricalColumn`.
+    A `SequenceCategoricalColumn`.
 
   Raises:
     ValueError: `hash_bucket_size` is not greater than 1.
     ValueError: `dtype` is neither string nor integer.
   """
-  return fc_old._SequenceCategoricalColumn(
-      fc_old._categorical_column_with_hash_bucket(
-          key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
+  return fc.SequenceCategoricalColumn(
+      fc.categorical_column_with_hash_bucket(
+          key=key,
+          hash_bucket_size=hash_bucket_size,
+          dtype=dtype))
 
 
 def sequence_categorical_column_with_vocabulary_file(
@@ -290,7 +296,8 @@ def sequence_categorical_column_with_vocabulary_file(
   columns = [states_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  input_layer, sequence_length = sequence_input_layer(features, columns)
+  sequence_feature_layer = SequenceFeatureLayer(columns)
+  input_layer, sequence_length = sequence_feature_layer(features)
 
   rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
   outputs, state = tf.nn.dynamic_rnn(
@@ -314,7 +321,7 @@ def sequence_categorical_column_with_vocabulary_file(
     dtype: The type of features. Only string and integer types are supported.
 
   Returns:
-    A `_SequenceCategoricalColumn`.
+    A `SequenceCategoricalColumn`.
 
   Raises:
     ValueError: `vocabulary_file` is missing or cannot be opened.
@@ -323,8 +330,8 @@ def sequence_categorical_column_with_vocabulary_file(
     ValueError: `num_oov_buckets` and `default_value` are both specified.
     ValueError: `dtype` is neither string nor integer.
   """
-  return fc_old._SequenceCategoricalColumn(
-      fc_old._categorical_column_with_vocabulary_file(
+  return fc.SequenceCategoricalColumn(
+      fc.categorical_column_with_vocabulary_file(
           key=key,
           vocabulary_file=vocabulary_file,
           vocabulary_size=vocabulary_size,
@@ -351,7 +358,8 @@ def sequence_categorical_column_with_vocabulary_list(
   columns = [colors_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  input_layer, sequence_length = sequence_input_layer(features, columns)
+  sequence_feature_layer = SequenceFeatureLayer(columns)
+  input_layer, sequence_length = sequence_feature_layer(features)
 
   rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
   outputs, state = tf.nn.dynamic_rnn(
@@ -375,7 +383,7 @@ def sequence_categorical_column_with_vocabulary_list(
       with `default_value`.
 
   Returns:
-    A `_SequenceCategoricalColumn`.
+    A `SequenceCategoricalColumn`.
 
   Raises:
     ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
@@ -383,8 +391,8 @@ def sequence_categorical_column_with_vocabulary_list(
     ValueError: `num_oov_buckets` and `default_value` are both specified.
     ValueError: if `dtype` is not integer or string.
   """
-  return fc_old._SequenceCategoricalColumn(
-      fc_old._categorical_column_with_vocabulary_list(
+  return fc.SequenceCategoricalColumn(
+      fc.categorical_column_with_vocabulary_list(
           key=key,
           vocabulary_list=vocabulary_list,
           dtype=dtype,
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
index ca4398a142065de0be7bee57cd7e54670bbae12e..be012a87690c24c6d9b7808790393e1aa6d01211 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
@@ -26,7 +26,7 @@ from tensorflow.contrib.feature_column.python.feature_column import sequence_fea
 from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column_v2 as sfc
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_lib as fc
-from tensorflow.python.feature_column.feature_column import _LazyBuilder
+from tensorflow.python.feature_column.feature_column_v2_test import _TestStateManager
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -131,7 +131,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         feature_columns=[embedding_column_b, embedding_column_a])
 
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ('sequence_input_layer/aaa_embedding/embedding_weights:0',
          'sequence_input_layer/bbb_embedding/embedding_weights:0'),
         tuple([v.name for v in global_vars]))
@@ -223,7 +223,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         feature_columns=shared_embedding_columns)
 
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ('sequence_input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
         tuple([v.name for v in global_vars]))
     with monitored_session.MonitoredSession() as sess:
@@ -670,6 +670,23 @@ def _assert_sparse_tensor_indices_shape(test_case, expected, actual):
   test_case.assertAllEqual(expected.dense_shape, actual.dense_shape)
 
 
+def _get_sequence_dense_tensor(column, features):
+  return column.get_sequence_dense_tensor(
+      fc.FeatureTransformationCache(features), None)
+
+
+def _get_sequence_dense_tensor_state(column, features):
+  state_manager = _TestStateManager()
+  column.create_state(state_manager)
+  return column.get_sequence_dense_tensor(
+      fc.FeatureTransformationCache(features), state_manager)
+
+
+def _get_sparse_tensors(column, features):
+  return column.get_sparse_tensors(
+      fc.FeatureTransformationCache(features), None)
+
+
 class SequenceCategoricalColumnWithIdentityTest(
     test.TestCase, parameterized.TestCase):
 
@@ -698,7 +715,7 @@ class SequenceCategoricalColumnWithIdentityTest(
     expected = sparse_tensor.SparseTensorValue(**expected_args)
     column = sfc.sequence_categorical_column_with_identity('aaa', num_buckets=9)
 
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
     with monitored_session.MonitoredSession() as sess:
@@ -737,7 +754,7 @@ class SequenceCategoricalColumnWithHashBucketTest(
     column = sfc.sequence_categorical_column_with_hash_bucket(
         'aaa', hash_bucket_size=10)
 
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
     with monitored_session.MonitoredSession() as sess:
@@ -790,7 +807,7 @@ class SequenceCategoricalColumnWithVocabularyFileTest(
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
 
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
     with monitored_session.MonitoredSession() as sess:
@@ -814,8 +831,7 @@ class SequenceCategoricalColumnWithVocabularyFileTest(
     input_placeholder_shape[1] = None
     input_placeholder = array_ops.sparse_placeholder(
         dtypes.string, shape=input_placeholder_shape)
-    id_weight_pair = column._get_sparse_tensors(
-        _LazyBuilder({'aaa': input_placeholder}))
+    id_weight_pair = _get_sparse_tensors(column, {'aaa': input_placeholder})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
     with monitored_session.MonitoredSession() as sess:
@@ -855,7 +871,7 @@ class SequenceCategoricalColumnWithVocabularyListTest(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'))
 
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
     with monitored_session.MonitoredSession() as sess:
@@ -922,13 +938,12 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old._embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
         initializer=_initializer)
 
-    embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': inputs}))
+    embedding_lookup, _ = _get_sequence_dense_tensor_state(
+        embedding_column, {'aaa': inputs})
 
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(
@@ -961,10 +976,11 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old._embedding_column(categorical_column, dimension=2)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=2)
 
-    _, sequence_length = embedding_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': inputs}))
+    _, sequence_length = _get_sequence_dense_tensor_state(
+        embedding_column, {'aaa': inputs})
 
     with monitored_session.MonitoredSession() as sess:
       sequence_length = sess.run(sequence_length)
@@ -988,10 +1004,11 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old._embedding_column(categorical_column, dimension=2)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=2)
 
-    _, sequence_length = embedding_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
+    _, sequence_length = _get_sequence_dense_tensor_state(
+        embedding_column, {'aaa': sparse_input})
 
     with monitored_session.MonitoredSession() as sess:
       self.assertAllEqual(
@@ -1058,22 +1075,18 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns_v2(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
 
-    embedding_lookup_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
-        _LazyBuilder({
-            'aaa': sparse_input_a
-        }))[0]
-    embedding_lookup_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
-        _LazyBuilder({
-            'bbb': sparse_input_b
-        }))[0]
+    embedding_lookup_a = _get_sequence_dense_tensor(
+        shared_embedding_columns[0], {'aaa': sparse_input_a})[0]
+    embedding_lookup_b = _get_sequence_dense_tensor(
+        shared_embedding_columns[1], {'bbb': sparse_input_b})[0]
 
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertItemsEqual(('aaa_bbb_shared_embedding:0',),
                           tuple([v.name for v in global_vars]))
     with monitored_session.MonitoredSession() as sess:
       self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
@@ -1104,17 +1117,13 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     expected_sequence_length_b = [2, 1]
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns_v2(
         [categorical_column_a, categorical_column_b], dimension=2)
 
-    sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
-        _LazyBuilder({
-            'aaa': sparse_input_a
-        }))[1]
-    sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
-        _LazyBuilder({
-            'bbb': sparse_input_b
-        }))[1]
+    sequence_length_a = _get_sequence_dense_tensor(
+        shared_embedding_columns[0], {'aaa': sparse_input_a})[1]
+    sequence_length_b = _get_sequence_dense_tensor(
+        shared_embedding_columns[1], {'bbb': sparse_input_b})[1]
 
     with monitored_session.MonitoredSession() as sess:
       sequence_length_a = sess.run(sequence_length_a)
@@ -1155,17 +1164,13 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
 
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns_v2(
         [categorical_column_a, categorical_column_b], dimension=2)
 
-    sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
-        _LazyBuilder({
-            'aaa': sparse_input_a
-        }))[1]
-    sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
-        _LazyBuilder({
-            'bbb': sparse_input_b
-        }))[1]
+    sequence_length_a = _get_sequence_dense_tensor(
+        shared_embedding_columns[0], {'aaa': sparse_input_a})[1]
+    sequence_length_b = _get_sequence_dense_tensor(
+        shared_embedding_columns[1], {'bbb': sparse_input_b})[1]
 
     with monitored_session.MonitoredSession() as sess:
       self.assertAllEqual(
@@ -1221,10 +1226,10 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc_old._indicator_column(categorical_column)
+    indicator_column = fc.indicator_column(categorical_column)
 
-    indicator_tensor, _ = indicator_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': inputs}))
+    indicator_tensor, _ = _get_sequence_dense_tensor(
+        indicator_column, {'aaa': inputs})
 
     with monitored_session.MonitoredSession() as sess:
       self.assertAllEqual(expected, indicator_tensor.eval(session=sess))
@@ -1253,10 +1258,10 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc_old._indicator_column(categorical_column)
+    indicator_column = fc.indicator_column(categorical_column)
 
-    _, sequence_length = indicator_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': inputs}))
+    _, sequence_length = _get_sequence_dense_tensor(
+        indicator_column, {'aaa': inputs})
 
     with monitored_session.MonitoredSession() as sess:
       sequence_length = sess.run(sequence_length)
@@ -1282,19 +1287,14 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     indicator_column = fc.indicator_column(categorical_column)
 
-    _, sequence_length = indicator_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
+    _, sequence_length = _get_sequence_dense_tensor(
+        indicator_column, {'aaa': sparse_input})
 
     with monitored_session.MonitoredSession() as sess:
       self.assertAllEqual(
           expected_sequence_length, sequence_length.eval(session=sess))
 
 
-def _get_sequence_dense_tensor(column, features):
-  return column.get_sequence_dense_tensor(
-      fc.FeatureTransformationCache(features), None)
-
-
 class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
 
   def test_defaults(self):
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 93b1aaa85e88e00c1b12a388321a4d6fb10f1611..c541c71f996c7a1b36cf28ae9a1783f8dca0a72c 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -522,7 +522,7 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
   auto bias_ptr = AsDeviceMemory(bias.template flat<BiasType>().data(),
                                  bias.template flat<BiasType>().size());
 
-  static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
+  static int64 ConvolveScratchSize = GetDnnWorkspaceLimit(
       // default value is in bytes despite the name of the environment variable
       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
   );
@@ -570,7 +570,7 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
-      CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+      DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
       dnn::ProfileResult profile_result;
       bool cudnn_launch_status =
           stream
@@ -609,7 +609,7 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
                                                       algorithm_config);
   }
 
-  CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+  DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
   bool cudnn_launch_status =
       stream
           ->ThenFusedConvolveWithAlgorithm(
diff --git a/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py b/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py
index e2594faf85bcf91cbe09f266e4d4211d20bdee17..364fa4eb461c62784803f0c309e3b7c5855df199 100644
--- a/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py
+++ b/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py
@@ -64,6 +64,9 @@ def condition_tensor(tensor, conditioning):
   """
   tensor.shape[1:].assert_is_fully_defined()
   num_features = tensor.shape[1:].num_elements()
+  if conditioning.shape.ndims < 2:
+    raise ValueError('conditioning must be at least 2D, but saw shape: %s'
+                     % conditioning.shape)
 
   mapped_conditioning = layers.linear(
       layers.flatten(conditioning), num_features)
diff --git a/tensorflow/contrib/gan/python/features/python/conditioning_utils_test.py b/tensorflow/contrib/gan/python/features/python/conditioning_utils_test.py
index 0aad769793761be69ee9d1e3416e44c7b3d8cea0..f5c7d53cf2c9aa08ba0074950983ef3ecd90168b 100644
--- a/tensorflow/contrib/gan/python/features/python/conditioning_utils_test.py
+++ b/tensorflow/contrib/gan/python/features/python/conditioning_utils_test.py
@@ -45,7 +45,7 @@ class ConditioningUtilsTest(test.TestCase):
           array_ops.placeholder(dtypes.float32, (5, None)),
           array_ops.placeholder(dtypes.float32, (5, 1)))
 
-    with self.assertRaisesRegexp(ValueError, 'expected min_ndim=2'):
+    with self.assertRaisesRegexp(ValueError, 'at least 2D'):
       conditioning_utils.condition_tensor(
           array_ops.placeholder(dtypes.float32, (5, 2)),
           array_ops.placeholder(dtypes.float32, (5)))
diff --git a/tensorflow/contrib/gdr/BUILD b/tensorflow/contrib/gdr/BUILD
index e534fdc17749974ebe713c2730682bea6d7a85e4..704be917b3680a1b5712f4f1dc5059b354db8610 100644
--- a/tensorflow/contrib/gdr/BUILD
+++ b/tensorflow/contrib/gdr/BUILD
@@ -37,7 +37,7 @@ tf_proto_library_cc(
     ],
 )
 
-tf_cuda_library(
+cc_library(
     name = "gdr_memory_manager",
     srcs = ["gdr_memory_manager.cc"],
     hdrs = ["gdr_memory_manager.h"],
@@ -58,7 +58,7 @@ tf_cuda_library(
     ],
 )
 
-tf_cuda_library(
+cc_library(
     name = "gdr_worker",
     srcs = ["gdr_worker.cc"],
     hdrs = ["gdr_worker.h"],
diff --git a/tensorflow/contrib/gdr/gdr.proto b/tensorflow/contrib/gdr/gdr.proto
index c0b89245b150bfa49cb527d25b6e1f324f353b25..bd438787c3374be6ead4f6233101fd1f548643ea 100644
--- a/tensorflow/contrib/gdr/gdr.proto
+++ b/tensorflow/contrib/gdr/gdr.proto
@@ -9,5 +9,4 @@ message RemoteMemoryRegion {
   uint64 addr = 3;
   uint32 rkey = 4;
   uint32 tensor_key = 5;
-  uint64 checksum = 6;
 }
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index 53587fcf3050f313c85485f77ce411cba7faccff..ce1875151597f926aeb6392e7fc8307312da123f 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -26,17 +26,14 @@ limitations under the License.
 #include <fcntl.h>
 #include <rdma/rdma_cma.h>
 #include <rdma/rdma_verbs.h>
-#include <sys/epoll.h>
 
 #include "tensorflow/contrib/gdr/gdr.pb.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/common_runtime/process_state.h"
-#if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
-#endif  // GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/process_state.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/numa.h"
@@ -81,10 +78,6 @@ int TryToReadNumaNode(ibv_device* device) {
   int32 value;
   if (strings::safe_strto32(content, &value)) {
     if (value < 0) {
-      LOG(INFO) << "Successful NUMA node read from SysFS had negative value ("
-                << value
-                << "), but there must be at least one NUMA node"
-                   ", so returning NUMA node zero";
       return port::kNUMANoAffinity;
     }
     LOG(INFO) << "NUMA node for device: " << device->name << " is " << value;
@@ -114,7 +107,7 @@ class GdrMemoryManager : public RemoteMemoryManager {
  public:
   GdrMemoryManager(const string& host, const string& port);
 
-  virtual ~GdrMemoryManager();
+  virtual ~GdrMemoryManager() {}
 
   virtual Status Init() override;
 
@@ -140,7 +133,7 @@ class GdrMemoryManager : public RemoteMemoryManager {
     return ptr < reinterpret_cast<char*>(other->addr) + other->length;
   }
 
-  ibv_mr* FindMemoryRegion(void* addr, size_t length);
+  ibv_mr* FindMemoryRegion(const Tensor* tensor);
 
   void InsertMemoryRegion(void* addr, size_t length,
                           const std::string& allocator_name);
@@ -152,7 +145,6 @@ class GdrMemoryManager : public RemoteMemoryManager {
   const string port_;
   RdmaEndpointPtr listening_;
   std::atomic<bool> stopped_;
-  int epfd_;
   int numa_node_;
 
   // Server side endpoints
@@ -163,15 +155,19 @@ class GdrMemoryManager : public RemoteMemoryManager {
   std::atomic<TensorKey> next_key_;
 
   // Server side on-the-fly tensor buffers
-  mutex server_mu_;
-  std::map<TensorKey, const TensorBuffer*> tensor_buffers_
-      GUARDED_BY(server_mu_);
+  mutex buf_mu_;
+  std::map<TensorKey, const TensorBuffer*> tensor_buffers_ GUARDED_BY(buf_mu_);
 
   // Client side endpoints
   mutex client_mu_;
   std::map<std::pair<string, string>, RdmaEndpointPtr> clients_
       GUARDED_BY(client_mu_);
 
+  // Client side callbacks
+  mutex callback_mu_;
+  std::map<TensorKey, StatusCallback> tensor_callbacks_
+      GUARDED_BY(callback_mu_);
+
   // Managed memory regions
   mutex alloc_mu_;
   std::vector<MemoryRegionPtr> mrs_ GUARDED_BY(alloc_mu_);
@@ -184,16 +180,9 @@ GdrMemoryManager::GdrMemoryManager(const string& host, const string& port)
       port_(port),
       listening_(nullptr, EndpointDeleter),
       stopped_(true),
-      next_key_(0) {}
-
-GdrMemoryManager::~GdrMemoryManager() { close(epfd_); }
+      next_key_(static_cast<uint32_t>(random::New64())) {}
 
 Status GdrMemoryManager::Init() {
-  epfd_ = epoll_create1(0);
-  if (epfd_ == -1) {
-    return errors::Unavailable(strerror(errno), ": ", "epoll_create");
-  }
-
   rdma_addrinfo* addrinfo;
   rdma_addrinfo hints = {};
   hints.ai_port_space = RDMA_PS_TCP;
@@ -206,7 +195,7 @@ Status GdrMemoryManager::Init() {
 
   ibv_qp_init_attr init_attr = {};
   init_attr.qp_type = IBV_QPT_RC;
-  init_attr.cap.max_recv_wr = 32;
+  init_attr.cap.max_recv_wr = 1024;
   init_attr.cap.max_send_wr = 1;
   init_attr.cap.max_recv_sge = 1;
   init_attr.cap.max_send_sge = 1;
@@ -239,14 +228,6 @@ Status GdrMemoryManager::Init() {
                                "cannot set server to non-blocking mode");
   }
 
-  epoll_event event = {};
-  event.events = EPOLLIN | EPOLLPRI;
-  event.data.ptr = listening_.get();
-  if (epoll_ctl(epfd_, EPOLL_CTL_ADD, listening_->channel->fd, &event)) {
-    return errors::Unavailable(strerror(errno), ": ",
-                               "cannot add server to epoll");
-  }
-
   numa_node_ = TryToReadNumaNode(listening_->verbs->device);
 
   SubAllocator::Visitor alloc_visitor = [this](void* ptr, int numa_node,
@@ -265,121 +246,114 @@ Status GdrMemoryManager::Init() {
   ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
   LOG(INFO) << "Instrumenting CPU allocator(s)";
 
-#if GOOGLE_CUDA
   for (int numa_idx = 0; numa_idx < port::NUMANumNodes(); ++numa_idx) {
     GPUProcessState::singleton()->AddCUDAHostAllocVisitor(numa_idx,
                                                           alloc_visitor);
     GPUProcessState::singleton()->AddCUDAHostFreeVisitor(numa_idx,
                                                          free_visitor);
   }
+
   if (IsGDRAvailable()) {
     SubAllocator::Visitor cuda_alloc_visitor = [this](void* ptr, int gpu_id,
                                                       size_t num_bytes) {
       VLOG(2) << "Registering RDMA capable memory region on GPU " << gpu_id;
       InsertMemoryRegion(ptr, num_bytes, strings::StrCat("GPU:", gpu_id));
     };
-    for (int numa_idx = 0; numa_idx < port::NUMANumNodes(); ++numa_idx) {
-      GPUProcessState::singleton()->AddGPUAllocVisitor(numa_idx,
-                                                       cuda_alloc_visitor);
-    }
-    VLOG(1) << "Instrumenting GPU allocator(s) for all Numas";
+    GPUProcessState::singleton()->AddGPUAllocVisitor(numa_node_,
+                                                     cuda_alloc_visitor);
+    LOG(INFO) << "Instrumenting GPU allocator for NUMA " << numa_node_;
   }
-#endif  // GOOGLE_CUDA
+
   return Status::OK();
 }
 
 void GdrMemoryManager::Run() {
   stopped_ = false;
   while (!stopped_) {
-    epoll_event events[32];
-    int ret = epoll_wait(epfd_, events, 32, 1);
-    if (ret == -1) {
-      LOG(ERROR) << "epoll_wait: " << strerror(errno);
-      return;
-    }
-    for (int i = 0; i < ret; i++) {
-      rdma_cm_id* id = static_cast<rdma_cm_id*>(events[i].data.ptr);
-      if (id == listening_.get()) {
-        // Accept incoming connections
-        if (!rdma_get_request(listening_.get(), &id)) {
-          if (!rdma_accept(id, nullptr)) {
-            LOG(INFO) << "Accepted new RDMA connection";
-            if (ibv_req_notify_cq(id->recv_cq, 0)) {
-              LOG(ERROR) << strerror(errno) << ": ibv_req_notify_cq failed";
-              EndpointDeleter(id);
-              continue;
-            }
-            for (int i = 0; i < 32; i++) {
-              if (rdma_post_recvv(id, nullptr, nullptr, 0)) {
-                LOG(ERROR) << strerror(errno) << ": rdma_post_recvv failed";
-                EndpointDeleter(id);
-                continue;
-              }
-            }
-            int flags = fcntl(id->recv_cq_channel->fd, F_GETFL, 0);
-            if (fcntl(id->recv_cq_channel->fd, F_SETFL, flags | O_NONBLOCK)) {
-              LOG(ERROR) << strerror(errno)
-                         << ": cannot set server_client to non-blocking mode";
-              EndpointDeleter(id);
-              continue;
-            }
-            epoll_event event = {};
-            event.events = EPOLLIN | EPOLLPRI;
-            event.data.ptr = id;
-            if (epoll_ctl(epfd_, EPOLL_CTL_ADD, id->recv_cq_channel->fd,
-                          &event)) {
-              LOG(ERROR) << strerror(errno)
-                         << ": cannot add server client to epoll";
-              EndpointDeleter(id);
-              continue;
-            }
-            server_clients_.push_back({id, EndpointDeleter});
+    rdma_cm_id* id = nullptr;
+    // Accept incoming connections
+    if (!rdma_get_request(listening_.get(), &id)) {
+      if (!rdma_accept(id, nullptr)) {
+        LOG(INFO) << "Accepted new RDMA connection";
+        for (int i = 0; i < 1024; i++) {
+          if (rdma_post_recvv(id, nullptr, nullptr, 0)) {
+            LOG(ERROR) << strerror(errno) << ": rdma_post_recvv failed";
+            EndpointDeleter(id);
+            continue;
           }
         }
-      } else {
-        // Polling work completions
-        ibv_cq* cq;
-        void* context;
-        if (!ibv_get_cq_event(id->recv_cq_channel, &cq, &context)) {
-          ibv_ack_cq_events(id->recv_cq, 1);
-          if (ibv_req_notify_cq(id->recv_cq, 0)) {
-            LOG(ERROR) << strerror(errno) << ": ibv_req_notify_cq failed";
-            continue;
+        server_clients_.push_back({id, EndpointDeleter});
+      }
+    }
+    // Polling server side work completions
+    for (const auto& client : server_clients_) {
+      ibv_wc wc[32];
+      int ret = ibv_poll_cq(client->recv_cq, 32, wc);
+      if (ret < 0) {
+        LOG(ERROR) << "ibv_poll_cq failed";
+        continue;
+      }
+      for (int i = 0; i < ret; i++) {
+        if (wc[i].opcode != IBV_WC_RECV_RDMA_WITH_IMM) {
+          LOG(ERROR) << "Received unknown operation " << wc[i].opcode;
+        }
+        if (wc[i].status != 0) {
+          LOG(ERROR) << ibv_wc_status_str(wc[i].status);
+        }
+        TensorKey tensor_key = ntohl(wc[i].imm_data);
+
+        if (rdma_post_recvv(client.get(), nullptr, nullptr, 0)) {
+          perror("rdma_post_recvv");
+          LOG(ERROR) << "rdma_post_recvv failed";
+        }
+
+        mutex_lock l(buf_mu_);
+        auto iter = tensor_buffers_.find(tensor_key);
+        if (iter == std::end(tensor_buffers_)) {
+          LOG(ERROR) << "Cannot find tensor buffer for tensor key "
+                     << tensor_key;
+        } else {
+          const TensorBuffer* buffer = iter->second;
+          buffer->Unref();
+          tensor_buffers_.erase(iter);
+        }
+      }
+    }
+    // Polling client side work completions
+    if (client_mu_.try_lock()) {
+      for (const auto& client : clients_) {
+        ibv_wc wc[32];
+        int ret = ibv_poll_cq(client.second->send_cq, 32, wc);
+        for (int i = 0; i < ret; i++) {
+          Status s;
+          if (wc[i].status) {
+            s = errors::Unavailable(ibv_wc_status_str(wc[i].status));
+          } else {
+            s = Status::OK();
           }
-          ibv_wc wc[32];
-          int ret = ibv_poll_cq(id->recv_cq, 32, wc);
-          if (ret < 0) {
-            LOG(ERROR) << "ibv_poll_cq failed";
-            continue;
+          TensorKey key = wc[i].wr_id;
+
+          ibv_send_wr wr = {};
+          wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+          wr.imm_data = htonl(key);
+          ibv_send_wr* bad_wr;
+          if (ibv_post_send(client.second->qp, &wr, &bad_wr)) {
+            LOG(ERROR) << strerror(errno)
+                       << ": ibv_post_send failed for tensor_key " << key;
           }
-          for (int i = 0; i < ret; i++) {
-            if (wc[i].opcode != IBV_WC_RECV_RDMA_WITH_IMM) {
-              LOG(ERROR) << "Received unknown operation " << wc[i].opcode;
-            }
-            if (wc[i].status != 0) {
-              LOG(ERROR) << ibv_wc_status_str(wc[i].status);
-            }
-            TensorKey tensor_key = ntohl(wc[i].imm_data);
-            {
-              mutex_lock l(server_mu_);
-              auto iter = tensor_buffers_.find(tensor_key);
-              if (iter == std::end(tensor_buffers_)) {
-                LOG(ERROR) << "Cannot find tensor buffer for tensor key "
-                           << tensor_key;
-              } else {
-                const TensorBuffer* buffer = iter->second;
-                buffer->Unref();
-                tensor_buffers_.erase(iter);
-              }
-            }
-            if (rdma_post_recvv(id, nullptr, nullptr, 0)) {
-              perror("rdma_post_recvv");
-              LOG(ERROR) << "rdma_post_recvv failed";
-              continue;
-            }
+
+          mutex_lock l(callback_mu_);
+          auto iter = tensor_callbacks_.find(key);
+          if (iter != std::end(tensor_callbacks_)) {
+            iter->second(s);
+            tensor_callbacks_.erase(iter);
+          } else {
+            LOG(WARNING) << "Cannot find client callback with tensor key "
+                         << key;
           }
         }
       }
+      client_mu_.unlock();
     }
   }
 }
@@ -390,116 +364,58 @@ void GdrMemoryManager::TransportOptionsFromTensor(
     ::google::protobuf::Any* mutable_transport_options, const Tensor& tensor,
     Device* device, DeviceContext* device_context, bool on_host,
     StatusCallback done) {
-  auto buffer = DMAHelper::buffer(&tensor);
-  void* addr = buffer->data();
-  size_t length = buffer->size();
-  if (length == 0) {
-    done(errors::Unavailable("Cannot register tensor buffer of size 0"));
-    return;
-  }
+  ibv_mr* mr = FindMemoryRegion(&tensor);
+  const TensorBuffer* buffer = DMAHelper::buffer(&tensor);
 
-  ibv_mr* mr = FindMemoryRegion(addr, length);
-
-#if GOOGLE_CUDA
-  if (device->tensorflow_gpu_device_info() && !on_host) {
-    Allocator* alloc = GPUProcessState::singleton()->GetCUDAHostAllocator(0);
-    Tensor* host_copy = new Tensor(alloc, tensor.dtype(), tensor.shape());
-    GPUUtil::CopyGPUTensorToCPU(
-        device, device_context, &tensor, host_copy,
-        [done, host_copy, mutable_transport_options, this](const Status& s) {
-          if (!s.ok()) {
-            done(s);
-            delete host_copy;
-            return;
-          }
-          auto buffer = DMAHelper::buffer(host_copy);
-          void* addr = buffer->data();
-          size_t length = buffer->size();
-          ibv_mr* mr = FindMemoryRegion(addr, length);
-
-          if (mr == nullptr) {
-            done(errors::Unavailable("Cannot find pinned memory region"));
-            delete host_copy;
-            return;
-          }
-
-          buffer->Ref();
-          TensorKey tensor_key = next_key_++;
-          {
-            mutex_lock l(server_mu_);
-            tensor_buffers_.insert(std::make_pair(tensor_key, buffer));
-          }
-
-          uint64_t checksum = 0;
-          if (VLOG_IS_ON(2)) {
-            checksum = GPUUtil::Checksum(*host_copy);
-          }
-
-          RemoteMemoryRegion remote_mr;
-          remote_mr.set_host(host_);
-          remote_mr.set_port(port_);
-          remote_mr.set_addr(reinterpret_cast<uint64_t>(addr));
-          remote_mr.set_rkey(mr->rkey);
-          remote_mr.set_tensor_key(tensor_key);
-          remote_mr.set_checksum(checksum);
-          mutable_transport_options->PackFrom(remote_mr);
-
-          done(Status::OK());
-          delete host_copy;
-        });
-    return;
-  }
-#endif
+  Tensor* copy = nullptr;
 
   if (mr == nullptr) {
-    Allocator* alloc = ProcessState::singleton()->GetCPUAllocator(numa_node_);
-    Tensor host_copy(alloc, tensor.dtype(), tensor.shape());
-
-    std::memcpy(DMAHelper::buffer(&host_copy)->data(), buffer->data(), length);
-    VLOG(2) << "Copying " << length << " bytes unpinned tensor buffer";
-
-    buffer = DMAHelper::buffer(&host_copy);
-    addr = buffer->data();
-    length = buffer->size();
-
-    mr = FindMemoryRegion(addr, length);
+    AllocatorAttributes alloc_attrs;
+    alloc_attrs.set_gpu_compatible(true);
+    alloc_attrs.set_nic_compatible(true);
+    alloc_attrs.set_on_host(true);
+    Allocator* alloc = device->GetAllocator(alloc_attrs);
+    copy = new Tensor(alloc, tensor.dtype(), tensor.shape());
+
+    mr = FindMemoryRegion(copy);
+    buffer = DMAHelper::buffer(copy);
     if (mr == nullptr) {
       done(errors::Unavailable("Cannot find pinned memory region"));
+      delete copy;
       return;
     }
-
-    buffer->Ref();
-  } else {
-    buffer->Ref();
   }
 
   TensorKey tensor_key = next_key_++;
+  buffer->Ref();
   {
-    mutex_lock l(server_mu_);
+    mutex_lock l(buf_mu_);
     tensor_buffers_.insert(std::make_pair(tensor_key, buffer));
   }
 
-  uint64_t checksum = 0;
-  if (VLOG_IS_ON(2)) {
-#ifdef GOOGLE_CUDA
-    if (device->tensorflow_gpu_device_info() && !on_host) {
-      checksum = GPUUtil::Checksum(device, device_context, tensor);
-    } else {
-      checksum = GPUUtil::Checksum(tensor);
-    }
-#endif
-  }
-
   RemoteMemoryRegion remote_mr;
   remote_mr.set_host(host_);
   remote_mr.set_port(port_);
-  remote_mr.set_addr(reinterpret_cast<uint64_t>(addr));
+  remote_mr.set_addr(reinterpret_cast<uint64_t>(buffer->data()));
   remote_mr.set_rkey(mr->rkey);
   remote_mr.set_tensor_key(tensor_key);
-  remote_mr.set_checksum(checksum);
   mutable_transport_options->PackFrom(remote_mr);
 
-  done(Status::OK());
+  if (copy && device->tensorflow_gpu_device_info() && !on_host) {
+    device_context->CopyDeviceTensorToCPU(&tensor, "" /* tensor_name */, device,
+                                          copy, [done, copy](const Status& s) {
+                                            done(s);
+                                            delete copy;
+                                          });
+    return;
+  } else if (copy) {
+    std::memcpy(buffer->data(), DMAHelper::buffer(&tensor)->data(),
+                buffer->size());
+    done(Status::OK());
+    delete copy;  // OK to delete; we have reffed the underlying TensorBuffer
+  } else {
+    done(Status::OK());
+  }
 }
 
 void GdrMemoryManager::TensorFromTransportOptions(
@@ -512,42 +428,10 @@ void GdrMemoryManager::TensorFromTransportOptions(
     return;
   }
 
-  auto buffer = DMAHelper::buffer(tensor);
-  void* addr = buffer->data();
-  size_t length = buffer->size();
-  ibv_mr* mr = FindMemoryRegion(addr, length);
-
-  Tensor host_copy;
-#if GOOGLE_CUDA
-  if (mr == nullptr && !on_host) {
-    Allocator* alloc =
-        GPUProcessState::singleton()->GetCUDAHostAllocator(numa_node_);
-    host_copy = Tensor(alloc, tensor->dtype(), tensor->shape());
-    buffer = DMAHelper::buffer(&host_copy);
-    addr = buffer->data();
-    length = buffer->size();
-    mr = FindMemoryRegion(addr, length);
-  }
-#endif  // GOOGLE_CUDA
-
-  if (mr == nullptr) {
-    Allocator* alloc = ProcessState::singleton()->GetCPUAllocator(numa_node_);
-    host_copy = Tensor(alloc, tensor->dtype(), tensor->shape());
-
-    buffer = DMAHelper::buffer(&host_copy);
-    addr = buffer->data();
-    length = buffer->size();
-
-    mr = FindMemoryRegion(addr, length);
-    if (mr == nullptr) {
-      done(errors::Unavailable("Cannot find pinned memory region"));
-      return;
-    }
-  }
-
-  decltype(clients_)::iterator iter;
-  bool success;
+  rdma_cm_id* id = nullptr;
   {
+    decltype(clients_)::iterator iter;
+    bool success;
     mutex_lock l(client_mu_);
     std::tie(iter, success) = clients_.insert(
         std::make_pair(std::make_pair(remote_mr.host(), remote_mr.port()),
@@ -560,93 +444,94 @@ void GdrMemoryManager::TensorFromTransportOptions(
         return;
       }
     }
-  }
-  rdma_cm_id* id = iter->second.get();
-
-  uint64_t start = Env::Default()->NowMicros();
-
-  if (rdma_post_read(id, nullptr, buffer->data(), buffer->size(), mr, 0,
-                     remote_mr.addr(), remote_mr.rkey())) {
-    done(errors::Unavailable(strerror(errno), ": ", "rdma_post_read failed"));
-    return;
+    id = iter->second.get();
   }
 
-  ibv_send_wr wr = {};
-  wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
-  wr.imm_data = htonl(remote_mr.tensor_key());
-  wr.send_flags = IBV_SEND_SIGNALED;
-  ibv_send_wr* bad_wr;
-  if (ibv_post_send(id->qp, &wr, &bad_wr)) {
-    done(errors::Unavailable(strerror(errno), ": ", "ibv_post_send failed"));
-    return;
-  }
+  ibv_mr* mr = FindMemoryRegion(tensor);
+  const TensorBuffer* buffer = DMAHelper::buffer(tensor);
 
-  ibv_wc wc = {};
-  int ret;
-  while ((ret = ibv_poll_cq(id->send_cq, 1, &wc)) == 0)
-    ;
-  if (ret < 0 || wc.status) {
-    done(errors::Unavailable(ibv_wc_status_str(wc.status)));
-    return;
-  }
+  const Tensor* copy = nullptr;
 
-#if GOOGLE_CUDA
-  if (device->tensorflow_gpu_device_info() && !on_host &&
-      host_copy.NumElements() > 0) {
-    uint64_t checksum = 0;
-    if (VLOG_IS_ON(2)) {
-      checksum = GPUUtil::Checksum(host_copy);
-      CHECK(checksum == remote_mr.checksum())
-          << "Checksum mismatch: " << checksum << "!=" << remote_mr.checksum();
+  if (mr == nullptr) {
+    AllocatorAttributes alloc_attrs;
+    alloc_attrs.set_gpu_compatible(true);
+    alloc_attrs.set_nic_compatible(true);
+    alloc_attrs.set_on_host(true);
+    Allocator* alloc = device->GetAllocator(alloc_attrs);
+    copy = new Tensor(alloc, tensor->dtype(), tensor->shape());
+
+    mr = FindMemoryRegion(copy);
+    buffer = DMAHelper::buffer(copy);
+    if (mr == nullptr) {
+      done(errors::Unavailable("Cannot find pinned memory region"));
+      delete copy;
+      return;
     }
-    Tensor* ref = new Tensor;
-    std::swap(host_copy, *ref);
-    GPUUtil::CopyCPUTensorToGPU(
-        ref, device_context, device, tensor,
-        [ref, done, buffer, remote_mr, start](const Status& s) {
-          if (!s.ok()) {
-            done(s);
-            delete ref;
-            return;
-          }
-          uint64_t end = Env::Default()->NowMicros();
-
-          VLOG(2) << "RDMA from remote memory region " << remote_mr.rkey()
-                  << " of size " << buffer->size() << " with tensor key "
-                  << remote_mr.tensor_key() << " took " << (end - start)
-                  << " micros";
-          done(Status::OK());
-          delete ref;
-        });
-    return;
   }
-#endif  // GOOGLE_CUDA
 
-  if ((on_host || !device->tensorflow_gpu_device_info()) &&
-      host_copy.NumElements() > 0) {
-    std::memcpy(DMAHelper::buffer(tensor)->data(), addr, length);
-    VLOG(2) << "Copying " << length << " bytes unpinned tensor buffer";
-  }
+  uint64_t start = Env::Default()->NowMicros();
 
-  uint64_t end = Env::Default()->NowMicros();
+  TensorKey tensor_key = remote_mr.tensor_key();
 
-  VLOG(2) << "RDMA from remote memory region " << remote_mr.rkey()
-          << " of size " << buffer->size() << " with tensor key "
-          << remote_mr.tensor_key() << " took " << (end - start) << " micros";
+  StatusCallback callback = [done, copy, device, device_context, on_host,
+                             tensor, start, tensor_key](const Status& s) {
+    if (!s.ok()) {
+      done(s);
+      if (copy) {
+        delete copy;
+      }
+      return;
+    }
 
-  uint64_t checksum = 0;
-  if (VLOG_IS_ON(2)) {
-#ifdef GOOGLE_CUDA
-    if (device->tensorflow_gpu_device_info() && !on_host) {
-      checksum = GPUUtil::Checksum(device, device_context, *tensor);
+    VLOG(2) << "RDMA of tensor " << tensor_key << " of size "
+            << DMAHelper::buffer(tensor)->size() << " took "
+            << (Env::Default()->NowMicros() - start) << " micros";
+
+    if (copy && device->tensorflow_gpu_device_info() && !on_host) {
+      device_context->CopyCPUTensorToDevice(copy, device, tensor,
+                                            [done, copy](const Status& s) {
+                                              done(s);
+                                              delete copy;
+                                            });
+    } else if (copy) {
+      std::memcpy(DMAHelper::buffer(tensor)->data(),
+                  DMAHelper::buffer(copy)->data(),
+                  DMAHelper::buffer(copy)->size());
+      done(s);
+      delete copy;
     } else {
-      checksum = GPUUtil::Checksum(*tensor);
+      done(s);
+    }
+  };
+
+  {
+    mutex_lock l(callback_mu_);
+    if (tensor_callbacks_.find(tensor_key) == std::end(tensor_callbacks_)) {
+      tensor_callbacks_.insert(std::make_pair(tensor_key, std::move(callback)));
+    } else {
+      done(errors::Unavailable("Received duplicated tensor key"));
+      if (copy) {
+        delete copy;
+      }
+      return;
+    }
+  }
+
+  if (rdma_post_read(id, reinterpret_cast<void*>(tensor_key), buffer->data(),
+                     buffer->size(), mr, IBV_SEND_SIGNALED, remote_mr.addr(),
+                     remote_mr.rkey())) {
+    done(errors::Unavailable(strerror(errno), ": ", "rdma_post_read failed"));
+    {
+      mutex_lock l(callback_mu_);
+      auto iter = tensor_callbacks_.find(tensor_key);
+      if (iter != std::end(tensor_callbacks_)) {
+        tensor_callbacks_.erase(iter);
+      }
+    }
+    if (copy) {
+      delete copy;
     }
-    CHECK(checksum == remote_mr.checksum())
-        << "Checksum mismatch: " << checksum << "!=" << remote_mr.checksum();
-#endif
   }
-  done(Status::OK());
 }
 
 Status GdrMemoryManager::CreateEndpoint(const string& host, const string& port,
@@ -663,7 +548,7 @@ Status GdrMemoryManager::CreateEndpoint(const string& host, const string& port,
   ibv_qp_init_attr init_attr = {};
   init_attr.qp_type = IBV_QPT_RC;
   init_attr.cap.max_recv_wr = 1;
-  init_attr.cap.max_send_wr = 32;
+  init_attr.cap.max_send_wr = 1024;
   init_attr.cap.max_recv_sge = 1;
   init_attr.cap.max_send_sge = 1;
 
@@ -687,8 +572,8 @@ Status GdrMemoryManager::CreateEndpoint(const string& host, const string& port,
   return Status::OK();
 }
 
-ibv_mr* GdrMemoryManager::FindMemoryRegion(void* addr, size_t length) {
-  if (length == 0) return nullptr;
+ibv_mr* GdrMemoryManager::FindMemoryRegion(const Tensor* tensor) {
+  const void* addr = DMAHelper::buffer(tensor)->data();
   mutex_lock l(alloc_mu_);
   auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator);
   if (iter == std::end(mrs_) || iter->get()->addr > addr) {
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
index fbccbead03fc0d641db40ede661bf3677d44c45d..5f8c300155770ed03ad12a9fa5ac74456edaf024 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -58,11 +58,9 @@ class GdrRecvTensorCall : public BaseRecvTensorCall {
     resp_.InitAlloc(dst_device_, recv_args_.alloc_attrs);
     StatusCallback cb = [this, recv_done](const Status& s) {
       bool dma_ok = resp_.metadata().has_transport_options();
-      if (s.ok() && tensor().TotalBytes() > 0 && (!is_dead()) && dma_ok) {
+      if (s.ok() && tensor().TotalBytes() > 1024 && (!is_dead()) && dma_ok) {
         auto transport_options = resp_.metadata().transport_options();
-        const bool on_host =
-            (dst_device_->tensorflow_gpu_device_info() == nullptr) ||
-            recv_args_.alloc_attrs.on_host();
+        const bool on_host = recv_args_.alloc_attrs.on_host();
         remote_memory_manager_->TensorFromTransportOptions(
             const_cast<Tensor*>(&tensor()), transport_options, dst_device_,
             recv_args_.device_context, on_host,
@@ -70,9 +68,6 @@ class GdrRecvTensorCall : public BaseRecvTensorCall {
               if (!s.ok()) {
                 mutex_lock l(mu_);
                 status_.Update(s);
-                LOG(ERROR) << "Cannot find pinned memory region from allocator "
-                           << dst_device_->GetAllocator(recv_args_.alloc_attrs)
-                                  ->Name();
               }
               recv_done();
             });
diff --git a/tensorflow/contrib/gdr/gdr_server_lib.cc b/tensorflow/contrib/gdr/gdr_server_lib.cc
index b3f48ec1dd9c75055f4e1ea76eb203b6ccf94718..dc0d5d548b80d36409778ef34e63171441f10142 100644
--- a/tensorflow/contrib/gdr/gdr_server_lib.cc
+++ b/tensorflow/contrib/gdr/gdr_server_lib.cc
@@ -74,9 +74,8 @@ Status GdrServer::Start() {
 }
 
 Status GdrServer::Stop() {
-  TF_RETURN_IF_ERROR(GrpcServer::Stop());
   remote_memory_manager_->Stop();
-  return Status::OK();
+  return GrpcServer::Stop();
 }
 
 Status GdrServer::Join() {
diff --git a/tensorflow/contrib/gdr/gdr_worker.cc b/tensorflow/contrib/gdr/gdr_worker.cc
index 867cb83f42034c8e9061e333ea671457745f92c3..016e5ea27b397830c69b6e1761b5994ebcfa9c3d 100644
--- a/tensorflow/contrib/gdr/gdr_worker.cc
+++ b/tensorflow/contrib/gdr/gdr_worker.cc
@@ -18,9 +18,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#if GOOGLE_CUDA
-#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
-#endif  // GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
@@ -78,7 +75,7 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
   const bool dma_ok = request->dma_ok();
   env_->rendezvous_mgr->RecvLocalAsync(
       step_id, parsed,
-      [this, opts, response, done, src_dev, dma_ok](
+      [this, opts, response, done, src_dev, request, dma_ok](
           const Status& status, const Rendezvous::Args& send_args,
           const Rendezvous::Args&, const Tensor& val, const bool is_dead) {
         opts->ClearCancelCallback();
@@ -89,10 +86,8 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
           // 3) the tensor has the on_host allocation attribute,
           // i.e. it's in CPU RAM *independent of its assigned
           // device type*.
-          const bool on_host =
-              (src_dev->tensorflow_gpu_device_info() == nullptr) ||
-              send_args.alloc_attrs.on_host();
-          if (val.TotalBytes() > 0 && (!is_dead) &&
+          const bool on_host = send_args.alloc_attrs.on_host();
+          if (val.TotalBytes() > 1024 && (!is_dead) &&
               DMAHelper::CanUseDMA(&val) && dma_ok) {
             // DMA cases.
             RecvTensorResponse* proto = new RecvTensorResponse;
@@ -117,8 +112,7 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
           } else {
             // Non-DMA cases.
             if (src_dev->tensorflow_gpu_device_info() && (!on_host)) {
-#if GOOGLE_CUDA
-              const DeviceContext* send_dev_context = send_args.device_context;
+              DeviceContext* send_dev_context = send_args.device_context;
               AllocatorAttributes alloc_attrs;
               alloc_attrs.set_gpu_compatible(true);
               alloc_attrs.set_on_host(true);
@@ -127,7 +121,8 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
               CHECK(send_dev_context)
                   << "send dev name: " << src_dev->name()
                   << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
-              // "val" is on a GPU. Uses GPUUtil to fill the response proto.
+              // "val" is on an accelerator device. Uses the device_context to
+              // fill the copy on host.
               StatusCallback copy_ready = [response, done, copy,
                                            is_dead](const Status& s) {
                 // The value is now ready to be returned on the wire.
@@ -136,11 +131,8 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
                 delete copy;
               };
 
-              GPUUtil::CopyGPUTensorToCPU(src_dev, send_dev_context, &val, copy,
-                                          copy_ready);
-#else
-              done(errors::Internal("No GPU device in process"));
-#endif  // GOOGLE_CUDA
+              send_dev_context->CopyDeviceTensorToCPU(
+                  &val, request->rendezvous_key(), src_dev, copy, copy_ready);
             } else {
               grpc::EncodeTensorToByteBuffer(is_dead, val, response);
               done(Status::OK());
diff --git a/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py b/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py
index f7f1189bb93c611719186a697c40f371644f63a2..bc941ae9f23eaa5c46fcca95b9aba0ac0d87960a 100644
--- a/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py
+++ b/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.contrib.hadoop.python.ops import hadoop_dataset_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -47,7 +48,7 @@ class SequenceFileDatasetTest(test.TestCase):
 
     dataset = hadoop_dataset_ops.SequenceFileDataset(filenames).repeat(
         num_repeats)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py b/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
index bf398b838dfaaff6fdaf33a6cd7086ef13e43a3e..77813519c136665a2fea30d4387f5e7a9776b20b 100644
--- a/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
+++ b/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
@@ -20,15 +20,19 @@ from __future__ import print_function
 from tensorflow.contrib.hadoop.python.ops import gen_dataset_ops
 from tensorflow.contrib.hadoop.python.ops import hadoop_op_loader  # pylint: disable=unused-import
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.util import deprecation
 
 
 class SequenceFileDataset(dataset_ops.DatasetSource):
   """A Sequence File Dataset that reads the sequence file."""
 
+  @deprecation.deprecated(
+      None,
+      "tf.contrib.hadoop will be removed in 2.0, the support for Apache Hadoop "
+      "will continue to be provided through the tensorflow/io GitHub project.")
   def __init__(self, filenames):
     """Create a `SequenceFileDataset`.
 
@@ -40,15 +44,12 @@ class SequenceFileDataset(dataset_ops.DatasetSource):
     For example:
 
     ```python
+    tf.enable_eager_execution()
+
     dataset = tf.contrib.hadoop.SequenceFileDataset("/foo/bar.seq")
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
     # Prints the (key, value) pairs inside a hadoop sequence file.
-    while True:
-      try:
-        print(sess.run(next_element))
-      except tf.errors.OutOfRangeError:
-        break
+    for key, value in dataset:
+      print(key, value)
     ```
 
     Args:
@@ -60,16 +61,10 @@ class SequenceFileDataset(dataset_ops.DatasetSource):
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.sequence_file_dataset(
-        self._filenames, nest.flatten(self.output_types))
-
-  @property
-  def output_classes(self):
-    return ops.Tensor, ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
+        self._filenames, self._element_structure._flat_types)  # pylint: disable=protected-access
 
   @property
-  def output_types(self):
-    return dtypes.string, dtypes.string
+  def _element_structure(self):
+    return structure.NestedStructure(
+        (structure.TensorStructure(dtypes.string, []),
+         structure.TensorStructure(dtypes.string, [])))
diff --git a/tensorflow/contrib/ignite/README.md b/tensorflow/contrib/ignite/README.md
index c7db0b77e25668fb8a42d204776044420f403e44..5a8c650fb927be0c835aaceffc516c048195c7bf 100644
--- a/tensorflow/contrib/ignite/README.md
+++ b/tensorflow/contrib/ignite/README.md
@@ -54,14 +54,12 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
+>>> tf.enable_eager_execution()
+>>>
 >>> dataset = IgniteDataset(cache_name="SQL_PUBLIC_KITTEN_CACHE")
->>> iterator = dataset.make_one_shot_iterator()
->>> next_obj = iterator.get_next()
 >>>
->>> with tf.Session() as sess:
->>>   for _ in range(3):
->>>     print(sess.run(next_obj))
+>>> for element in dataset:
+>>>   print(element)
 
 {'key': 1, 'val': {'NAME': b'WARM KITTY'}}
 {'key': 2, 'val': {'NAME': b'SOFT KITTY'}}
@@ -74,23 +72,22 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
+>>> tf.enable_eager_execution()
+>>>
 >>> dataset = IgniteDataset(cache_name="IMAGES")
->>> iterator = dataset.make_one_shot_iterator()
->>> next_obj = iterator.get_next()
 >>>
->>> with tf.Session() as sess:
->>>   print(sess.run(next_obj))
+>>> for element in dataset.take(1):
+>>>   print(element)
 
 {
-    'key': 'kitten.png', 
+    'key': 'kitten.png',
     'val': {
         'metadata': {
             'file_name': b'kitten.png',
             'label': b'little ball of fur',
-            width: 800, 
+            width: 800,
             height: 600
-        }, 
+        },
         'pixels': [0, 0, 0, 0, ..., 0]
     }
 }
@@ -100,13 +97,11 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
+>>>
 >>> dataset = IgniteDataset(cache_name="IMAGES").map(lambda obj: obj['val']['pixels'])
->>> iterator = dataset.make_one_shot_iterator()
->>> next_obj = iterator.get_next()
 >>>
->>> with tf.Session() as sess:
->>>   print(sess.run(next_obj))
+>>> for element in dataset:
+>>>   print(element)
 
 [0, 0, 0, 0, ..., 0]
 ```
@@ -126,18 +121,18 @@ Ignite Dataset allows using these two aspects of distributed neural network trai
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
+>>>
 >>> dataset = IgniteDataset("IMAGES")
 >>>
 >>> # Compute gradients locally on every worker node.
->>> gradients = []    
+>>> gradients = []
 >>> for i in range(5):
 >>>     with tf.device("/job:WORKER/task:%d" % i):
->>>         device_iterator = dataset.make_one_shot_iterator()
+>>>         device_iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
 >>>         device_next_obj = device_iterator.get_next()
 >>>         gradient = compute_gradient(device_next_obj)
->>>         gradients.append(gradient)        
->>>        
+>>>         gradients.append(gradient)
+>>>
 >>> # Aggregate them on master node.
 >>> result_gradient = tf.reduce_sum(gradients)
 >>>
@@ -145,7 +140,7 @@ Ignite Dataset allows using these two aspects of distributed neural network trai
 >>>     print(sess.run(result_gradient))
 ```
 
-High-level TensorFlow API for [distributed training](https://www.tensorflow.org/api_docs/python/tf/contrib/distribute/DistributionStrategy) is supported as well. 
+High-level TensorFlow API for [distributed training](https://www.tensorflow.org/api_docs/python/tf/contrib/distribute/DistributionStrategy) is supported as well.
 
 ### Distributed File System
 
diff --git a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
index 936b29a4f50794380d48efed99e267c6b4c44dc6..66e654ca636a5a051c6f9cd35bf9001dfbcbf7f4 100644
--- a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
+++ b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
@@ -27,17 +27,16 @@ import six
 from tensorflow.contrib.ignite.python.ops import gen_dataset_ops
 from tensorflow.contrib.ignite.python.ops import ignite_op_loader  # pylint: disable=unused-import
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.util import deprecation
 
 
 @six.add_metaclass(abc.ABCMeta)
 class Readable(object):
-  """Readable abstract class that exposes methods to do reading-related
-
-     operations.
-  """
+  """Abstract class that exposes methods to do reading-related operations."""
 
   @abc.abstractmethod
   def __init__(self):
@@ -227,10 +226,7 @@ types = {
 
 
 class TypeTreeNode(object):
-  """TypeTreeNode class exposes methods to format object tree structure
-
-     data.
-  """
+  """TypeTreeNode class exposes methods to format object tree structure data."""
 
   def __init__(self, name, type_id, fields=None, permutation=None):
     """Constructs a new instance of TypeTreeNode.
@@ -692,18 +688,22 @@ class IgniteClient(TcpClient):
 
 
 class IgniteDataset(dataset_ops.DatasetSource):
-  """Apache Ignite is a memory-centric distributed database, caching, and
-
-     processing platform for transactional, analytical, and streaming workloads,
-     delivering in-memory speeds at petabyte scale. This contrib package
-     contains an integration between Apache Ignite and TensorFlow. The
-     integration is based on tf.data from TensorFlow side and Binary Client
-     Protocol from Apache Ignite side. It allows to use Apache Ignite as a
-     datasource for neural network training, inference and all other
+  """Apache Ignite is a memory-centric distributed database.
+
+     It acts as a caching and processing platform for transactional, analytical,
+     and streaming workloads, delivering in-memory speeds at petabyte scale.
+     This contrib package contains an integration between Apache Ignite and
+     TensorFlow. The integration is based on tf.data from TensorFlow side and
+     Binary Client Protocol from Apache Ignite side. It allows to use Apache
+     Ignite as a datasource for neural network training, inference and all other
      computations supported by TensorFlow. Ignite Dataset is based on Apache
      Ignite Binary Client Protocol.
   """
 
+  @deprecation.deprecated(
+      None,
+      "tf.contrib.ignite will be removed in 2.0, the support for Apache Ignite "
+      "will continue to be provided through the tensorflow/io GitHub project.")
   def __init__(self,
                cache_name,
                host="localhost",
@@ -756,6 +756,9 @@ class IgniteDataset(dataset_ops.DatasetSource):
         self.cache_type.to_permutation(),
         dtype=dtypes.int32,
         name="permutation")
+    self._structure = structure.convert_legacy_structure(
+        self.cache_type.to_output_types(), self.cache_type.to_output_shapes(),
+        self.cache_type.to_output_classes())
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.ignite_dataset(self.cache_name, self.host, self.port,
@@ -763,13 +766,5 @@ class IgniteDataset(dataset_ops.DatasetSource):
                                           self.schema, self.permutation)
 
   @property
-  def output_classes(self):
-    return self.cache_type.to_output_classes()
-
-  @property
-  def output_shapes(self):
-    return self.cache_type.to_output_shapes()
-
-  @property
-  def output_types(self):
-    return self.cache_type.to_output_types()
+  def _element_structure(self):
+    return self._structure
diff --git a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
index ef29b5f14a4b2fea2400ec4d56a7ad2cf44cf2cb..ff5d4c458c859fd8e5e3ae65ee41a454d55d6538 100644
--- a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
+++ b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
@@ -21,6 +21,7 @@ import os
 
 from tensorflow.contrib.ignite import IgniteDataset
 from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -65,7 +66,7 @@ class IgniteDatasetTest(test.TestCase):
     self.assertEqual(dtypes.string, dataset.output_types["val"]["NAME"])
     self.assertEqual(dtypes.int64, dataset.output_types["val"]["VAL"])
 
-    it = dataset.make_one_shot_iterator()
+    it = dataset_ops.make_one_shot_iterator(dataset)
     ne = it.get_next()
 
     with session.Session() as sess:
diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index 4997c31a7fc7f4243d03b22fc9c01fb13a2a25a4..ba5cdfebf92c07e496ed588848d5859ff6a5bff2 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -281,6 +281,13 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
             value.eval(),
             np.array([[4, 4], [4, 4]]).astype(dtype.as_numpy_dtype()))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_transform_eager(self):
+    image = constant_op.constant([[1., 2.], [3., 4.]])
+    value = image_ops.transform(image, [1] * 8)
+    with self.test_session(use_gpu=True):
+      self.assertAllEqual(self.evaluate(value), np.array([[4, 4], [4, 4]]))
+
 
 class BipartiteMatchTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index d4fb99a017faebe30384d739f22f4ff5fa986bc4..b25a6f7b5742917a032946fe03a0dab20e7dc1ad 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.contrib.image.ops import gen_image_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import common_shapes
@@ -271,8 +272,11 @@ def transform(images,
       raise TypeError("Images should have rank between 2 and 4.")
 
     if output_shape is None:
-      output_shape = tensor_util.constant_value(
-          array_ops.shape(images)[1:3]) or array_ops.shape(images)[1:3]
+      output_shape = array_ops.shape(images)[1:3]
+      if not context.executing_eagerly():
+        output_shape_value = tensor_util.constant_value(output_shape)
+        if output_shape_value is not None:
+          output_shape = output_shape_value
 
     output_shape = ops.convert_to_tensor(
         output_shape, dtypes.int32, name="output_shape")
diff --git a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
index 7129f09e8b42e48a9c768fd4a66cde3d4da9d31d..b399e1b6c2ac47db205b5d8bbc81875ef5c08a31 100644
--- a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
+++ b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
@@ -20,15 +20,20 @@ from __future__ import print_function
 from tensorflow.contrib.kafka.python.ops import gen_dataset_ops
 from tensorflow.contrib.kafka.python.ops import kafka_op_loader  # pylint: disable=unused-import
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.util import deprecation
 
 
 class KafkaDataset(dataset_ops.DatasetSource):
   """A Kafka Dataset that consumes the message.
   """
 
+  @deprecation.deprecated(
+      None,
+      "tf.contrib.kafka will be removed in 2.0, the support for Apache Kafka "
+      "will continue to be provided through the tensorflow/io GitHub project.")
   def __init__(self,
                topics,
                servers="localhost",
@@ -63,13 +68,5 @@ class KafkaDataset(dataset_ops.DatasetSource):
                                          self._group, self._eof, self._timeout)
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.string
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
diff --git a/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py b/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
index 75806dbbeb1819bb0a6965bbc384e02df9895210..2b1d478a9b0fd12ca25c72da6872acccfd7285fc 100644
--- a/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
+++ b/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
@@ -20,9 +20,10 @@ from __future__ import print_function
 from tensorflow.contrib.kinesis.python.ops import gen_dataset_ops
 from tensorflow.contrib.kinesis.python.ops import kinesis_op_loader  # pylint: disable=unused-import
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.util import deprecation
 
 
 class KinesisDataset(dataset_ops.DatasetSource):
@@ -34,15 +35,12 @@ class KinesisDataset(dataset_ops.DatasetSource):
 
   For example, we can construct and use the KinesisDataset as follows:
   ```python
+  tf.enable_eager_execution()
+
   dataset = tf.contrib.kinesis.KinesisDataset(
       "kinesis_stream_name", read_indefinitely=False)
-  next = dataset.make_one_shot_iterator().get_next()
-  with tf.Session() as sess:
-    while True:
-      try:
-        print(sess.run(nxt))
-      except tf.errors.OutOfRangeError:
-        break
+  for element in dataset:
+    print(element)
   ```
 
   Since Kinesis is a data streaming service, data may not be available
@@ -53,6 +51,10 @@ class KinesisDataset(dataset_ops.DatasetSource):
   is returned immediately instead.
   """
 
+  @deprecation.deprecated(
+      None,
+      "tf.contrib.kinesis will be removed in 2.0, the support for Kinesis "
+      "will continue to be provided through the tensorflow/io GitHub project.")
   def __init__(self,
                stream,
                shard="",
@@ -84,13 +86,5 @@ class KinesisDataset(dataset_ops.DatasetSource):
         self._stream, self._shard, self._read_indefinitely, self._interval)
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.string
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 0a4d2c6d4cb5cad7da93cea89478bc0fca2ac4d6..d791418c9d0f887058ceb535092fa8122da1aa75 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -1459,13 +1459,6 @@ class DropoutTest(test.TestCase):
 
 class FlattenTest(test.TestCase):
 
-  def testInvalidRank(self):
-    with ops.Graph().as_default() as g, self.session(g):
-      inputs = array_ops.placeholder(dtype=dtypes.float32)
-      inputs.set_shape(tensor_shape.TensorShape((5,)))
-      with self.assertRaisesRegexp(ValueError, 'incompatible with the layer'):
-        _layers.flatten(inputs)
-
   def testUnknownLastDim(self):
     with ops.Graph().as_default() as g, self.session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
@@ -1502,6 +1495,12 @@ class FlattenTest(test.TestCase):
                        images.get_shape().num_elements())
       self.assertEqual(output.get_shape()[0], images.get_shape()[0])
 
+  def testFlatten0D(self):
+    with self.cached_session():
+      scalars = random_ops.random_uniform((5,), seed=1, name='scalars')
+      output = _layers.flatten(scalars)
+      self.assertEqual(output.shape, (5, 1))
+
   def testFlattenBatchSize(self):
     height, width = 3, 3
     with self.cached_session() as sess:
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 238504f6d60aeb1a7ff25deab4a86881285e8c03..14065fcee51c014a1af227504eaaca1fa39941e1 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -274,6 +274,7 @@ py_test(
     name = "estimator_test",
     size = "medium",
     srcs = ["python/learn/estimators/estimator_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = [
         "manual",
diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
index 8466dc36d13e223aed4f1dfe8e39a6f91c99fa55..d49834dc860a8b4341ddd3720fde52281f7474f7 100644
--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for SdcaModel."""
+"""Tests for SdcaModel (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index f3f1dcd98db5ae24af154d1f0851a0688d2bc611..c056a12fa5307a7e9ac4cf30e1386ddfd5cd7d75 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -12,7 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Proximal stochastic dual coordinate ascent optimizer for linear models."""
+# pylint: disable=line-too-long
+"""Proximal stochastic dual coordinate ascent optimizer for linear models (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
+# pylint: enable=line-too-long
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -40,6 +47,7 @@ from tensorflow.python.ops import variables as var_ops
 from tensorflow.python.ops.nn import log_poisson_loss
 from tensorflow.python.ops.nn import sigmoid_cross_entropy_with_logits
 from tensorflow.python.summary import summary
+from tensorflow.python.util import deprecation
 
 __all__ = ['SdcaModel']
 
@@ -48,7 +56,7 @@ __all__ = ['SdcaModel']
 class SdcaModel(object):
   """Stochastic dual coordinate ascent solver for linear models.
 
-    Loss functions supported:
+  Loss functions supported:
 
      * Binary logistic loss
      * Squared loss
@@ -109,6 +117,10 @@ class SdcaModel(object):
     ```
   """
 
+  @deprecation.deprecated(
+      None, 'This class is deprecated. To UPDATE or USE linear optimizers, '
+      'please check its latest version in core: '
+      'tensorflow_estimator/python/estimator/canned/linear_optimizer/.')
   def __init__(self, examples, variables, options):
     """Create a new sdca optimizer."""
 
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
index a001555e8f257c88a52fdb40d4181f5cd9c92e84..a28394964a12013c43d85701b5a0ab5c559afd62 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Sharded mutable dense hash table."""
+"""Sharded mutable dense hash table (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,6 +33,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import deprecation
 
 
 # TODO(rohanj): This should subclass Checkpointable and implement
@@ -45,6 +51,10 @@ class ShardedMutableDenseHashTable(object):
 
   # TODO(andreasst): consider moving this to lookup module
 
+  @deprecation.deprecated(
+      None, 'This class is deprecated. To UPDATE or USE linear optimizers, '
+      'please check its latest version in core: '
+      'tensorflow_estimator/python/estimator/canned/linear_optimizer/.')
   def __init__(self,
                key_dtype,
                value_dtype,
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable_test.py b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable_test.py
index 2b56d0fa3a8b8564b7c73a62bd99cc900d6f5c54..2d1457f9e4cc576da696be191e718814dd9ff4e5 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable_test.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for sharded_mutable_dense_hashtable.py."""
+"""Tests for sharded_mutable_dense_hashtable.py (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py b/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py
index 003795233ff2b28e33fc10388ef25efb63c43bb0..64730f8eed1ff9bfcd4a980dceb28abb98e39f73 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Sparse feature column."""
+"""Sparse feature column (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,6 +26,7 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework.ops import internal_convert_to_tensor
 from tensorflow.python.framework.ops import name_scope
+from tensorflow.python.util import deprecation
 
 
 class SparseFeatureColumn(object):
@@ -68,6 +74,10 @@ class SparseFeatureColumn(object):
   @@feature_values
   """
 
+  @deprecation.deprecated(
+      None, 'This class is deprecated. To UPDATE or USE linear optimizers, '
+      'please check its latest version in core: '
+      'tensorflow_estimator/python/estimator/canned/linear_optimizer/.')
   def __init__(self, example_indices, feature_indices, feature_values):
     """Creates a `SparseFeatureColumn` representation.
 
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column_test.py b/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column_test.py
index 51c4f68543da2f563481cc2d35b556796616cf9d..0ae780e1a100c7dadde7196803f2ae0d4bcb2334 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column_test.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for sparse_feature_column.py."""
+"""Tests for sparse_feature_column.py (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index e52fb5ab1431e086f99b4033a6216636a83bad79..229a72a780d5ccce8263444ffeae7700f6ac8613 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -91,7 +91,7 @@ def index_table_from_tensor(mapping,
   The bucket ID range is `[mapping size, mapping size + num_oov_buckets - 1]`.
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.initializer.run()` once.
+  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
 
   Elements in `mapping` cannot have duplicates, otherwise when executing the
   table initializer op, it will throw a `FailedPreconditionError`.
@@ -158,7 +158,7 @@ def string_to_index(tensor, mapping, default_value=-1, name=None):
   will throw a FailedPreconditionError.
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` once.
+  `session.run(tf.tables_initializer)` once.
 
   For example:
 
@@ -202,7 +202,7 @@ def index_to_string_table_from_tensor(mapping, default_value="UNK", name=None):
   (an out-of-vocabulary entry) is assigned the `default_value`
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.initializer.run()` once.
+  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
 
   Elements in `mapping` cannot have duplicates, otherwise when executing the
   table initializer op, it will throw a `FailedPreconditionError`.
@@ -257,7 +257,7 @@ def index_to_string(tensor, mapping, default_value="UNK", name=None):
   (an out-of-vocabulary entry) is assigned the `default_value`
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` once.
+  `session.run(tf.tables_initializer)` once.
 
   For example:
 
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index 5e99ef460518fa761b12533e5dc07dc252f1d582..9b2c2dd87cc8a92fbb6b45504939be3788b60839 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -25,6 +25,7 @@ import six
 from tensorflow.contrib import lookup
 from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import counter
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -2737,7 +2738,7 @@ class MutableHashTableBenchmark(test.Benchmark):
 
   def benchmark_many_repeated_scalar_insert_scalar(self):
     table = self._create_table()
-    c = counter.Counter().make_one_shot_iterator().get_next()
+    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
     value = variables.Variable(1.0)
     insert = table.insert(c, value)
     size = table.size()
@@ -2758,7 +2759,7 @@ class MutableHashTableBenchmark(test.Benchmark):
 
   def benchmark_many_repeated_batch_32_insert_scalar(self):
     table = self._create_table()
-    c = counter.Counter().make_one_shot_iterator().get_next()
+    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
     value = variables.Variable([1.0] * 32)
     insert = table.insert(32 * c + list(range(32)), value)
     size = table.size()
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index b396c527673902d61072dc9cf7d2766476be8369..2a5232b476712a96f84be0f4725beb78bc138297 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -30,11 +30,13 @@ EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE
 GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
-# Note: The Protobuf source in `tensorflow/workspace.bzl` in TensorFlow
-# 1.10 branch does not work. `make distclean` fails and blocks the build
-# process. For now we're hardcoding to the version which is used by
-# TensorFlow 1.9.
-PROTOBUF_URL="https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.0.tar.gz"
+
+# Note: The protobuf repo needs to be cloned due to its submodules.
+# These variables contain the GitHub repo and the sha, from `tensorflow/workspace.bzl`,
+# from which to clone it from and checkout to.
+readonly PROTOBUF_REPO="https://github.com/protocolbuffers/protobuf.git"
+readonly PROTOBUF_TAG="$(grep -o 'https://github.com/protocolbuffers/protobuf/archive/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1 | awk '{print substr($0, index($0, "archive") + 8, index($0, "tar") - index($0, "archive") - 9) }')"
+
 # TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' once
 # the archive has been propagated in mirror.bazel.build.
 RE2_URL="$(grep -o 'https://github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
@@ -91,11 +93,34 @@ download_and_extract() {
   find "${dir}" -type f -name '*BUILD' -delete
 }
 
+function clone_repository() {
+  local repo_url="${1}"
+  local destination_directory="${2}"
+  local commit_sha="${3}"
+
+  if [[ -d "${destination_directory}" ]]; then
+    rm -rf "${destination_directory}"
+  fi
+
+  git clone "${repo_url}" "${destination_directory}"
+
+  pushd "$(pwd)" 1>/dev/null
+
+  cd "${destination_directory}"
+
+  if [[ -n "${commit_sha}" ]]; then
+    git checkout "${PROTOBUF_TAG}"
+  fi
+
+  git submodule update --init
+
+  popd 1>/dev/null
+}
+
 download_and_extract "${EIGEN_URL}" "${DOWNLOADS_DIR}/eigen"
 download_and_extract "${GEMMLOWP_URL}" "${DOWNLOADS_DIR}/gemmlowp"
 download_and_extract "${GOOGLETEST_URL}" "${DOWNLOADS_DIR}/googletest"
 download_and_extract "${NSYNC_URL}" "${DOWNLOADS_DIR}/nsync"
-download_and_extract "${PROTOBUF_URL}" "${DOWNLOADS_DIR}/protobuf"
 download_and_extract "${RE2_URL}" "${DOWNLOADS_DIR}/re2"
 download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
 download_and_extract "${DOUBLE_CONVERSION_URL}" "${DOWNLOADS_DIR}/double_conversion"
@@ -106,6 +131,8 @@ download_and_extract "${CUB_URL}" "${DOWNLOADS_DIR}/cub/external/cub_archive"
 download_and_extract "${FARMHASH_URL}" "${DOWNLOADS_DIR}/farmhash"
 download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
 
+clone_repository "${PROTOBUF_REPO}" "${DOWNLOADS_DIR}/protobuf" "${PROTOBUF_TAG}"
+
 replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
 replace_by_sed 's#static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );#static uint32x2_t p2ui_CONJ_XOR;// = vld1_u32( conj_XOR_DATA ); - Removed by scripts#' \
diff --git a/tensorflow/contrib/metrics/python/metrics/classification.py b/tensorflow/contrib/metrics/python/metrics/classification.py
index 062deb74b165329d8e72efa73b9d81f4174f8831..9aabc4bec3053871e3ff6cd3a88fd76d293f48cc 100644
--- a/tensorflow/contrib/metrics/python/metrics/classification.py
+++ b/tensorflow/contrib/metrics/python/metrics/classification.py
@@ -18,13 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics_impl
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import distribution_strategy_context
 
 # TODO(nsilberman): move into metrics/python/ops/
 
diff --git a/tensorflow/contrib/metrics/python/metrics/classification_test.py b/tensorflow/contrib/metrics/python/metrics/classification_test.py
index d6a670f97b32a29129cb9ea0cd71c5a2b7597a47..e789d2cb9dfbac7b1e145be48b3f707af3fd4e18 100644
--- a/tensorflow/contrib/metrics/python/metrics/classification_test.py
+++ b/tensorflow/contrib/metrics/python/metrics/classification_test.py
@@ -291,12 +291,11 @@ class F1ScoreTest(test.TestCase):
 
     labels = labels.astype(np.float32)
     predictions = predictions.astype(np.float32)
-    tf_predictions, tf_labels = (dataset_ops.Dataset
-                                 .from_tensor_slices((predictions, labels))
-                                 .repeat()
-                                 .batch(batch_size)
-                                 .make_one_shot_iterator()
-                                 .get_next())
+    tf_predictions, tf_labels = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset
+        .from_tensor_slices((predictions, labels))
+        .repeat()
+        .batch(batch_size)).get_next()
     f1, f1_op = classification.f1_score(tf_labels, tf_predictions,
                                         num_thresholds=3)
 
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py b/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
index 1b0383d24c0c472b4875d15c3650e37dfd2439e1..c922d0cd11fda3c51a51ceccf69798df7ce75f26 100644
--- a/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
@@ -29,7 +29,7 @@ from tensorflow.python.platform import test
 
 def _GetExampleIter(inputs):
   dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
-  return dataset.make_one_shot_iterator()
+  return dataset_ops.make_one_shot_iterator(dataset)
 
 
 class FixedLossScaleManagerTest(test.TestCase):
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
index 9009df0eefec13146090ba5fc2096e71ba6eb89d..33f9a43e803ea845a25bba284e41e5a0e6228dad 100644
--- a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
@@ -132,7 +132,7 @@ class LossScaleOptimizerTest(test.TestCase):
 
     x = variable_scope.get_variable("x", initializer=1., dtype=dtypes.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices([np.nan, np.inf, 0.1])
-    itr = dataset.make_one_shot_iterator()
+    itr = dataset_ops.make_one_shot_iterator(dataset)
 
     lr = 1
     opt = gd.GradientDescentOptimizer(lr)
@@ -182,7 +182,7 @@ class LossScaleOptimizerTest(test.TestCase):
 
     x = variable_scope.get_variable("x", initializer=1., dtype=dtypes.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices([np.nan, np.inf, 0.1])
-    itr = dataset.make_one_shot_iterator()
+    itr = dataset_ops.make_one_shot_iterator(dataset)
 
     lr = 1
     init_loss_scale = 8
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index f6b4373edd0544555dd16a373802d2feb5d674b1..43ea66ac5a178f6ffe87df99ddced3d0442111c1 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -214,7 +214,7 @@ def get_pruning_hparams():
       target_sparsity=0.5,
       sparsity_function_begin_step=0,
       sparsity_function_end_step=100,
-      sparsity_function_exponent=3,
+      sparsity_function_exponent=3.0,
       use_tpu=False)
 
 
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index f4ac70eb1a720c2acc3ef942f269228156749cba..0446e823d95f8ecbed6a0c34a83ade009e68448b 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -14,6 +14,7 @@ py_library(
     name = "opt_py",
     srcs = [
         "__init__.py",
+        "python/training/adam_gs_optimizer.py",
         "python/training/adamax.py",
         "python/training/addsign.py",
         "python/training/agn_optimizer.py",
@@ -22,6 +23,7 @@ py_library(
         "python/training/external_optimizer.py",
         "python/training/ggt.py",
         "python/training/lars_optimizer.py",
+        "python/training/lazy_adam_gs_optimizer.py",
         "python/training/lazy_adam_optimizer.py",
         "python/training/matrix_functions.py",
         "python/training/model_average_optimizer.py",
@@ -60,6 +62,21 @@ py_library(
     ],
 )
 
+py_test(
+    name = "adam_gs_optimizer_test",
+    srcs = ["python/training/adam_gs_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "adamax_test",
     srcs = ["python/training/adamax_test.py"],
@@ -148,6 +165,25 @@ py_test(
     ],
 )
 
+py_test(
+    name = "lazy_adam_gs_optimizer_test",
+    srcs = ["python/training/lazy_adam_gs_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "lazy_adam_optimizer_test",
     srcs = ["python/training/lazy_adam_optimizer_test.py"],
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index c7ea68efa9a13a471bba3f41d0600855793b20a2..e8fc52342ceabb47da97ca0f3c8a01e419a221a1 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
+from tensorflow.contrib.opt.python.training.adam_gs_optimizer import *
 from tensorflow.contrib.opt.python.training.adamax import *
 from tensorflow.contrib.opt.python.training.addsign import *
 from tensorflow.contrib.opt.python.training.agn_optimizer import *
@@ -28,6 +29,7 @@ from tensorflow.contrib.opt.python.training.external_optimizer import *
 from tensorflow.contrib.opt.python.training.lars_optimizer import *
 from tensorflow.contrib.opt.python.training.ggt import *
 from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
+from tensorflow.contrib.opt.python.training.lazy_adam_gs_optimizer import *
 from tensorflow.contrib.opt.python.training.model_average_optimizer import *
 from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
 from tensorflow.contrib.opt.python.training.multitask_optimizer_wrapper import *
@@ -44,12 +46,14 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     'AdaMaxOptimizer',
+    'AdamGSOptimizer',
     'PowerSignOptimizer',
     'AddSignOptimizer',
     'DelayCompensatedGradientDescentOptimizer',
     'DropStaleGradientOptimizer',
     'ExternalOptimizerInterface',
     'LARSOptimizer',
+    'LazyAdamGSOptimizer',
     'LazyAdamOptimizer',
     'NadamOptimizer',
     'MovingAverageOptimizer',
diff --git a/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py b/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fb649ea82e79b3bc78a2da6d5c3e9a071adec6d
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py
@@ -0,0 +1,217 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adam rewrite to use global step for computing beta1 & beta2 accumulation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("train.AdamOptimizer")
+class AdamGSOptimizer(optimizer.Optimizer):
+  """Optimizer that implements the Adam algorithm.
+
+  See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+  ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+  """
+
+  def __init__(self, global_step=0, learning_rate=0.001,
+               beta1=0.9, beta2=0.999, epsilon=1e-8,
+               use_locking=False, name="Adam"):
+    """Construct a new Adam optimizer.
+
+    Branched from tf.train.AdamOptimizer. The only difference is to pass
+    global step for computing beta1 and beta2 accumulators, instead of having
+    optimizer keep its own independent beta1 and beta2 accumulators as non-slot
+    variables.
+
+    Initialization:
+
+    $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+    $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+    $$t := 0 \text{(Initialize timestep)}$$
+
+    The update rule for `variable` with gradient `g` uses an optimization
+    described at the end of section2 of the paper:
+
+    $$t := t + 1$$
+    $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+
+    $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+
+    The default value of 1e-8 for epsilon might not be a good default in
+    general. For example, when training an Inception network on ImageNet a
+    current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
+    formulation just before Section 2.1 of the Kingma and Ba paper rather than
+    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+    hat" in the paper.
+
+    The sparse implementation of this algorithm (used when the gradient is an
+    IndexedSlices object, typically because of `tf.gather` or an embedding
+    lookup in the forward pass) does apply momentum to variable slices even if
+    they were not used in the forward pass (meaning they have a gradient equal
+    to zero). Momentum decay (beta1) is also applied to the entire momentum
+    accumulator. This means that the sparse behavior is equivalent to the dense
+    behavior (in contrast to some momentum implementations which ignore momentum
+    unless a variable slice was actually used).
+
+    Args:
+      global_step: tensorflow variable indicating the step.
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta1: A float value or a constant float tensor.
+        The exponential decay rate for the 1st moment estimates.
+      beta2: A float value or a constant float tensor.
+        The exponential decay rate for the 2nd moment estimates.
+      epsilon: A small constant for numerical stability. This epsilon is
+        "epsilon hat" in the Kingma and Ba paper (in the formula just before
+        Section 2.1), not the epsilon in Algorithm 1 of the paper.
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adam".
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `beta1`, `beta2`, and
+    `epsilon` can each be a callable that takes no arguments and returns the
+    actual value to use. This can be useful for changing these values across
+    different invocations of optimizer functions.
+    @end_compatibility
+    """
+    super(AdamGSOptimizer, self).__init__(use_locking, name)
+    self._lr = learning_rate
+    self._beta1 = beta1
+    self._beta2 = beta2
+    self._epsilon = epsilon
+    self._global_step = global_step
+    self._global_step_on_worker = None
+
+    # Tensor versions of the constructor arguments, created in _prepare().
+    self._lr_t = None
+    self._beta1_t = None
+    self._beta2_t = None
+    self._epsilon_t = None
+
+    # Created in SparseApply if needed.
+    self._updated_lr = None
+
+  def _get_beta_accumulators(self):
+    return (math_ops.pow(self._beta1_t, self._global_step_on_worker),
+            math_ops.pow(self._beta2_t, self._global_step_on_worker))
+
+  def _create_slots(self, var_list):
+    # Create slots for the first and second moments.
+    for v in var_list:
+      self._zeros_slot(v, "m", self._name)
+      self._zeros_slot(v, "v", self._name)
+
+  def _prepare(self):
+    lr = self._call_if_callable(self._lr)
+    beta1 = self._call_if_callable(self._beta1)
+    beta2 = self._call_if_callable(self._beta2)
+    epsilon = self._call_if_callable(self._epsilon)
+
+    self._lr_t = ops.convert_to_tensor(lr, name="learning_rate")
+    self._beta1_t = ops.convert_to_tensor(beta1, name="beta1")
+    self._beta2_t = ops.convert_to_tensor(beta2, name="beta2")
+    self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon")
+
+    # Performance optimization so that worker creates a copy of the global step
+    # to avoid overloading the parameter server holding the global step.
+    self._global_step_on_worker = math_ops.cast(
+        array_ops.identity(self._global_step) + 1, dtypes.float32)
+
+  def _apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    return training_ops.apply_adam(
+        var, m, v,
+        math_ops.cast(beta1_power, var.dtype.base_dtype),
+        math_ops.cast(beta2_power, var.dtype.base_dtype),
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, var.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
+        grad, use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    return training_ops.resource_apply_adam(
+        var.handle, m.handle, v.handle,
+        math_ops.cast(beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(beta2_power, grad.dtype.base_dtype),
+        math_ops.cast(self._lr_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
+        grad, use_locking=self._use_locking)
+
+  def _apply_sparse_shared(self, grad, var, indices, scatter_add):
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, "m")
+    m_scaled_g_values = grad * (1 - beta1_t)
+    m_t = state_ops.assign(m, m * beta1_t,
+                           use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = scatter_add(m, indices, m_scaled_g_values)
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, "v")
+    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
+    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = scatter_add(v, indices, v_scaled_g_values)
+    v_sqrt = math_ops.sqrt(v_t)
+    var_update = state_ops.assign_sub(var,
+                                      lr * m_t / (v_sqrt + epsilon_t),
+                                      use_locking=self._use_locking)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _apply_sparse(self, grad, var):
+    return self._apply_sparse_shared(
+        grad.values, var, grad.indices,
+        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
+            x, i, v, use_locking=self._use_locking))
+
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(
+            x.handle, i, v)]):
+      return x.value()
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    return self._apply_sparse_shared(
+        grad, var, indices, self._resource_scatter_add)
diff --git a/tensorflow/contrib/opt/python/training/adam_gs_optimizer_test.py b/tensorflow/contrib/opt/python/training/adam_gs_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c68c965aef3729bebe7d0e0dd707c344321d9e3f
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/adam_gs_optimizer_test.py
@@ -0,0 +1,382 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for AdamGS."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import adam_gs_optimizer
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      alpha=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class AdamGSOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          global_step = resource_variable_ops.ResourceVariable(
+              array_ops.zeros([], dtypes.int64))
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = adam_gs_optimizer.AdamGSOptimizer(global_step=global_step)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.cached_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adam_gs_optimizer.AdamGSOptimizer(3.0)
+        minimize_op = optimizer.minimize(gathered_sum)
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        repeated_index_global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64))
+        aggregated_global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64))
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adam_gs_optimizer.AdamGSOptimizer(
+            global_step=repeated_index_global_step).apply_gradients(
+                [(grad_repeated_index, repeated_index_update_var)],
+                global_step=repeated_index_global_step)
+        aggregated_update = adam_gs_optimizer.AdamGSOptimizer(
+            global_step=aggregated_global_step).apply_gradients(
+                [(grad_aggregated, aggregated_update_var)],
+                global_step=aggregated_global_step)
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            self.evaluate(repeated_index_update_var))
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              self.evaluate(repeated_index_update_var))
+
+  def doTestBasic(self, use_resource=False, use_callable_params=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          global_step = resource_variable_ops.ResourceVariable(
+              array_ops.zeros([], dtypes.int64), name="global_step_%d" % i)
+          var0 = resource_variable_ops.ResourceVariable(
+              var0_np, name="var0_%d" % i)
+          var1 = resource_variable_ops.ResourceVariable(
+              var1_np, name="var1_%d" % i)
+        else:
+          global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = lambda: 0.001
+        beta1 = lambda: 0.9
+        beta2 = lambda: 0.999
+        epsilon = lambda: 1e-8
+        if not use_callable_params:
+          learning_rate = learning_rate()
+          beta1 = beta1()
+          beta2 = beta2()
+          epsilon = epsilon()
+
+        opt = adam_gs_optimizer.AdamGSOptimizer(global_step=global_step,
+                                                learning_rate=learning_rate)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        opt_variables = opt.variables()
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+        self.assertTrue(beta1_power is not None)
+        self.assertTrue(beta2_power is not None)
+        self.assertNotIn(beta1_power, opt_variables)
+        self.assertNotIn(beta2_power, opt_variables)
+
+        if not context.executing_eagerly():
+          with ops.Graph().as_default():
+            # Shouldn't return non-slot variables from other graphs.
+            self.assertEqual(0, len(opt.variables()))
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          if not context.executing_eagerly():
+            self.evaluate(update)
+            self.assertAllCloseAccordingToType(
+                0.9**(t + 1), self.evaluate(beta1_power))
+            self.assertAllCloseAccordingToType(
+                0.999**(t + 1), self.evaluate(beta2_power))
+          else:
+            if t > 1:
+              opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                  global_step=global_step)
+              beta1_power, beta2_power = opt._get_beta_accumulators()
+              self.assertAllCloseAccordingToType(
+                  0.9**t, self.evaluate(beta1_power))
+              self.assertAllCloseAccordingToType(
+                  0.999**t, self.evaluate(beta2_power))
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/Adam:0" % (i,),
+                             opt.get_slot(var=var0, name="m").name)
+
+  def testBasic(self):
+    with self.cached_session():
+      self.doTestBasic(use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_resource=True, use_callable_params=True)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam_gs_optimizer.AdamGSOptimizer(
+            global_step=global_step, learning_rate=constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam_gs_optimizer.AdamGSOptimizer(global_step=global_step)
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                      global_step=global_step)
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                      global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testTwoSessions(self):
+    optimizer = adam_gs_optimizer.AdamGSOptimizer()
+
+    with context.eager_mode():
+      var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+      grads0 = constant_op.constant(np.array([0.1, 0.1]))
+      optimizer.apply_gradients([(grads0, var0)])
+
+    g = ops.Graph()
+    with g.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+        optimizer.apply_gradients([(grads0, var0)])
+
+    gg = ops.Graph()
+    with gg.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+
+        # If the optimizer saves any state not keyed by graph the following line
+        # fails.
+        optimizer.apply_gradients([(grads0, var0)])
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adam_gs_optimizer.AdamGSOptimizer(1.)
+      opt.minimize(lambda: v1 + v2)
+      # There should be two unique slot variables for v1 and v2 respectively.
+      self.assertEqual(4, len(set(opt.variables())))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
index 6c203e5519e6a66d20e2509eca3c74eb66bf32c7..fa1a7aaff0aa59a6a64b1f0bf836a273926d785d 100644
--- a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import saver
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.training.saving import saveable_object_util
 
 LOCAL_VARIABLE_NAME = 'local_center_variable'
 GLOBAL_VARIABLE_NAME = 'global_center_variable'
@@ -424,7 +425,7 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
     if var_list is None:
       var_list = variables.trainable_variables()
     if not isinstance(var_list, dict):
-      var_list = saver.BaseSaverBuilder.OpListToDict(var_list)
+      var_list = saveable_object_util.op_list_to_dict(var_list)
 
     swapped_var_list = {}
     for key, var in var_list.items():
@@ -464,4 +465,4 @@ class _ElasticAverageOptimizerHook(session_run_hook.SessionRunHook):
 
   def after_create_session(self, session, coord):
     """Run initialization ops"""
-    session.run(self._variable_init_op)
\ No newline at end of file
+    session.run(self._variable_init_op)
diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_gs_optimizer.py b/tensorflow/contrib/opt/python/training/lazy_adam_gs_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8827007e4d7f6722398a8e36bd626377842d92ef
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/lazy_adam_gs_optimizer.py
@@ -0,0 +1,114 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""LazyAdam rewrite to use global step for computing beta1 & beta2 accumulation.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.opt.python.training import adam_gs_optimizer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+
+
+class LazyAdamGSOptimizer(adam_gs_optimizer.AdamGSOptimizer):
+  """Variant of the Adam optimizer that handles sparse updates more efficiently.
+
+  Branched from tf.contrib.opt.LazyAdamGSOptimizer. The only difference is to
+  pass global step for computing beta1 and beta2 accumulators, instead of having
+  optimizer keep its own independent beta1 and beta2 accumulators as non-slot
+  variables.
+
+  The original Adam algorithm maintains two moving-average accumulators for
+  each trainable variable; the accumulators are updated at every step.
+  This class provides lazier handling of gradient updates for sparse variables.
+  It only updates moving-average accumulators for sparse variable indices that
+  appear in the current batch, rather than updating the accumulators for all
+  indices. Compared with the original Adam optimizer, it can provide large
+  improvements in model training throughput for some applications. However, it
+  provides slightly different semantics than the original Adam algorithm, and
+  may lead to different empirical results.
+  """
+
+  def _apply_sparse(self, grad, var):
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
+
+    # \\(m := beta1 * m + (1 - beta1) * g_t\\)
+    m = self.get_slot(var, "m")
+    m_t = state_ops.scatter_update(m, grad.indices,
+                                   beta1_t * array_ops.gather(m, grad.indices) +
+                                   (1 - beta1_t) * grad.values,
+                                   use_locking=self._use_locking)
+
+    # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
+    v = self.get_slot(var, "v")
+    v_t = state_ops.scatter_update(v, grad.indices,
+                                   beta2_t * array_ops.gather(v, grad.indices) +
+                                   (1 - beta2_t) * math_ops.square(grad.values),
+                                   use_locking=self._use_locking)
+
+    # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
+    m_t_slice = array_ops.gather(m_t, grad.indices)
+    v_t_slice = array_ops.gather(v_t, grad.indices)
+    denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t
+    var_update = state_ops.scatter_sub(var, grad.indices,
+                                       lr * m_t_slice / denominator_slice,
+                                       use_locking=self._use_locking)
+    return control_flow_ops.group(var_update, m_t, v_t)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
+
+    # \\(m := beta1 * m + (1 - beta1) * g_t\\)
+    m = self.get_slot(var, "m")
+    m_t_slice = beta1_t * array_ops.gather(m, indices) + (1 - beta1_t) * grad
+    m_update_op = resource_variable_ops.resource_scatter_update(m.handle,
+                                                                indices,
+                                                                m_t_slice)
+
+    # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
+    v = self.get_slot(var, "v")
+    v_t_slice = (beta2_t * array_ops.gather(v, indices) +
+                 (1 - beta2_t) * math_ops.square(grad))
+    v_update_op = resource_variable_ops.resource_scatter_update(v.handle,
+                                                                indices,
+                                                                v_t_slice)
+
+    # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
+    var_slice = lr * m_t_slice / (math_ops.sqrt(v_t_slice) + epsilon_t)
+    var_update_op = resource_variable_ops.resource_scatter_sub(var.handle,
+                                                               indices,
+                                                               var_slice)
+
+    return control_flow_ops.group(var_update_op, m_update_op, v_update_op)
diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_gs_optimizer_test.py b/tensorflow/contrib/opt/python/training/lazy_adam_gs_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdc9a02a546c8399172d0c5b58941b4d80179955
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/lazy_adam_gs_optimizer_test.py
@@ -0,0 +1,402 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for LazyAdamGSOptimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import lazy_adam_gs_optimizer
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      alpha=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class LazyAdamGSOptimizerTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters([False, True])
+  def testSparse(self, use_resource):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          global_step = resource_variable_ops.ResourceVariable(
+              array_ops.zeros([], dtypes.int64))
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(
+            global_step=global_step)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @parameterized.parameters([False, True])
+  def testSparseDevicePlacement(self, use_resource):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.cached_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        if use_resource:
+          global_step = resource_variable_ops.ResourceVariable(
+              array_ops.zeros([], dtypes.int64))
+          var = resource_variable_ops.ResourceVariable([[1.0], [2.0]])
+        else:
+          global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+          var = variables.Variable([[1.0], [2.0]])
+
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(
+            global_step=global_step, learning_rate=3.0)
+        minimize_op = optimizer.minimize(gathered_sum, global_step=global_step)
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  @parameterized.parameters([False, True])
+  def testSparseRepeatedIndices(self, use_resource):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        if use_resource:
+          repeated_index_global_step = resource_variable_ops.ResourceVariable(
+              array_ops.zeros([], dtypes.int64))
+          aggregated_global_step = resource_variable_ops.ResourceVariable(
+              array_ops.zeros([], dtypes.int64))
+          repeated_index_update_var = resource_variable_ops.ResourceVariable(
+              [[1.0], [2.0]], dtype=dtype)
+          aggregated_update_var = resource_variable_ops.ResourceVariable(
+              [[1.0], [2.0]], dtype=dtype)
+        else:
+          repeated_index_global_step = variables.Variable(
+              array_ops.zeros([], dtypes.int64))
+          aggregated_global_step = variables.Variable(
+              array_ops.zeros([], dtypes.int64))
+          repeated_index_update_var = variables.Variable(
+              [[1.0], [2.0]], dtype=dtype)
+          aggregated_update_var = variables.Variable(
+              [[1.0], [2.0]], dtype=dtype)
+
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update_opt = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(
+            global_step=repeated_index_global_step)
+        repeated_update = repeated_update_opt.apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)],
+            global_step=repeated_index_global_step)
+        aggregated_update_opt = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(
+            global_step=aggregated_global_step)
+        aggregated_update = aggregated_update_opt.apply_gradients(
+            [(grad_aggregated, aggregated_update_var)],
+            global_step=aggregated_global_step)
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def doTestBasic(self, use_resource=False, use_callable_params=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          global_step = resource_variable_ops.ResourceVariable(
+              array_ops.zeros([], dtypes.int64), name="global_step_%d" % i)
+          var0 = resource_variable_ops.ResourceVariable(
+              var0_np, name="var0_%d" % i)
+          var1 = resource_variable_ops.ResourceVariable(
+              var1_np, name="var1_%d" % i)
+        else:
+          global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = lambda: 0.001
+        beta1 = lambda: 0.9
+        beta2 = lambda: 0.999
+        epsilon = lambda: 1e-8
+        if not use_callable_params:
+          learning_rate = learning_rate()
+          beta1 = beta1()
+          beta2 = beta2()
+          epsilon = epsilon()
+
+        opt = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(
+            global_step=global_step, learning_rate=learning_rate)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        opt_variables = opt.variables()
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+        self.assertIsNotNone(beta1_power)
+        self.assertIsNotNone(beta2_power is not None)
+        self.assertNotIn(beta1_power, opt_variables)
+        self.assertNotIn(beta2_power, opt_variables)
+
+        if not context.executing_eagerly():
+          with ops.Graph().as_default():
+            # Shouldn't return non-slot variables from other graphs.
+            self.assertEqual(0, len(opt.variables()))
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          if not context.executing_eagerly():
+            self.evaluate(update)
+            self.assertAllCloseAccordingToType(
+                0.9**(t + 1), self.evaluate(beta1_power))
+            self.assertAllCloseAccordingToType(
+                0.999**(t + 1), self.evaluate(beta2_power))
+          else:
+            if t > 1:
+              opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                  global_step=global_step)
+              beta1_power, beta2_power = opt._get_beta_accumulators()
+              self.assertAllCloseAccordingToType(
+                  0.9**t, self.evaluate(beta1_power))
+              self.assertAllCloseAccordingToType(
+                  0.999**t, self.evaluate(beta2_power))
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/Adam:0" % (i,),
+                             opt.get_slot(var=var0, name="m").name)
+
+  def testBasic(self):
+    with self.cached_session():
+      self.doTestBasic(use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_resource=True, use_callable_params=True)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(
+            global_step=global_step, learning_rate=constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(
+            global_step=global_step)
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                      global_step=global_step)
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                      global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testTwoSessions(self):
+    optimizer = lazy_adam_gs_optimizer.LazyAdamGSOptimizer()
+
+    with context.eager_mode():
+      var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+      grads0 = constant_op.constant(np.array([0.1, 0.1]))
+      optimizer.apply_gradients([(grads0, var0)])
+
+    g = ops.Graph()
+    with g.as_default():
+      with self.session(graph=g):
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+        optimizer.apply_gradients([(grads0, var0)])
+
+    gg = ops.Graph()
+    with gg.as_default():
+      with self.session(graph=gg):
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+
+        # If the optimizer saves any state not keyed by graph the following line
+        # fails.
+        optimizer.apply_gradients([(grads0, var0)])
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(1.)
+      opt.minimize(lambda: v1 + v2)
+      # There should be two non-slot variables, and two unique slot variables
+      # for v1 and v2 respectively.
+      self.assertLen(set(opt.variables()), 4)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
index b7fd2d2fb9db3eed15eb1cc2934199939790b1c0..bf3e5c51f78cc3ca3c7c77009c9cf428c4988953 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.training import moving_averages
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import saver
+from tensorflow.python.training.saving import saveable_object_util
 
 
 class MovingAverageOptimizer(optimizer.Optimizer):
@@ -165,7 +166,7 @@ class MovingAverageOptimizer(optimizer.Optimizer):
     if var_list is None:
       var_list = variables.global_variables()
     if not isinstance(var_list, dict):
-      var_list = saver.BaseSaverBuilder.OpListToDict(var_list)
+      var_list = saveable_object_util.op_list_to_dict(var_list)
 
     v_name_to_tensor = {}
     for k, tensor_or_list in six.iteritems(var_list):
diff --git a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
index 200b0d200826a6212a236680327f4daf7d07831f..8b8065c678e11e8fc237e71cf1d392ced5c22ada 100644
--- a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
+++ b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
@@ -59,6 +59,23 @@ class DecoupledWeightDecayExtension(object):
   Note that this extension decays weights BEFORE applying the update based
   on the gradient, i.e. this extension only has the desired behaviour for
   optimizers which do not depend on the value of'var' in the update step!
+  
+  Note: when applying a decay to the learning rate, be sure to manually apply
+  the decay to the `weight_decay` as well. For example:
+
+  ```python
+    schedule = tf.train.piecewise_constant(tf.train.get_global_step(), 
+                                           [10000, 15000], [1e-0, 1e-1, 1e-2])
+    lr = 1e-1 * schedule()
+    wd = lambda: 1e-4 * schedule()
+
+    # ...
+
+    optimizer = tf.contrib.opt.MomentumWOptimizer(learning_rate=lr,
+                                                  weight_decay=wd,
+                                                  momentum=0.9,
+                                                  use_nesterov=True)
+  ```
   """
 
   def __init__(self, weight_decay, **kwargs):
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 73a556f0b299614b098ceef0fb9d32f148227b03..7fb23abc38d9dc101204ed83808aebe5a8ef1e78 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -25,6 +25,7 @@ import abc
 import six
 
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -36,7 +37,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribution_strategy_context as distribute_ctx
 from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.training import slot_creator
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -997,10 +997,10 @@ class OptimizerV2(optimizer_v1.Optimizer):
       with ops.control_dependencies([update_ops]):
         finish_updates = distribution.extended.update_non_slot(
             non_slot_devices, finish, group=False)
-      # We said grouped=False, which means finish_updates is always a list.
-      # It will be [None] when finish() returns None.
-      if finish_updates == [None]:
-        finish_updates = [update_ops]
+      # We said group=False, which means finish_updates is always a tuple.
+      # It will be (None,) when finish() returns None.
+      if finish_updates == (None,):
+        finish_updates = (update_ops,)
 
       # Update `global_step` (if any).
       if global_step is None:
diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD
index d50b52b8ff1ce8188ab52c6968d716378efd9daa..53a3bc63e1d770b451846c45370fdee9ffa72d70 100644
--- a/tensorflow/contrib/predictor/BUILD
+++ b/tensorflow/contrib/predictor/BUILD
@@ -42,6 +42,7 @@ py_library(
     name = "saved_model_predictor",
     srcs = ["saved_model_predictor.py"],
     srcs_version = "PY2AND3",
+    visibility = ["//learning/brain/contrib/learn/tpu:__subpackages__"],
     deps = [
         ":base_predictor",
         "//tensorflow/contrib/saved_model:saved_model_py",
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 21d1b1213090273b5abd8e012f8711db98c94347..7c973fe597181b822e617db1f85a08f1b678e26f 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -685,7 +685,7 @@ def _InsertQuantOp(context,
       [1; 2^bits - 1] or wide range [0; 2^bits - 1].
     producer_scope: The restriction of producer scope. If not None, the new op
       will be inserted only when the producer is in this scope.
-    consumer_scope: The restriction of producer scope. If not None, the new op
+    consumer_scope: The restriction of consumer scope. If not None, the new op
       will be inserted only when all the consumers are in this scope.
   Raises:
     ValueError: When producer operation is not directly connected to the
diff --git a/tensorflow/contrib/rate/BUILD b/tensorflow/contrib/rate/BUILD
index c461a7145e27c4238161cec989448be807acd543..76db9aecf615d0a94f65cd7ea799db245828db1c 100644
--- a/tensorflow/contrib/rate/BUILD
+++ b/tensorflow/contrib/rate/BUILD
@@ -34,6 +34,11 @@ py_test(
     name = "rate_test",
     size = "small",
     srcs = ["rate_test.py"],
+    tags = [
+        "manual",  # TODO(b/120555555)
+        "no_oss",  # TODO(b/120555555)
+        "notap",  # TODO(b/120555555)
+    ],
     deps = [
         ":rate",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/receptive_field/README.md b/tensorflow/contrib/receptive_field/README.md
index 79b015a9163f5727caa40b54579c71e57621c92f..d1c41e4c0a11028765c9fc0dc345cb29453baa31 100644
--- a/tensorflow/contrib/receptive_field/README.md
+++ b/tensorflow/contrib/receptive_field/README.md
@@ -185,5 +185,4 @@ Effective padding (vertical) = 1482
 
 ## Authors
 
-Andr&eacute; Araujo (github id: andrefaraujo) and Mark Sandler (github id:
-marksandler)
+Andr&eacute; Araujo (@andrefaraujo) and Mark Sandler (@marksandler)
diff --git a/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py b/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py
index d6fdd12bbe37fb0e0cb12f1d0adc3fce29b19e8a..72f98ccc32e945b48b5f1b570bcca323a5b5f48a 100644
--- a/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py
+++ b/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Computes Receptive Field (RF) information given a graph protobuf.
-
-For an example of usage, see accompanying file compute_rf.sh
-"""
+"""Computes Receptive Field (RF) information given a graph protobuf."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py b/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py
index a298b4d49038468299b58140758c69675368e855..325929a5937ac60a6134fae064e7633a4c57473d 100644
--- a/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py
+++ b/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py
@@ -16,8 +16,6 @@
 
 The receptive field (and related parameters) for the different models are
 printed to stdout, and may also optionally be written to a CSV file.
-
-For an example of usage, see rf_benchmark.sh
 """
 
 from __future__ import absolute_import
@@ -262,11 +260,11 @@ def _model_rf(graphdef,
       information will be computed.
     model_type: Type of model to be used, used only for printing purposes.
     csv_writer: A CSV writer for RF parameters, which is used if it is not None.
-    input_resolution: Input resolution to use when computing RF
-      parameters. This is important for the case where padding can only be
-      defined if the input resolution is known, which may happen if using SAME
-      padding. This is assumed the resolution for both height and width. If
-      None, we consider the resolution is unknown.
+    input_resolution: Input resolution to use when computing RF parameters. This
+      is important for the case where padding can only be defined if the input
+      resolution is known, which may happen if using SAME padding. This is
+      assumed the resolution for both height and width. If None, we consider the
+      resolution is unknown.
   """
   for desired_end_point_key in desired_end_point_keys:
     print('- %s:' % desired_end_point_key)
@@ -283,10 +281,10 @@ def _model_rf(graphdef,
       if (receptive_field_x == receptive_field_y) and (
           effective_stride_x == effective_stride_y) and (
               effective_padding_x == effective_padding_y):
-        print('Receptive field size = %5s, effective stride = %5s, effective '
-              'padding = %5s' % (str(receptive_field_x),
-                                 str(effective_stride_x),
-                                 str(effective_padding_x)))
+        print(
+            'Receptive field size = %5s, effective stride = %5s, effective '
+            'padding = %5s' % (str(receptive_field_x), str(effective_stride_x),
+                               str(effective_padding_x)))
       else:
         print('Receptive field size: horizontal = %5s, vertical = %5s. '
               'Effective stride: horizontal = %5s, vertical = %5s. Effective '
@@ -362,9 +360,8 @@ def _process_model_rf(model_type='resnet_v1_50',
       defined if the input resolution is known, which may happen if using SAME
       padding. The entries in the list are assumed the resolution for both
       height and width. If one of the elements in the list is None, we consider
-      it to mean that the resolution is unknown. If the list itself is None,
-      we use the default list [None, 224, 321].
-
+      it to mean that the resolution is unknown. If the list itself is None, we
+      use the default list [None, 224, 321].
   """
   # Process default value for this list.
   if input_resolutions is None:
@@ -477,8 +474,8 @@ def _mobilenet_v1_rf(csv_writer=None):
     csv_writer: A CSV writer for RF parameters, which is used if it is not None.
   """
   for model_type in _SUPPORTED_MOBILENETV1_VARIANTS:
-    with slim.arg_scope(
-        [slim.batch_norm, slim.dropout], is_training=False) as arg_sc:
+    with slim.arg_scope([slim.batch_norm, slim.dropout],
+                        is_training=False) as arg_sc:
       _process_model_rf(model_type, csv_writer, arg_sc)
 
 
diff --git a/tensorflow/contrib/receptive_field/python/util/receptive_field.py b/tensorflow/contrib/receptive_field/python/util/receptive_field.py
index b9bd2f09761ab10a62d37e8e2580b93b9b8a4453..9127c772c75279d9c8eacc5a17680beba9247d01 100644
--- a/tensorflow/contrib/receptive_field/python/util/receptive_field.py
+++ b/tensorflow/contrib/receptive_field/python/util/receptive_field.py
@@ -12,12 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Functions to compute receptive field of a fully-convolutional network.
-
-Please refer to the following g3doc for detailed explanation on how this
-computation is performed, and why it is important:
-g3doc/photos/vision/features/delf/g3doc/rf_computation.md
-"""
+"""Functions to compute receptive field of a fully-convolutional network."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -96,8 +91,8 @@ class ReceptiveField(object):
     Args:
       y: An array of feature coordinates with shape `(..., d)`, where `d` is the
         number of dimensions of the coordinates.
-      axis: The dimensions for which to compute the input center coordinates.
-        If `None` (the default), compute the input center coordinates for all
+      axis: The dimensions for which to compute the input center coordinates. If
+        `None` (the default), compute the input center coordinates for all
         dimensions.
 
     Returns:
@@ -127,8 +122,8 @@ class ReceptiveField(object):
     Args:
       x: An array of input center coordinates with shape `(..., d)`, where `d`
         is the number of dimensions of the coordinates.
-      axis: The dimensions for which to compute the feature coordinates.
-        If `None` (the default), compute the feature coordinates for all
+      axis: The dimensions for which to compute the feature coordinates. If
+        `None` (the default), compute the feature coordinates for all
         dimensions.
 
     Returns:
@@ -274,14 +269,15 @@ def compute_receptive_field_from_graph_def(graph_def,
         continue
 
       # Get params for this layer.
-      (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
-       padding_y, _, _) = parse_layer_parameters.get_layer_params(
+      (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x, padding_y,
+       _, _) = parse_layer_parameters.get_layer_params(
            node, name_to_node, node_info[node.name].input_size)
-      logging.vlog(3, "kernel_size_x = %s, kernel_size_y = %s, "
-                   "stride_x = %s, stride_y = %s, "
-                   "padding_x = %s, padding_y = %s, input size = %s" %
-                   (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
-                    padding_y, node_info[node.name].input_size))
+      logging.vlog(
+          3, "kernel_size_x = %s, kernel_size_y = %s, "
+          "stride_x = %s, stride_y = %s, "
+          "padding_x = %s, padding_y = %s, input size = %s" %
+          (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
+           padding_y, node_info[node.name].input_size))
       if padding_x is None or padding_y is None:
         undefined_padding = True
 
@@ -352,15 +348,15 @@ def compute_receptive_field_from_graph_def(graph_def,
               raise ValueError(
                   "Graph is not aligned since effective stride from different "
                   "paths is different in vertical direction")
-            if (rf_sizes_x[inp_name] - 1
-               ) / 2 - effective_paddings_x[inp_name] != (
-                   rf_size_input_x - 1) / 2 - effective_padding_input_x:
+            if (rf_sizes_x[inp_name] -
+                1) / 2 - effective_paddings_x[inp_name] != (
+                    rf_size_input_x - 1) / 2 - effective_padding_input_x:
               raise ValueError(
                   "Graph is not aligned since center shift from different "
                   "paths is different in horizontal direction")
-            if (rf_sizes_y[inp_name] - 1
-               ) / 2 - effective_paddings_y[inp_name] != (
-                   rf_size_input_y - 1) / 2 - effective_padding_input_y:
+            if (rf_sizes_y[inp_name] -
+                1) / 2 - effective_paddings_y[inp_name] != (
+                    rf_size_input_y - 1) / 2 - effective_padding_input_y:
               raise ValueError(
                   "Graph is not aligned since center shift from different "
                   "paths is different in vertical direction")
diff --git a/tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py b/tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py
index d8ca0eab276b39f025d018edebb78eed7a8433bb..cec4c3c23305034d167a248a637425507750064e 100644
--- a/tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py
+++ b/tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py
@@ -164,6 +164,15 @@ class ResamplerOpsTest(xla_test.XLATestCase):
       expected = [[[0.0], [27.62]]]
       self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
 
+      expected_grad_data = [[[[0.12], [0.27999997]], [[0.18000001],
+                                                      [0.42000002]]]]
+      expected_grad_warp = [[[0., 0.], [22.60000038, 35.20000076]]]
+
+      grad_output = np.ones([1, 2, 1], dtype=dtype)
+      self._assertBackwardOpMatchesExpected(input_np, warp_np, grad_output,
+                                            expected_grad_data,
+                                            expected_grad_warp)
+
     # One of (x, y) is less than 0.
     for dtype in self.float_types:
       input_shape = [1, 2, 2, 1]
@@ -171,11 +180,21 @@ class ResamplerOpsTest(xla_test.XLATestCase):
       input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
 
       warp_shape = [1, 2, 2]
+      # -1 is out of bound for grad_warp.
       warp_data = [-1, 0.1, 0.7, 0.6]
       warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
       expected = [[[0.0], [27.62]]]
       self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
 
+      expected_grad_data = [[[[0.12], [0.27999997]], [[0.18000001],
+                                                      [0.42000002]]]]
+      expected_grad_warp = [[[0., 0.], [22.60000038, 35.20000076]]]
+
+      grad_output = np.ones([1, 2, 1], dtype=dtype)
+      self._assertBackwardOpMatchesExpected(input_np, warp_np, grad_output,
+                                            expected_grad_data,
+                                            expected_grad_warp)
+
     # Both of (x, y) are greater than image size.
     for dtype in self.float_types:
       input_shape = [1, 2, 2, 1]
@@ -183,11 +202,20 @@ class ResamplerOpsTest(xla_test.XLATestCase):
       input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
 
       warp_shape = [1, 2, 2]
+      # -0.1 is *inbound* for grad_warp and grad_data, 2.1 is out of bound.
       warp_data = [-0.1, 0.1, 1.2, 2.1]
       warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
       expected = [[[0.0], [0.0]]]
       self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
 
+      expected_grad_data = [[[[0.81], [0.0]], [[0.09], [0.0]]]]
+      expected_grad_warp = [[[10.30, 2.7], [0.0, 0.0]]]
+
+      grad_output = np.ones([1, 2, 1], dtype=dtype)
+      self._assertBackwardOpMatchesExpected(input_np, warp_np, grad_output,
+                                            expected_grad_data,
+                                            expected_grad_warp)
+
     # One of (x, y) is greater than image size.
     for dtype in self.float_types:
       input_shape = [1, 2, 2, 1]
@@ -200,6 +228,14 @@ class ResamplerOpsTest(xla_test.XLATestCase):
       expected = [[[0.0], [0.0]]]
       self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
 
+      expected_grad_data = [[[[0.81], [0.81]], [[0.0], [0.08]]]]
+      expected_grad_warp = [[[-4.5, 9.5], [-9.9, 39.20]]]
+
+      grad_output = np.ones([1, 2, 1], dtype=dtype)
+      self._assertBackwardOpMatchesExpected(input_np, warp_np, grad_output,
+                                            expected_grad_data,
+                                            expected_grad_warp)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
index ffba514bb96f5ce8d963cb0a0482738eafe88355..2a4b6eae367fe617e9a19d80f16eb3fda9ade1c0 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
@@ -22,53 +22,57 @@ import os
 import six
 
 from tensorflow.python.client import session
-from tensorflow.python.estimator import keras as estimator_keras_util
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.export import export as export_helpers
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import models as models_lib
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.metrics import Metric
 from tensorflow.python.keras.models import model_from_json
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import builder as saved_model_builder
 from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import save as save_lib
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training.checkpointable import util as checkpointable_utils
 from tensorflow.python.util import compat
+from tensorflow.python.util import nest
+from tensorflow_estimator.python.estimator import keras as estimator_keras_util
+from tensorflow_estimator.python.estimator import model_fn as model_fn_lib
+from tensorflow_estimator.python.estimator.export import export as export_helpers
 
 
 def save_keras_model(
-    model, saved_model_path, custom_objects=None, as_text=None):
-  """Save a `tf.keras.Model` into Tensorflow SavedModel format.
+    model, saved_model_path, custom_objects=None, as_text=None,
+    input_signature=None, serving_only=False):
+  """Saves a `tf.keras.Model` into Tensorflow SavedModel format.
 
   `save_model` generates new files/folders under the `saved_model_path` folder:
-  1) an asset folder containing the json string of the model's
-     configuration (topology).
-  2) a checkpoint containing the model weights.
-  3) a saved_model.pb file containing the model's MetaGraphs. The prediction
+  1) a checkpoint containing the model weights.
+  2) a saved_model.pb file containing the model's MetaGraphs. The prediction
      graph is always exported. The evaluaton and training graphs are exported
      if the following conditions are met:
      - Evaluation: model loss is defined.
      - Training: model is compiled with an optimizer defined under `tf.train`.
        This is because `tf.keras.optimizers.Optimizer` instances cannot be
        saved to checkpoints.
-
-  Model Requirements:
-  - Model must be a sequential model or functional model. Subclassed models can
-    not be saved via this function, unless you provide an implementation for
-    get_config() and from_config().
-  - All variables must be saveable by the model. In general, this condition is
-    met through the use of layers defined in the keras library. However,
-    there is currently a bug with variables created in Lambda layer functions
-    not being saved correctly (see
-    https://github.com/keras-team/keras/issues/9740).
+  3) Model's json configuration, if model.get_config() has been implemented.
+     This file can be used to reload the model using
+     tf.keras.models.model_from_json(). Note that if any custom objects were
+     used, they should be passed to the `custom_object` argument when loading
+     the model.
+
+  Model limitations:
+  - Sequential and functional models can always be saved.
+  - Subclassed models can only be saved when `serving_only=True`. This is due to
+    the current implementation copying the model in order to export the training
+    and evaluation graphs. Because the topology of subclassed models cannot be
+    determined, the subclassed models cannot be cloned. Subclassed models will
+    be entirely exportable in the future.
 
   Note that each mode is exported in separate graphs, so different modes do not
   share variables. To use the train graph with evaluation or prediction graphs,
@@ -94,38 +98,88 @@ def save_keras_model(
   ```
 
   Args:
-    model: A `tf.keras.Model` to be saved.
+    model: A `tf.keras.Model` to be saved. If the model is subclassed, the flag
+      `serving_only` must be set to True.
     saved_model_path: a string specifying the path to the SavedModel directory.
       The SavedModel will be saved to a timestamped folder created within this
       directory.
     custom_objects: Optional dictionary mapping string names to custom classes
       or functions (e.g. custom loss functions).
-    as_text: whether to write the `SavedModel` proto in text format.
+    as_text: whether to write the `SavedModel` proto in text format. Currently
+      unavailable in serving-only mode.
+    input_signature: A possibly nested sequence of `tf.TensorSpec` objects, used
+      to specify the expected model inputs. `input_signature`'s nested structure
+      should match the expected nested structure of the inputs to the model. If
+      this is not set, this function will attempt to infer the input shapes and
+      dtypes from the model. Note that if the model is subclassed, the tensor
+      inputs to the call function should be nested in the first argument (this
+      is a general requirement for using subclassed models with Keras functions
+      .fit(), .predict(), etc.).
+    serving_only: Export only the outputs produced from calling the model in
+      predict mode. The losses, optimizer, and other training configurations are
+      not saved. If the SavedModel will only be used for serving (rather than
+      retraining), or if the model is subclassed, this can be set to True.
 
   Returns:
     String path to the SavedModel folder, a subdirectory of `saved_model_path`.
 
   Raises:
-    NotImplementedError: If the model is a subclassed model.
-    ValueError: If a Sequential model does not have input shapes defined by the
-      user, and is not built.
+    NotImplementedError: If the model is a subclassed model, and serving_only is
+      False.
+    ValueError: If the input signature cannot be inferred from the model.
   """
+  export_dir = export_helpers.get_timestamped_export_dir(saved_model_path)
+
+  if serving_only:
+    save_lib.save(
+        model, export_dir,
+        signatures=training_utils.trace_model_call(model, input_signature))
+  else:
+    _save_v1_format(model, export_dir, custom_objects, as_text, input_signature)
+
+  try:
+    _export_model_json(model, export_dir)
+  except NotImplementedError:
+    logging.warning('Skipped saving model JSON, subclassed model does not have '
+                    'get_config() defined.')
+
+  return export_dir
+
+
+def _export_model_json(model, saved_model_path):
+  """Saves model configuration as a json string under assets folder."""
+  model_json = model.to_json()
+  model_json_filepath = os.path.join(
+      saved_model_utils.get_or_create_assets_dir(saved_model_path),
+      compat.as_text(constants.SAVED_MODEL_FILENAME_JSON))
+  file_io.write_string_to_file(model_json_filepath, model_json)
+
+
+def _export_model_variables(model, saved_model_path):
+  """Saves model weights in checkpoint format under variables folder."""
+  saved_model_utils.get_or_create_variables_dir(saved_model_path)
+  checkpoint_prefix = saved_model_utils.get_variables_path(saved_model_path)
+  model.save_weights(checkpoint_prefix, save_format='tf', overwrite=True)
+  return checkpoint_prefix
+
+
+def _save_v1_format(model, path, custom_objects, as_text, input_signature):
+  """Exports model to v1 SavedModel format."""
   if not model._is_graph_network:
     if isinstance(model, sequential.Sequential):
       # If input shape is not directly set in the model, the exported model
-      # will assume that the inputs have the same shape as the shape the model
-      # was built model with.
-      if not model.built:
+      # will infer the expected shapes of the input from the model.
+      if not model.built and input_signature is None:
         raise ValueError(
-            'Sequential model must be built before it can be exported.')
+            'Sequential model\'s input shape is unknown. Please build the '
+            'model, or use the input_signature argument to specify the '
+            'model inputs.')
     else:
       raise NotImplementedError(
-          'Exporting subclassed models is not yet supported.')
+          'Subclassed models can only be exported for serving. Please set '
+          'argument serving_only=True.')
 
-  export_dir = export_helpers.get_timestamped_export_dir(saved_model_path)
-  temp_export_dir = export_helpers.get_temp_export_dir(export_dir)
-
-  builder = saved_model_builder._SavedModelBuilder(temp_export_dir)
+  builder = saved_model_builder._SavedModelBuilder(path)
 
   # Manually save variables to export them in an object-based checkpoint. This
   # skips the `builder.add_meta_graph_and_variables()` step, which saves a
@@ -133,7 +187,7 @@ def save_keras_model(
   # TODO(b/113134168): Add fn to Builder to save with object-based saver.
   # TODO(b/113178242): This should only export the model json structure. Only
   # one save is needed once the weights can be copied from the model to clone.
-  checkpoint_path = _export_model_json_and_variables(model, temp_export_dir)
+  checkpoint_path = _export_model_variables(model, path)
 
   # Export each mode. Use ModeKeys enums defined for `Estimator` to ensure that
   # Keras models and `Estimator`s are exported with the same format.
@@ -143,10 +197,12 @@ def save_keras_model(
   export_args = {'builder': builder,
                  'model': model,
                  'custom_objects': custom_objects,
-                 'checkpoint_path': checkpoint_path}
+                 'checkpoint_path': checkpoint_path,
+                 'input_signature': input_signature}
 
   has_saved_vars = False
   if model.optimizer:
+    # TODO(kathywu): Verify this works with v2 optimizer.
     if isinstance(model.optimizer, optimizers.TFOptimizer):
       _export_mode(model_fn_lib.ModeKeys.TRAIN, has_saved_vars, **export_args)
       has_saved_vars = True
@@ -161,34 +217,20 @@ def save_keras_model(
 
   builder.save(as_text)
 
-  gfile.Rename(temp_export_dir, export_dir)
-  return export_dir
-
-
-def _export_model_json_and_variables(model, saved_model_path):
-  """Save model variables and json structure into SavedModel subdirectories."""
-  # Save model configuration as a json string under assets folder.
-  model_json = model.to_json()
-  model_json_filepath = os.path.join(
-      saved_model_utils.get_or_create_assets_dir(saved_model_path),
-      compat.as_text(constants.SAVED_MODEL_FILENAME_JSON))
-  file_io.write_string_to_file(model_json_filepath, model_json)
-
-  # Save model weights in checkpoint format under variables folder.
-  saved_model_utils.get_or_create_variables_dir(saved_model_path)
-  checkpoint_prefix = saved_model_utils.get_variables_path(saved_model_path)
-  model.save_weights(checkpoint_prefix, save_format='tf', overwrite=True)
-  return checkpoint_prefix
-
 
 def _get_var_list(model):
-  """Return list of all checkpointed saveable objects in the model."""
+  """Returns list of all checkpointed saveable objects in the model."""
   return checkpointable_utils.named_saveables(model)
 
 
+def create_placeholder(spec):
+  return K.placeholder(shape=spec.shape, dtype=spec.dtype, name=spec.name)
+
+
 def _export_mode(
-    mode, has_saved_vars, builder, model, custom_objects, checkpoint_path):
-  """Export a model, and optionally save new vars from the clone model.
+    mode, has_saved_vars, builder, model, custom_objects, checkpoint_path,
+    input_signature):
+  """Exports a model, and optionally saves new vars from the clone model.
 
   Args:
     mode: A `tf.estimator.ModeKeys` string.
@@ -199,6 +241,8 @@ def _export_mode(
     custom_objects: A dictionary mapping string names to custom classes
       or functions.
     checkpoint_path: String path to checkpoint.
+    input_signature: Nested TensorSpec containing the expected inputs. Can be
+      `None`, in which case the signature will be inferred from the model.
 
   Raises:
     ValueError: If the train/eval mode is being exported, but the model does
@@ -214,10 +258,16 @@ def _export_mode(
 
     K.set_learning_phase(mode == model_fn_lib.ModeKeys.TRAIN)
 
+    if input_signature is None:
+      input_tensors = None
+    else:
+      input_tensors = nest.map_structure(create_placeholder, input_signature)
+
     # Clone the model into blank graph. This will create placeholders for inputs
     # and targets.
     clone = models_lib.clone_and_build_model(
-        model, custom_objects=custom_objects, compile_clone=compile_clone)
+        model, input_tensors=input_tensors, custom_objects=custom_objects,
+        compile_clone=compile_clone)
 
     # Make sure that iterations variable is added to the global step collection,
     # to ensure that, when the SavedModel graph is loaded, the iterations
@@ -271,7 +321,7 @@ def _export_mode(
 
 
 def _create_signature_def_map(model, mode):
-  """Create a SignatureDef map from a Keras model."""
+  """Creates a SignatureDef map from a Keras model."""
   inputs_dict = {name: x for name, x in zip(model.input_names, model.inputs)}
   if model.optimizer:
     targets_dict = {x.name.split(':')[0]: x
@@ -309,14 +359,14 @@ def _create_signature_def_map(model, mode):
 
 
 def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):  # pylint: disable=unused-argument
-  """Assert model and clone contain the same checkpointable objects."""
+  """Asserts model and clone contain the same checkpointable objects."""
 
   # TODO(fchollet, kathywu): make sure this works in eager mode.
   return True
 
 
 def load_keras_model(saved_model_path):
-  """Load a keras.Model from SavedModel.
+  """Loads a keras.Model from SavedModel.
 
   load_model reinstantiates model state by:
   1) loading model topology from json (this will eventually come
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
index 93d73e1b484ed810fb347b13e95022dfca3584c2..fbf8138493362d4a3c8a75e1ee1bb2fbe8096499 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
@@ -29,7 +29,9 @@ from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.utils import tf_utils
@@ -215,7 +217,7 @@ class LayerWithLearningPhase(keras.engine.base_layer.Layer):
     return input_shape
 
 
-def functional_model(uses_learning_phase):
+def functional_model(uses_learning_phase=True):
   inputs = keras.layers.Input(shape=(3,))
   x = keras.layers.Dense(2)(inputs)
   x = keras.layers.Dense(3)(x)
@@ -224,7 +226,7 @@ def functional_model(uses_learning_phase):
   return keras.models.Model(inputs, x)
 
 
-def sequential_model(uses_learning_phase):
+def sequential_model(uses_learning_phase=True):
   model = keras.models.Sequential()
   model.add(keras.layers.Dense(2, input_shape=(3,)))
   model.add(keras.layers.Dense(3))
@@ -233,7 +235,7 @@ def sequential_model(uses_learning_phase):
   return model
 
 
-def sequential_model_without_input_shape(uses_learning_phase):
+def sequential_model_without_input_shape(uses_learning_phase=True):
   model = keras.models.Sequential()
   model.add(keras.layers.Dense(2))
   model.add(keras.layers.Dense(3))
@@ -242,10 +244,30 @@ def sequential_model_without_input_shape(uses_learning_phase):
   return model
 
 
+class Subclassed(keras.models.Model):
+
+  def __init__(self):
+    super(Subclassed, self).__init__()
+    self.dense1 = keras.layers.Dense(2)
+    self.dense2 = keras.layers.Dense(3)
+
+  def call(self, inputs):
+    x = self.dense1(inputs)
+    x = self.dense2(x)
+    return x
+
+
+def subclassed_model():
+  return Subclassed()
+
+
 def load_model(sess, path, mode):
   tags = model_fn_lib.EXPORT_TAG_MAP[mode]
-  sig_def_key = (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-                 if mode == model_fn_lib.ModeKeys.PREDICT else mode)
+  if mode == model_fn_lib.ModeKeys.PREDICT:
+    sig_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+  else:
+    sig_def_key = mode
+
   meta_graph_def = loader_impl.load(sess, tags, path)
   inputs = {
       k: sess.graph.get_tensor_by_name(v.name)
@@ -463,13 +485,54 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
       clone.train_on_batch(input_arr, target_arr)
 
-  def testSaveSeqModelWithoutInputShapesRaisesError(self):
-    """A Sequential model that hasn't been built should raise an error."""
+  def testSaveSequentialModelWithoutInputShapes(self):
     model = sequential_model_without_input_shape(True)
-    with self.assertRaisesRegexp(
-        ValueError, 'must be built'):
+    # A Sequential model that hasn't been built should raise an error.
+    with self.assertRaisesRegexp(ValueError, 'Please build the model'):
       keras_saved_model.save_keras_model(model, '')
 
+    saved_model_path = self._save_model_dir()
+    output_path = keras_saved_model.save_keras_model(
+        model, saved_model_path,
+        input_signature=tensor_spec.TensorSpec(shape=(10, 11, 12, 13, 14),
+                                               dtype=dtypes.float32,
+                                               name='spec_input'))
+
+    with session.Session(graph=ops.Graph()) as sess:
+      inputs, outputs, _ = load_model(sess, output_path,
+                                      model_fn_lib.ModeKeys.PREDICT)
+      self.assertEqual(5, inputs[next(iter(inputs.keys()))].shape.ndims)
+      self.assertEqual(5, outputs[next(iter(outputs.keys()))].shape.ndims)
+      self.assertEqual(3, outputs[next(iter(outputs.keys()))].shape[-1])
+
+  @test_util.run_v2_only
+  @parameterized.parameters(
+      {
+          'model_builder': sequential_model_without_input_shape,
+          'input_signature': [tensor_spec.TensorSpec(shape=[None, 3],
+                                                     dtype=dtypes.float32)]},
+      {
+          'model_builder': subclassed_model,
+          'input_signature': [tensor_spec.TensorSpec(shape=[None, 3],
+                                                     dtype=dtypes.float32)]})
+  def testServingOnly(self, model_builder, input_signature):
+    saved_model_path = self._save_model_dir()
+    input_arr = np.random.random((5, 3)).astype(np.float32)
+    model = model_builder()
+    ref_predict = model.predict(input_arr)
+
+    output_path = keras_saved_model.save_keras_model(
+        model, saved_model_path, serving_only=True,
+        input_signature=input_signature)
+
+    # Load predict graph, and test predictions
+    with session.Session(graph=ops.Graph()) as sess:
+      inputs, outputs, _ = load_model(sess, output_path,
+                                      model_fn_lib.ModeKeys.PREDICT)
+      predictions = sess.run(outputs[next(iter(outputs.keys()))],
+                             {inputs[next(iter(inputs.keys()))]: input_arr})
+      self.assertAllClose(ref_predict, predictions, atol=1e-05)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 922f21b98b35dfff19c8c605a25e89c5d2da8d98..d815f81f847ad79ddcc6c6ecf5c050598e185d8d 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import variable_scope as vs
@@ -992,5 +993,67 @@ class AttentionWrapperTest(test.TestCase):
         expected_final_alignment_history=expected_final_alignment_history,
         name='testMultiAttention')
 
+  def testCustomizedAttention(self):
+    batch_size = 2
+    max_time = 3
+    num_units = 2
+    memory = constant_op.constant([[[1., 1.], [2., 2.], [3., 3.]],
+                                   [[4., 4.], [5., 5.], [6., 6.]]])
+    memory_sequence_length = constant_op.constant([3, 2])
+    attention_mechanism = wrapper.BahdanauAttention(num_units, memory,
+                                                    memory_sequence_length)
+
+    # Sets all returned values to be all ones.
+    def _customized_attention(unused_attention_mechanism, unused_cell_output,
+                              unused_attention_state, unused_attention_layer):
+      """Customized attention.
+
+      Returns:
+        attention: `Tensor` of shape [batch_size, num_units], attention output.
+        alignments: `Tensor` of shape [batch_size, max_time], sigma value for
+          each input memory (prob. function of input keys).
+        next_attention_state: A `Tensor` representing the next state for the
+          attention.
+      """
+      attention = array_ops.ones([batch_size, num_units])
+      alignments = array_ops.ones([batch_size, max_time])
+      next_attention_state = alignments
+      return attention, alignments, next_attention_state
+
+    attention_cell = wrapper.AttentionWrapper(
+        rnn_cell.LSTMCell(2),
+        attention_mechanism,
+        attention_layer_size=None,  # don't use attention layer.
+        output_attention=False,
+        alignment_history=(),
+        attention_fn=_customized_attention,
+        name='attention')
+    self.assertEqual(num_units, attention_cell.output_size)
+
+    initial_state = attention_cell.zero_state(
+        batch_size=2, dtype=dtypes.float32)
+    source_input_emb = array_ops.ones([2, 3, 2])
+    source_input_length = constant_op.constant([3, 2])
+
+    # 'state' is a tuple of
+    # (cell_state, h, attention, alignments, alignment_history, attention_state)
+    output, state = rnn.dynamic_rnn(
+        attention_cell,
+        inputs=source_input_emb,
+        sequence_length=source_input_length,
+        initial_state=initial_state,
+        dtype=dtypes.float32)
+
+    with self.session() as sess:
+      sess.run(variables.global_variables_initializer())
+      output_value, state_value = sess.run([output, state], feed_dict={})
+      self.assertAllEqual(np.array([2, 3, 2]), output_value.shape)
+      self.assertAllClose(np.array([[1., 1.], [1., 1.]]), state_value.attention)
+      self.assertAllClose(
+          np.array([[1., 1., 1.], [1., 1., 1.]]), state_value.alignments)
+      self.assertAllClose(
+          np.array([[1., 1., 1.], [1., 1., 1.]]), state_value.attention_state)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 77e9f848b137911b53e1b4df5dd740fe38af55bb..60ec3efffe771a3a6d6f36ed4b51a34ef9509612 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -1088,7 +1088,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
                output_attention=True,
                initial_cell_state=None,
                name=None,
-               attention_layer=None):
+               attention_layer=None,
+               attention_fn=None):
     """Construct the `AttentionWrapper`.
 
     **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
@@ -1132,7 +1133,9 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         feed the context and cell output into the attention layer to generate
         attention at each time step. If attention_mechanism is a list,
         attention_layer_size must be a list of the same length. If
-        attention_layer is set, this must be None.
+        attention_layer is set, this must be None. If attention_fn is set,
+        it must guaranteed that the outputs of attention_fn also meet the
+        above requirements.
       alignment_history: Python boolean, whether to store alignment history
         from all time steps in the final output state (currently stored as a
         time major `TensorArray` on which you must call `stack()`).
@@ -1158,6 +1161,12 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         the context as attention at each time step. If attention_mechanism is a
         list, attention_layer must be a list of the same length. If
         attention_layers_size is set, this must be None.
+      attention_fn: An optional callable function that allows users to provide
+        their own customized attention function, which takes input
+        (attention_mechanism, cell_output, attention_state, attention_layer) and
+        outputs (attention, alignments, next_attention_state). If provided,
+        the attention_layer_size should be the size of the outputs of
+        attention_fn.
 
     Raises:
       TypeError: `attention_layer_size` is not None and (`attention_mechanism`
@@ -1240,6 +1249,10 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
           tensor_shape.dimension_value(attention_mechanism.values.shape[-1])
           for attention_mechanism in attention_mechanisms)
 
+    if attention_fn is None:
+      attention_fn = _compute_attention
+    self._attention_fn = attention_fn
+
     self._cell = cell
     self._attention_mechanisms = attention_mechanisms
     self._cell_input_fn = cell_input_fn
@@ -1443,7 +1456,7 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
     all_attention_states = []
     maybe_all_histories = []
     for i, attention_mechanism in enumerate(self._attention_mechanisms):
-      attention, alignments, next_attention_state = _compute_attention(
+      attention, alignments, next_attention_state = self._attention_fn(
           attention_mechanism, cell_output, previous_attention_state[i],
           self._attention_layers[i] if self._attention_layers else None)
       alignment_history = previous_alignment_history[i].write(
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index ab36848f13ab3078cd232c18f140188e12db703b..8f8f057702951094758b277ce060955f3dc6e99d 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -921,6 +921,7 @@ def _get_scores(log_probs, sequence_lengths, length_penalty_weight,
   """
   length_penalty_ = _length_penalty(
       sequence_lengths=sequence_lengths, penalty_factor=length_penalty_weight)
+  length_penalty_ = math_ops.cast(length_penalty_, dtype=log_probs.dtype)
   scores = log_probs / length_penalty_
 
   coverage_penalty_weight = ops.convert_to_tensor(
diff --git a/tensorflow/contrib/solvers/python/kernel_tests/lanczos_test.py b/tensorflow/contrib/solvers/python/kernel_tests/lanczos_test.py
index 8fcd7aeef6a6964902666a4f3c17e05b0c7b52ee..f31bdbd399c9de4f2f5d557b75b1ece6d64a765e 100644
--- a/tensorflow/contrib/solvers/python/kernel_tests/lanczos_test.py
+++ b/tensorflow/contrib/solvers/python/kernel_tests/lanczos_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.contrib.solvers.python.ops import lanczos
 from tensorflow.contrib.solvers.python.ops import util
 from tensorflow.python.framework import constant_op
@@ -80,7 +81,8 @@ if __name__ == "__main__":
     for shape in [[4, 4], [7, 4], [5, 8]]:
       for orthogonalize in True, False:
         for steps in range(1, min(shape) + 1):
-          for use_static_shape in True, False:
+          # TF2 does not support placeholders so we skip it
+          for use_static_shape in set([True, tf2.enabled()]):
             arg_string = "%s_%s_%s_%s_staticshape_%s" % (
                 dtype.__name__, "_".join(map(str, shape)), orthogonalize, steps,
                 use_static_shape)
diff --git a/tensorflow/contrib/solvers/python/kernel_tests/least_squares_test.py b/tensorflow/contrib/solvers/python/kernel_tests/least_squares_test.py
index 2a9100903aae5689919a6b25fcb18ff192f250b3..841a41a2339824ab8ca15f4bdd74be697cd6fe9f 100644
--- a/tensorflow/contrib/solvers/python/kernel_tests/least_squares_test.py
+++ b/tensorflow/contrib/solvers/python/kernel_tests/least_squares_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.contrib.solvers.python.ops import least_squares
 from tensorflow.contrib.solvers.python.ops import util
 from tensorflow.python.framework import constant_op
@@ -76,7 +77,8 @@ def _get_least_squares_tests(dtype_, use_static_shape_, shape_):
 if __name__ == "__main__":
   for dtype in np.float32, np.float64:
     for shape in [[4, 4], [8, 5], [3, 7]]:
-      for use_static_shape in True, False:
+      # TF2 does not support placeholders under eager so we skip it
+      for use_static_shape in set([True, tf2.enabled()]):
         arg_string = "%s_%s_staticshape_%s" % (dtype.__name__,
                                                "_".join(map(str, shape)),
                                                use_static_shape)
diff --git a/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py b/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
index a0e6eb87bc06fb1303a7eb86fa6760458f20a9b9..10807f7a80617e56abeb6d13ce419a49a2269aac 100644
--- a/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
+++ b/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.contrib.solvers.python.ops import linear_equations
 from tensorflow.contrib.solvers.python.ops import util
 from tensorflow.python.framework import constant_op
@@ -113,7 +114,8 @@ def _get_linear_equations_tests(dtype_, use_static_shape_, shape_):
 if __name__ == "__main__":
   for dtype in np.float32, np.float64:
     for size in 1, 4, 10:
-      for use_static_shape in True, False:
+      # TF2 does not support placeholders under eager so we skip it
+      for use_static_shape in set([True, tf2.enabled()]):
         shape = [size, size]
         arg_string = "%s_%s_staticshape_%s" % (dtype.__name__, size,
                                                use_static_shape)
diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md
index caf8b6db0dc0a220d593f9c0afc9464ca51a1e05..a9c2ad78a3db409e6e8669c48c4df37c8db19c4b 100644
--- a/tensorflow/contrib/tensorrt/README.md
+++ b/tensorflow/contrib/tensorrt/README.md
@@ -1,8 +1,46 @@
-# Using TensorRT in TensorFlow
+# Using TensorRT in TensorFlow (TF-TRT)
 
-This module provides necessary bindings and introduces TRT_engine_op operator
-that wraps a subgraph in TensorRT. This is still a work in progress but should
-be useable with most common graphs.
+This module provides necessary bindings and introduces `TRTEngineOp` operator
+that wraps a subgraph in TensorRT. This module is under active development.
+
+## Installing TF-TRT
+
+Currently TensorFlow nightly builds include TF-TRT by default, which means you
+don't need to install TF-TRT separately. You can pull the latest TF containers
+from docker hub or install the latest TF pip package to get access to the latest
+TF-TRT.
+
+If you want to use TF-TRT on NVIDIA Jetson platform, you can find the download
+links for the relevant TensorFlow pip packages here:
+https://docs.nvidia.com/deeplearning/dgx/index.html#installing-frameworks-for-jetson
+
+## Installing TensorRT
+
+In order to make use of TF-TRT, you will need a local installation of TensorRT.
+Installation instructions for compatibility with TensorFlow are provided on the
+[TensorFlow GPU support](https://www.tensorflow.org/install/gpu) guide.
+
+## Examples
+
+You can find example scripts for running inference on deep learning models in
+this repository: https://github.com/tensorflow/tensorrt
+
+We have used these examples to verify the accuracy and performance of TF-TRT.
+For more information see
+[Verified Models](https://docs.nvidia.com/deeplearning/dgx/integrate-tf-trt/index.html#verified-models).
+
+## Documentation
+
+[TF-TRT documentation](https://docs.nvidia.com/deeplearning/dgx/integrate-tf-trt/index.html)
+gives an overview of the supported functionalities, provides tutorials and
+verified models, explains best practices with troubleshooting guides.
+
+## Tests
+
+TF-TRT includes both Python tests and C++ unit tests. Most of Python tests are
+located in the test directory and they can be executed using `bazel test` or
+directly with the Python command. Most of the C++ unit tests are used to test
+the conversion functions that convert each TF op to a number of TensorRT layers.
 
 ## Compilation
 
@@ -18,12 +56,3 @@ bazel build --config=cuda --config=opt //tensorflow/tools/pip_package:build_pip_
 bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/
 ```
 
-After the installation of tensorflow package, TensorRT transformation will be
-available. An example use can be found in test/test_tftrt.py script
-
-## Installing TensorRT 3.0.4
-
-In order to make use of TensorRT integration, you will need a local installation
-of TensorRT 3.0.4 from the [NVIDIA Developer website](https://developer.nvidia.com/tensorrt).
-Installation instructions for compatibility with TensorFlow are provided on the
-[TensorFlow GPU support](https://www.tensorflow.org/install/gpu) guide.
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index ecbd4ea802083cd742b496a65a13b72eb9eda9d9..746514b930c6c4c602c727a51313a8c5da271fa6 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -59,7 +59,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 #include "cuda/include/cuda_runtime_api.h"
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
@@ -89,49 +89,52 @@ Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) {
   // TODO(laigd): move this set to TrtNodeValidator where it should belong.
   // LINT.IfChange
   static const std::set<string> candidate_ops = {
-      "Identity",
-      "Snapshot",
-      "Const",
-      "Conv2D",
-      "MaxPool",
-      "BiasAdd",
-      "Relu",
-      "Sigmoid",
-      "Tanh",
+      "Abs",
       "Add",
-      "Mul",
-      "Sub",
-      "Rsqrt",
-      "Pad",
-      "Mean",
       "AvgPool",
+      "BatchMatMul",
+      "BiasAdd",
       "ConcatV2",
+      "Const",
+      "Conv2D",
       "DepthwiseConv2dNative",
-      "FusedBatchNorm",
-      "FusedBatchNormV2",
       "Div",
-      "RealDiv",
-      "Rsqrt",
-      "Reciprocal",
       "Exp",
+      "ExpandDims",
+      "FusedBatchNorm",
+      "FusedBatchNormV2",
+      "Identity",
       "Log",
-      "Sqrt",
-      "Abs",
-      "Neg",
-      "Transpose",
-      "Reshape",
       "MatMul",
-      "BatchMatMul",
-      "Softmax",
-      "Minimum",
-      "Maximum",
-      "TopKV2",
-      "Sum",
-      "Prod",
       "Max",
+      "MaxPool",
+      "Maximum",
+      "Mean",
       "Min",
+      "Minimum",
+      "Mul",
+      "Neg",
+      "Pad",
+      "Prod",
+      "RealDiv",
+      "Reciprocal",
+      "Relu",
       "Relu6",
+      "Reshape",
+      "Rsqrt",
+      "Rsqrt",
+      "Sigmoid",
+      "Snapshot",
+      "Softmax",
+      "Sqrt",
       "Square",
+      "Squeeze",
+      "StridedSlice",
+      "Sub",
+      "Sum",
+      "Tanh",
+      "TopKV2",
+      "Transpose",
   };
   bool is_supported_op_type =
       (candidate_ops.count(node->type_string()) ||
@@ -320,6 +323,13 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   return Status::OK();
 }
 
+struct EdgePtrCompare {
+  bool operator()(const tensorflow::Edge* lhs,
+                  const tensorflow::Edge* rhs) const {
+    return lhs->id() < rhs->id();
+  }
+};
+
 // Function to get subsegment information structure.
 tensorflow::Status GetEngineInfo(
     const tensorflow::Graph* g,
@@ -358,8 +368,12 @@ tensorflow::Status GetEngineInfo(
     }
     const int node_id = node->id();
     subgraph_node_ids.push_back(node_id);
-    // Create input connections.
-    for (const auto edge : node->in_edges()) {
+    // Create input connections. Sort edges first to make determnistic since
+    // in_edges is a set of pointers.
+    std::vector<const tensorflow::Edge*> in_edges(node->in_edges().begin(),
+                                                  node->in_edges().end());
+    std::sort(in_edges.begin(), in_edges.end(), EdgePtrCompare());
+    for (const auto edge : in_edges) {
       auto input_node = edge->src();
       if (input_node->IsSource() || segment_nodes.count(input_node->name())) {
         continue;
@@ -407,8 +421,12 @@ tensorflow::Status GetEngineInfo(
             node_id, edge->dst_input(), /*input_edge=*/true, port);
       }
     }
-    // Create output connections.
-    for (const auto edge : node->out_edges()) {
+    // Create output connections. Sort edges first to make determnistic since
+    // out_edges is a set of pointers.
+    std::vector<const tensorflow::Edge*> out_edges(node->out_edges().begin(),
+                                                   node->out_edges().end());
+    std::sort(out_edges.begin(), out_edges.end(), EdgePtrCompare());
+    for (const auto edge : out_edges) {
       auto output_node = edge->dst();
       if (output_node->IsSink() || segment_nodes.count(output_node->name())) {
         continue;
@@ -585,6 +603,14 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
       }
     }
   }
+  // We don't support segments with no inputs. Fall back to native TF here to
+  // avoid crash later. Constant folding should've folded the ops that make up
+  // these segments.
+  if (inputs.empty()) {
+    return tensorflow::errors::Internal(
+        "Segment has no inputs (possible "
+        "constfold failure)");
+  }
 
   const bool calibrate_int8 =
       (info.precision_mode == INT8MODE && info.use_calibration);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 938cadc7c44270cc48eba73fe33f6559193ac4b3..adf8831b960172fc29b5d631e5b0533318d4764d 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -48,7 +48,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 
 // Check if the types are equal. Cast to int first so that failure log message
 // would work!
@@ -120,6 +120,15 @@ inline nvinfer1::Dims TensorShapeToTrtDims(const TensorShapeType& shape,
   return trt_dims;
 }
 
+Status TensorShapeArrayToTrtDims(const std::vector<int>& shape,
+                                 nvinfer1::Dims* out,
+                                 bool ignore_first_dim = false) {
+  PartialTensorShape tensor_shape;
+  TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(shape, &tensor_shape));
+  *out = TensorShapeToTrtDims(tensor_shape, ignore_first_dim);
+  return tensorflow::Status::OK();
+}
+
 void GetOutputProperties(const grappler::GraphProperties& graph_properties,
                          const Node* node, const int out_port,
                          PartialTensorShape* shape,
@@ -623,6 +632,11 @@ bool TFAttrs::get<bool>(const string& key) const {
   return this->at(key)->b();
 }
 
+template <>
+int TFAttrs::get<int>(const string& key) const {
+  return this->at(key)->i();
+}
+
 // TODO(jie): reorder4 & reorder2 should be merged?
 // TODO(aaroey): fix the order of parameters.
 template <typename T>
@@ -1524,6 +1538,24 @@ enum class ConvolutionType { DEFAULT, DEPTHWISE_CONV };
 tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
+  if (inputs.at(0).is_weights()) {
+    return tensorflow::errors::Unimplemented(
+        node_def.op(), " is only implemented for tensors, not weights, at ",
+        node_def.name());
+  }
+  if (inputs.at(1).is_tensor()) {
+    return tensorflow::errors::Unimplemented("Kernel for ", node_def.op(),
+                                             " must be constant weights, at ",
+                                             node_def.name());
+  }
+  TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
+  VLOG(2) << "weight shape: " << weights_rsck.DebugString();
+  if (weights_rsck.shape_.nbDims != 4) {
+    return tensorflow::errors::Internal(
+        "Conv2D expects kernel of dimension 4, at: " + node_def.name());
+  }
+  if (params->validation_only) return tensorflow::Status::OK();
+
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
   TFAttrs attrs(node_def);
 
@@ -1545,12 +1577,6 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
   if (num_groups == 0) num_groups = tensor_dim.d[0];  // depthwise convolution
   VLOG(2) << "groups count: " << num_groups;
 
-  TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
-  VLOG(2) << "weight shape: " << weights_rsck.DebugString();
-  if (weights_rsck.shape_.nbDims != 4) {
-    return tensorflow::errors::Internal(
-        "Conv2D expects kernel of dimension 4, at: " + node_def.name());
-  }
   if (params->converter->precision_mode() == FP16MODE) {
     weights_rsck =
         ConvertFP32ToFP16(params->weight_store, inputs.at(1).weights());
@@ -1637,7 +1663,7 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params,
     case ConvolutionType::DEPTHWISE_CONV:
       return ConvertConv2DHelper(params, 0);
   }
-  return tensorflow::errors::Unimplemented("unsupported convolution type at, " +
+  return tensorflow::errors::Unimplemented("Unsupported convolution type, at ",
                                            params->node_def.name());
 }
 
@@ -1880,6 +1906,372 @@ tensorflow::Status ConvertReshape(OpConverterParams* params) {
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertExpandDims(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 2) {
+    return tensorflow::errors::InvalidArgument(
+        "Two inputs expected for ExpandDims, at ", node_def.name());
+  }
+  if (inputs.at(0).is_weights()) {
+    return tensorflow::errors::Unimplemented(
+        "ExpandDims expects tensor for input, at ", node_def.name());
+  }
+  if (!inputs.at(1).is_weights()) {
+    return tensorflow::errors::InvalidArgument(
+        "ExpandDims expects weights for axis, at ", node_def.name());
+  }
+  // Get input shape as vector.
+  TRT_TensorOrWeights input_tensor = inputs.at(0);
+  const nvinfer1::Dims dims = input_tensor.GetTrtDims();
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  // Add batch dim back.
+  input_dims.insert(input_dims.begin(), -1);
+  const int input_rank = input_dims.size();
+  // Get axis to expand on.
+  TRT_ShapedWeights weights = inputs.at(1).weights();
+  if (weights.count() != 1) {
+    return tensorflow::errors::InvalidArgument(
+        "ExpandDims axis must be a scalar, at ", node_def.name());
+  }
+  const int* weights_ptr =
+      static_cast<int*>(const_cast<void*>(weights.GetValues()));
+  int axis = weights_ptr[0];
+  // Make sure axis is valid.
+  if ((axis < (-input_rank - 1)) || (axis > input_rank)) {
+    return tensorflow::errors::InvalidArgument(
+        "Axis for ExpandDims is invalid, must be in the range "
+        "[-rank(input) - 1, rank(input)], at ",
+        node_def.name());
+  }
+  // Convert negative axis to corresponding positive axis.
+  if (axis < 0) axis += input_rank + 1;
+  if (axis == 0) {
+    return tensorflow::errors::Unimplemented(
+        "Modifying batch dimension is not supported for ExpandDims, at ",
+        node_def.name());
+  }
+  if (params->validation_only) return Status::OK();
+
+  // ExpandDims: Insert new dim of size 1.
+  input_dims.insert(input_dims.begin() + axis, 1);
+  // Reshape tensor.
+  nvinfer1::Dims new_dims;
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims,
+                                               /*ignore_first_dim=*/true));
+  const nvinfer1::ITensor* output_tensor = nullptr;
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      input_tensor, new_dims, &output_tensor));
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertSqueeze(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 1) {
+    return tensorflow::errors::InvalidArgument(
+        "One input expected for Squeeze, at ", node_def.name());
+  }
+  if (inputs.at(0).is_weights()) {
+    return tensorflow::errors::Unimplemented(
+        "Squeeze expects tensor for input, at ", node_def.name());
+  }
+  // Get input shape.
+  TRT_TensorOrWeights input_tensor = inputs.at(0);
+  const nvinfer1::Dims dims = input_tensor.GetTrtDims();
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  // Add batch dim back.
+  input_dims.insert(input_dims.begin(), -1);
+  const int input_rank = input_dims.size();
+  // Mark axes to remove by setting them to 0.
+  TFAttrs attrs(node_def);
+  auto squeeze_dims = attrs.get<std::vector<int>>("squeeze_dims");
+  if (squeeze_dims.size() == 0) {
+    return tensorflow::errors::Unimplemented(
+        "Squeeze is only implemented for explicit dims, at ", node_def.name());
+  }
+  for (int axis : squeeze_dims) {
+    // Make sure axis is valid.
+    if ((axis < -input_rank) || (axis >= input_rank)) {
+      return tensorflow::errors::InvalidArgument(
+          "Axis for Squeeze is invalid, must be in the range "
+          "[-rank(input), rank(input)), at ",
+          node_def.name());
+    }
+    // Convert negative axis to corresponding positive axis.
+    if (axis < 0) axis += input_rank;
+    // Don't squeeze batch dim.
+    if (axis == 0) {
+      return tensorflow::errors::Unimplemented(
+          "Cannot squeeze batch dimension, at ", node_def.name());
+    }
+    // Make sure target dimension is size 1.
+    if (input_dims[axis] != 1) {
+      return tensorflow::errors::InvalidArgument(
+          "Cannot squeeze a dimension which isn't size 1, at ",
+          node_def.name());
+    }
+    // Mark dim for removal by setting to 0.
+    input_dims[axis] = 0;
+  }
+  if (params->validation_only) return Status::OK();
+
+  // Remove all dims which are equal to 0.
+  input_dims.erase(std::remove(input_dims.begin(), input_dims.end(), 0),
+                   input_dims.end());
+  // Reshape tensor.
+  nvinfer1::Dims new_dims;
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims,
+                                               /*ignore_first_dim=*/true));
+  const nvinfer1::ITensor* output_tensor = nullptr;
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      input_tensor, new_dims, &output_tensor));
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
+  return tensorflow::Status::OK();
+}
+
+// Gets the bounds (start or end) from the weights of a StridedSlice op.
+tensorflow::Status GetStridedSliceBound(const std::vector<int>& input_dims,
+                                        const TRT_ShapedWeights& bound_weights,
+                                        int mask, bool begin, string node_name,
+                                        std::vector<int>* output_bound) {
+  const string bound_name = (begin) ? "begin" : "end";
+  const int* weights_ptr = static_cast<int*>(bound_weights.GetValues());
+  *output_bound =
+      std::vector<int>(weights_ptr, weights_ptr + bound_weights.count());
+  if (output_bound->size() != input_dims.size()) {
+    return tensorflow::errors::InvalidArgument(
+        "StridedSlice \"", bound_name, "\" specified ",
+        std::to_string(output_bound->size()), " dimensions, but input rank is ",
+        std::to_string(input_dims.size()), ", at ", node_name);
+  }
+  for (int i = 0; i < output_bound->size(); i++) {
+    if ((1 << i) & mask) {
+      // Apply mask.
+      (*output_bound)[i] = (begin) ? 0 : input_dims[i];
+      // Masked bound will always result in a valid, non-negative bound, so we
+      // don't need the following checks. For the common case of using masks on
+      // a undefined batch dim (-1), we specifically don't want to do the
+      // following checks because they will erroneously detect an out of range
+      // bound or try to correct the negative value.
+      continue;
+    }
+    // Make sure bound is valid.
+    if (((*output_bound)[i] < -input_dims[i]) ||
+        ((*output_bound)[i] > input_dims[i])) {
+      return tensorflow::errors::InvalidArgument(
+          bound_name, " value of ", std::to_string((*output_bound)[i]),
+          " for StridedSlice is invalid, must be in the range "
+          "[-dim_size(i), dim_size(i)], at ",
+          node_name);
+    }
+    // Convert negative values to their positive equivalent.
+    if ((*output_bound)[i] < 0) {
+      (*output_bound)[i] += input_dims[i];
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 4) {
+    return tensorflow::errors::InvalidArgument(
+        "StridedSlice expects 4 inputs, at ", node_def.name());
+  }
+  if (!inputs.at(1).is_weights() || !inputs.at(2).is_weights() ||
+      !inputs.at(3).is_weights()) {
+    return tensorflow::errors::InvalidArgument(
+        "StridedSlice expects weights for begin, end, and strides, at ",
+        node_def.name());
+  }
+  if (!inputs.at(0).is_tensor()) {
+    return tensorflow::errors::Unimplemented(
+        "StridedSlice is only implemented for tensors, at ", node_def.name());
+  }
+  // Get input dims.
+  nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  if (inputs.at(0).is_tensor()) {
+    // Temporarily add batch dimension so that indexes line up properly.
+    input_dims.insert(input_dims.begin(), inputs.at(0).batch_size());
+  }
+  if (input_dims.size() > 4) {
+    return tensorflow::errors::Unimplemented(
+        "StridedSlice is not implemented for tensors with rank > 4, at ",
+        node_def.name());
+  }
+  TFAttrs attrs(node_def);
+  // Get begin and end bounds per axis.
+  std::vector<int> begin, end;
+  TF_RETURN_IF_ERROR(GetStridedSliceBound(input_dims, inputs.at(1).weights(),
+                                          attrs.get<int>("begin_mask"), true,
+                                          node_def.name(), &begin));
+  TF_RETURN_IF_ERROR(GetStridedSliceBound(input_dims, inputs.at(2).weights(),
+                                          attrs.get<int>("end_mask"), false,
+                                          node_def.name(), &end));
+  // Get strides per axis (must all be 1).
+  TRT_ShapedWeights stride_weights = inputs.at(3).weights();
+  const int* stride_weights_ptr = static_cast<int*>(stride_weights.GetValues());
+  std::vector<int> strides(stride_weights_ptr,
+                           stride_weights_ptr + stride_weights.count());
+  for (int x : strides) {
+    if (x != 1) {
+      return tensorflow::errors::Unimplemented(
+          "StridedSlice is only implemented for stride of 1, at ",
+          node_def.name());
+    }
+  }
+  // Unsupported mask options.
+  for (const string& attr :
+       {"ellipsis_mask", "new_axis_mask", "shrink_axis_mask"}) {
+    int attr_val = attrs.get<int>(attr);
+    if (attr_val != 0) {
+      return tensorflow::errors::Unimplemented(
+          attr, " is not supported for StridedSlice, at ", node_def.name());
+    }
+  }
+
+  nvinfer1::ITensor* tensor =
+      const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor());
+  // Reshape if necessary to 4-D, since IPaddingLayer requires a 4-D input.
+  const bool need_reshape = (input_dims.size() != 4);
+  int reshape_dims_added = 0;
+  nvinfer1::Dims reshape_dims;
+  if (need_reshape) {
+    // Add new dims after batch dim until tensor is 4D.
+    while (input_dims.size() < 4) {
+      input_dims.insert(input_dims.begin() + 1, 1);
+      begin.insert(begin.begin() + 1, 0);
+      end.insert(end.begin() + 1, 1);
+      reshape_dims_added++;
+    }
+    TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &reshape_dims,
+                                                 /*ignore_first_dim=*/true));
+  }
+  // Find dimensions which need to be sliced.
+  std::vector<int> pad_dims;
+  for (int i = 0; i < input_dims.size(); i++) {
+    if ((begin[i] != 0) || (end[i] != input_dims[i])) {
+      if (i == 0) {
+        return tensorflow::errors::Unimplemented(
+            "StridedSlice can't modify batch dim, at ", node_def.name());
+      } else if ((end[i] - begin[i]) < 0) {
+        return tensorflow::errors::InvalidArgument(
+            "New size of sliced dimension is negative, at ", node_def.name());
+      }
+      pad_dims.push_back(i);
+    }
+  }
+  if (pad_dims.size() == 0) {
+    // No dimensions are changed. We could create a padding layer anyway with
+    // values of 0.
+    if (params->validation_only) return Status::OK();
+    params->outputs->push_back(inputs.at(0));
+    return tensorflow::Status::OK();
+  } else if (pad_dims.size() == 1) {
+    // Only one dim is modified but we have to have 2, mark a second dim which
+    // will have padding of 0. The dim we add is chosen to avoid an unecessary
+    // transpose.
+    if (pad_dims[0] != 2) {
+      pad_dims.push_back(2);
+    } else {
+      pad_dims.push_back(3);
+    }
+  } else if (pad_dims.size() > 2) {
+    return tensorflow::errors::Unimplemented(
+        "StridedSlice can only modify 2 dimensions, at ", node_def.name());
+  }
+  std::sort(pad_dims.begin(), pad_dims.end());
+  // Convert to pre/post padding values. Since TRT does not have a StridedSlice
+  // or Slice layer, we instead create an IPaddingLayer with negative padding.
+  nvinfer1::DimsHW pre_padding, post_padding;
+  for (int i = 0; i < pad_dims.size(); i++) {
+    const int axis = pad_dims[i];
+    pre_padding.d[i] = -begin[axis];
+    post_padding.d[i] = end[axis] - input_dims[axis];
+  }
+
+  // IPaddingLayer will always apply the padding to dims 2,3 (input format is
+  // NCHW).
+  const bool need_transpose = !(pad_dims[0] == 2 && pad_dims[1] == 3);
+  std::vector<int> transpose_order(input_dims.size());
+  std::vector<int> inv_transpose_order(input_dims.size());
+  if (need_transpose) {
+    if (pad_dims[0] == 1 && pad_dims[1] == 3) {
+      transpose_order = {0, 2, 1, 3};
+      inv_transpose_order = {0, 2, 1, 3};
+    } else if (pad_dims[0] == 1 && pad_dims[1] == 2) {
+      transpose_order = {0, 3, 1, 2};
+      inv_transpose_order = {0, 2, 3, 1};
+    }
+  }
+  if (params->validation_only) return Status::OK();
+
+  // Start conversion.
+  if (need_reshape) {
+    const nvinfer1::ITensor* output_tensor = nullptr;
+    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+        inputs.at(0), reshape_dims, &output_tensor));
+    tensor = const_cast<nvinfer1::ITensor*>(output_tensor);
+  }
+  if (need_transpose) {
+    const nvinfer1::ITensor* output_tensor = nullptr;
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        tensor, transpose_order, &output_tensor));
+    tensor = const_cast<nvinfer1::ITensor*>(output_tensor);
+  }
+
+  // Add padding layer
+  nvinfer1::IPaddingLayer* layer = params->converter->network()->addPadding(
+      *const_cast<nvinfer1::ITensor*>(tensor), pre_padding, post_padding);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->converter->MarkQuantizationRangesAsInferrable(tensor,
+                                                        layer->getOutput(0));
+  tensor = layer->getOutput(0);
+
+  // Restore transpose
+  if (need_transpose) {
+    const nvinfer1::ITensor* output_tensor = nullptr;
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        tensor, inv_transpose_order, &output_tensor));
+    tensor = const_cast<nvinfer1::ITensor*>(output_tensor);
+  }
+  // Restore reshape
+  if (need_reshape) {
+    // Calculate output dimensions
+    for (int i = 0; i < pad_dims.size(); i++) {
+      const int axis = pad_dims[i];
+      input_dims[axis] = end[axis] - begin[axis];
+    }
+    // Remove added 1 dimensions
+    for (int i = 0; i < reshape_dims_added; i++) {
+      int value = input_dims[1];
+      if (value != 1) {
+        return tensorflow::errors::Internal(
+            "StridedSlice error when reshaping, at ", node_def.name());
+      }
+      input_dims.erase(input_dims.begin() + 1);
+    }
+
+    nvinfer1::Dims new_dims;
+    TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims,
+                                                 /*ignore_first_dim=*/true));
+    const nvinfer1::ITensor* output_tensor = nullptr;
+    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+        TRT_TensorOrWeights(tensor), new_dims, &output_tensor));
+    tensor = const_cast<nvinfer1::ITensor*>(output_tensor);
+  }
+
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(tensor)));
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertConv2D(OpConverterParams* params) {
   return ConvertConv2DHelper(params, ConvolutionType::DEFAULT);
 }
@@ -1891,9 +2283,29 @@ tensorflow::Status ConvertConv2DDepthwise(OpConverterParams* params) {
 tensorflow::Status ConvertPool(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  if (inputs.at(0).is_weights()) {
+    return tensorflow::errors::Unimplemented(
+        node_def.op(), " is only implemented for tensors, not weights, at ",
+        node_def.name());
+  }
+  nvinfer1::PoolingType type;
+  if (node_def.op() == "MaxPool") {
+    type = nvinfer1::PoolingType::kMAX;
+  } else if (node_def.op() == "AvgPool") {
+    type = nvinfer1::PoolingType::kAVERAGE;
+  } else {
+    return tensorflow::errors::Unimplemented(
+        "Unsupported pooling type: ", node_def.op(), ", at ", node_def.name());
+  }
   TFAttrs attrs(node_def);
+  const string padding_type = attrs.get<string>("padding");
+  if ((padding_type != "SAME") && (padding_type != "VALID")) {
+    return tensorflow::errors::Unimplemented(
+        "Unsupported padding type: ", padding_type, ", at ", node_def.name());
+  }
+  if (params->validation_only) return Status::OK();
 
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
   int h_index = 2;
   int w_index = 3;
   const auto data_format = attrs.get<string>("data_format");
@@ -1904,16 +2316,6 @@ tensorflow::Status ConvertPool(OpConverterParams* params) {
         const_cast<nvinfer1::ITensor*>(tensor), {0, 3, 1, 2}, &tensor));
   }
 
-  nvinfer1::PoolingType type;
-  if (node_def.op() == "MaxPool") {
-    type = nvinfer1::PoolingType::kMAX;
-  } else if (node_def.op() == "AvgPool") {
-    type = nvinfer1::PoolingType::kAVERAGE;
-  } else {
-    return tensorflow::errors::Unimplemented("Unsupported pool type: ",
-                                             node_def.op());
-  }
-
   const auto tf_stride = attrs.get<std::vector<int>>("strides");
   const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
 
@@ -1922,7 +2324,6 @@ tensorflow::Status ConvertPool(OpConverterParams* params) {
 
   auto tensor_dim = tensor->getDimensions();
   std::vector<std::pair<int, int>> padding;
-  const string padding_type = attrs.get<string>("padding");
   if (padding_type == "SAME") {
     // This is NCHW tensor with no batch dimension.
     //  1 -> h
@@ -1932,9 +2333,6 @@ tensorflow::Status ConvertPool(OpConverterParams* params) {
         {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])});
   } else if (padding_type == "VALID") {
     padding = {{0, 0}, {0, 0}};
-  } else {
-    return tensorflow::errors::Unimplemented("Unsupported padding type: ",
-                                             padding_type);
   }
 
   if (padding[0].first != padding[0].second ||
@@ -2701,6 +3099,7 @@ tensorflow::Status ConvertPad(OpConverterParams* params) {
     return tensorflow::errors::Unimplemented(
         "Padding layer does not support padding on dimension 1 and 3 yet");
   }
+  if (params->validation_only) return Status::OK();
 
   bool legit_pad = true;
   nvinfer1::DimsHW pre_padding(0, 0);
@@ -2804,6 +3203,7 @@ tensorflow::Status ConvertConcat(OpConverterParams* params) {
 
     inputs_vec.push_back(tensor_i);
   }
+  if (params->validation_only) return tensorflow::Status::OK();
 
   // nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
   nvinfer1::IConcatenationLayer* layer =
@@ -2825,12 +3225,35 @@ tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
   auto data_format = attrs.get<string>("data_format");
   if (data_format != "NCHW") {
     return tensorflow::errors::Unimplemented(
-        "only data_format=NCHW is supported, at " + node_def.name());
+        node_def.op(), " only supports data_format=NCHW, at ", node_def.name());
   }
   bool is_training = attrs.get<bool>("is_training");
   if (is_training) {
+    // Trying to use batchnorm in training mode is a very common problem.
+    // Because the error message will only be printed in VLOG(1) by the
+    // segmenter, we issue a special warning so that users will actually see it.
+    LOG(WARNING) << node_def.op() << " only supports is_training=false. If you "
+                 << "are using Keras, please call "
+                 << "keras.backend.set_learning_phase(0) before constructing "
+                 << "your model. At " << node_def.name();
     return tensorflow::errors::Unimplemented(
-        "only is_training=false is supported, at " + node_def.name());
+        node_def.op(), " only supports is_training=false, at ",
+        node_def.name());
+  }
+  if (inputs.at(0).is_weights()) {
+    return tensorflow::errors::Unimplemented(
+        node_def.op(),
+        " is only implemented for tensor inputs, not weights, at ",
+        node_def.name());
+  }
+  for (int i = 1; i < 5; i++) {
+    if (inputs.at(i).is_tensor()) {
+      return tensorflow::errors::Unimplemented(
+          node_def.op(),
+          " must have constant inputs for scale, offset, mean and variance, "
+          "at ",
+          node_def.name());
+    }
   }
   nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
 
@@ -2845,7 +3268,7 @@ tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
   for (int i = 1; i < 5; i++) {
     if (inputs.at(i).weights().type_ != parameter_type) {
       return tensorflow::errors::Unimplemented(
-          "Inconsistent parameter type for batchnormis not supported, at: " +
+          "Inconsistent parameter type for batchnorm is not supported, at: " +
           node_def.name());
     }
   }
@@ -2865,6 +3288,8 @@ tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
           "Inconsistent batchnorm parameter count, at: " + node_def.name());
     }
   }
+  if (params->validation_only) return Status::OK();
+
   //  We could technically have two weights with different shape.
   //  that requires two addScale op, arguably less performant
   TRT_ShapedWeights combined_scale_weights =
@@ -3150,12 +3575,19 @@ static void RegisterValidatableOpConverters(
     std::unordered_map<string, OpConverter>* registration) {
   // TODO(laigd): support all op types.
   (*registration)["BiasAdd"] = ConvertBiasAdd;
+  (*registration)["ConcatV2"] = ConvertConcat;
   (*registration)["Const"] = ConvertConst;
-  (*registration)["Transpose"] = ConvertTranspose;
-  (*registration)["Reshape"] = ConvertReshape;
+  (*registration)["Conv2D"] = ConvertConv2D;
+  (*registration)["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
+  (*registration)["ExpandDims"] = ConvertExpandDims;
   (*registration)["MatMul"] = ConvertMatMul;
+  (*registration)["Pad"] = ConvertPad;
   (*registration)["Relu6"] = ConvertRelu6;
+  (*registration)["Reshape"] = ConvertReshape;
   (*registration)["Square"] = ConvertSquare;
+  (*registration)["Squeeze"] = ConvertSqueeze;
+  (*registration)["StridedSlice"] = ConvertStridedSlice;
+  (*registration)["Transpose"] = ConvertTranspose;
 
   for (auto quantization_op_type :
        {"QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3",
@@ -3169,6 +3601,12 @@ static void RegisterValidatableOpConverters(
   for (auto activation_op_type : {"Relu", "Sigmoid", "Tanh"}) {
     (*registration)[activation_op_type] = ConvertActivation;
   }
+  for (auto pool_op_type : {"AvgPool", "MaxPool"}) {
+    (*registration)[pool_op_type] = ConvertPool;
+  }
+  for (auto normalization_op_type : {"FusedBatchNorm", "FusedBatchNormV2"}) {
+    (*registration)[normalization_op_type] = ConvertFusedBatchNorm;
+  }
 }
 
 void TrtNodeValidator::RegisterOpValidators() {
@@ -3177,21 +3615,10 @@ void TrtNodeValidator::RegisterOpValidators() {
 
 void Converter::RegisterOpConverters() {
   RegisterValidatableOpConverters(&op_registry_);
-
-  op_registry_["Conv2D"] = ConvertConv2D;
-  op_registry_["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
-  op_registry_["MaxPool"] = ConvertPool;
-  op_registry_["AvgPool"] = ConvertPool;
   // TODO(ben,jie): this is a temp hack.
   op_registry_["Identity"] = ConvertIdentity;  // Identity should be removed
   op_registry_["Snapshot"] = ConvertIdentity;  // Snapshot should be removed
 
-  op_registry_["Pad"] = ConvertPad;
-
-  op_registry_["ConcatV2"] = ConvertConcat;
-  op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm;
-  op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm;
-
   op_registry_["Rsqrt"] = ConvertUnary;
   op_registry_["Reciprocal"] = ConvertUnary;
   op_registry_["Exp"] = ConvertUnary;
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index daa311119282221a5eccf4f166f67b479d0d3776..54e19b73957bccdae2b23bd3556de9ad00b864e5 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -34,7 +34,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
index 4790622e83ee1f77be2754a3655e6f8881609d26..a2ddfbffa5b0d8c421bcfe054097a9e42b79fe8f 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
@@ -44,7 +44,7 @@ limitations under the License.
 #if GOOGLE_TENSORRT
 #include "cuda/include/cuda.h"
 #include "cuda/include/cuda_runtime_api.h"
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
@@ -2113,6 +2113,512 @@ TEST_F(OpConverterTest, ConvertActivation) {
   }
 }
 
+TEST_F(OpConverterTest, ConvertExpandDims) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_expanddims", "ExpandDims", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Two inputs expected for ExpandDims, at my_expanddims");
+  }
+
+  // Get the NodeDef for ExpandDims.
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+  auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
+  auto expanddims =
+      ops::ExpandDims(s.WithOpName("my_expanddims"), input, weights);
+  const NodeDef& node_def = expanddims.operation.node()->def();
+  {
+    // Input is weights, should fail.
+    Reset();
+    AddTestWeights<int32>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    AddTestWeights<int32>("weights", {1}, {1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "ExpandDims expects tensor for input, at my_expanddims");
+  }
+  {
+    // Axis is a tensor, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("weights", {3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "ExpandDims expects weights for axis, at my_expanddims");
+  }
+  {
+    // Add dim at batch dimension, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("weights", {1}, {0});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Modifying batch dimension is not supported for ExpandDims, at "
+        "my_expanddims");
+  }
+  {
+    // Add dim at batch dimension via negative axis, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    // Input is rank 4 (batch dim included)
+    AddTestWeights<int32>("weights", {1}, {-5});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Modifying batch dimension is not supported for ExpandDims, at "
+        "my_expanddims");
+  }
+  {
+    // Axis > rank(input), should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    // Input is rank 4 (batch dim included)
+    AddTestWeights<int32>("weights", {1}, {5});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Axis for ExpandDims is invalid, must be in the range "
+        "[-rank(input) - 1, rank(input)], at my_expanddims");
+  }
+  {
+    // Axis < -rank(input)-1, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    // Input is rank 4 (batch dim included)
+    AddTestWeights<int32>("weights", {1}, {-6});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Axis for ExpandDims is invalid, must be in the range "
+        "[-rank(input) - 1, rank(input)], at my_expanddims");
+  }
+
+  struct TestParams {
+    TestParams(const std::vector<int>& input_dims, int axis,
+               const std::vector<int>& expected_output_dims)
+        : input_dims(input_dims),
+          axis(axis),
+          expected_output_dims(expected_output_dims) {}
+    std::vector<int> input_dims;
+    int axis;
+    std::vector<int> expected_output_dims;
+  };
+
+  // Ok.
+  const int kExpandDimsOKCases = 8;
+  TestParams ok_params[kExpandDimsOKCases] = {
+      TestParams{{2, 3}, 1, {1, 2, 3}}, TestParams{{2, 3}, -3, {1, 2, 3}},
+      TestParams{{2, 3}, 3, {2, 3, 1}}, TestParams{{2, 3}, -1, {2, 3, 1}},
+      TestParams{{2, 3}, 2, {2, 1, 3}}, TestParams{{2, 3}, -2, {2, 1, 3}},
+      TestParams{{6}, 1, {1, 6}},       TestParams{{6}, -1, {6, 1}},
+  };
+  for (int i = 0; i < kExpandDimsOKCases; ++i) {
+    Reset();
+    AddTestTensor("input", ok_params[i].input_dims);
+    AddTestWeights<int32>("weights", {1}, {ok_params[i].axis});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_expanddims", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    std::vector<float> output_data(6);
+    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_expanddims",
+                       &output_data);
+    EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertSqueeze) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_squeeze", "Squeeze", {});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "One input expected for Squeeze, at my_squeeze");
+  }
+  {
+    // No attrs, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto squeeze = ops::Squeeze(s.WithOpName("my_squeeze"), input);
+    const NodeDef& node_def = squeeze.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Squeeze is only implemented for explicit dims, at my_squeeze");
+  }
+
+  // Get the NodeDef for Squeeze.
+  auto get_squeeze_nodedef = [](std::vector<int> axis) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    ops::Squeeze::Attrs squeeze_attrs;
+    squeeze_attrs.axis_ = gtl::ArraySlice<int>(axis);
+    auto squeeze =
+        ops::Squeeze(s.WithOpName("my_squeeze"), input, squeeze_attrs);
+    return squeeze.operation.node()->def();
+  };
+
+  {
+    // Input is weights, should fail.
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef({0});
+    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Squeeze expects tensor for input, at my_squeeze");
+  }
+  {
+    // Squeeze batch dim, should fail.
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef({0});
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Cannot squeeze batch dimension, at my_squeeze");
+  }
+  {
+    // Squeeze batch dim via negative axis, should fail.
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef({-4});
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Cannot squeeze batch dimension, at my_squeeze");
+  }
+  {
+    // Squeeze >= rank(input), should fail.
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef({4});
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Axis for Squeeze is invalid, must be in the range "
+        "[-rank(input), rank(input)), at my_squeeze");
+  }
+  {
+    // Squeeze < -rank(input), should fail.
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef({-5});
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Axis for Squeeze is invalid, must be in the range "
+        "[-rank(input), rank(input)), at my_squeeze");
+  }
+
+  struct TestParams {
+    TestParams(const std::vector<int>& input_dims, const std::vector<int>& axis,
+               const std::vector<int>& expected_output_dims)
+        : input_dims(input_dims),
+          axis(axis),
+          expected_output_dims(expected_output_dims) {}
+    std::vector<int> input_dims;
+    std::vector<int> axis;
+    std::vector<int> expected_output_dims;
+  };
+
+  // Ok.
+  const int kSqueezeOKCases = 10;
+  TestParams ok_params[kSqueezeOKCases] = {
+      TestParams{{1, 2, 3}, {1}, {2, 3}},
+      TestParams{{1, 2, 3}, {-3}, {2, 3}},
+      TestParams{{2, 3, 1}, {3}, {2, 3}},
+      TestParams{{2, 3, 1}, {-1}, {2, 3}},
+      TestParams{{1, 2, 1, 3, 1}, {1, 3, 5}, {2, 3}},
+      TestParams{{1, 2, 1, 3, 1}, {3, 1, 5}, {2, 3}},
+      TestParams{{1, 2, 1, 3, 1}, {-1, -3, -5}, {2, 3}},
+      TestParams{{1, 2, 1, 3, 1}, {1, -3, 5}, {2, 3}},
+      TestParams{{1, 6}, {1}, {6}},
+      TestParams{{6, 1}, {2}, {6}},
+  };
+  for (int i = 0; i < kSqueezeOKCases; ++i) {
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef(ok_params[i].axis);
+    AddTestTensor("input", ok_params[i].input_dims);
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_squeeze", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    std::vector<float> output_data(6);
+    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_squeeze",
+                       &output_data);
+    EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertStridedSlice) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_strided_slice", "StridedSlice", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "StridedSlice expects 4 inputs, at my_strided_slice");
+  }
+
+  // Get nodedef for StridedSlice layer.
+  auto get_strided_slice_nodedef =
+      [](int begin_mask = 0, int end_mask = 0, int ellipsis_mask = 0,
+         int new_axis_mask = 0, int shrink_axis_mask = 0) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto begin = ops::Placeholder(s.WithOpName("begin"), DT_INT32);
+    auto end = ops::Placeholder(s.WithOpName("end"), DT_INT32);
+    auto strides = ops::Placeholder(s.WithOpName("strides"), DT_INT32);
+    ops::StridedSlice::Attrs attrs = ops::StridedSlice::Attrs()
+                                         .BeginMask(begin_mask)
+                                         .EndMask(end_mask)
+                                         .EllipsisMask(ellipsis_mask)
+                                         .NewAxisMask(new_axis_mask)
+                                         .ShrinkAxisMask(shrink_axis_mask);
+    auto strided_slice = ops::StridedSlice(s.WithOpName("my_strided_slice"),
+                                           input, begin, end, strides, attrs);
+    return strided_slice.operation.node()->def();
+  };
+
+  {
+    NodeDef node_def = get_strided_slice_nodedef();
+    AddTestWeights<int32>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "StridedSlice is only implemented for tensors, at my_strided_slice");
+  }
+  {
+    // Begin, end, strides are tensors, should fail.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("begin", {4});
+    AddTestTensor("end", {4});
+    AddTestTensor("strides", {4});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "StridedSlice expects weights for begin, end, and strides, at "
+        "my_strided_slice");
+  }
+  {
+    // Non-zero ellipsis_mask, should fail.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef(
+        /*begin_mask=*/0, /*end_mask=*/0, /*ellipsis_mask=*/2,
+        /*new_axis_mask=*/0, /*shrink_axis_mask=*/0);
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "ellipsis_mask is not supported for StridedSlice, at "
+        "my_strided_slice");
+  }
+  {
+    // Modify batch dim, should fail.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {0, 1, 2, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "StridedSlice can't modify batch dim, at my_strided_slice");
+  }
+  {
+    // Stride is not 1, should fail.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 2, -1, 3});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "StridedSlice is only implemented for stride of "
+                               "1, at my_strided_slice");
+  }
+  {
+    // Begin out of bounds, should fail.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {1, 2, 3, 4});
+    AddTestWeights<int32>("end", {4}, {0, 1, 2, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "begin value of 2 for StridedSlice is invalid, must be in the range "
+        "[-dim_size(i), dim_size(i)], at my_strided_slice");
+  }
+  {
+    // End out of bounds, should fail.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {1, 2, 3, 4});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "end value of 2 for StridedSlice is invalid, must be in the range "
+        "[-dim_size(i), dim_size(i)], at my_strided_slice");
+  }
+  {
+    // Size of sliced dim is negative, should fail.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 2, 0});
+    AddTestWeights<int32>("end", {4}, {1, 1, 0, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "New size of sliced dimension is negative, at my_strided_slice");
+  }
+
+  struct TestParams {
+    TestParams(const std::vector<int>& input_dims,
+               const std::vector<int>& expected_output_dims,
+               const std::vector<int>& begin, const std::vector<int>& end,
+               const std::vector<int>& begin_mask,
+               const std::vector<int>& end_mask,
+               const std::vector<int>& expected_output)
+        : input_dims(input_dims),
+          expected_output_dims(expected_output_dims),
+          begin(begin),
+          end(end),
+          expected_output(expected_output) {
+      // Masks are provided in terms of vectors for readability. Convert them to
+      // binary here.
+      this->begin_mask = 0;
+      for (int i = 0; i < begin_mask.size(); i++) {
+        if (begin_mask[i]) this->begin_mask |= (1 << i);
+      }
+      this->end_mask = 0;
+      for (int i = 0; i < end_mask.size(); i++) {
+        if (end_mask[i]) this->end_mask |= (1 << i);
+      }
+    }
+
+    std::vector<int> input_dims;
+    std::vector<int> expected_output_dims;
+    std::vector<int> begin;
+    std::vector<int> end;
+    int begin_mask;
+    int end_mask;
+    std::vector<int> expected_output;
+  };
+
+  // Ok.
+  const int kStridedSliceOKCases = 18;
+  TestParams ok_params[kStridedSliceOKCases] = {
+      // 2D Crop.
+      TestParams{/*input_dims=*/{1, 2, 3}, /*expected_output_dims=*/{1, 1, 2},
+                 /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 0, 1, 2},
+                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 0, 0},
+                 /*expected_output=*/{1, 2}},
+      TestParams{/*input_dims=*/{1, 2, 3}, /*expected_output_dims=*/{1, 1, 2},
+                 /*begin=*/{0, 0, 1, 1}, /*end=*/{0, 0, 0, 0},
+                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 1, 1},
+                 /*expected_output=*/{5, 6}},
+      TestParams{/*input_dims=*/{1, 2, 3}, /*expected_output_dims=*/{1, 1, 2},
+                 /*begin=*/{0, 0, 1, 1}, /*end=*/{0, 1, 2, 3},
+                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 0, 0},
+                 /*expected_output=*/{5, 6}},
+      // 2D Crop, with transpose.
+      TestParams{/*input_dims=*/{2, 3, 1}, /*expected_output_dims=*/{1, 2, 1},
+                 /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 2, 1},
+                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 0, 0, 0},
+                 /*expected_output=*/{1, 2}},
+      TestParams{/*input_dims=*/{2, 3, 1}, /*expected_output_dims=*/{1, 2, 1},
+                 /*begin=*/{0, 1, 1, 0}, /*end=*/{0, 2, 3, 1},
+                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 0, 0, 0},
+                 /*expected_output=*/{5, 6}},
+      TestParams{/*input_dims=*/{2, 1, 3}, /*expected_output_dims=*/{1, 1, 2},
+                 /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 1, 2},
+                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 0, 0, 0},
+                 /*expected_output=*/{1, 2}},
+      TestParams{/*input_dims=*/{2, 1, 3}, /*expected_output_dims=*/{1, 1, 2},
+                 /*begin=*/{0, 1, 0, 1}, /*end=*/{0, 2, 1, 3},
+                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 0, 0, 0},
+                 /*expected_output=*/{5, 6}},
+      // 2D Crop, with reshape.
+      TestParams{/*input_dims=*/{2, 3}, /*expected_output_dims=*/{1, 2},
+                 /*begin=*/{0, 0, 0}, /*end=*/{0, 1, 2},
+                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 0, 0},
+                 /*expected_output=*/{1, 2}},
+      TestParams{/*input_dims=*/{2, 3}, /*expected_output_dims=*/{1, 2},
+                 /*begin=*/{0, 1, 1}, /*end=*/{0, 0, 0},
+                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 1, 1},
+                 /*expected_output=*/{5, 6}},
+      // 1D Crop.
+      TestParams{/*input_dims=*/{1, 2, 3}, /*expected_output_dims=*/{1, 2, 2},
+                 /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 0, 0, 2},
+                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 1, 0},
+                 /*expected_output=*/{1, 2, 4, 5}},
+      TestParams{/*input_dims=*/{1, 2, 3}, /*expected_output_dims=*/{1, 1, 3},
+                 /*begin=*/{0, 0, 1, 0}, /*end=*/{0, 0, 0, 0},
+                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 1, 1},
+                 /*expected_output=*/{4, 5, 6}},
+      // 1D Crop, with transpose.
+      TestParams{/*input_dims=*/{2, 3, 1}, /*expected_output_dims=*/{1, 3, 1},
+                 /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 0, 0},
+                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 0, 1, 1},
+                 /*expected_output=*/{1, 2, 3}},
+      TestParams{/*input_dims=*/{2, 3, 1}, /*expected_output_dims=*/{1, 3, 1},
+                 /*begin=*/{0, 1, 0, 0}, /*end=*/{0, 0, 0, 0},
+                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 1, 1},
+                 /*expected_output=*/{4, 5, 6}},
+      // 1D Crop, with reshape.
+      TestParams{/*input_dims=*/{6}, /*expected_output_dims=*/{3},
+                 /*begin=*/{0, 0}, /*end=*/{0, 3},
+                 /*begin_mask=*/{0, 0}, /*end_mask=*/{1, 0},
+                 /*expected_output=*/{1, 2, 3}},
+      TestParams{/*input_dims=*/{1, 6}, /*expected_output_dims=*/{1, 3},
+                 /*begin=*/{0, 0, 2}, /*end=*/{0, 0, 5},
+                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 1, 0},
+                 /*expected_output=*/{3, 4, 5}},
+      TestParams{/*input_dims=*/{6, 1}, /*expected_output_dims=*/{3, 1},
+                 /*begin=*/{0, 2, 0}, /*end=*/{0, 5, 0},
+                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 0, 1},
+                 /*expected_output=*/{3, 4, 5}},
+      // Negative axis.
+      TestParams{/*input_dims=*/{6, 1}, /*expected_output_dims=*/{3, 1},
+                 /*begin=*/{0, -6, 0}, /*end=*/{0, -3, 0},
+                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 0, 1},
+                 /*expected_output=*/{1, 2, 3}},
+      TestParams{/*input_dims=*/{6, 1}, /*expected_output_dims=*/{5, 1},
+                 /*begin=*/{0, 0, 0}, /*end=*/{0, -1, 0},
+                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 0, 1},
+                 /*expected_output=*/{1, 2, 3, 4, 5}},
+  };
+
+  for (int i = 0; i < kStridedSliceOKCases; i++) {
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef(ok_params[i].begin_mask,
+                                                 ok_params[i].end_mask);
+    AddTestTensor("input", ok_params[i].input_dims);
+    AddTestWeights<int32>("begin",
+                          {static_cast<int>(ok_params[i].begin.size())},
+                          ok_params[i].begin);
+    AddTestWeights<int32>("end", {static_cast<int>(ok_params[i].end.size())},
+                          ok_params[i].end);
+    std::vector<int> strides(ok_params[i].input_dims.size(), 1);
+    AddTestWeights<int32>("strides", {static_cast<int>(strides.size())},
+                          strides);
+    RunValidationAndConversion(node_def);
+
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_strided_slice", &output));
+    std::vector<float> output_data(ok_params[i].expected_output.size());
+    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_strided_slice",
+                       &output_data);
+    EXPECT_THAT(output_data, ElementsAreArray(ok_params[i].expected_output));
+  }
+}
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
index c1688d4db88a270dcd202989f89a677ed10576d9..d57f2300f8e6e6ce79c538133da6bc5cf5ead2f5 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
@@ -226,8 +226,9 @@ tensorflow::Status TRTOptimizationPass::Optimize(
   tensorflow::tensorrt::convert::ConversionParams cp;
 
   if (use_calibration_ && precision_mode_ != INT8MODE) {
-    LOG(ERROR) << "Calibration with FP32 or FP16 is not implemented. "
-               << "Falling back to use_calibration = False.";
+    VLOG(1) << "Calibration with FP32 or FP16 is not implemented. "
+            << "Falling back to use_calibration = False."
+            << "Note that the default value of use_calibration is True.";
     use_calibration_ = false;
   }
 
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
index f658e45569bbef73faa751634b0163f5687ad164..189e9c939b9ffd4450f7ba95fe1abdbbc049b430 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
@@ -23,7 +23,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index b801480a30552113d0e9572d173871d71b7cacd8..b545f497f32d5a1a6960b748467ca189b7debf6c 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -31,7 +31,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 #include "cuda/include/cuda_runtime_api.h"
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/contrib/tensorrt/log/trt_logger.h
index 58d9d05d01960d7c5222b3a8be881afdba2f79e6..96ccacb791e40143c5c4d9d691bb353702f9a28b 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin.h
index 167e8197a70ecab0b068777d92948a92cabe6d2b..754920b60ca7439513a91ad0354833a2482b29c1 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin.h
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
index 51393d2092c04323932273f6655d4579269e34aa..bbae9fb65c22cf69d2e7954436fd04dd16f7f6c8 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
@@ -27,7 +27,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
index 2346cb9ba03ea033363fe5336e5fbab058a8ac6c..129bdcdbc2f8d9d5215f45f381bcadf35e4fa75e 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
index 5ded702c4189f9cced2aa1ed3c33f7d1ccf7efd1..274ce42fec9283c643004d45fba461879fc5f2dc 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
@@ -23,7 +23,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.h b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
index b03fe7b8b59b01d7f5e947efffee1b7a7c45b86d..f857a9de055ee7668f0bf9bc97e030354505081b 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc b/tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc
index ad6b1d7d4c57d696d3dee3b479733e152e669211..beb1284208e4c10ffe1d36ef411cf08f11dbcb78 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc
@@ -48,11 +48,14 @@ TEST(TRTAllocatorTest, Align) {
         513ul, 700ul, 12345ul, 1ul << 32}) {
     for (uint64_t alignment = 1; alignment <= space * 4; alignment *= 2) {
       for (const uintptr_t ptr_val :
-           {1ul, alignment == 1 ? 1ul : alignment - 1, alignment, alignment + 1,
-            alignment + (alignment / 2)}) {
+           {static_cast<uint64_t>(1),
+            alignment == 1 ? static_cast<uint64_t>(1) : alignment - 1,
+            alignment, alignment + 1, alignment + (alignment / 2)}) {
         if (ptr_val % alignment == 0) {
           for (const uint64_t size :
-               {1ul, space == 1 ? 1ul : space - 1, space, space + 1}) {
+               {static_cast<uint64_t>(1),
+                space == 1 ? static_cast<uint64_t>(1) : space - 1, space,
+                space + 1}) {
             EXPECT_EQ(space >= size, RunTest(alignment, size, ptr_val, space));
           }
         } else {
@@ -62,8 +65,10 @@ TEST(TRTAllocatorTest, Align) {
             EXPECT_TRUE(
                 RunTest(alignment, space - diff, ptr_val + diff, space - diff));
             for (const uint64_t size :
-                 {1ul, space - diff > 1 ? space - diff - 1 : 1ul, space - diff,
-                  space - diff + 1, space - 1}) {
+                 {static_cast<uint64_t>(1),
+                  space - diff > 1 ? space - diff - 1
+                                   : static_cast<uint64_t>(1),
+                  space - diff, space - diff + 1, space - 1}) {
               EXPECT_EQ(space - diff >= size,
                         RunTest(alignment, size, ptr_val, space));
             }
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
index e8f08ad9f48453da4ef51d94fcbc3d98f6e04b3b..65466c9741989fda5f82fc27d813d026f35fe386 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
@@ -26,7 +26,7 @@ limitations under the License.
 #if GOOGLE_TENSORRT
 
 #include "cuda/include/cuda_runtime_api.h"
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
index 0be5d44f7a36276bdaabe9a63337844d4011cf32..aac9e5c7bd725fc10bcaa04536ebc7be071b4d4c 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h
@@ -31,7 +31,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 6abc5226ccf96e472df77269bee6186726e5768d..084a96e0fa5c97edc58adf2590ed94e5ef0e4d85 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -225,6 +225,24 @@ SimpleGraph::~SimpleGraph() {
   for (auto x : edges_) delete x;
 }
 
+// Define comparison functions for std::set with pointer keys so that behavior
+// is deterministic. When using std::set with pointer key types, the items are
+// sorted by pointer address which is non-deterministic. This can cause issues
+// for INT8 mode because the graph is converted twice and non-determinism may
+// cause a mismatch between the calibration tables of the conversions.
+struct SimpleEdgePtrCompare {
+  bool operator()(const SimpleEdge* lhs, const SimpleEdge* rhs) const {
+    return lhs->id() < rhs->id();
+  }
+};
+
+struct NodePtrCompare {
+  bool operator()(const tensorflow::Node* lhs,
+                  const tensorflow::Node* rhs) const {
+    return lhs->name() < rhs->name();
+  }
+};
+
 namespace {
 
 // Copied from TF ReverseDFS, which only works for tensorflow::Graph.
@@ -476,7 +494,7 @@ tensorflow::Status SegmentGraph(
     // nodes. Iterate since combining two nodes may unblock other
     // combining.
     while (true) {
-      std::set<const SimpleEdge*> contract_edges;
+      std::set<const SimpleEdge*, SimpleEdgePtrCompare> contract_edges;
       for (const SimpleEdge* out_edge : node->out_edges()) {
         VLOG(3) << "... out node " << out_edge->dst()->name() << " ( "
                 << out_edge->dst()->id() << " <- " << node->id() << " )";
@@ -530,7 +548,7 @@ tensorflow::Status SegmentGraph(
 
   // A map from the segment identifier (currently the name of the root node of
   // the segment tree) to the segment nodes set.
-  std::map<string, std::set<const tensorflow::Node*>> sg_map;
+  std::map<string, std::set<const tensorflow::Node*, NodePtrCompare>> sg_map;
 
   // A map from the segment identifier (currently the name of the root node of
   // the segment tree) to the device names that the nodes in the segment are
@@ -566,7 +584,8 @@ tensorflow::Status SegmentGraph(
   // --------------------------------- Step 2 ---------------------------------
   // Remove ineligible input/output nodes.
   for (auto& itr : sg_map) {
-    std::set<const tensorflow::Node*>& segment_nodes = itr.second;
+    std::set<const tensorflow::Node*, NodePtrCompare>& segment_nodes =
+        itr.second;
     VLOG(1) << "Segment original size: " << segment_nodes.size();
     while (true) {
       std::deque<const tensorflow::Node*> in_nodes_que, out_nodes_que;
@@ -618,8 +637,9 @@ tensorflow::Status SegmentGraph(
                               bool is_input_nodes,
                               std::deque<const tensorflow::Node*>* que) {
         // Run a BFS on the queue to find all the input/output nodes.
-        std::set<const tensorflow::Node*> visited;
-        std::set<const tensorflow::Node*> logged(que->begin(), que->end());
+        std::set<const tensorflow::Node*, NodePtrCompare> visited;
+        std::set<const tensorflow::Node*, NodePtrCompare> logged(que->begin(),
+                                                                 que->end());
         while (!que->empty()) {
           auto node = que->front();
           que->pop_front();
@@ -653,7 +673,8 @@ tensorflow::Status SegmentGraph(
   // --------------------------------- Step 3 ---------------------------------
   // Convert the segments into the expected return format
   for (const auto& itr : sg_map) {
-    const std::set<const tensorflow::Node*>& segment_nodes = itr.second;
+    const std::set<const tensorflow::Node*, NodePtrCompare>& segment_nodes =
+        itr.second;
     if (VLOG_IS_ON(1)) {
       string s = "parent=" + itr.first + ":";
       for (auto node : segment_nodes) s += " " + node->name();
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index ad9703325f5b0e41e7cea28eafdf91e5a1681245..f30dba59ad55317d7ad7730e4dc66c9aba4e6a6b 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #if GOOGLE_TENSORRT
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace shape_inference {
diff --git a/tensorflow/contrib/tensorrt/tensorrt_test.cc b/tensorflow/contrib/tensorrt/tensorrt_test.cc
index 102a1d38919fde9cdb0890a0073c916a68b85601..769982c6456f76663e50fe3ec59651127e3720ac 100644
--- a/tensorflow/contrib/tensorrt/tensorrt_test.cc
+++ b/tensorflow/contrib/tensorrt/tensorrt_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #if GOOGLE_TENSORRT
 #include "cuda/include/cuda.h"
 #include "cuda/include/cuda_runtime_api.h"
-#include "tensorrt/NvInfer.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/contrib/tensorrt/test/rank_two_test.py b/tensorflow/contrib/tensorrt/test/rank_two_test.py
index 0cd733dca13462ac8f4478544005ae4000f711f1..563232fc12675d9e1b32b7ab461591af57beadb9 100644
--- a/tensorflow/contrib/tensorrt/test/rank_two_test.py
+++ b/tensorflow/contrib/tensorrt/test/rank_two_test.py
@@ -51,8 +51,10 @@ class RankTwoTest(trt_test.TfTrtIntegrationTestBase):
         c = constant_op.constant(3.0, name="c%d_3" % i)
         q = math_ops.add(q, c, name="add%d_3" % i)
         if i == 0:
+          axis = constant_op.constant(-1, dtype=dtypes.int32, name="axis")
           for j in range(2):
-            q = array_ops.expand_dims(q, -1, name="expand%d_%d" % (i, j))
+            q = array_ops.expand_dims(q, axis, name="expand%d_%d" % (i, j))
+          q = self.trt_incompatible_op(q)
         q = gen_math_ops.reciprocal(q, name="reciprocal%d" % i)
         outputs.append(q)
       # Combine both paths
@@ -70,7 +72,7 @@ class RankTwoTest(trt_test.TfTrtIntegrationTestBase):
     return {
         "TRTEngineOp_0": [
             "add0_1", "add0_2", "add0_3", "c0_1", "c0_2", "c0_3", "abs0_1",
-            "abs0_2"
+            "abs0_2", "expand0_0", "expand0_1", "axis"
         ],
         "TRTEngineOp_1": [
             "add", "add1_1", "add1_2", "add1_3", "c1_1", "c1_2", "c1_3",
diff --git a/tensorflow/contrib/tensorrt/test/unary_test.py b/tensorflow/contrib/tensorrt/test/unary_test.py
index 9fc50e05952abd335e196dce8fc8a81056d7007d..b6e5e32db1236684a06c2d44298b9a3d39667152 100644
--- a/tensorflow/contrib/tensorrt/test/unary_test.py
+++ b/tensorflow/contrib/tensorrt/test/unary_test.py
@@ -106,10 +106,7 @@ class UnaryTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return [
-        "TRTEngineOp_0", "TRTEngineOp_1", "TRTEngineOp_2", "TRTEngineOp_3",
-        "TRTEngineOp_4"
-    ]
+    return ["TRTEngineOp_0"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tfprof/README.md b/tensorflow/contrib/tfprof/README.md
index b29d1acacf17b57549558be45c853566817c1729..f40e76f554e8815aac96344d8cb0b911bafdd712 100644
--- a/tensorflow/contrib/tfprof/README.md
+++ b/tensorflow/contrib/tfprof/README.md
@@ -1,7 +1,5 @@
 # tfprof: TensorFlow Profiler and Beyond
 
-<h1>Please use `tf.profiler.xxx` instead of `tf.contrib.tfprof.xxx`</h1>
-
 <h1>Full Document in
 <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/README.md">tensorflow/core/profiler/README.md</a><h1>
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index ae7db35b47b326272dd2c7bc76e18047cec59865..4b90b596b28efec83aa349782c4874d79b6817c7 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -104,6 +104,7 @@ py_test(
     srcs = [
         "estimators_test.py",
     ],
+    shard_count = 3,
     srcs_version = "PY2AND3",
     tags = [
         "no_mac",
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 05d2ebd2e8a3292a95df0e2f976df0e2871063f8..007aeaec15d6db7ea4581ab9825da2dbe8b37163 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -79,6 +79,7 @@ py_library(
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:summary_ops_v2",
@@ -101,6 +102,7 @@ tf_gen_op_libs(
         "replication_ops",
         "tpu_configuration_ops",
         "tpu_embedding_ops",
+        "tpu_ordinal_selector_op",
     ],
     deps = [
         "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_cc",
@@ -152,6 +154,13 @@ tf_gen_op_wrapper_py(
     ],
 )
 
+tf_gen_op_wrapper_py(
+    name = "tpu_ordinal_selector_op",
+    deps = [
+        ":tpu_ordinal_selector_op_op_lib",
+    ],
+)
+
 py_library(
     name = "profiler",
     srcs = ["python/profiler/__init__.py"],
diff --git a/tensorflow/contrib/tpu/ops/tpu_ordinal_selector_op.cc b/tensorflow/contrib/tpu/ops/tpu_ordinal_selector_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54e6b20f7f388b67a96ac8acfe814a4202b56a18
--- /dev/null
+++ b/tensorflow/contrib/tpu/ops/tpu_ordinal_selector_op.cc
@@ -0,0 +1,39 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("TPUOrdinalSelector")
+    .Output("device_ordinals: int32")
+    .SetIsStateful()
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0,
+                    c->Vector(shape_inference::InferenceContext::kUnknownDim));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+A TPU core selector Op.
+
+This Op produces a set of TPU cores (for warm-up) or a single TPU core
+(for regular inference) to execute the TPU program on. The output is
+consumed by TPUPartitionedCall.
+
+device_ordinals: A vector 1 or more TPU cores.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index 63641e00c5dbf4b4e635ecfea8bef98c7d0b7075..a081c4354a779d37140338793e66844c3fcf7a12 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -90,12 +90,12 @@ def main(unused_argv=None):
   tf_version = tf.__version__
   print('TensorFlow version %s detected' % tf_version)
 
-  if FLAGS.service_addr is None and FLAGS.tpu is None:
+  if not FLAGS.service_addr and not FLAGS.tpu:
     sys.exit('You must specify either --service_addr or --tpu.')
 
   tpu_cluster_resolver = None
-  if FLAGS.service_addr is not None:
-    if FLAGS.tpu is not None:
+  if FLAGS.service_addr:
+    if FLAGS.tpu:
       tf.logging.warn('Both --service_addr and --tpu are set. Ignoring '
                       '--tpu and using --service_addr.')
     service_addr = FLAGS.service_addr
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index d61c824eab5337a7cd08cfa52a7e8f8b8d73b455..8d6245390fc3fa005c92d01bc9b64ddb47583582 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -156,7 +156,7 @@ def StreamingFilesDataset(files,
 
     source_dataset = source_dataset.prefetch(1)
 
-    source_iterator = source_dataset.make_one_shot_iterator()
+    source_iterator = dataset_ops.make_one_shot_iterator(source_dataset)
     source_handle = source_iterator.string_handle()
 
   @function.Defun(dtypes.string)
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
index b58d05eac56f3586e183333f7c1a3867ee57456c..52d87b800401c3e584da9843916cfc7a767c082a 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
@@ -70,7 +70,7 @@ class DatasetsTest(test.TestCase):
     dataset = datasets.StreamingFilesDataset(
         os.path.join(self.get_temp_dir(), 'text_line.*.txt'), filetype='text')
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -94,7 +94,7 @@ class DatasetsTest(test.TestCase):
     dataset = datasets.StreamingFilesDataset(
         os.path.join(self.get_temp_dir(), 'tf_record*'), filetype='tfrecord')
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -121,7 +121,7 @@ class DatasetsTest(test.TestCase):
 
     dataset = datasets.StreamingFilesDataset(filenames, filetype='tfrecord')
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -154,7 +154,7 @@ class DatasetsTest(test.TestCase):
         os.path.join(self.get_temp_dir(), 'fixed_length*'),
         filetype=FixedLengthFile)
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -177,7 +177,7 @@ class DatasetsTest(test.TestCase):
     dataset = datasets.StreamingFilesDataset(
         dataset_ops.Dataset.range(10), filetype=gen_dataset)
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index cf3b2e68e940652220983c98e3a0acb68cf88d89..cf9672f8d867f4ad5cb0281abe710f6e3bcdf1f2 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -133,7 +133,7 @@ def _tpu_session_context():
 An error occurred connecting or initializing your TPU.
 
 The session has been reset. re-run keras_to_tpu_model to create a new session.
-""" + e)
+""" + str(e))
 
 
 def setup_tpu_session(cluster_resolver):
@@ -729,7 +729,7 @@ class TPUDatasetInfeedManager(TPUInfeedManager):
     dummy_x_shape[0] *= tpu_assignment.num_towers
     dummy_y_shape = dataset.output_shapes[1].as_list()
     dummy_y_shape[0] *= tpu_assignment.num_towers
-    self._iterator = dataset.make_initializable_iterator()
+    self._iterator = dataset_ops.make_initializable_iterator(dataset)
     K.get_session().run(self._iterator.initializer)
 
     self._get_next_ops = []
@@ -1676,14 +1676,10 @@ class KerasTPUModel(models.Model):
         callbacks,
         self,
         do_validation=do_validation,
-        val_inputs=val_inputs,
-        val_targets=val_targets,
-        val_sample_weights=val_sample_weights,
         batch_size=batch_size,
         epochs=epochs,
         steps_per_epoch=steps_per_epoch,
         samples=num_training_samples,
-        validation_steps=validation_steps,
         verbose=verbose,
         count_mode=count_mode)
 
@@ -2073,6 +2069,8 @@ class KerasTPUModel(models.Model):
       # tpu_model may not be compiled, e.g., loading weights and then predict.
       return
     for k, v in six.iteritems(cpu_optimizer_config):
+      if k == 'name':
+        continue
       opt_var = getattr(self._tpu_model.optimizer, k)
       if isinstance(opt_var, variables.Variable):
         logging.info('CPU -> TPU %s: %s {%s}', k, v, K.get_value(opt_var))
@@ -2101,6 +2099,8 @@ class KerasTPUModel(models.Model):
     self._cpu_model.set_weights(tpu_weights)
     for k, v in six.iteritems(tpu_optimizer_config):
       logging.info('TPU -> CPU %s: %s', k, v)
+      if k == 'name':
+        continue
       opt_var = getattr(self.cpu_optimizer, k)
       if isinstance(opt_var, variables.Variable):
         K.get_session().run(opt_var.assign(v))
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py b/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
index 8b0b240dc7302c203a22349d583323327fc4480b..de425626c813784ef657d17eac0c7bb77599a155 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
@@ -69,6 +69,7 @@ class ReplicatedVariable(object):
   def __init__(self, name, variables):
     self._name = name
     self._primary_var = variables[0]
+    self._common_name = self._primary_var.name.split(":")[0]
     self._vars = variables
     self._cached_value = None
     self._dtype = variables[0].dtype
diff --git a/tensorflow/contrib/tpu/python/tpu/session_support.py b/tensorflow/contrib/tpu/python/tpu/session_support.py
index a95275487899c4770ef99b620a7671eec2bb81eb..3e463823c820a3ef8628324f77e1a9caf8d385d5 100644
--- a/tensorflow/contrib/tpu/python/tpu/session_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/session_support.py
@@ -43,12 +43,19 @@ class CoordinatorShutdownException(Exception):
   pass
 
 
+def _clone_session(session, graph=None):
+  return session_lib.Session(
+      target=session.sess_str,
+      config=session._config,  # pylint: disable=protected-access
+      graph=graph if graph else session.graph)
+
+
 def _make_heartbeat_op(session, device, request_ph):
   """Return a heartbeat op or None if heartbeats are not supported by device."""
   try:
     # Test if we can connect in a isolated graph + session
     with ops.Graph().as_default():
-      with session_lib.Session(target=session.sess_str) as temp_session:
+      with _clone_session(session) as temp_session:
         with ops.device(device):
           heartbeat_op = tpu_ops.worker_heartbeat('')
           options = config_pb2.RunOptions(timeout_in_ms=5000)
@@ -220,6 +227,7 @@ class WatchdogManager(threading.Thread):
     self.ping_interval = ping_interval
     self.shutdown_timeout = shutdown_timeout
     self.daemon = True
+    self._config = session._config  # pylint: disable=protected-access
     self._target = session.sess_str
     self._running = False
     self._devices = devices
@@ -234,6 +242,7 @@ class WatchdogManager(threading.Thread):
     self._session = session_lib.Session(
         target=self._target,
         graph=self._graph,
+        config=self._config,
     )
 
     if self._devices is None:
@@ -334,8 +343,7 @@ class GracefulShutdownHook(session_run_hook.SessionRunHook):
 
     with self._graph.as_default():
       logging.info('Installing graceful shutdown hook.')
-      self._session = session_lib.Session(
-          target=training_session.sess_str, graph=self._graph)
+      self._session = _clone_session(training_session, self._graph)
       self._workers = WorkerHeartbeatManager.from_devices(
           self._session, all_worker_devices(self._session))
       self._heartbeat_supported = self._workers.num_workers() > 0
diff --git a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
index 70baea203cc6174bebc7d90646045efae5f2391d..a1494e3660bc09e3af45e81097151a35990810fb 100644
--- a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
+++ b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
@@ -21,44 +21,56 @@ from __future__ import print_function
 import os
 import os.path
 import re
+import sys
 
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 
 _TRACER_LOG_PREFIX = ' [>>>TT>>>]'
 _DEVICE_TYPE_TPU = 'tpu'
 _DEVICE_TYPE_CPU = 'cpu'
-_GLOBAL_STEP_OP_NAME = 'GLOBAL-STEP'
 _TRACE_MODE_NAN_INF = 'nan-inf'
 _TRACE_MODE_PART_TENSOR = 'part-tensor'
 _TRACE_MODE_PART_TENSOR_SIZE = 3
 _TRACE_MODE_FULL_TENSOR = 'full-tensor'
-_RECORD_OUTSIDE_OP_RANGE = 'not-traced-outside-op-range'
-_RECORD_SHOULD_NOT_TRACE = 'not-traced-should-not-trace'
-_RECORD_FILTERED_OUT = 'not-traced-filtered-out'
-_RECORD_SCALAR = 'not-traced-scalar'
-_RECORD_DYNAMIC_SHAPE = 'not-traced-dynamic-shape'
-_RECORD_GET_TRACED = 'get-traced'
+_TRACE_MODE_NORM = 'norm'
+_TRACE_MODE_MAX_ABS = 'max-abs'
+_REASON_OUTSIDE_OP_RANGE = 'not-traced-outside-op-range'
+_REASON_UNSAFE_OP = 'not-traced-unsafe-op'
+_REASON_UNSAFE_SCALAR = 'not-traced-unsafe-scalar'
+_REASON_LESS_INTERESTING_OP = 'not-traced-less-interesting-op'
+_REASON_DEVICE_MISMATCH = 'not-traced-device-mismatch'
+_REASON_DYNAMIC_SHAPE = 'not-traced-dynamic-shape'
+_REASON_SCALAR_GET_TRACED = 'traced-scalar'
+_REASON_TENSOR_GET_TRACED = 'traced-tensor'
+_REASON_USER_INCLUDED = 'traced-user-included'
+_REASON_USER_EXCLUDED = 'not-traced-user-excluded'
+_REASON_NON_NUMERIC_TENSOR = 'not-traced-non-numeric-tensor'
 _MARKER_SECTION_BEGIN = '!!!!!!! section-begin:'
 _MARKER_SECTION_END = '!!!!!!! section-end:'
 _SECTION_NAME_CONFIG = 'configuration'
 _SECTION_NAME_REASON = 'reason'
 _SECTION_NAME_OP_LIST = 'op-list'
+_SECTION_NAME_TENSOR_LIST = 'tensor-list'
 _SECTION_NAME_GRAPH = 'graph'
 _FIELD_NAME_VERSION = 'version:'
 _FIELD_NAME_DEVICE = 'device:'
 _FIELD_NAME_TRACE_MODE = 'trace-mode:'
 _FIELD_NAME_NUM_REPLICAS = 'num-replicas:'
 _FIELD_NAME_NUM_OPS = 'number-of-ops:'
+_FIELD_NAME_NUM_TENSORS = 'number-of-tensors:'
 _FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED = 'topological-sort-succeed:'
 _FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
 _FLAG_SINGLE_QUOTE_PAT = re.compile(r"\s*--([^=]+)='([^']*)'")
@@ -66,13 +78,72 @@ _FLAG_DOUBLE_QUOTE_PAT = re.compile(r'\s*--([^=]+)="([^"]*)"')
 _FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
 _FLAG_NAME_ENABLE = 'enable'
 _FLAG_NAME_TRACE_MODE = 'trace_mode'
-_FLAG_NAME_INTERESTING_OPS = 'interesting_ops'
+_FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS = 'include_less_interesting_ops'
+_FLAG_NAME_EXCLUDED_OPNAMES = 'excluded_opnames'
+_FLAG_NAME_EXCLUDED_OPTYPES = 'excluded_optypes'
+_FLAG_NAME_INCLUDED_OPNAMES = 'included_opnames'
+_FLAG_NAME_INCLUDED_OPTYPES = 'included_optypes'
 _FLAG_NAME_TRACE_FILE = 'trace_file_path'
+_FLAG_NAME_REPORT_FILE = 'report_file_path'
 _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR = 'use_test_undeclared_outputs_dir'
 _FLAG_NAME_OP_RANGE = 'op_range'
 _OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
 _OUTPUT_STREAM_ESCAPE = 'file://'
 _TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
+_TENSOR_TRACER_COLLECTION = 'tensor_tracer_variables'
+_TENSOR_TRACER_CHECKPOINT = 'tensor_tracer_checkpoint'
+
+
+def tensor_checkpoint(tensor, checkpoint_name):
+  """Adds a checkpoint with the given checkpoint name for the given tensor.
+
+  The tensor will be added to the list of tensors that will be traced by the
+  tensor tracer.
+
+  Args:
+     tensor: the tensor object for which the tracing is requested.
+     checkpoint_name: a string name for the checkpoint. This name has to be a
+     unique name if used within model comparison. The tensors that have the same
+     checkpoint identifier is compared in model comparison.
+  Returns:
+    The provided tensor.
+  """
+
+  tensor.graph.get_collection(_TENSOR_TRACER_COLLECTION)
+  tensor.graph.add_to_collection(_TENSOR_TRACER_COLLECTION,
+                                 (tensor, checkpoint_name))
+  return tensor
+
+
+def keras_layer_checkpoint(layer, checkpoint_name):
+  """An interface for adding the tensor outputs of a keras layer.
+
+  Encapsulates tensor_checkpoint.
+
+  Args:
+     layer: A keras layer.
+     checkpoint_name: a string name for the checkpoint. This name has to be a
+     unique name if used within model comparison. The tensors that have the same
+     checkpoint identifier is compared in model comparison.
+
+  Returns:
+    The provided layer.
+  """
+  try:
+    outputs = layer.output
+    if tensor_util.is_tensor(outputs):
+      tensor_checkpoint(outputs, '%s' % (checkpoint_name))
+    else:
+      idx = 0
+      for output_tensor in outputs:
+        if tensor_util.is_tensor(outputs):
+          tensor_checkpoint(output_tensor, '%s_%d' % (checkpoint_name, idx))
+        idx += 1
+  except AttributeError:
+    pass
+  except RuntimeError:
+    pass
+  return layer
 
 
 class TensorTracer(object):
@@ -105,6 +176,34 @@ class TensorTracer(object):
     match = _FLAG_NO_QUOTE_PAT.match(flags, pos)
     return match
 
+  @staticmethod
+  def validate_flag_names():
+    """Validates if the TensorTrace flags passed are valid."""
+    valid_flag_names = [_FLAG_NAME_ENABLE, _FLAG_NAME_TRACE_MODE,
+                        _FLAG_NAME_EXCLUDED_OPNAMES,
+                        _FLAG_NAME_EXCLUDED_OPTYPES,
+                        _FLAG_NAME_INCLUDED_OPNAMES,
+                        _FLAG_NAME_INCLUDED_OPTYPES,
+                        _FLAG_NAME_TRACE_FILE, _FLAG_NAME_REPORT_FILE,
+                        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR,
+                        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS,
+                        _FLAG_NAME_OP_RANGE]
+    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return
+    pos = 0
+    while True:
+      match = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
+      if not match:
+        break
+      flag_name = match.group(1)
+      if flag_name not in valid_flag_names:
+        raise ValueError(
+            'The flag name "%s" passed via the environment variable "%s" '
+            'is invalid. Valid flag names are:'
+            '\n%s'%(flag_name, _FLAGS_ENV_VAR, valid_flag_names))
+      pos = match.end()
+
   @staticmethod
   def print_flag_values():
     """Prints all TensorTracer flags passed via environment variables."""
@@ -146,6 +245,20 @@ class TensorTracer(object):
       pos = match.end()
     return ''
 
+  @staticmethod
+  def flag_value_to_re_list(flag_name):
+    """Converts list of strings to compiled RE."""
+
+    re_list = []
+    flag_value = TensorTracer.get_flag_value(flag_name)
+    if not flag_value:
+      return re_list
+    list_of_values = flag_value.split()
+    for v in list_of_values:
+      r = re.compile(v)
+      re_list.append(r)
+    return re_list
+
   @staticmethod
   def is_enabled():
     """Returns True if TensorTracer is enabled."""
@@ -186,29 +299,67 @@ class TensorTracer(object):
     """Checks if the given trace mode is valid."""
 
     valid_trace_modes = [_TRACE_MODE_NAN_INF, _TRACE_MODE_PART_TENSOR,
-                         _TRACE_MODE_FULL_TENSOR]
+                         _TRACE_MODE_FULL_TENSOR, _TRACE_MODE_NORM,
+                         _TRACE_MODE_MAX_ABS]
     if trace_mode not in valid_trace_modes:
       raise ValueError('Invalid trace mode "%s" given to the Tensor_Tracer.'
                        'Valid trace modes are: %s'%(trace_mode,
                                                     valid_trace_modes))
 
   @staticmethod
-  def should_trace(device_type, op):
-    """Returns True if the given Op should be traced."""
+  def unsafe_op(op):
+    """Returns True if this op is not safe to be traced."""
 
-    if device_type != _DEVICE_TYPE_TPU:
-      raise ValueError('Non TPU device type is not supported')
     if control_flow_util.IsInCond(op):
+      return True
+    # Reasons for not including following op types:
+    #    Assign: cause incorrect result with CPU tracing.
+    #    others: compilation problems.
+    if op.type in ['Assign', 'Pack', 'Shape', 'Reshape', 'ArgMin', 'ArgMax']:
+      return True
+    return False
+
+  @staticmethod
+  def device_mismatch(device_type, op):
+    if device_type == _DEVICE_TYPE_TPU:
+      # pylint: disable=protected-access
+      return tpu._TPU_REPLICATE_ATTR not in op.node_def.attr
+      # pylint: enable=protected-access
+    return False
+
+  @staticmethod
+  def unsafe_scalar_trace(op):
+    """Return true if scalar output tensor from Op is not safe to be traced."""
+
+    # Tracing the following causes cycle in the graph on TPU.
+    if op.type in ['LoopCond', 'Enter', 'Merge', 'Const',
+                   'Switch', 'Less', 'ReadVariableOp']:
+      return True
+    # Tracing the following will cause casting-issue
+    # with the norm tracing mode or other compilation issues on CPU.
+    if op.type in ['VarHandleOp', 'IteratorToStringHandle',
+                   'IteratorGetNext', 'OneShotIterator',
+                   'IteratorV2', 'MakeIterator',
+                   'BatchDatasetV2', 'MapDataset',
+                   'FixedLengthRecordDataset', 'TakeDataset', 'ZipDataset',
+                   'Placeholder', 'PlaceholderWithDefault', 'StridedSlice']:
+      return True
+    return False
+
+  @staticmethod
+  def less_interesting_op(op):
+    """Returns True if the given Op is not an interesting one to be traced."""
+
+    include_less_interesting = TensorTracer.get_flag_value(
+        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS)
+    if include_less_interesting:
       return False
-    if op.type in ['Reshape', 'ArgMin', 'ArgMax']:
-      return False
-    # pylint: disable=protected-access
-    return tpu._TPU_REPLICATE_ATTR in op.node_def.attr
-    # pylint: enable=protected-access
+    return op.type in ['Const', 'Identity', 'Cast', 'Shape']
 
   @staticmethod
   def reason(op_idx, details):
-    """Returns why the Op at op_idx is traced or not."""
+    """Returns reason why the Op at op_idx is traced or not."""
+
     return '%d %s'%(op_idx, details)
 
   @staticmethod
@@ -274,6 +425,33 @@ class TensorTracer(object):
       assert len(unsorted_ops) == len(sorted_ops)
       return (True, sorted_ops)
 
+  @staticmethod
+  def _make_op_and_tensor_maps(op_list):
+    """Creates various maps and lists from op_list.
+
+    Args:
+       op_list: a list of Ops
+
+    Returns:
+       opname_idx_map: a map from Op's name to its index in op_list.
+       tensor_list: a list of output tensors of the Ops in op_list.
+       tensorname_idx_map: a map from output tensor name to its index
+                           in tensor_list.
+    """
+
+    opname_idx_map = {}
+    tensor_list = []
+    tensorname_idx_map = {}
+    for op_id, op in enumerate(op_list):
+      if op.name in opname_idx_map:
+        raise ValueError('Duplicated Op name: %s'%op.name)
+      opname_idx_map[op.name] = op_id
+      for output_tensor in op.outputs:
+        if output_tensor.name not in tensorname_idx_map:
+          tensor_list.append(output_tensor)
+          tensorname_idx_map[output_tensor.name] = len(tensor_list)-1
+    return (opname_idx_map, tensor_list, tensorname_idx_map)
+
   def __init__(self):
     """Initializes a TensorTracer.
 
@@ -281,16 +459,20 @@ class TensorTracer(object):
     """
     self._version = 'use-outside-compilation'
     self._device_type = None
+    TensorTracer.validate_flag_names()
     self._trace_mode = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_MODE)
     if not self._trace_mode:
       self._trace_mode = _TRACE_MODE_NAN_INF
     TensorTracer.check_trace_mode(self._trace_mode)
     self._part_tensor_size = _TRACE_MODE_PART_TENSOR_SIZE
     self._instrument_records = {}
-    interesting_ops = TensorTracer.get_flag_value(_FLAG_NAME_INTERESTING_OPS)
-    self._selected_ops = interesting_ops.split()
     self._set_trace_file_path()
+    self._set_report_file()
     self._set_op_range()
+    self._set_excluded_opnames()
+    self._set_excluded_optypes()
+    self._set_included_opnames()
+    self._set_included_optypes()
     self._num_replicas = None
     self._replica_id = None
 
@@ -318,10 +500,7 @@ class TensorTracer(object):
     """Sets the path of the output trace file."""
 
     self._trace_file_path = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_FILE)
-    if not self._trace_file_path:
-      raise ValueError('--%s is not set in the environment variable %s'
-                       %(_FLAG_NAME_TRACE_FILE, _FLAGS_ENV_VAR))
-    elif TensorTracer.use_test_undeclared_outputs_dir():
+    if self._trace_file_path and TensorTracer.use_test_undeclared_outputs_dir():
       if os.path.isabs(self._trace_file_path):
         raise ValueError('If use_test_undeclared_outputs_dir is set,'
                          'trace_file_path cannot be an absolute path (%s)'
@@ -330,6 +509,22 @@ class TensorTracer(object):
       self._trace_file_path = os.path.join(outputs_dir,
                                            self._trace_file_path)
 
+  def _set_report_file(self):
+    """Sets the path of the output report file."""
+
+    self._report_file_path = TensorTracer.get_flag_value(_FLAG_NAME_REPORT_FILE)
+    if not self._report_file_path:
+      self._report_file = None
+      return
+    try:
+      self._report_file = gfile.Open(self._report_file_path, 'w')
+    except IOError as e:
+      raise e
+
+  def _close_report_file(self):
+    if self._report_file:
+      self._report_file.close()
+
   def _set_op_range(self):
     """Sets the index range of the Ops that we will consider tracing."""
 
@@ -350,19 +545,48 @@ class TensorTracer(object):
       return False
     return self._op_range[1] < 0 or idx <= self._op_range[1]
 
-  def _write_report(self, content):
-    """Writes the given content to the report."""
+  def _set_excluded_opnames(self):
+    self._excluded_opname_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_EXCLUDED_OPNAMES)
+
+  def _set_excluded_optypes(self):
+    self._excluded_optype_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_EXCLUDED_OPTYPES)
+
+  def _set_included_opnames(self):
+    self._included_opname_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_INCLUDED_OPNAMES)
+
+  def _set_included_optypes(self):
+    self._included_optype_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_INCLUDED_OPTYPES)
+
+  def _is_user_included_op(self, op):
+    for opname_re in self._included_opname_re_list:
+      if opname_re.match(op.name):
+        return True
+    for optype_re in self._included_optype_re_list:
+      if optype_re.match(op.type):
+        return True
+    return False
 
-    logging.info('%s %s'%(_TRACER_LOG_PREFIX, content))
+  def _is_user_excluded_op(self, op):
+    for opname_re in self._excluded_opname_re_list:
+      if opname_re.match(op.name):
+        return True
+    for optype_re in self._excluded_optype_re_list:
+      if optype_re.match(op.type):
+        return True
+    return False
 
-  def _is_selected_op(self, op_name):
-    """Returns True if the Op with op_name is selected to be traced."""
+  def _write_report(self, content):
+    """Writes the given content to the report."""
 
-    if not self._selected_ops:
-      return True
-    if op_name in self._selected_ops:
-      return True
-    return False
+    line = '%s %s'%(_TRACER_LOG_PREFIX, content)
+    if self._report_file:
+      self._report_file.write(line)
+    else:
+      logging.info(line)
 
   def _write_config_section(self):
     """Writes the config section of the report."""
@@ -382,15 +606,42 @@ class TensorTracer(object):
       self._write_report('"%s" %s\n'%(key, self._instrument_records[key]))
     self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_REASON))
 
-  def _write_op_list_section(self, op_list):
+  def _write_op_list_section(self, op_list, tensorname_idx_map):
     """Writes the Op-list section of the report."""
 
     self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_OP_LIST))
     self._write_report('%s %d\n'%(_FIELD_NAME_NUM_OPS, len(op_list)))
     for i in range(0, len(op_list)):
-      self._write_report('%d "%s" %s\n'%(i, op_list[i].name, op_list[i].type))
+      op = op_list[i]
+      line = '%d "%s" %s'%(i, op.name, op.type)
+      for out_tensor in op.outputs:
+        if out_tensor.name not in tensorname_idx_map:
+          raise ValueError(
+              'out_tensor %s is not in tensorname_idx_map'%out_tensor.name)
+        line += ' %d'%tensorname_idx_map[out_tensor.name]
+      line += '\n'
+      self._write_report(line)
     self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_OP_LIST))
 
+  def _write_tensor_list_section(self, tensor_list, opname_idx_map):
+    """Writes the tensor-list section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
+                                  _SECTION_NAME_TENSOR_LIST))
+    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_TENSORS, len(tensor_list)))
+    for i in range(0, len(tensor_list)):
+      tensor = tensor_list[i]
+      line = '%d "%s"'%(i, tensor.name)
+      for consumer_op in tensor.consumers():
+        if consumer_op.name not in opname_idx_map:
+          raise ValueError(
+              'consumer_op %s is not in opname_idx_map'%consumer_op.name)
+        line += ' %d'%opname_idx_map[consumer_op.name]
+      line += '\n'
+      self._write_report(line)
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
+                                  _SECTION_NAME_TENSOR_LIST))
+
   def _write_graph_section(self, succeed, sorted_or_cycle):
     """Writes the graph section of the report."""
 
@@ -422,7 +673,7 @@ class TensorTracer(object):
       Args:
         op_name: the name of the Op that outputs the tensor to be printed.
         output_idx: which output of the Op it is (0 means the first output).
-        num_elements: number of elements to print.
+        num_elements: number of elements to print (-1 means print all).
         tensor: the tensor needs to be returned.
         output_tensor: the tensor needs to be printed.
 
@@ -430,10 +681,13 @@ class TensorTracer(object):
         The same tensor passed via the "tensor" argument.
       """
       msg = '"%s:%d" '%(op_name, output_idx)
-      output_stream = _OUTPUT_STREAM_ESCAPE + self._trace_file_path
+      if self._trace_file_path:
+        output_stream = _OUTPUT_STREAM_ESCAPE + self._trace_file_path
+      else:
+        output_stream = sys.stderr
       print_op = logging_ops.print_v2(msg, array_ops.shape(output_tensor),
                                       ' @', self._replica_id,
-                                      '\n', output_tensor,
+                                      '\n', output_tensor, '\n',
                                       summarize=num_elements,
                                       output_stream=output_stream)
       with ops.control_dependencies([print_op]):
@@ -442,7 +696,8 @@ class TensorTracer(object):
     def _detect_nan_inf(tensor):
       """Trace function for detecting any NaN/Inf in the tensor."""
 
-      if tensor.dtype.is_floating:
+      if tensor.dtype.__eq__(dtypes.bfloat16) or tensor.dtype.__eq__(
+          dtypes.float16):
         # Since host can't handle bf16, always convert tensor to f32.
         tensor = math_ops.cast(tensor, dtypes.float32)
         output_tensor = math_ops.reduce_any(
@@ -450,12 +705,19 @@ class TensorTracer(object):
                                     gen_math_ops.is_inf(tensor)))
       else:
         output_tensor = constant_op.constant(0)
-      return _print_tensor(op_name, output_idx, 1, tensor, output_tensor)
+      return _print_tensor(op_name, output_idx, -1, tensor, output_tensor)
 
-    def _show_global_step(tensor):
-      """Trace function for printing the global step count."""
+    def _show_norm(tensor):
+      tensor = math_ops.cast(tensor, dtypes.float64)
+      output_tensor = linalg_ops.norm(tensor)
+      return _print_tensor(op_name, output_idx, -1, tensor, output_tensor)
 
-      return _print_tensor(op_name, output_idx, 1, tensor, tensor)
+    def _show_max_abs(tensor):
+      output_tensor = math_ops.cast(math_ops.reduce_max(math_ops.abs(tensor)),
+                                    dtypes.float64)
+      zero = constant_op.constant(0, dtypes.float64)
+      output_tensor = gen_math_ops.maximum(zero, output_tensor)
+      return _print_tensor(op_name, output_idx, -1, tensor, output_tensor)
 
     def _show_part_tensor(tensor):
       """Trace function for printing part of the tensor."""
@@ -468,23 +730,139 @@ class TensorTracer(object):
 
       return _print_tensor(op_name, output_idx, -1, tensor, tensor)
 
-    if op_name == _GLOBAL_STEP_OP_NAME:
-      return _show_global_step
     if self._trace_mode == _TRACE_MODE_NAN_INF:
       return _detect_nan_inf
     if self._trace_mode == _TRACE_MODE_PART_TENSOR:
       return _show_part_tensor
     if self._trace_mode == _TRACE_MODE_FULL_TENSOR:
       return _show_full_tensor
+    if self._trace_mode == _TRACE_MODE_NORM:
+      return _show_norm
+    if self._trace_mode == _TRACE_MODE_MAX_ABS:
+      return _show_max_abs
 
     raise RuntimeError('Tensor trace fun for %s is not yet implemented'
                        %self._trace_mode)
 
+  def _skip_op(self, op_id, op, user_included, user_excluded):
+    """Returns True if we should not trace Op."""
+
+    if user_included:
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_USER_INCLUDED)
+      return False
+    if user_excluded:
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_USER_EXCLUDED)
+      return True
+    if not self._inside_op_range(op_id):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_OUTSIDE_OP_RANGE)
+      return True
+    if TensorTracer.unsafe_op(op):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_UNSAFE_OP)
+      return True
+    if TensorTracer.device_mismatch(self._device_type, op):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_DEVICE_MISMATCH)
+      return True
+    if TensorTracer.less_interesting_op(op):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_LESS_INTERESTING_OP)
+      return True
+    return False
+
+  def _skip_tensor(self, op_id, out_tensor, user_included,
+                   user_excluded):
+    """Returns True if we should not trace out_tensor."""
+
+    # Skips a tensor if the tensor has a non-numeric type.
+    #   Note: we cannot use check_ops.is_numeric_tensor(out_tensor)
+    #         because it also excludes tensors with dtypes, bool, and
+    #         float32_ref, which we actually want to trace.
+    non_numeric_tensor_types = set([dtypes.variant, dtypes.resource,
+                                    dtypes.string])
+    if out_tensor.dtype in non_numeric_tensor_types:
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_NON_NUMERIC_TENSOR)
+      return True
+
+    if user_included:
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_USER_INCLUDED)
+      return False
+    if user_excluded:
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_USER_EXCLUDED)
+      return True
+    if not out_tensor.get_shape().is_fully_defined():
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_DYNAMIC_SHAPE)
+      return True
+    rank = len(out_tensor.shape)
+    if rank < 1:
+      # scalar
+      if TensorTracer.unsafe_scalar_trace(out_tensor.op):
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_UNSAFE_SCALAR)
+        return True
+      else:
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_SCALAR_GET_TRACED)
+        return False
+    else:
+      # tensor
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_TENSOR_GET_TRACED)
+      return False
+
+  def _pre_tracing(self, graph):
+    """Work needs to be done prior to TPU or CPU tracing."""
+
+    operations = graph.get_operations()
+    (opname_idx_map, tensor_list, tensorname_idx_map) = (
+        TensorTracer._make_op_and_tensor_maps(operations))
+    self._write_config_section()
+    self._write_op_list_section(operations, tensorname_idx_map)
+    self._write_tensor_list_section(tensor_list, opname_idx_map)
+    # Does the topological sort before adding any nodes to the graph.
+    (succeed, sorted_or_cycle) = TensorTracer.topological_sort(graph)
+    return (operations, succeed, sorted_or_cycle)
+
+  def _post_tracing(self, succeed, sorted_or_cycle):
+    """Work needs to be done after TPU or CPU tracing."""
+
+    self._write_reason_section()
+    self._write_graph_section(succeed, sorted_or_cycle)
+    self._close_report_file()
+
+  def _get_checkpoints(self, graph):
+    """Returns the list of Ops that produce the tensors traced with API.
+
+    Args:
+      graph: the graph of Ops.
+
+    Returns:
+      A set of operation names which should be traced.
+    """
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
+                                  _TENSOR_TRACER_CHECKPOINT))
+    checkpoint_operations = set()
+    tensor_tracer_variables = graph.get_collection(_TENSOR_TRACER_COLLECTION)
+    for (tensor, checkpoint_name) in tensor_tracer_variables:
+      self._write_report('%s %s\n'%(tensor.name, checkpoint_name))
+      checkpoint_operations.add(tensor.op.name)
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
+                                  _TENSOR_TRACER_CHECKPOINT))
+    return checkpoint_operations
+
   def trace_tpu(self, graph, result_tensor, num_replicas=None):
     """Traces the tensors generated by TPU Ops in a TF graph.
 
     Args:
-      graph: the graph of Ops.
+      graph: the graph of Ops executed on the TPU.
       result_tensor: a result tensor of evaluating the graph.
       num_replicas: number of replicas used on the TPU.
 
@@ -502,38 +880,22 @@ class TensorTracer(object):
     TensorTracer.check_device_type(self._device_type)
     result_tensor_copy = self._add_replica_id_to_graph(num_replicas,
                                                        result_tensor)
-    self._write_config_section()
+    (operations, succeed, sorted_or_cycle) = self._pre_tracing(graph)
     tracing_ops = []
-    operations = graph.get_operations()
-    self._write_op_list_section(operations)
-    # Does the topological sort before adding any nodes to the graph.
-    (succeed, sorted_or_cycle) = TensorTracer.topological_sort(graph)
+    checkpoint_operations = self._get_checkpoints(graph)
+
     for op_id, op in enumerate(operations):
-      if not self._inside_op_range(op_id):
-        self._instrument_records[op.name] = TensorTracer.reason(
-            op_id, _RECORD_OUTSIDE_OP_RANGE)
+      if checkpoint_operations and op.name not in checkpoint_operations:
         continue
-      if not TensorTracer.should_trace(self._device_type, op):
-        self._instrument_records[op.name] = TensorTracer.reason(
-            op_id, _RECORD_SHOULD_NOT_TRACE)
-        continue
-      if not self._is_selected_op(op.name):
-        self._instrument_records[op.name] = TensorTracer.reason(
-            op_id, _RECORD_FILTERED_OUT)
+      user_included = self._is_user_included_op(op)
+      user_excluded = self._is_user_excluded_op(op)
+      if self._skip_op(op_id, op, user_included, user_excluded):
         continue
       for i in range(len(op.outputs)):
         out_tensor = op.outputs[i]
-        if not out_tensor.get_shape().is_fully_defined():
-          self._instrument_records[out_tensor.name] = TensorTracer.reason(
-              op_id, _RECORD_DYNAMIC_SHAPE)
-          continue  # cannot trace tensors with dynamic shape.
-        rank = len(out_tensor.shape)
-        if rank < 1:
-          self._instrument_records[out_tensor.name] = TensorTracer.reason(
-              op_id, _RECORD_SCALAR)
-          continue  # cannot trace scalar.
-        self._instrument_records[out_tensor.name] = TensorTracer.reason(
-            op_id, _RECORD_GET_TRACED)
+        if self._skip_tensor(op_id, out_tensor, user_included,
+                             user_excluded):
+          continue
         consumers = out_tensor.consumers()
         trace_op = tpu.outside_compilation(
             self._make_tensor_trace_fun(op.name, i), out_tensor)
@@ -546,8 +908,45 @@ class TensorTracer(object):
           # if there is no consumer, we will add the control dependence later
           # when we add the control dependency to the output operations.
           tracing_ops.append(trace_op)
+    self._post_tracing(succeed, sorted_or_cycle)
+    return (result_tensor_copy, tracing_ops)
 
-    self._write_reason_section()
-    self._write_graph_section(succeed, sorted_or_cycle)
+  def trace_cpu(self, graph):
+    """Traces the tensors generated by CPU Ops in a TF graph.
 
-    return (result_tensor_copy, tracing_ops)
+    Args:
+      graph: the graph of Ops executed on the CPU.
+
+    Returns:
+      tracing_calls: a map from keys to trace calls.
+                     A key is constructed from an Op's name.
+                     A trace call consists of a function and a tensor (
+                     the function will be invoked with the tensor).
+    """
+
+    self._device_type = _DEVICE_TYPE_CPU
+    TensorTracer.check_device_type(self._device_type)
+    self._num_replicas = 1
+    self._replica_id = 0
+    (operations, succeed, sorted_or_cycle) = self._pre_tracing(graph)
+    tracing_calls = {}
+    checkpoint_operations = self._get_checkpoints(graph)
+
+    for op_id, op in enumerate(operations):
+      if checkpoint_operations and op.name not in checkpoint_operations:
+        continue
+      user_included = self._is_user_included_op(op)
+      user_excluded = self._is_user_excluded_op(op)
+      if self._skip_op(op_id, op, user_included, user_excluded):
+        continue
+      for i in range(len(op.outputs)):
+        out_tensor = op.outputs[i]
+        if self._skip_tensor(op_id, out_tensor, user_included,
+                             user_excluded):
+          continue
+        trace_fun = self._make_tensor_trace_fun(op.name, i)
+        trace_call = (trace_fun, [out_tensor])
+        trace_call_key = 'tensor_tracing_cpu-%s:%d'%(op.name, i)
+        tracing_calls[trace_call_key] = trace_call
+    self._post_tracing(succeed, sorted_or_cycle)
+    return tracing_calls
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index def57da20d6018dcf27ccb7a9d04592f38ce2f7c..9266d81cf5fc035790062f0e307a5da0b01a9fc1 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -646,6 +646,10 @@ def split_compile_and_replicate(computation,
           array_ops.identity(x, name="replicated_input_{}".format(i))
           for i, x in enumerate(computation_inputs)
       ]
+      for i in computation_inputs:
+        # pylint: disable=protected-access
+        i.op._set_attr("_tpu_input_identity", attr_value_pb2.AttrValue(b=True))
+        # pylint: enable=protected-access
 
       # If there is an infeed queue, adds the dequeued values to the
       # computation's inputs.
@@ -726,7 +730,11 @@ def split_compile_and_replicate(computation,
     new_output_tensors = []
     for t in output_tensors:
       with ops.device(t.device if t.device else core(0)):
-        new_output_tensors.append(array_ops.identity(t))
+        o = array_ops.identity(t)
+        # pylint: disable=protected-access
+        o.op._set_attr("_tpu_output_identity", attr_value_pb2.AttrValue(b=True))
+        # pylint: enable=protected-access
+        new_output_tensors.append(o)
     output_tensors = new_output_tensors
     context.ExitResult(output_tensors)
   finally:
@@ -777,15 +785,15 @@ def split_compile_and_replicate(computation,
       ]
 
 
-def shard(computation,
-          inputs=None,
-          num_shards=1,
-          input_shard_axes=None,
-          outputs_from_all_shards=True,
-          output_shard_axes=None,
-          infeed_queue=None,
-          device_assignment=None,
-          name=None):
+def split_compile_and_shard(computation,
+                            inputs=None,
+                            num_shards=1,
+                            input_shard_axes=None,
+                            outputs_from_all_shards=True,
+                            output_shard_axes=None,
+                            infeed_queue=None,
+                            device_assignment=None,
+                            name=None):
   """Shards `computation` for parallel execution.
 
   `inputs` must be a list of Tensors or None (equivalent to an empty list), each
@@ -839,7 +847,7 @@ def shard(computation,
       is equal to the number of cores in the TPU system.
     name: (Deprecated) Does nothing.
   Returns:
-    A list of output tensors.
+    A tuple of (compile op, [output tensors]).
   Raises:
     ValueError: If num_shards <= 0
     ValueError: If len(input_shard_axes) != len(inputs)
@@ -874,7 +882,7 @@ def shard(computation,
   else:
     transposed_inputs = [[]] * num_shards
 
-  outputs = replicate(
+  compile_op, outputs = split_compile_and_replicate(
       computation,
       transposed_inputs,
       infeed_queue=infeed_queue,
@@ -891,7 +899,7 @@ def shard(computation,
     # one so it can be used as a control dependency or fetch node.
     # TODO(b/36647078) remove disable when pylint bug is fixed.
     # pylint: disable=indexing-exception
-    return [outputs[0]]
+    return compile_op, [outputs[0]]
     # pylint: enable=indexing-exception
 
   # TODO(b/36647078) remove disable when pylint bug is fixed.
@@ -925,7 +933,87 @@ def shard(computation,
       # TODO(phawkins): use a smarter policy, e.g., round-robin across shards.
       results.append(x[0])
 
-  return results
+  return compile_op, results
+
+
+def shard(computation,
+          inputs=None,
+          num_shards=1,
+          input_shard_axes=None,
+          outputs_from_all_shards=True,
+          output_shard_axes=None,
+          infeed_queue=None,
+          device_assignment=None,
+          name=None):
+  """Shards `computation` for parallel execution.
+
+  `inputs` must be a list of Tensors or None (equivalent to an empty list), each
+  of which has a corresponding split axis (from `input_shard_axes`). Each input
+  is split into `num_shards` pieces along the corresponding axis, and
+  computation is applied to each shard in parallel.
+
+  Tensors are broadcast to all shards if they are lexically captured by
+  `computation`. e.g.,
+
+  x = tf.constant(7)
+  def computation():
+    return x + 3
+  ... = shard(computation, ...)
+
+  TODO(phawkins): consider adding support for broadcasting Tensors passed
+  as inputs.
+
+  If `outputs_from_all_shards` is true, the outputs from all shards of
+  `computation` are concatenated back together along their `output_shards_axes`.
+  Otherwise, each output is taken from an arbitrary shard.
+
+  Inputs and outputs of the computation must be at least rank-1 Tensors.
+
+  Args:
+    computation: A Python function that builds a computation to apply to each
+      shard of the input.
+    inputs: A list of input tensors or None (equivalent to an empty list). Each
+      input tensor has a corresponding shard axes, given by `input_shard_axes`,
+      which must have size divisible by `num_shards`.
+    num_shards: The number of shards.
+    input_shard_axes: A list of dimensions along which to shard `inputs`, or
+      `None`. `None` means "shard all inputs along dimension 0". If not `None`,
+      there must be one dimension per input.
+    outputs_from_all_shards: Boolean or list of boolean. For each output, if
+      `True`, outputs from all shards are concatenated along the corresponding
+      `output_shard_axes` entry. Otherwise, each output is taken
+      from an arbitrary shard. If the argument is a boolean, the argument's
+      value is used for each output.
+    output_shard_axes: A list of dimensions along which to concatenate the
+      outputs of `computation`, or `None`. `None` means "concatenate all outputs
+      along dimension 0". If not `None`, there must be one dimension per output.
+      Ignored if `outputs_from_all_shards` is False.
+    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
+      of `computation`.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each shard of the computation uses
+      only one core, and there is either only one shard, or the number of shards
+      is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+  Returns:
+    A list of output tensors.
+  Raises:
+    ValueError: If num_shards <= 0
+    ValueError: If len(input_shard_axes) != len(inputs)
+    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
+  """
+  return split_compile_and_shard(
+      computation,
+      inputs=inputs,
+      num_shards=num_shards,
+      input_shard_axes=input_shard_axes,
+      outputs_from_all_shards=outputs_from_all_shards,
+      output_shard_axes=output_shard_axes,
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)[1]
 
 
 def batch_parallel(computation,
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 7171587ff7298982423a5046d85d1970a4d6b1cb..44a8f7ce0e5794ec95b5d0c25adca14b194a25d1 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -31,6 +31,7 @@ import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.contrib.tpu.proto import compilation_result_pb2 as tpu_compilation_result
 from tensorflow.contrib.tpu.python.tpu import tensor_tracer
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import error_handling
@@ -45,6 +46,7 @@ from tensorflow.contrib.training.python.training import hparam
 from tensorflow.core.framework import variable_pb2
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as tf_session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest as data_nest
 from tensorflow.python.estimator import estimator as estimator_lib
@@ -335,6 +337,16 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
     hooks = None
     if self.host_call is not None:
       hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
+    if tensor_tracer.TensorTracer.is_enabled():
+      tt = tensor_tracer.TensorTracer()
+      tracing_calls = tt.trace_cpu(ops.get_default_graph())
+      tracing_call_ret = _OutfeedHostCall.create_cpu_hostcall(tracing_calls)
+      tracing_functions = tracing_call_ret.values()
+      if tracing_functions:
+        if hooks:
+          hooks.extend([_OutfeedHostCallHook(tracing_functions)])
+        else:
+          hooks = [_OutfeedHostCallHook(tracing_functions)]
     hooks = tuple(hooks or [])
     scaffold = self.scaffold_fn() if self.scaffold_fn else None
     return model_fn_lib.EstimatorSpec(
@@ -411,13 +423,17 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
                ctx,
                enqueue_ops,
                dequeue_ops,
+               tpu_compile_op,
                run_infeed_loop_on_coordinator=True,
-               rendezvous=None):
+               rendezvous=None,
+               master=None,
+               session_config=None):
     self._master_job = ctx.master_job
     self._enqueue_ops = enqueue_ops
     self._dequeue_ops = dequeue_ops
     self._rendezvous = rendezvous
-
+    self._master = master
+    self._session_config = session_config
     self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
     self._initial_infeed_sleep_secs = (
         ctx.config.tpu_config.initial_infeed_sleep_secs)
@@ -425,15 +441,15 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     self._feed_error = None
     self._finished = False
     self._should_initialize_tpu = True
+    self._tpu_compile_op = tpu_compile_op
 
   def begin(self):
     logging.info('TPU job name %s', self._master_job)
     self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+    self._init_ops = []
     if self._should_initialize_tpu:
-      self._init_ops = [tpu.initialize_system(job=self._master_job)]
       self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
     else:
-      self._init_ops = []
       self._finalize_ops = []
 
     summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
@@ -474,12 +490,31 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
   def _create_infeed_controller(self, name, target, args):
     return _OpQueueContext(name=name, target=target, args=args)
 
+  def _assertCompilationSucceeded(self, result, coord):
+    proto = tpu_compilation_result.CompilationResultProto()
+    proto.ParseFromString(result)
+    if proto.status_error_message:
+      logging.error('Compilation failed: {}'.format(proto.status_error_message))
+      coord.request_stop()
+    else:
+      logging.info('Compilation succeeded')
+
   def after_create_session(self, session, coord):
-    logging.info('Init TPU system')
-    start = time.time()
+    if self._should_initialize_tpu:
+      logging.info('Init TPU system')
+      start = time.time()
+      with ops.Graph().as_default():
+        with tf_session.Session(
+            self._master, config=self._session_config) as sess:
+          sess.run(tpu.initialize_system(job=self._master_job))
+      logging.info('Initialized TPU in %d seconds', time.time() - start)
+
     session.run(self._init_ops,
                 options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
-    logging.info('Initialized TPU in %d seconds', time.time() - start)
+
+    if os.environ.get('TPU_SPLIT_COMPILE_AND_EXECUTE', '') == '1':
+      logging.info('Compiling user program: this may take a while...')
+      self._assertCompilationSucceeded(session.run(self._tpu_compile_op), coord)
 
     self._infeed_controller = self._create_infeed_controller(
         name='InfeedController', target=self._run_infeed, args=(session,))
@@ -521,13 +556,17 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
 
 class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
 
-  def __init__(self, ctx, enqueue_ops, dequeue_ops, rendezvous=None):
+  def __init__(self, ctx, enqueue_ops, dequeue_ops, tpu_compile_op,
+               rendezvous=None, master=None, session_config=None):
     super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
         ctx,
         enqueue_ops,
         dequeue_ops,
+        tpu_compile_op=tpu_compile_op,
         run_infeed_loop_on_coordinator=False,
-        rendezvous=rendezvous)
+        rendezvous=rendezvous,
+        master=master,
+        session_config=session_config)
 
   def _create_infeed_controller(self, name, target, args):
     return _OpSignalOnceQueueContext(name=name, target=target, args=args)
@@ -2241,7 +2280,7 @@ class TPUEstimator(estimator_lib.Estimator):
           (k, _export_output_to_tensors(v))
           for k, v in six.iteritems(estimator_spec.export_outputs))
       tensors = nest.flatten(tensors_dict)
-      tpu_tensors = [t for t in tensors if _is_tpu_tensor(t)]
+      tpu_tensors = [t for t in tensors if t is not None]
 
       # We cannot return anything other than `tpu_tensors` here so we capture
       # the rest for later use.
@@ -2255,18 +2294,10 @@ class TPUEstimator(estimator_lib.Estimator):
     # `tpu_tensors_on_cpu`.
     new_tensors = []
     for t in tensors:
-      if _is_tpu_tensor(t):
-        new_tensors.append(tpu_tensors_on_cpu.pop(0))
-      elif t is None:
+      if t is None:
         new_tensors.append(None)
       else:
-        # Only fetching `tpu_tensors_on_cpu` does not trigger
-        # TPU computation and blocks, so we add the control dependency here.
-        control_inputs = (
-            tpu_tensors_on_cpu if _is_iterable(tpu_tensors_on_cpu) else
-            (tpu_tensors_on_cpu,))
-        with ops.control_dependencies(control_inputs):
-          new_tensors.append(array_ops.identity(t))
+        new_tensors.append(tpu_tensors_on_cpu.pop(0))
 
     # Reconstruct `tensors_dict`.
     new_tensors_dict = nest.pack_sequence_as(tensors_dict, new_tensors)
@@ -2523,7 +2554,7 @@ class TPUEstimator(estimator_lib.Estimator):
             graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op)
 
         if mode == model_fn_lib.ModeKeys.TRAIN:
-          loss, host_call, scaffold, training_hooks = (
+          compile_op, loss, host_call, scaffold, training_hooks = (
               _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
           host_ops = host_call.create_tpu_hostcall()
           if host_ops is None:
@@ -2558,9 +2589,12 @@ class TPUEstimator(estimator_lib.Estimator):
                   ctx,
                   enqueue_ops,
                   host_ops,
+                  tpu_compile_op=compile_op,
                   run_infeed_loop_on_coordinator=(
                       run_infeed_loop_on_coordinator),
                   rendezvous=self._rendezvous[mode],
+                  master=self._config.master,
+                  session_config=self._session_config,
               ),
               InstallSignalHandlerHook()
           ])
@@ -2613,8 +2647,8 @@ class TPUEstimator(estimator_lib.Estimator):
               scaffold=scaffold)
 
         if mode == model_fn_lib.ModeKeys.EVAL:
-          total_loss, host_calls, scaffold, eval_hooks = _eval_on_tpu_system(
-              ctx, model_fn_wrapper, dequeue_fn)
+          compile_op, total_loss, host_calls, scaffold, eval_hooks = (
+              _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
           iterations_per_loop_var = _create_or_get_iterations_per_loop()
           mean_loss = math_ops.div(
               total_loss,
@@ -2661,10 +2695,13 @@ class TPUEstimator(estimator_lib.Estimator):
                   ctx,
                   enqueue_ops,
                   eval_update_ops + host_ops,
+                  tpu_compile_op=compile_op,
                   run_infeed_loop_on_coordinator=(
                       run_infeed_loop_on_coordinator),
-                  rendezvous=self._rendezvous[mode]),
-          ] + input_hooks
+                  rendezvous=self._rendezvous[mode],
+                  master=self._config.evaluation_master,
+                  session_config=self._session_config,
+              )] + input_hooks
 
           if eval_hooks:
             hooks.extend(eval_hooks)
@@ -2679,7 +2716,7 @@ class TPUEstimator(estimator_lib.Estimator):
         # Predict
         assert mode == model_fn_lib.ModeKeys.PREDICT
 
-        (dummy_predict_op, host_calls,
+        (compile_op, dummy_predict_op, host_calls,
          scaffold, prediction_hooks) = _predict_on_tpu_system(
              ctx, model_fn_wrapper, dequeue_fn)
         with ops.control_dependencies([dummy_predict_op]):
@@ -2735,7 +2772,10 @@ class TPUEstimator(estimator_lib.Estimator):
         hooks = [
             _StoppingPredictHook(scalar_stopping_signal),
             TPUInfeedOutfeedSessionHookForPrediction(
-                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode]),
+                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode],
+                tpu_compile_op=compile_op,
+                master=self._config.master,
+                session_config=self._session_config),
         ] + input_hooks
 
         if prediction_hooks:
@@ -2750,17 +2790,6 @@ class TPUEstimator(estimator_lib.Estimator):
     return _model_fn
 
 
-def _is_tpu_tensor(tensor):
-  if not isinstance(tensor, ops.Tensor):
-    return False
-  try:
-    tensor.op.get_attr(tpu._OUTSIDE_COMPILATION_ATTR)  # pylint: disable=protected-access
-  except ValueError:
-    return True
-  else:
-    return False
-
-
 def _export_output_to_tensors(export_output):
   """Get a list of `Tensors` used in `export_output`.
 
@@ -2832,15 +2861,16 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
     return training_loop.repeat(iterations_per_loop_var, single_tpu_eval_step,
                                 [_ZERO_LOSS])
 
-  (loss,) = tpu.shard(
+  (compile_op, loss,) = tpu.split_compile_and_shard(
       multi_tpu_eval_steps_on_single_shard,
       inputs=[],
       num_shards=ctx.num_replicas,
       outputs_from_all_shards=False,
       device_assignment=ctx.device_assignment)
 
+  loss = loss[0]
   scaffold = _get_scaffold(captured_scaffold_fn)
-  return loss, host_calls, scaffold, captured_eval_hooks.get()
+  return compile_op, loss, host_calls, scaffold, captured_eval_hooks.get()
 
 
 def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
@@ -2855,15 +2885,16 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
     return training_loop.repeat(iterations_per_loop_var, single_tpu_train_step,
                                 [_INITIAL_LOSS])
 
-  (loss,) = tpu.shard(
+  (compile_op, loss,) = tpu.split_compile_and_shard(
       multi_tpu_train_steps_on_single_shard,
       inputs=[],
       num_shards=ctx.num_replicas,
       outputs_from_all_shards=False,
       device_assignment=ctx.device_assignment)
 
+  loss = loss[0]
   scaffold = _get_scaffold(captured_scaffold_fn)
-  return loss, host_call, scaffold, captured_training_hooks.get()
+  return compile_op, loss, host_call, scaffold, captured_training_hooks.get()
 
 
 def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
@@ -2883,15 +2914,17 @@ def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
         cond, single_tpu_predict_step, inputs=inputs, name=b'loop')
     return outputs
 
-  (dummy_predict_op,) = tpu.shard(
+  (compile_op, dummy_predict_op,) = tpu.split_compile_and_shard(
       multi_tpu_predict_steps_on_single_shard,
       inputs=[],
       num_shards=ctx.num_replicas,
       outputs_from_all_shards=False,
       device_assignment=ctx.device_assignment)
 
+  dummy_predict_op = dummy_predict_op[0]
   scaffold = _get_scaffold(captured_scaffold_fn)
-  return dummy_predict_op, host_calls, scaffold, captured_predict_hooks.get()
+  return (compile_op, dummy_predict_op, host_calls, scaffold,
+          captured_predict_hooks.get())
 
 
 def _wrap_computation_in_while_loop(device, op_fn):
@@ -3081,7 +3114,7 @@ class _Inputs(object):
 
     The initializer must be run before calling `features_and_labels`.
     """
-    self._iterator = self._dataset.make_initializable_iterator()
+    self._iterator = dataset_ops.make_initializable_iterator(self._dataset)
     return self._iterator.initializer
 
   def features_and_labels(self):
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
index 3786e52b949dfac8c1587d1ea3041b625f00183f..e3ea983abfd24d03c964fbc647b56262e15e0a96 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.tpu.python.tpu import tpu_estimator
-from tensorflow.python import data as dataset_lib
 from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
@@ -34,10 +34,10 @@ def make_input_fn(num_samples):
 
   def input_fn(params):
     batch_size = params['batch_size']
-    da1 = dataset_lib.Dataset.from_tensor_slices(a)
-    da2 = dataset_lib.Dataset.from_tensor_slices(b)
+    da1 = dataset_ops.Dataset.from_tensor_slices(a)
+    da2 = dataset_ops.Dataset.from_tensor_slices(b)
 
-    dataset = dataset_lib.Dataset.zip((da1, da2))
+    dataset = dataset_ops.Dataset.zip((da1, da2))
     dataset = dataset.map(lambda fa, fb: {'a': fa, 'b': fb})
     dataset = dataset.batch(batch_size)
     return dataset
@@ -50,10 +50,10 @@ def make_input_fn_with_labels(num_samples):
 
   def input_fn(params):
     batch_size = params['batch_size']
-    da1 = dataset_lib.Dataset.from_tensor_slices(a)
-    da2 = dataset_lib.Dataset.from_tensor_slices(b)
+    da1 = dataset_ops.Dataset.from_tensor_slices(a)
+    da2 = dataset_ops.Dataset.from_tensor_slices(b)
 
-    dataset = dataset_lib.Dataset.zip((da1, da2))
+    dataset = dataset_ops.Dataset.zip((da1, da2))
     dataset = dataset.map(lambda fa, fb: ({'a': fa}, fb))
     dataset = dataset.batch(batch_size)
     return dataset
@@ -71,7 +71,7 @@ class TPUEstimatorStoppingSignalsTest(test.TestCase):
 
     with ops.Graph().as_default():
       dataset = input_fn(params)
-      features = dataset.make_one_shot_iterator().get_next()
+      features = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
       # With tf.data.Dataset.batch, the batch is None, i.e., dynamic shape.
       self.assertIsNone(features['a'].shape.as_list()[0])
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
index ec682e5829c4df536a043334b74200f0b6259df3..d66ecfcf4a56b8da1c2d2f518bebe4baa76b315e 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
@@ -52,6 +52,7 @@ def _query_tpu_system_metadata(master_address, cluster_def=None,
   devices = []
   device_dict = collections.defaultdict(list)
 
+  # TODO(b/120564445): Replace with standard library for retries.
   retry_count = 1
   while True:
     logging.info('Querying Tensorflow master (%s) for TPU system metadata.',
diff --git a/tensorflow/contrib/tpu/tpu_estimator.md b/tensorflow/contrib/tpu/tpu_estimator.md
index b6514e19dc92fe4c7cdcdb6582a7c0ad5ad573d5..552febd80bd35b37a95cdaaf8d5923278311ac8e 100644
--- a/tensorflow/contrib/tpu/tpu_estimator.md
+++ b/tensorflow/contrib/tpu/tpu_estimator.md
@@ -89,12 +89,9 @@ handle training:
 
         dataset = tf.data.TFRecordDataset(
             filename, buffer_size=FLAGS.dataset_reader_buffer_size)
-        dataset = dataset.map(parser).cache().repeat().batch(batch_size)
-        images, labels = dataset.make_one_shot_iterator().get_next()
-        # set_shape to give inputs statically known shapes.
-        images.set_shape([batch_size, 28 * 28])
-        labels.set_shape([batch_size])
-        return images, labels
+        dataset = dataset.map(parser).cache().repeat().batch(
+            batch_size, drop_remainder=True)
+        return dataset
       return input_fn
 
 
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 00295f57f60858db5234ce28cc643ea9eee44daa..f6427ae05a20f253edf030eff0f860361616042b 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -26,7 +26,6 @@ py_library(
         "python/training/resample.py",
         "python/training/sampling_ops.py",
         "python/training/sequence_queueing_state_saver.py",
-        "python/training/tensor_queue_dataset.py",
         "python/training/training.py",
         "python/training/tuner.py",
     ],
@@ -287,28 +286,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "tensor_queue_dataset_test",
-    size = "large",
-    srcs = ["python/training/tensor_queue_dataset_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
-        ":training_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data",
-        "//tensorflow/python/data/experimental/kernel_tests/serialization:dataset_serialization_test_base",
-        "//third_party/py/numpy",
-    ],
-)
-
 tf_proto_library(
     name = "protos_all",
     srcs = glob(["**/*.proto"]),
diff --git a/tensorflow/contrib/training/__init__.py b/tensorflow/contrib/training/__init__.py
index 3547e71184ec2b99163ea4247c01d24487811b47..87ce57ef060a0eb9383248255713421c14988416 100644
--- a/tensorflow/contrib/training/__init__.py
+++ b/tensorflow/contrib/training/__init__.py
@@ -59,8 +59,6 @@ from tensorflow.contrib.training.python.training.hparam import *
 from tensorflow.contrib.training.python.training.resample import *
 from tensorflow.contrib.training.python.training.sampling_ops import *
 from tensorflow.contrib.training.python.training.sequence_queueing_state_saver import *
-from tensorflow.contrib.training.python.training.tensor_queue_dataset import enqueue_in_queue_dataset
-from tensorflow.contrib.training.python.training.tensor_queue_dataset import prepend_from_queue_and_padded_batch_dataset
 from tensorflow.contrib.training.python.training.training import add_gradients_summaries
 from tensorflow.contrib.training.python.training.training import clip_gradient_norms
 from tensorflow.contrib.training.python.training.training import clip_gradient_norms_fn
@@ -79,7 +77,6 @@ _allowed_symbols = [
     'FeedingQueueRunner', 'get_or_create_eval_step', 'StopAfterNEvalsHook',
     'SummaryAtEndHook', 'wait_for_new_checkpoint', 'add_gradients_summaries',
     'clip_gradient_norms', 'clip_gradient_norms_fn', 'create_train_op',
-    'multiply_gradients', 'enqueue_in_queue_dataset',
-    'prepend_from_queue_and_padded_batch_dataset', 'train']
+    'multiply_gradients', 'train']
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index 3beb7bfe3048a8f0294f7e9149b5a07b5fcc7d17..bcc177601b95172b05d327247bd370c2f8b65d59 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -187,7 +187,7 @@ def _cast_to_type_if_compatible(name, param_type, value):
   return param_type(value)
 
 
-def parse_values(values, type_map):
+def parse_values(values, type_map, ignore_unknown=False):
   """Parses hyperparameter values from a string into a python map.
 
   `values` is a string containing comma-separated `name=value` pairs.
@@ -233,6 +233,9 @@ def parse_values(values, type_map):
       type T if either V has type T, or V is a list of elements of type T.
       Hence, for a multidimensional parameter 'x' taking float values,
       'x=[0.1,0.2]' will parse successfully if type_map['x'] = float.
+    ignore_unknown: Bool. Whether values that are missing a type in type_map
+      should be ignored. If set to True, a ValueError will not be raised for
+      unknown hyperparameter type.
 
   Returns:
     A python map mapping each name to either:
@@ -260,6 +263,8 @@ def parse_values(values, type_map):
     m_dict = m.groupdict()
     name = m_dict['name']
     if name not in type_map:
+      if ignore_unknown:
+        continue
       raise ValueError('Unknown hyperparameter type for %s' % name)
     type_ = type_map[name]
 
diff --git a/tensorflow/contrib/training/python/training/hparam_test.py b/tensorflow/contrib/training/python/training/hparam_test.py
index 660c97f25e8458c345c8914bcaf98f37d047e50e..a990e04711ce68bd928a508484f0d6f657dd2f8c 100644
--- a/tensorflow/contrib/training/python/training/hparam_test.py
+++ b/tensorflow/contrib/training/python/training/hparam_test.py
@@ -216,6 +216,14 @@ class HParamsTest(test.TestCase):
     self.assertTrue(isinstance(parse_dict['arr'], dict))
     self.assertDictEqual(parse_dict['arr'], {1: 10})
 
+  def testParseValuesWithIndexAssigment1_IgnoreUnknown(self):
+    """Assignment to an index position."""
+    parse_dict = hparam.parse_values(
+        'arr[1]=10,b=5', {'arr': int}, ignore_unknown=True)
+    self.assertEqual(len(parse_dict), 1)
+    self.assertTrue(isinstance(parse_dict['arr'], dict))
+    self.assertDictEqual(parse_dict['arr'], {1: 10})
+
   def testParseValuesWithIndexAssigment2(self):
     """Assignment to multiple index positions."""
     parse_dict = hparam.parse_values('arr[0]=10,arr[5]=20', {'arr': int})
@@ -223,6 +231,14 @@ class HParamsTest(test.TestCase):
     self.assertTrue(isinstance(parse_dict['arr'], dict))
     self.assertDictEqual(parse_dict['arr'], {0: 10, 5: 20})
 
+  def testParseValuesWithIndexAssigment2_IgnoreUnknown(self):
+    """Assignment to multiple index positions."""
+    parse_dict = hparam.parse_values(
+        'arr[0]=10,arr[5]=20,foo=bar', {'arr': int}, ignore_unknown=True)
+    self.assertEqual(len(parse_dict), 1)
+    self.assertTrue(isinstance(parse_dict['arr'], dict))
+    self.assertDictEqual(parse_dict['arr'], {0: 10, 5: 20})
+
   def testParseValuesWithIndexAssigment3(self):
     """Assignment to index positions in multiple names."""
     parse_dict = hparam.parse_values('arr[0]=10,arr[1]=20,L[5]=100,L[10]=200',
@@ -234,6 +250,17 @@ class HParamsTest(test.TestCase):
     self.assertTrue(isinstance(parse_dict['L'], dict))
     self.assertDictEqual(parse_dict['L'], {5: 100, 10: 200})
 
+  def testParseValuesWithIndexAssigment3_IgnoreUnknown(self):
+    """Assignment to index positions in multiple names."""
+    parse_dict = hparam.parse_values(
+        'arr[0]=10,C=5,arr[1]=20,B[0]=kkk,L[5]=100,L[10]=200',
+        {'arr': int, 'L': int}, ignore_unknown=True)
+    self.assertEqual(len(parse_dict), 2)
+    self.assertTrue(isinstance(parse_dict['arr'], dict))
+    self.assertDictEqual(parse_dict['arr'], {0: 10, 1: 20})
+    self.assertTrue(isinstance(parse_dict['L'], dict))
+    self.assertDictEqual(parse_dict['L'], {5: 100, 10: 200})
+
   def testParseValuesWithIndexAssigment4(self):
     """Assignment of index positions and scalars."""
     parse_dict = hparam.parse_values('x=10,arr[1]=20,y=30',
@@ -246,6 +273,17 @@ class HParamsTest(test.TestCase):
     self.assertEqual(parse_dict['x'], 10)
     self.assertEqual(parse_dict['y'], 30)
 
+  def testParseValuesWithIndexAssigment4_IgnoreUnknown(self):
+    """Assignment of index positions and scalars."""
+    parse_dict = hparam.parse_values(
+        'x=10,foo[0]=bar,arr[1]=20,zzz=78,y=30',
+        {'x': int, 'y': int, 'arr': int}, ignore_unknown=True)
+    self.assertEqual(len(parse_dict), 3)
+    self.assertTrue(isinstance(parse_dict['arr'], dict))
+    self.assertDictEqual(parse_dict['arr'], {1: 20})
+    self.assertEqual(parse_dict['x'], 10)
+    self.assertEqual(parse_dict['y'], 30)
+
   def testParseValuesWithIndexAssigment5(self):
     """Different variable types."""
     parse_dict = hparam.parse_values('a[0]=5,b[1]=true,c[2]=abc,d[3]=3.14', {
@@ -264,24 +302,55 @@ class HParamsTest(test.TestCase):
     self.assertTrue(isinstance(parse_dict['d'], dict))
     self.assertDictEqual(parse_dict['d'], {3: 3.14})
 
+  def testParseValuesWithIndexAssigment5_IgnoreUnknown(self):
+    """Different variable types."""
+    parse_dict = hparam.parse_values(
+        'a[0]=5,cc=4,b[1]=true,c[2]=abc,mm=2,d[3]=3.14',
+        {'a': int, 'b': bool, 'c': str, 'd': float},
+        ignore_unknown=True)
+    self.assertEqual(set(parse_dict.keys()), {'a', 'b', 'c', 'd'})
+    self.assertTrue(isinstance(parse_dict['a'], dict))
+    self.assertDictEqual(parse_dict['a'], {0: 5})
+    self.assertTrue(isinstance(parse_dict['b'], dict))
+    self.assertDictEqual(parse_dict['b'], {1: True})
+    self.assertTrue(isinstance(parse_dict['c'], dict))
+    self.assertDictEqual(parse_dict['c'], {2: 'abc'})
+    self.assertTrue(isinstance(parse_dict['d'], dict))
+    self.assertDictEqual(parse_dict['d'], {3: 3.14})
+
   def testParseValuesWithBadIndexAssigment1(self):
     """Reject assignment of list to variable type."""
     with self.assertRaisesRegexp(ValueError,
                                  r'Assignment of a list to a list index.'):
       hparam.parse_values('arr[1]=[1,2,3]', {'arr': int})
 
+  def testParseValuesWithBadIndexAssigment1_IgnoreUnknown(self):
+    """Reject assignment of list to variable type."""
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Assignment of a list to a list index.'):
+      hparam.parse_values(
+          'arr[1]=[1,2,3],c=8', {'arr': int}, ignore_unknown=True)
+
   def testParseValuesWithBadIndexAssigment2(self):
     """Reject if type missing."""
     with self.assertRaisesRegexp(ValueError,
                                  r'Unknown hyperparameter type for arr'):
       hparam.parse_values('arr[1]=5', {})
 
+  def testParseValuesWithBadIndexAssigment2_IgnoreUnknown(self):
+    """Ignore missing type."""
+    hparam.parse_values('arr[1]=5', {}, ignore_unknown=True)
+
   def testParseValuesWithBadIndexAssigment3(self):
     """Reject type of the form name[index]."""
     with self.assertRaisesRegexp(ValueError,
                                  'Unknown hyperparameter type for arr'):
       hparam.parse_values('arr[1]=1', {'arr[1]': int})
 
+  def testParseValuesWithBadIndexAssigment3_IgnoreUnknown(self):
+    """Ignore type of the form name[index]."""
+    hparam.parse_values('arr[1]=1', {'arr[1]': int}, ignore_unknown=True)
+
   def testWithReusedVariables(self):
     with self.assertRaisesRegexp(ValueError,
                                  'Multiple assignments to variable \'x\''):
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
deleted file mode 100644
index 8896a95327a4cb609a9a78412afa68b316a3131e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Python wrappers for Datasets and Iterators."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import convert
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.util import nest as tf_nest
-
-
-class _PrependFromQueueAndPaddedBatchDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that prepends a queue to another `Dataset`.
-
-  A vector of handles to the queue is returned as the first component of
-  the associated iterator.  This vector can be passed to
-  `enqueue_in_queue_dataset` to add new elements to the queue.
-  """
-
-  def __init__(self, input_dataset, batch_size, padded_shapes, padding_values):
-    """Initialize `PrependFromQueueAndPaddedBatchDataset`."""
-    super(_PrependFromQueueAndPaddedBatchDataset, self).__init__(input_dataset)
-    if sparse.any_sparse(input_dataset.output_classes):
-      raise TypeError(
-          "Batching of padded sparse tensors is not currently supported")
-    self._input_dataset = input_dataset
-    self._batch_size = ops.convert_to_tensor(
-        batch_size, dtype=dtypes.int64, name="batch_size")
-    if padded_shapes is None:
-      self._padded_shapes = nest.map_structure(
-          convert.partial_shape_to_tensor, input_dataset.output_shapes)
-    else:
-      self._padded_shapes = nest.map_structure_up_to(
-          input_dataset.output_shapes, convert.partial_shape_to_tensor,
-          padded_shapes)
-    # pylint: disable=protected-access
-    padding_values = (
-        padding_values if padding_values is not None else
-        dataset_ops._default_padding(input_dataset))
-    self._padding_values = nest.map_structure_up_to(
-        input_dataset.output_shapes, dataset_ops._padding_value_to_tensor,
-        padding_values, input_dataset.output_types)
-    # pylint: enable=protected-access
-
-  def _as_variant_tensor(self):
-    # pylint: disable=protected-access
-    return gen_dataset_ops.prepend_from_queue_and_padded_batch_dataset(
-        self._input_dataset._as_variant_tensor(),
-        batch_size=self._batch_size,
-        padded_shapes=[
-            ops.convert_to_tensor(s, dtype=dtypes.int64)
-            for s in nest.flatten(self._padded_shapes)
-        ],
-        padding_values=nest.flatten(self._padding_values),
-        output_shapes=nest.flatten(
-            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
-    # pylint: enable=protected-access
-
-  @property
-  def output_classes(self):
-    return (ops.Tensor, self._input_dataset.output_classes)
-
-  def _as_batch_shape(self, shape_like):
-    return tensor_shape.vector(None).concatenate(
-        tensor_util.constant_value_as_shape(shape_like))
-
-  @property
-  def output_shapes(self):
-    # First output is a variant representing the Queue
-    return (tensor_shape.vector(None),
-            nest.map_structure(self._as_batch_shape, self._padded_shapes))
-
-  @property
-  def output_types(self):
-    # First output is a variant representing the Queue
-    return (dtypes.variant, self._input_dataset.output_types)
-
-
-def prepend_from_queue_and_padded_batch_dataset(batch_size,
-                                                padding_values=None,
-                                                padded_shapes=None):
-  """A transformation that prepends a queue to a `Dataset` and batches results.
-
-  A vector of handles to the queue is returned as the first component of the
-  associated iterator.  This vector can be passed to `enqueue_in_queue_dataset`
-  to add new elements to the queue.
-
-  Below is an example of how this dataset might be used to split incoming
-  variable-length sequences into "head" and "rest" parts, where "rest" parts
-  are re-enqueued back into the dataset.  A more realistic example would
-  perform some calculation on the "head" and modify some components of "rest"
-  with the result (before re-enqueueing).
-
-  ```python
-  dataset = tf.data.Dataset.from_tensor_slices([2*x for x in range(10)])
-  # Make a dataset of variable-length vectors and their lengths.
-  dataset = dataset.map(lambda count: (count, tf.ones((count,))))
-  # Emit a queue we can prepend to, and counts/values as padded batch.
-  dataset = dataset.apply(
-      tf.contrib.training.prepend_from_queue_and_padded_batch_dataset(
-        batch_size=10))
-  dataset = dataset.prefetch(1)
-
-  iterator = dataset.make_one_shot_iterator()
-  queue, (count, padded_value) = iterator.get_next()
-
-  # Split the padded_value into two pieces: head and rest
-  rest_indices = tf.squeeze(tf.where(count > 3), axis=1)
-  bound = tf.minimum(3, tf.reduce_max(count))
-  value_head = padded_value[:, :bound]
-  count_rest = tf.gather(count - 3, rest_indices)
-  value_rest = tf.gather(padded_value[:, bound:], rest_indices)
-  queue_rest = tf.gather(queue, rest_indices)
-  enqueue_rest_op = tf.contrib.training.enqueue_in_queue_dataset(
-    queue_rest, (count_rest, value_rest))
-  with tf.control_dependencies([enqueue_rest_op]):
-    calculation = fn(value_head)
-
-  while True:  # Will raise OutOfRange when finished with all pieces.
-    session.run(calculation)
-  ```
-
-  Args:
-    batch_size: `int64` scalar tensor.  The batch size to use when performing
-      padded batching.
-    padding_values: (optional) Nested tuple of scalar tensors.  If provided,
-      the structure and dtypes of padding_values should match that of
-      incoming dataset's `output_types`.
-    padded_shapes: (optional) Nested tuple of `int64` vector tensors.
-      If provided, the structure must match that of the incoming dataset's
-      `output_types`.  If not provided, the incoming dataset's `output_shapes`
-      is used.  Any unknown (`None` or `-1`) dimensions in the shapes are
-      treated as being unique per-batch: for each batch time, an unknown
-      dimension is replaced with the maximum given value of this dimension
-      across all tensors for the given component in the batch.
-
-  Returns:
-    A `Dataset` transformation function, which can be passed to
-    `tf.data.Dataset.apply`.
-  """
-
-  def _apply_fn(dataset):
-    return _PrependFromQueueAndPaddedBatchDataset(
-        dataset,
-        batch_size=batch_size,
-        padding_values=padding_values,
-        padded_shapes=padded_shapes)
-
-  return _apply_fn
-
-
-def enqueue_in_queue_dataset(queue, components):
-  """Enqueue components into queue from `PrependFromQueueAndPaddedBatchDataset`.
-
-  The components' dtypes and shapes must be compatible with the `output_shapes`
-  attribute of the `dataset` created by
-  `prepend_from_queue_and_padded_batch_dataset`.  This operation supports both
-  non-batched and batched modes.
-
-  For more details, see the example in the docstring for
-  `prepend_from_queue_and_padded_batch_dataset`.
-
-  Args:
-    queue: `variant` scalar or vector tensor.
-      The tensor emitted by the first component of the iterator associated with
-      `prepend_from_queue_and_padded_batch_dataset`.  If this is a scalar,
-      then the `components` input tensors should not have a prepended batch
-      dimension.
-    components: Nested tuple of tensors, each with a leading batch dimension
-      if `queue` is a vector.  The structure, dtypes, and shapes
-      (excluding batch dimension) must match the nested tuples
-      `dataset.output_types[1]` and `dataset.output_shapes[1]` (the non-queue
-      output types and shapes) of the `dataset` emitted by
-      the original `prepend_from_queue_and_padded_batch_dataset` call.
-
-  Returns:
-    An `Operation` that enqueues `components` into the dataset(s) associated
-    with entries of `queue`.
-  """
-  return gen_dataset_ops.enqueue_in_queue_dataset(
-      queue=queue, components=tf_nest.flatten(components))
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
deleted file mode 100644
index c1657fec7bbe4a3227c3ea273b72176ac4066c50..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for TensorQueueDataset."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.training.python.training import tensor_queue_dataset as tqd
-from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.platform import test
-
-
-class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
-
-  def testNoEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    self.assertEqual((dtypes.variant, dtypes.int32), dataset.output_types)
-    self.assertAllEqual(([None],) * 2,
-                        [x.as_list() for x in dataset.output_shapes])
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    self.assertEqual([0], self.evaluate(value))
-    self.assertEqual([1], self.evaluate(value))
-    self.assertEqual([2], self.evaluate(value))
-    with self.assertRaisesOpError("End of sequence"):
-      self.evaluate(value)
-
-  def testBatchedNoEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    self.assertAllEqual([0, 1], self.evaluate(value))
-    self.assertAllEqual([2], self.evaluate(value))
-    with self.assertRaisesOpError("End of sequence"):
-      self.evaluate(value)
-
-  def testBatchedWithBiggerPaddingNoEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=2, padded_shapes=[3]))
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    self.assertAllEqual([[0, 0, 0], [1, 0, 0]], self.evaluate(value))
-    self.assertAllEqual([[2, 0, 0]], self.evaluate(value))
-    with self.assertRaisesOpError("End of sequence"):
-      self.evaluate(value)
-
-  def testBatchedWithBiggerPaddingOneEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=1, padded_shapes=[3]))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0, 0, 0]], sess.run(value))
-      value_1, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([[1, 0, 0]], value_1)
-      value_2, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([[-1, 0, 0]], value_2)
-      value_3 = sess.run(value)
-      self.assertAllEqual([[1, 0, 0]], value_3)
-      value_4, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([[2, 0, 0]], value_4)
-      value_5 = sess.run(value)
-      self.assertAllEqual([[-2, 0, 0]], value_5)
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testOneEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
-    with self.cached_session() as sess:
-      self.assertEqual([0], sess.run(value))
-      value_1, _ = sess.run([value, enqueue_negative])
-      self.assertEqual([1], value_1)
-      value_2, _ = sess.run([value, enqueue_negative])
-      self.assertEqual([-1], value_2)
-      value_3 = sess.run(value)
-      self.assertEqual([1], value_3)
-      value_4, _ = sess.run([value, enqueue_negative])
-      self.assertEqual([2], value_4)
-      value_5 = sess.run(value)
-      self.assertEqual([-2], value_5)
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testBatchedOneEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
-    enqueue_zeroth = tqd.enqueue_in_queue_dataset([queue_handle[0]],
-                                                  array_ops.expand_dims(
-                                                      value[0], axis=0))
-    with self.cached_session() as sess:
-      value_0, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([0, 1], value_0)
-      value_1, _ = sess.run([value, enqueue_zeroth])
-      self.assertAllEqual([0, -1], value_1)
-      value_2, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([0, 2], value_2)
-      self.assertAllEqual([0, -2], sess.run(value))
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testManyEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_many_more = [
-        tqd.enqueue_in_queue_dataset(queue_handle, value + 100 + i)
-        for i in range(1000)
-    ]
-    with self.cached_session() as sess:
-      value_0, _ = sess.run((value, enqueue_many_more))
-      self.assertEqual([0], value_0)
-      rest = []
-      for _ in range(1000):
-        rest.append(sess.run(value))
-      self.assertEquals([[100 + i] for i in range(1000)], sorted(rest))
-      # Going back to the original input.
-      value_1, _ = sess.run((value, enqueue_many_more))
-      self.assertEqual(1, value_1)
-      rest = []
-      for _ in range(1000):
-        rest.append(sess.run(value))
-      self.assertEquals([[100 + i + 1] for i in range(1000)], sorted(rest))
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testEnqueueWithPrefetch(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    # Prefetching will request additional values before they are
-    # available to the queue.
-    dataset = dataset.prefetch(buffer_size=3)
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue = tqd.enqueue_in_queue_dataset(queue_handle, value + 1)
-    with self.cached_session() as sess:
-      i = 0
-      while i < 4:
-        received, _ = sess.run((value, enqueue))
-        if received.size > 0:
-          self.assertAllEqual([i], received)
-          i += 1
-      received_last = False
-      while True:
-        try:
-          received = sess.run(value)
-          if received.size > 0:
-            self.assertAllEqual([4], received)
-            received_last = True
-        except errors.OutOfRangeError:
-          break
-      self.assertTrue(received_last)
-
-  def testDatasetWithPaddedShapeSmallerThanInputFails(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([[0, 0, 0]]).repeat(None)
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=1, padded_shapes=[2]))
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    with self.cached_session() as sess:
-      with self.assertRaisesOpError(
-          r"Incompatible input shapes at component 0 between "
-          r"input dataset this dataset: \[3\] vs. \[2\]"):
-        sess.run(value)
-
-  def testEnqueueWithIncompatibleInputsFailsWithInformativeError(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0]).repeat(None)
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-
-    enqueue_bad_structure = tqd.enqueue_in_queue_dataset(
-        queue_handle, (value, value))
-    enqueue_bad_dtype = tqd.enqueue_in_queue_dataset(queue_handle,
-                                                     np.array(
-                                                         [1.0],
-                                                         dtype=np.float32))
-    enqueue_bad_shape_no_batch_dim = tqd.enqueue_in_queue_dataset(
-        queue_handle, ([1],))
-    enqueue_bad_shape = tqd.enqueue_in_queue_dataset(queue_handle,
-                                                     np.array(
-                                                         [[1]], dtype=np.int32))
-
-    with self.cached_session() as sess:
-      with self.assertRaisesOpError(
-          "mismatched number of tensors.  Queue expects 1 tensors but "
-          "tried to insert 2"):
-        sess.run(enqueue_bad_structure)
-      with self.assertRaisesOpError(r"Expected component 0 to have batched "
-                                    r"shape \[1,...\], but saw shape: \[\]"):
-        sess.run(enqueue_bad_shape_no_batch_dim)
-      with self.assertRaisesOpError(
-          r"mismatched shapes at component 0.  Attempted to insert tensor "
-          r"with shape \[1\] but queue expected shape: \[\]"):
-        sess.run(enqueue_bad_shape)
-      with self.assertRaisesOpError(
-          r"mismatched dtypes at component 0.  Attempted to insert tensor "
-          r"of type float but queue expected type: int32"):
-        sess.run(enqueue_bad_dtype)
-
-  def testEnqueueWithPaddedBatchFailsWithInformativeError(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    with self.assertRaisesRegexp(
-        TypeError, r"Unable to create padding for field of type 'variant'"):
-      dataset.padded_batch(batch_size=10, padded_shapes=[1])
-
-  def testOneEnqueueWithPadding(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
-    # Make a dataset of variable-length vectors and their lengths.
-    dataset = dataset.map(
-        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
-    # Emit a queue we can prepend to, and counts/values as padded
-    # batch.
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=3))
-
-    iterator = dataset.make_one_shot_iterator()
-    queue, (count, padded_value) = iterator.get_next()
-
-    # Split the padded_value into two pieces: head and rest
-    rest_indices = array_ops.squeeze(array_ops.where(count > 2), axis=1)
-    bound = math_ops.minimum(2, math_ops.reduce_max(count))
-    value_head = padded_value[:, :bound]
-    count_rest = array_ops.gather(count - 2, rest_indices)
-    value_rest = array_ops.gather(padded_value, rest_indices)[:, bound:]
-    queue_rest = array_ops.gather(queue, rest_indices)
-    enqueue_rest_op = tqd.enqueue_in_queue_dataset(queue_rest,
-                                                   (count_rest, value_rest))
-    with ops.control_dependencies([enqueue_rest_op]):
-      calc = array_ops.identity(value_head)
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0, 0], [2, 2], [4, 4]], sess.run(calc))
-      self.assertAllEqual([[4, 4], [6, 6]], sess.run(calc))
-      self.assertAllEqual([[6, 6]], sess.run(calc))
-      self.assertAllEqual([[6, 6]], sess.run(calc))
-      # Get some final batches due to prefetching.
-      for _ in range(3):
-        try:
-          self.assertAllEqual(
-              np.empty(shape=(0, 0), dtype=np.int32), sess.run(calc))
-        except errors.OutOfRangeError as e:
-          self.assertTrue(str(e).startswith("End of sequence"))
-
-  def testNonstandardPadding(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
-    # Make a dataset of variable-length vectors and their lengths.
-    dataset = dataset.map(
-        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
-    # Emit a queue we can prepend to, and counts/values as padded
-    # batch.
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=3, padding_values=(
-                0,
-                -1,
-            )))
-
-    iterator = dataset.make_one_shot_iterator()
-    _, (unused_count, padded_value) = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([[-1, -1, -1, -1], [2, 2, -1, -1], [4, 4, 4, 4]],
-                          sess.run(padded_value))
-      self.assertAllEqual([[6] * 6], sess.run(padded_value))
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(padded_value)
-
-
-# TODO(ebrevdo): Figure out how to use run_core_tests to test state
-# saving of an iterator that's had some tensors enqueued into its queue.
-class PrependFromQueueAndPaddedBatchDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
-
-  def testPrependFromQueueAndPaddedBatch(self):
-
-    def build_dataset(seq_lens):
-      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
-          lambda x: array_ops.fill([x], x)).apply(
-              tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=4))
-
-    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
-    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
-    self.run_core_tests(lambda: build_dataset(seq_lens1),
-                        lambda: build_dataset(seq_lens2), 8)
-
-  def testPrependFromQueueAndPaddedBatchNonDefaultPadding(self):
-
-    def build_dataset(seq_lens):
-
-      def fill_tuple(x):
-        filled = array_ops.fill([x], x)
-        return (filled, string_ops.as_string(filled))
-
-      padded_shape = [-1]
-      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
-          fill_tuple).apply(
-              tqd.prepend_from_queue_and_padded_batch_dataset(
-                  batch_size=4,
-                  padded_shapes=(padded_shape, padded_shape),
-                  padding_values=(-1, "<end>")))
-
-    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
-    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
-    self.run_core_tests(lambda: build_dataset(seq_lens1),
-                        lambda: build_dataset(seq_lens2), 8)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c268605711fb73f37773ce7b4181bf17f2a3a4fa..8bf1480d33b2d2117fb5c7ddf046262cfeb8a8ab 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -49,7 +49,7 @@
 # filegroup ":android_proto_srcs" - Protos
 # filegroup ":android_srcs" - Core sources
 # cc_library ":android_tensorflow_lib" - Native library
-# cc_library ":android_tensorflow_lib_selective_registration" - Native library
+# cc_library ":android_tensorflow_lib_lite" - Native library, without ops,
 #   supporting SELECTIVE_REGISTRATION feature.
 # portable_proto_library ":android_proto_lib" (Google-internal)
 #
@@ -113,7 +113,6 @@ load(
     "tf_additional_device_tracer_test_flags",
     "tf_additional_gdr_lib_defines",
     "tf_additional_human_readable_json_deps",
-    "tf_additional_logger_deps",
     "tf_additional_lib_defines",
     "tf_additional_lib_deps",
     "tf_additional_lib_hdrs",
@@ -446,15 +445,31 @@ cc_library(
 )
 
 cc_library(
-    name = "logger",
-    srcs = tf_platform_srcs(["logger.cc"]),
-    hdrs = ["platform/logger.h"] + tf_platform_hdrs(["logger.h"]),
+    name = "logger_interface",
+    hdrs = ["platform/logger.h"],
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        ":lib",
-        ":lib_internal",
-    ] + tf_additional_logger_deps(),
+        ":lib_proto_parsing",
+        "@protobuf_archive//:protobuf",
+    ],
+)
+
+cc_library(
+    name = "default_logger",
+    srcs = ["platform/default/logger.cc"],
+    hdrs = ["platform/logger.h"],
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:logger_interface",
+    ],
+)
+
+cc_library(
+    name = "logger",
+    hdrs = ["platform/logger.h"],
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/core/platform/default/build_config:logger"],
 )
 
 filegroup(
@@ -492,7 +507,10 @@ cc_library(
         ":platform_env_internal_hdrs",
     ],
     copts = tf_copts(),
-    visibility = ["//tensorflow/core:__subpackages__"],
+    visibility = [
+        "//tensorflow/c:__subpackages__",
+        "//tensorflow/core:__subpackages__",
+    ],
     deps = [
         ":error_codes_proto_cc",
         ":lib",
@@ -1608,6 +1626,9 @@ filegroup(
             "**/*main.cc",
             "debug/**/*",
             "framework/op_gen_*",
+            "framework/node_def_util.*",
+            "framework/op_kernel.*",
+            "framework/dataset.*",
             "lib/jpeg/**/*",
             "lib/png/**/*",
             "lib/gif/**/*",
@@ -1616,7 +1637,6 @@ filegroup(
             "util/reporter.*",
             "platform/**/cuda_libdevice_path.*",
             "platform/**/logger.cc",
-            "platform/**/logger.h",
             "platform/default/test_benchmark.*",
             "platform/cuda.h",
             "platform/google/**/*",
@@ -1651,6 +1671,9 @@ filegroup(
             "common_runtime/**/*.cc",
             "graph/**/*.h",
             "graph/**/*.cc",
+            "framework/node_def_util.*",
+            "framework/op_kernel.*",
+            "framework/dataset.*",
         ],
         exclude = [
             "**/*test.*",
@@ -1679,6 +1702,9 @@ filegroup(
 # operators, use :android_tensorflow_lib if you want full operator
 # support.
 #
+# If you just need TensorFlow types, e.g. Tensors, use
+# :android_tensorflow_lib_lite_no_runtime.
+#
 # Compiles to a trivial library on non-Android to prevent irrelevant
 # build errors. If not building this as part of an android_binary,
 # a command such as the following must be used:
@@ -1689,7 +1715,33 @@ filegroup(
 cc_library(
     name = "android_tensorflow_lib_lite",
     srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts(android_optimization_level_override = None),
+    copts = tf_copts(android_optimization_level_override = None) + [
+        "-DSUPPORT_SELECTIVE_REGISTRATION",
+    ],
+    linkopts = ["-lz"],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":mobile_additional_lib_deps",
+        ":protos_all_cc_impl",
+        ":stats_calculator_portable",
+        "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
+        "@nsync//:nsync_cpp",
+        "@protobuf_archive//:protobuf",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "android_tensorflow_lib_lite_nortti",
+    srcs = if_android(["//tensorflow/core:android_srcs"]),
+    copts = tf_copts(android_optimization_level_override = None) + [
+        "-DSUPPORT_SELECTIVE_REGISTRATION",
+    ] + tf_opts_nortti_if_android(),
     linkopts = ["-lz"],
     tags = [
         "manual",
@@ -1797,58 +1849,6 @@ cc_library(
     alwayslink = 1,
 )
 
-# Android library for use with the SELECTIVE_REGISTRATION feature.
-# Does not contain operators. In contrast to android_tensorflow_lib_lite,
-# this links in framework support for all types, relying on selective
-# registration of ops to prune code size.
-cc_library(
-    name = "android_tensorflow_lib_selective_registration",
-    srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts(android_optimization_level_override = None) + [
-        "-DSUPPORT_SELECTIVE_REGISTRATION",
-    ],
-    linkopts = if_android(["-lz"]),
-    tags = [
-        "manual",
-        "notap",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":protos_all_cc_impl",
-        "//third_party/eigen3",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@double_conversion//:double-conversion",
-        "@nsync//:nsync_cpp",
-        "@protobuf_archive//:protobuf",
-    ],
-    alwayslink = 1,
-)
-
-# Android library for use with the SELECTIVE_REGISTRATION feature with
-# no proto_rtti.
-cc_library(
-    name = "android_tensorflow_lib_selective_registration_nortti",
-    srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_android() + [
-        "-DSUPPORT_SELECTIVE_REGISTRATION",
-    ],
-    linkopts = if_android(["-lz"]),
-    tags = [
-        "manual",
-        "notap",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":protos_all_cc_impl",
-        "//third_party/eigen3",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@double_conversion//:double-conversion",
-        "@nsync//:nsync_cpp",
-        "@protobuf_archive//:protobuf",
-    ],
-    alwayslink = 1,
-)
-
 filegroup(
     name = "android_op_registrations_and_gradients",
     srcs = glob(
@@ -2087,9 +2087,7 @@ tf_proto_library_cc(
     srcs = ["protobuf/master.proto"],
     cc_api_version = 2,
     protodeps = tf_additional_all_protos(),
-    visibility = [
-        "//tensorflow:internal",
-    ],
+    visibility = ["//tensorflow:internal"],
 )
 
 tf_proto_library_cc(
@@ -4060,20 +4058,6 @@ tf_cuda_cc_test(
     ],
 )
 
-tf_cc_test_gpu(
-    name = "cuda_libdevice_path_test",
-    size = "small",
-    srcs = ["platform/cuda_libdevice_path_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_cuda_tests_tags(),
-    deps = [
-        ":cuda_libdevice_path",
-        ":lib",
-        ":test",
-        ":test_main",
-    ],
-)
-
 tf_cuda_only_cc_test(
     name = "util_cuda_kernel_helper_test",
     srcs = [
@@ -4929,7 +4913,7 @@ filegroup(
 
 cc_library(
     name = "cuda_libdevice_path",
-    srcs = ["platform/cuda_libdevice_path.cc"] + tf_additional_libdevice_srcs(),
+    srcs = tf_additional_libdevice_srcs(),
     hdrs = ["platform/cuda_libdevice_path.h"],
     copts = tf_copts(),
     data = tf_additional_libdevice_data(),
diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc
index d38a8424eb13009fbf84d7511fb1325085d8b809..7405e2ace72d1c08cf87cc0040e617379e18149b 100644
--- a/tensorflow/core/api_def/api_test.cc
+++ b/tensorflow/core/api_def/api_test.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt
index 639d962874d083472e6df13550e107026fd2d0a1..32def912f83e420eab58a3071f573ae81139a298 100644
--- a/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BatchDataset"
+  visibility: HIDDEN
   in_arg {
     name: "batch_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt
index 6889b8ea148b57da847964c062bd52b1027b8d22..9f7088b90077544ca11fff08dae526140ca1aa6e 100644
--- a/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "CacheDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filename"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt
index 67281f9547ac6bb9df5b19e9f31da891454993bd..7997d8daaf91e47044f0729fb8a3c80d69d13acc 100644
--- a/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "ConcatenateDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that concatenates `input_dataset` with `another_dataset`."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt b/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt
index 2b9dffd883250fd5631444252e7b236116e2e822..27d7d6b98684e10853f2f73373a756f0006daa0e 100644
--- a/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "DatasetToSingleElement"
+  visibility: HIDDEN
   in_arg {
     name: "dataset"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
deleted file mode 100644
index 9722f5ede30cb0b893171bfc36a0eb8c1ab3c7e2..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
+++ /dev/null
@@ -1,3 +0,0 @@
-op {
-  graph_op_name: "EnqueueInQueueDataset"
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalBytesProducedStatsDataset.pbtxt
similarity index 56%
rename from tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalBytesProducedStatsDataset.pbtxt
index 73df11b2f75f82fad174fb7e77eccbef35c2c7d1..dc296162ae83117d349147c2655756c59384c051 100644
--- a/tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalBytesProducedStatsDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
-  graph_op_name: "BytesProducedStatsDataset"
+  graph_op_name: "ExperimentalBytesProducedStatsDataset"
+  visibility: HIDDEN
   summary: "Records the bytes size of each element of `input_dataset` in a StatsAggregator."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetCardinality.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetCardinality.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac014bcc5e6ae48cdecd6acefca267da3f2fe4f1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetCardinality.pbtxt
@@ -0,0 +1,21 @@
+op {
+  graph_op_name: "ExperimentalDatasetCardinality"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the dataset to return cardinality for.
+END
+  }
+  out_arg {
+    name: "cardinality"
+    description: <<END
+The cardinality of `input_dataset`. Named constants are used to represent
+infinite and unknown cardinality.
+END
+  }
+  summary: "Returns the cardinality of `input_dataset`."
+  description: <<END
+Returns the cardinality of `input_dataset`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetToTFRecord.pbtxt
similarity index 91%
rename from tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetToTFRecord.pbtxt
index e1b8a9abdd2bec0fda690f96d266569b2fb2fcab..085d20d7bf1882accfa3380465568774d1459afb 100644
--- a/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetToTFRecord.pbtxt
@@ -1,5 +1,5 @@
 op {
-  graph_op_name: "DatasetToTFRecord"
+  graph_op_name: "ExperimentalDatasetToTFRecord"
   visibility: HIDDEN
   in_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalDenseToSparseBatchDataset.pbtxt
similarity index 89%
rename from tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalDenseToSparseBatchDataset.pbtxt
index e275cfdd3de5de36979967b1d85d1ae9cd0582a8..8ebd6d88a8b9ff9e0a855215a0167f043d083bad 100644
--- a/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalDenseToSparseBatchDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "DenseToSparseBatchDataset"
+  graph_op_name: "ExperimentalDenseToSparseBatchDataset"
+  visibility: HIDDEN
   in_arg {
     name: "input_dataset"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_GroupByReducerDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByReducerDataset.pbtxt
similarity index 97%
rename from tensorflow/core/api_def/base_api/api_def_GroupByReducerDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByReducerDataset.pbtxt
index 067ad4018b09d4909325dbc152e30a0afcf29235..dd132802fac8cbbd06872cd50415d3a5d29abc38 100644
--- a/tensorflow/core/api_def/base_api/api_def_GroupByReducerDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByReducerDataset.pbtxt
@@ -1,5 +1,5 @@
 op {
-  graph_op_name: "GroupByReducerDataset"
+  graph_op_name: "ExperimentalGroupByReducerDataset"
   visibility: HIDDEN
   in_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/api_def/base_api/api_def_GroupByWindowDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByWindowDataset.pbtxt
similarity index 82%
rename from tensorflow/core/api_def/base_api/api_def_GroupByWindowDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByWindowDataset.pbtxt
index ea6bcd469577d02e39afbeb2ba0c8b467e312ba9..6e4c12ed815d8119999852056a473b76e2d4ab90 100644
--- a/tensorflow/core/api_def/base_api/api_def_GroupByWindowDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByWindowDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "GroupByWindowDataset"
+  graph_op_name: "ExperimentalGroupByWindowDataset"
+  visibility: HIDDEN
   attr {
     name: "key_func"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalLatencyStatsDataset.pbtxt
similarity index 58%
rename from tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalLatencyStatsDataset.pbtxt
index 78d946b0b47044855ff145e9492fdb3721ff0044..e7351b9d70a75285351534d474209339b6bcbce4 100644
--- a/tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalLatencyStatsDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
-  graph_op_name: "LatencyStatsDataset"
+  graph_op_name: "ExperimentalLatencyStatsDataset"
+  visibility: HIDDEN
   summary: "Records the latency of producing `input_dataset` elements in a StatsAggregator."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapAndBatchDataset.pbtxt
similarity index 96%
rename from tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalMapAndBatchDataset.pbtxt
index 81ef92cae0c95c765a82c993f58f261509c47d71..bc4270670c5369d6d7440b50dae98f367453b3d9 100644
--- a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapAndBatchDataset.pbtxt
@@ -1,5 +1,5 @@
 op {
-  graph_op_name: "MapAndBatchDatasetV2"
+  graph_op_name: "ExperimentalMapAndBatchDataset"
   visibility: HIDDEN
   in_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalParallelInterleaveDataset.pbtxt
similarity index 90%
rename from tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalParallelInterleaveDataset.pbtxt
index d6889b54a032bb20896dc7b03af5621f45d365d9..dd70e3328493825b268fc1a2f6e1c85207a426bf 100644
--- a/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalParallelInterleaveDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "ParallelInterleaveDataset"
+  graph_op_name: "ExperimentalParallelInterleaveDataset"
+  visibility: HIDDEN
   attr {
     name: "f"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ParseExampleDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalParseExampleDataset.pbtxt
similarity index 96%
rename from tensorflow/core/api_def/base_api/api_def_ParseExampleDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalParseExampleDataset.pbtxt
index 3de2f18fc28b57171b478f43c64a88d72069a89f..2de13c5ceef4eced73f6e0984e70921926ece7f2 100644
--- a/tensorflow/core/api_def/base_api/api_def_ParseExampleDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalParseExampleDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "ParseExampleDataset"
+  graph_op_name: "ExperimentalParseExampleDataset"
+  visibility: HIDDEN
   in_arg {
     name: "dense_defaults"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalRandomDataset.pbtxt
similarity index 86%
rename from tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalRandomDataset.pbtxt
index 0466b40f85eb118c94404e2f0d7670392bc7afdf..f5d7bc4adb79ac63aaf41f03063b26257ebee429 100644
--- a/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalRandomDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "RandomDataset"
+  graph_op_name: "ExperimentalRandomDataset"
+  visibility: HIDDEN
   in_arg {
     name: "seed"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ScanDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalScanDataset.pbtxt
similarity index 61%
rename from tensorflow/core/api_def/base_api/api_def_ScanDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalScanDataset.pbtxt
index e83d4a9e967f959b19adc5fad38a7141f8936cc4..4742cf4d57ff471178f0d59d9fd8a99a1e6f2166 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScanDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalScanDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
-  graph_op_name: "ScanDataset"
+  graph_op_name: "ExperimentalScanDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset successively reduces `f` over the elements of `input_dataset`."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalSetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalSetStatsAggregatorDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e6b2f81b333899e3cdc2723edb537507f541a64
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalSetStatsAggregatorDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalSetStatsAggregatorDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalSlidingWindowDataset.pbtxt
similarity index 88%
rename from tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalSlidingWindowDataset.pbtxt
index ddde3ee5b4ef1d82cc244563d4835e319a9dc50a..dc62750b66a996d1429fcd8477bcd57b7b488dda 100644
--- a/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalSlidingWindowDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "SlideDataset"
+  graph_op_name: "ExperimentalSlidingWindowDataset"
+  visibility: HIDDEN
   in_arg {
     name: "window_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_SqlDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalSqlDataset.pbtxt
similarity index 87%
rename from tensorflow/core/api_def/base_api/api_def_SqlDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalSqlDataset.pbtxt
index 7570d5da5662b8eab90e7dd00f8cb225a963d373..35cddbd061917e397aa7b10e7fee43033adfc2e2 100644
--- a/tensorflow/core/api_def/base_api/api_def_SqlDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalSqlDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "SqlDataset"
+  graph_op_name: "ExperimentalSqlDataset"
+  visibility: HIDDEN
   in_arg {
     name: "driver_name"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8a770d462d54230340ac278f755b997d7c9144a5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorHandle.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "ExperimentalStatsAggregatorHandle"
+  visibility: HIDDEN
+  summary: "Creates a statistics manager resource."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorSummary.pbtxt
similarity index 56%
rename from tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorSummary.pbtxt
index bcaf9fea1af5123848b2d6267b3ef0f7279a7230..ffe010368918a2134fa70d3bc6d6fb30a7dbc2c5 100644
--- a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorSummary.pbtxt
@@ -1,4 +1,5 @@
 op {
-  graph_op_name: "StatsAggregatorSummary"
+  graph_op_name: "ExperimentalStatsAggregatorSummary"
+  visibility: HIDDEN
   summary: "Produces a summary of any statistics recorded by the given statistics manager."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalUnbatchDataset.pbtxt
similarity index 57%
rename from tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalUnbatchDataset.pbtxt
index 324fadac0af5088e86e61beaaa27f2111cfd4b82..c89e1fd0bdd6ef594797233170b41cb86521c84f 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalUnbatchDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
-  graph_op_name: "UnbatchDataset"
+  graph_op_name: "ExperimentalUnbatchDataset"
+  visibility: HIDDEN
   summary: "A dataset that splits the elements of its input into multiple elements."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt
index fd60c0f3785a22f456c63285bf59381e6a2a5d66..776529bc593b10915c6be8c4a3bdac6e6b131c32 100644
--- a/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "FilterDataset"
+  visibility: HIDDEN
   in_arg {
     name: "other_arguments"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt
index 651b84d0d660a0bfc0ef45dd841dfc51ee1e3340..3b142432582146fcc0534d36d1aa063b71f11338 100644
--- a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "FixedLengthRecordDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filenames"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt
index ad82eddb587b40e8ab61dd55aa3dc277aefd03d5..def9f85e02d9d34412ed42d7774d77e8b6a328e0 100644
--- a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt
@@ -1,3 +1,4 @@
 op {
   graph_op_name: "FixedLengthRecordDatasetV2"
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt
index 1936119c50f5323e69465a79cda784afc68c3aca..1e20e853254ccb5086b3b52f473a4a823fefefe8 100644
--- a/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "FlatMapDataset"
+  visibility: HIDDEN
   attr {
     name: "f"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt
index 4f1cf3e6867a06df1f39774bc389fbe35a994ab4..06e9a6463e76dbf43caae878b62afcba55e6995d 100644
--- a/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "GeneratorDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that invokes a function to generate elements."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt
index bec2828e2462227b962bc045d796484a10365452..597edf5fb2b2d1c1f9d5a97992ec074385407f47 100644
--- a/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "InterleaveDataset"
+  visibility: HIDDEN
   attr {
     name: "f"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_Lu.pbtxt b/tensorflow/core/api_def/base_api/api_def_Lu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..35dbee8364ec596ee18cf8892361ee3112a7764a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Lu.pbtxt
@@ -0,0 +1,51 @@
+op {
+  graph_op_name: "Lu"
+  in_arg {
+    name: "input"
+    description: <<END
+A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
+size `[M, M]`.
+END
+  }
+  out_arg {
+    name: "lu"
+    description: <<END
+A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
+lower triangular factor `L` with unit diagonal, and whose upper triangular part
+denotes the upper triangular factor `U`.
+END
+  }
+  out_arg {
+    name: "p"
+    description: <<END
+Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
+`[..., M]`.
+@compatibility(scipy)
+Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
+packed into a single tensor, the permutation is applied to `input` instead of
+the right hand side and the permutation `P` is returned as a list of indices
+instead of a permutation matrix.
+@end_compatibility
+END
+  }
+  summary: "Computes the LU decomposition of one or more square matrices."
+  description: <<END
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices.
+
+The input has to be invertible.
+
+The output consists of two tensors LU and P containing the LU decomposition
+of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
+upper triangular factors.
+
+For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
+shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
+triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
+entries correspond to the upper triangular part, including the diagonal, of LU.
+
+P represents a permutation matrix encoded as a list of indices each between `0`
+and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
+P, then the L, U and P satisfies P_mat * input = L * U.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
deleted file mode 100644
index e230c51edfe9355b556812b0946b3a4879f160bc..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
+++ /dev/null
@@ -1,53 +0,0 @@
-op {
-  graph_op_name: "MapAndBatchDataset"
-  visibility: HIDDEN
-  in_arg {
-    name: "input_dataset"
-    description: <<END
-A variant tensor representing the input dataset.
-END
-  }
-  in_arg {
-    name: "other_arguments"
-    description: <<END
-A list of tensors, typically values that were captured when building a closure
-for `f`.
-END
-  }
-  in_arg {
-    name: "batch_size"
-    description: <<END
-A scalar representing the number of elements to accumulate in a
-batch. It determines the number of concurrent invocations of `f` that process
-elements from `input_dataset` in parallel.
-END
-  }
-  in_arg {
-    name: "num_parallel_batches"
-    description: <<END
-A scalar representing the number of batches to create in parallel. Processing
-multiple batches in parallel benefits workloads prone to stragglers.
-END
-  }
-  in_arg {
-    name: "drop_remainder"
-    description: <<END
-A scalar representing whether the last batch should be dropped in case its size
-is smaller than desired.
-END
-  }
-  attr {
-    name: "f"
-    description: <<END
-A function to apply to the outputs of `input_dataset`.
-END
-  }
-  summary: "Creates a dataset that fuses mapping with batching."
-  description: <<END
-Creates a dataset that applies `f` to the outputs of `input_dataset` and then
-batches `batch_size` of them.
-
-Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
-to `batch_size * num_parallel_batches` copies of `f` in parallel.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt
index 76d63ec2478e07d5af09754dc63994841119fa56..4f235f49461465931c6b863b2007c512511c873c 100644
--- a/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "MapDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_OneHot.pbtxt b/tensorflow/core/api_def/base_api/api_def_OneHot.pbtxt
index 807b8ae31015e4bcb73e54e98d879460f0d92f62..b325df1c8c2b231f03a1960babd2d915b1b0e72d 100644
--- a/tensorflow/core/api_def/base_api/api_def_OneHot.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_OneHot.pbtxt
@@ -66,7 +66,6 @@ Examples
 =========
 
 Suppose that
-
 ```
   indices = [0, 2, -1, 1]
   depth = 3
@@ -76,16 +75,15 @@ Suppose that
 ```
 
 Then output is `[4 x 3]`:
-
-    ```output =
-      [5.0 0.0 0.0]  // one_hot(0)
-      [0.0 0.0 5.0]  // one_hot(2)
-      [0.0 0.0 0.0]  // one_hot(-1)
-      [0.0 5.0 0.0]  // one_hot(1)
-    ```
+```
+output =
+  [5.0 0.0 0.0]  // one_hot(0)
+  [0.0 0.0 5.0]  // one_hot(2)
+  [0.0 0.0 0.0]  // one_hot(-1)
+  [0.0 5.0 0.0]  // one_hot(1)
+```
 
 Suppose that
-
 ```
   indices = [0, 2, -1, 1]
   depth = 3
@@ -95,19 +93,19 @@ Suppose that
 ```
 
 Then output is `[3 x 4]`:
+```
+output =
+  [0.0 3.0 3.0 3.0]
+  [3.0 3.0 3.0 0.0]
+  [3.0 3.0 3.0 3.0]
+  [3.0 0.0 3.0 3.0]
+//  ^                one_hot(0)
+//      ^            one_hot(2)
+//          ^        one_hot(-1)
+//              ^    one_hot(1)
+```
 
-    ```output =
-      [0.0 3.0 3.0 3.0]
-      [3.0 3.0 3.0 0.0]
-      [3.0 3.0 3.0 3.0]
-      [3.0 0.0 3.0 3.0]
-    //  ^                one_hot(0)
-    //      ^            one_hot(2)
-    //          ^        one_hot(-1)
-    //              ^    one_hot(1)
-    ```
 Suppose that
-
 ```
   indices = [[0, 2], [1, -1]]
   depth = 3
@@ -117,14 +115,15 @@ Suppose that
 ```
 
 Then output is `[2 x 2 x 3]`:
-
-    ```output =
-      [
-        [1.0, 0.0, 0.0]  // one_hot(0)
-        [0.0, 0.0, 1.0]  // one_hot(2)
-      ][
-        [0.0, 1.0, 0.0]  // one_hot(1)
-        [0.0, 0.0, 0.0]  // one_hot(-1)
-      ]```
+```
+output =
+  [
+    [1.0, 0.0, 0.0]  // one_hot(0)
+    [0.0, 0.0, 1.0]  // one_hot(2)
+  ][
+    [0.0, 1.0, 0.0]  // one_hot(1)
+    [0.0, 0.0, 0.0]  // one_hot(-1)
+  ]
+```
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt
index d243dfe8b67bc14e9c5e22d5e68e3faf5d4684a8..53f4d94ecc8810a38aaafac29438d8186636684a 100644
--- a/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "PaddedBatchDataset"
+  visibility: HIDDEN
   in_arg {
     name: "batch_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt
index 313494dd738b02d09807ec78fc8e0802e719e116..5343605edd5859d2cafa656f3821a318e24d0b09 100644
--- a/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "ParallelMapDataset"
+  visibility: HIDDEN
   in_arg {
     name: "num_parallel_calls"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt
index e158eedc6f0ef11de3c8979d65dd69d8bece1eb4..a71336a285542bc4bdf095fb2ac477ea975725c0 100644
--- a/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "PrefetchDataset"
+  visibility: HIDDEN
   in_arg {
     name: "buffer_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
deleted file mode 100644
index d4549340fac6d59cc994050e65f5a0016f2d52ab..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
+++ /dev/null
@@ -1,3 +0,0 @@
-op {
-  graph_op_name: "PrependFromQueueAndPaddedBatchDataset"
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedGather.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedGather.pbtxt
index 240c987ddab4cd6ba04655891a258801716dc619..9c40332ea28421e0b6a8ab771f6d19fdaa75a63a 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedGather.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedGather.pbtxt
@@ -11,8 +11,8 @@ END
   in_arg {
     name: "params_dense_values"
     description: <<END
-The `inner_values` for the `params` RaggedTensor. There was a terminology change
-at the python level from dense_values to inner_values, so dense_values is the
+The `flat_values` for the `params` RaggedTensor. There was a terminology change
+at the python level from dense_values to flat_values, so dense_values is the
 deprecated name.
 END
   }
@@ -32,7 +32,7 @@ END
   }
   out_arg {
     name: "output_dense_values"
-    description: "The `inner_values` for the returned RaggedTensor."
+    description: "The `flat_values` for the returned RaggedTensor."
   }
   attr {
     name: "PARAMS_RAGGED_RANK"
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedRange.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedRange.pbtxt
index 927e839b72ab0c09318bf58734effe5aab2d7f5a..4a9b2af804483df8eafd3306fc4f68cb9de55f2b 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedRange.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedRange.pbtxt
@@ -19,7 +19,7 @@ op {
   }
   out_arg{
     name: "rt_dense_values"
-    description: "The `inner_values` for the returned `RaggedTensor`."
+    description: "The `flat_values` for the returned `RaggedTensor`."
   }
   summary: <<END
 Returns a `RaggedTensor` containing the specified sequences of numbers.
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedTensorToSparse.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedTensorToSparse.pbtxt
index 8c73ea644c8072a2a3d11f6489976ca34e02b55d..958c71185e4b9f2f876ca66f9cfaeabcbe2050cc 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedTensorToSparse.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedTensorToSparse.pbtxt
@@ -7,7 +7,7 @@ op {
   }
   in_arg {
     name: "rt_dense_values"
-    description: "The `inner_values` for the `RaggedTensor`."
+    description: "The `flat_values` for the `RaggedTensor`."
   }
   out_arg {
     name: "sparse_indices"
diff --git a/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt
index a9e14b8a052e416dd78f1abdc25c9b024a778107..4ac5050040c22ff6ffc5d0bb7c69453cd9e12f5c 100644
--- a/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "RangeDataset"
+  visibility: HIDDEN
   in_arg {
     name: "start"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt
index fc6169cd32f1671000a9cb96209059d062c00db8..b2fcab15384d0cc7354699d15a25bdf8879fbac6 100644
--- a/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "RepeatDataset"
+  visibility: HIDDEN
   in_arg {
     name: "count"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ee16ef1baa86f31dfa78bb75aeea81e4b983972
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
@@ -0,0 +1,85 @@
+op {
+  graph_op_name: "ResourceApplyAdamWithAmsgrad"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "v"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "vhat"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "beta1_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta1"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, m, and v tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the Adam algorithm."
+  description: <<END
+$$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$vhat_t := max{vhat_{t-1}, v_t}$$
+$$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..830391a32baa48a358c5cd12d73bfc26b852fe6d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "ResourceApplyKerasMomentum"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+If `True`, the tensor passed to compute grad will be
+var + momentum * accum, so in the end, the var you get is actually
+var + momentum * accum.
+END
+  }
+  summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
+  description: <<END
+want to use Nesterov momentum.
+
+accum = accum * momentum - lr * grad
+var += accum
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b10b1bc2a9bb7a28f9f96fdb0328ab23952f7e56
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
@@ -0,0 +1,64 @@
+op {
+  graph_op_name: "ResourceSparseApplyKerasMomentum"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+If `True`, the tensor passed to compute grad will be
+var + momentum * accum, so in the end, the var you get is actually
+var + momentum * accum.
+END
+  }
+  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme."
+  description: <<END
+Set use_nesterov = True if you want to use Nesterov momentum.
+
+That is for rows we have grad for, we update var and accum as follows:
+
+accum = accum * momentum - lr * grad
+var += accum
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
deleted file mode 100644
index 77123e143b200fc079879bc0e891a771a7cb67e7..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
+++ /dev/null
@@ -1,3 +0,0 @@
-op {
-  graph_op_name: "SetStatsAggregatorDataset"
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
index fb425b24a4134366df1129df63dc0361537dd746..9ea1cc8babe8832d0553b942901c1c391f1b2709 100644
--- a/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "ShuffleAndRepeatDataset"
+  visibility: HIDDEN
   in_arg {
     name: "buffer_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
index ea5c52c0ee3826076b855ca243f03cb940b8e0b2..c7f4836a3ad32011f4903973f9400362c795c841 100644
--- a/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "ShuffleDataset"
+  visibility: HIDDEN
   in_arg {
     name: "buffer_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt
index 44e5bac79b8cdfb703d8679b66d79ab9e9e7509a..f830049d053b50257d343306c9726adcf10aabd7 100644
--- a/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "SkipDataset"
+  visibility: HIDDEN
   in_arg {
     name: "count"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt
index ffb805834908103865e5fcb8d98fb080d60a44ab..4203eca73a5f954a3f407f2a5ad9b1193b044ec5 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "SparseTensorSliceDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that splits a SparseTensor into elements row-wise."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt
deleted file mode 100644
index 9b30d64afe18a71fbbe73b397979796b8b844faa..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StatsAggregatorHandle"
-  summary: "Creates a statistics manager resource."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt
index 80f64cebb1bef262146afdadd5c37b0a30277db0..30e425794b358f9a99efae1c116d7b35753f6bff 100644
--- a/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "TFRecordDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filenames"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt
index 8808dc6b1f0d0ae3a0e83f376eab245beaad2de1..eadcb6cd051bc306ba98d8a4318135e1fd7ccfb2 100644
--- a/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "TakeDataset"
+  visibility: HIDDEN
   in_arg {
     name: "count"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt
index 050e174aacb12b415357437e7f989b09faf40621..c086d7420c27055d374b1924148c868cc9d6dfcc 100644
--- a/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "TensorDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that emits `components` as a tuple of tensors once."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListConcat.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..909c09aa12bd715d4ec6b6d19a9cd6b4b72f804a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListConcat.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "TensorListConcat"
+  summary: "Concats all tensors in the list along the 0th dimension."
+  description: <<END
+Requires that all tensors have the same shape except the first dimension.
+
+input_handle: The input list.
+tensor: The concated result.
+lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListSplit.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListSplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..24156cb8c47fab5af34bff3be3975b7a7959e542
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListSplit.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "TensorListSplit"
+  summary: "Splits a tensor into a list."
+  description: <<END
+list[i] corresponds to lengths[i] tensors from the input tensor.
+The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
+
+tensor: The input tensor.
+element_shape: A shape compatible with that of elements in the tensor.
+lengths: Vector of sizes of the 0th dimension of tensors in the list.
+output_handle: The list.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt
index a26a98fd7f3a6564309efd28dff8c2bc93d7a67f..30cb803b26bf836a7b02cc3fb6875175046eab94 100644
--- a/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "TensorSliceDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that emits each dim-0 slice of `components` once."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt
index 6b630509964ed56ecaf401b10a46c5e53cd46528..31ef3e3335e2812156fc3d1af2c5c1724fa52310 100644
--- a/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "TextLineDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filenames"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_UnicodeDecode.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnicodeDecode.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b3f69023f1167fc3964a82a1e425d619ecc5521
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnicodeDecode.pbtxt
@@ -0,0 +1,76 @@
+op {
+  graph_op_name: "UnicodeDecode"
+  in_arg {
+    name: "input"
+    description: <<END
+The text to be decoded. Can have any shape. Note that the output is flattened
+to a vector of char values.
+END
+  }
+  out_arg {
+    name: "row_splits"
+    description: <<END
+A 1D int32 tensor containing the row splits.
+END
+  }
+  out_arg {
+    name: "char_values"
+    description: <<END
+A 1D int32 Tensor containing the decoded codepoints.
+END
+  }
+  attr {
+    name: "input_encoding"
+    description: <<END
+Text encoding of the input strings. This is any of the encodings supported
+by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+END
+  }
+  attr {
+    name: "errors"
+    description: <<END
+Error handling policy when there is invalid formatting found in the input.
+The value of 'strict' will cause the operation to produce a InvalidArgument
+error on any invalid input formatting. A value of 'replace' (the default) will
+cause the operation to replace any invalid formatting in the input with the
+`replacement_char` codepoint. A value of 'ignore' will cause the operation to
+skip any invalid formatting in the input and produce no corresponding output
+character.
+END
+  }
+  attr {
+    name: "replacement_char"
+    description: <<END
+The replacement character codepoint to be used in place of any invalid
+formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+be used. The default value is the default unicode replacement character is
+0xFFFD or U+65533.)
+END
+  }
+  attr {
+    name: "replace_control_characters"
+    description: <<END
+Whether to replace the C0 control characters (00-1F) with the
+`replacement_char`. Default is false.
+END
+  }
+  summary: <<END
+Decodes each string in `input` into a sequence of Unicode code points.
+END
+  description: <<END
+The character codepoints for all strings are returned using a single vector
+`char_values`, with strings expanded to characters in row-major order.
+
+The `row_splits` tensor indicates where the codepoints for
+each input string begin and end within the `char_values` tensor.
+In particular, the values for the `i`th
+string (in row-major order) are stored in the slice
+`[row_splits[i]:row_splits[i+1]]`. Thus:
+
+* `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+  character in the `i`th string (in row-major order).
+* `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+  string (in row-major order).
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnwrapDatasetVariant.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnwrapDatasetVariant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f88a1dac378b5fd8a3347df90b987d21644a3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnwrapDatasetVariant.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UnwrapDatasetVariant"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WrapDatasetVariant.pbtxt b/tensorflow/core/api_def/base_api/api_def_WrapDatasetVariant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40f5c7a0d212fb74e67ea6dde58bca191a153231
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WrapDatasetVariant.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WrapDatasetVariant"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt
index 7495693ccc50fede4a359d13aa710a1fd2fd9402..3c819963590f8f4ca05fd137ee70183c7d688aa2 100644
--- a/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "ZipDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that zips together `input_datasets`."
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchDataset.pbtxt
deleted file mode 100644
index 4289c1daf96583943b8dfad84aeca3351657bee4..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_BatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "BatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt b/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
index 7965af4916e7b8f590bd22452459410075c37cf8..fdbe5282bc136fa7cb59e9e638e6f1952b3ed5ce 100644
--- a/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "BesselI0e"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.bessel_i0e"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt b/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
index dffd296f6d8288356add56f8fbff01bfc4c9213a..3f08cd766d8cb0698c62fbb488ce71ea8018d9e2 100644
--- a/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "BesselI1e"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.bessel_i1e"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_BytesProducedStatsDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_BytesProducedStatsDataset.pbtxt
deleted file mode 100644
index fcf541f9036baaef1590f06da0d7471b0558b4c7..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_BytesProducedStatsDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "BytesProducedStatsDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_CacheDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_CacheDataset.pbtxt
deleted file mode 100644
index 2bbb4ff9e3b08d0dd11c7444e5d00feb514e81c0..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_CacheDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "CacheDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ConcatenateDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ConcatenateDataset.pbtxt
deleted file mode 100644
index c005a4da0f866c1d1106effabbaa22f1abecf422..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ConcatenateDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ConcatenateDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_DatasetToSingleElement.pbtxt b/tensorflow/core/api_def/python_api/api_def_DatasetToSingleElement.pbtxt
deleted file mode 100644
index e3d34cc15be752b466aa03f6805cd687698f74fa..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_DatasetToSingleElement.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "DatasetToSingleElement"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_DenseToSparseBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_DenseToSparseBatchDataset.pbtxt
deleted file mode 100644
index 0a8e068afb744ce8b472111d19cf743d39ac44ef..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_DenseToSparseBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "DenseToSparseBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_EnqueueInQueueDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_EnqueueInQueueDataset.pbtxt
deleted file mode 100644
index 051cf14c0ec2b32779be8b9c297b93abd1bc1318..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_EnqueueInQueueDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "EnqueueInQueueDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt b/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
index 391167254edb69725c778e6319bf8a9f6038589f..21ae77e9ed71cae895b5e3f62adb2607704b5858 100644
--- a/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
@@ -1,4 +1,10 @@
 op {
   graph_op_name: "Erf"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.erf"
+  }
+  endpoint {
+    name: "erf"
+    deprecation_version: 2
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FilterDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FilterDataset.pbtxt
deleted file mode 100644
index 6f91b842181c769d0a2f921f1d7566c4d8522541..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_FilterDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "FilterDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDataset.pbtxt
deleted file mode 100644
index d0703471d38c94a8c37da6f0a65ebd165c23a820..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "FixedLengthRecordDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDatasetV2.pbtxt
deleted file mode 100644
index def9f85e02d9d34412ed42d7774d77e8b6a328e0..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDatasetV2.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "FixedLengthRecordDatasetV2"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FlatMapDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FlatMapDataset.pbtxt
deleted file mode 100644
index 9de61ac263cd82a0893aa2e27b9d7532490ca441..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_FlatMapDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "FlatMapDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt
index 26598ab1fb918e251d4c4da7b14810ebf4c44779..efd42b888d21fad6c369ae63182ed8846bf9f0b1 100644
--- a/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "FloorDiv"
-  visibility: HIDDEN
+  endpoint {
+    name: "floor_div"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt b/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt
index ef562e93a0dee0a3f24716719cb24232302626dc..e5db6d49b29e46c9f19c43767c16a5e5296304e8 100644
--- a/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "FloorMod"
-  visibility: HIDDEN
+  endpoint {
+    name: "floormod"
+  }
+  endpoint {
+    name: "mod"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_GeneratorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_GeneratorDataset.pbtxt
deleted file mode 100644
index 9dcfa0f7d210012aa5c2d43349239a953ea3739e..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_GeneratorDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "GeneratorDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_GroupByWindowDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_GroupByWindowDataset.pbtxt
deleted file mode 100644
index 8d40208e613e6b7ee1522c2990afea1345cc5de1..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_GroupByWindowDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "GroupByWindowDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_InterleaveDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_InterleaveDataset.pbtxt
deleted file mode 100644
index ef1b06b19cc6a0c62f6e9f451aceed8aeabed553..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_InterleaveDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "InterleaveDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_LatencyStatsDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_LatencyStatsDataset.pbtxt
deleted file mode 100644
index 94bf6106ad8459767d31a345a17483b255dfc02b..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_LatencyStatsDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "LatencyStatsDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Lu.pbtxt b/tensorflow/core/api_def/python_api/api_def_Lu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0b6b53da50e474c3bfe2065a607a19baf06bc80
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Lu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Lu"
+  endpoint {
+    name: "linalg.lu"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapAndBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapAndBatchDataset.pbtxt
deleted file mode 100644
index cffd2910fb404bc7f75e55e42b9ebba1635db134..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_MapAndBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "MapAndBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapDataset.pbtxt
deleted file mode 100644
index 0b1d2f2c730ff8b8b928fcd97c4fe3bdc704e470..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_MapDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "MapDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt b/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt
index 0e2bb9b950d933f2e73272b403fba2c29110b3cb..ac166561ee9b1ab5fcee6fad776971172b0ee5ba 100644
--- a/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Neg"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.negative"
+  }
+  endpoint {
+    name: "negative"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_PaddedBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_PaddedBatchDataset.pbtxt
deleted file mode 100644
index c6223b3132ed0d6878995d3c5e657275fac0cc4f..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_PaddedBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "PaddedBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParallelInterleaveDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParallelInterleaveDataset.pbtxt
deleted file mode 100644
index 93cd5719feb613cd3de2e422e23cc3d690bdef08..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ParallelInterleaveDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ParallelInterleaveDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParallelMapDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParallelMapDataset.pbtxt
deleted file mode 100644
index 09d200dd24c828af85d1505bb17086dbfa688ee8..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ParallelMapDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ParallelMapDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParseExampleDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParseExampleDataset.pbtxt
deleted file mode 100644
index 45826b6fdcc582ac7fd84d45b079b7f4994bc370..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ParseExampleDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ParseExampleDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_PrefetchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_PrefetchDataset.pbtxt
deleted file mode 100644
index ec4e214eb5e082c8f732cbef9db69524c48d80a4..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_PrefetchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "PrefetchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
deleted file mode 100644
index 228c4047d2e0b7ddfec1d8cd4fad478aa6c4c1a7..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "PrependFromQueueAndPaddedBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_RandomDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_RandomDataset.pbtxt
deleted file mode 100644
index a5f6f8c6f1db344c480e2bd452362d977dc15000..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_RandomDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "RandomDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_RangeDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_RangeDataset.pbtxt
deleted file mode 100644
index 4cd8296b2233ac58c12e6573d2194f7d976d9137..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_RangeDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "RangeDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_RealDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_RealDiv.pbtxt
index bd87eef8240532c158b7604d8c5576e6d0b8b24b..f9e01eb56744cefddb41bad1a54d539ab3e0c548 100644
--- a/tensorflow/core/api_def/python_api/api_def_RealDiv.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_RealDiv.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "RealDiv"
-  visibility: HIDDEN
+  endpoint {
+    name: "realdiv"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_RepeatDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_RepeatDataset.pbtxt
deleted file mode 100644
index be301da8386af0fbd98c9b02d2cfc0fe79178990..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_RepeatDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "RepeatDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1eef1b69b979bfeaaaaec81f47a6e62c8ecd8284
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyAdamWithAmsgrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyKerasMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1c39242b3101449ed08c7b132502f7a9eea1228e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyKerasMomentum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyKerasMomentum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..180793521352a3d9ba3b75b709c3f9d2d37c8f93
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyKerasMomentum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScanDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScanDataset.pbtxt
deleted file mode 100644
index e71b655c22fbcbf1524433fc65a392e4d80c5c43..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ScanDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ScanDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
deleted file mode 100644
index 3a8c1036ca34233b245a92110dc6e81ac348942d..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SetStatsAggregatorDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ShuffleAndRepeatDataset.pbtxt
deleted file mode 100644
index 7b0d2994f0711f440fb6623aa2322c86bd3859f8..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ShuffleAndRepeatDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ShuffleAndRepeatDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ShuffleDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ShuffleDataset.pbtxt
deleted file mode 100644
index 8f0be9197adeb23b2d5047c5d69916df0e2c1eda..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ShuffleDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ShuffleDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
index c2ee91dd12ed16ba27a9c4ae45b48194bc5a8b03..fb427cdb191d4976cf50d214e7f58695e7c41490 100644
--- a/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Sign"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.sign"
+  }
+  endpoint {
+    name: "sign"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_SkipDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SkipDataset.pbtxt
deleted file mode 100644
index 96a551c5b6669a8d019e3c705507aba768ab9d21..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_SkipDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SkipDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
deleted file mode 100644
index 867116c5da718f66205132d70a93c39464096df6..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SlideDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseTensorSliceDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseTensorSliceDataset.pbtxt
deleted file mode 100644
index 19c0c7f199dfd24d24a56c3766733f9e55957c12..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_SparseTensorSliceDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SparseTensorSliceDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SqlDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SqlDataset.pbtxt
deleted file mode 100644
index 2ab4c3e441dd51f50a2796ef9d6fa0d21b727ffa..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_SqlDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SqlDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
index 59e2dfe8366813242337c9490d74ca317e525636..16a4d9a7bcc0058aa0baf46ed0b932d4c26a23e2 100644
--- a/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Sqrt"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.sqrt"
+  }
+  endpoint {
+    name: "sqrt"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Square.pbtxt b/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
index 7b39ae25fa062b4271dcc2aee6523847c97b1e4d..0bd2f1bf41b80b1a21d50a9b9f437da33e36584c 100644
--- a/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Square"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.square"
+  }
+  endpoint {
+    name: "square"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatsAggregatorHandle.pbtxt
deleted file mode 100644
index f7bed36602f40602313157c20677acbbf592d7be..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorHandle.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StatsAggregatorHandle"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatsAggregatorSummary.pbtxt
deleted file mode 100644
index 8b1bab2440f1934f1fd0194b76b7907fb0fb142d..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorSummary.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StatsAggregatorSummary"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TFRecordDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TFRecordDataset.pbtxt
deleted file mode 100644
index 3c270ada3c219b03715e0cd651a4b56fe5ebc227..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_TFRecordDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TFRecordDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TakeDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TakeDataset.pbtxt
deleted file mode 100644
index 711b335dc1926d32071637b3c986727c339736a3..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_TakeDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TakeDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt b/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt
index c946e0a794a77fe6f40613824e6d614e9667ccf9..80d11d27853d89b17fc86fca4fc9219452cd1aca 100644
--- a/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt
@@ -1,4 +1,12 @@
 op {
   graph_op_name: "Tanh"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.tanh"
+  }
+  endpoint {
+    name: "nn.tanh"
+  }
+  endpoint {
+    name: "tanh"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorDataset.pbtxt
deleted file mode 100644
index 5bc3920c56360f2348805db1db79ab2b630f379d..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_TensorDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TensorDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListConcat.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c7b6fd106ce304f1e75913614c54f12a3efe5e38
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListConcat.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListConcat"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_defTensorListPushBackBatch.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListPushBackBatch.pbtxt
similarity index 100%
rename from tensorflow/core/api_def/python_api/api_defTensorListPushBackBatch.pbtxt
rename to tensorflow/core/api_def/python_api/api_def_TensorListPushBackBatch.pbtxt
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListSplit.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListSplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..091297db07174a3925ed2a09b879d013580b606e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListSplit.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListSplit"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorSliceDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorSliceDataset.pbtxt
deleted file mode 100644
index 89ad016483fa392a302915d588d32201237c717a..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_TensorSliceDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TensorSliceDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TextLineDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TextLineDataset.pbtxt
deleted file mode 100644
index 08d785191b6a4bddce2ac43fd4c0188b4d74548e..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_TextLineDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TextLineDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TruncateDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_TruncateDiv.pbtxt
index 2a547f771cfb3d4f3d9496ea24196e1a8a1f1879..8e46c5e663a3fca40a6c2e4890a6ab9388645ad9 100644
--- a/tensorflow/core/api_def/python_api/api_def_TruncateDiv.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_TruncateDiv.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "TruncateDiv"
-  visibility: HIDDEN
+  endpoint {
+    name: "truncatediv"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_TruncateMod.pbtxt b/tensorflow/core/api_def/python_api/api_def_TruncateMod.pbtxt
index 0731e8810e25cad2cca02522aba55d032b1765b2..97fb816a7ad395a4ad67d0296d87cf6264c76ac2 100644
--- a/tensorflow/core/api_def/python_api/api_def_TruncateMod.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_TruncateMod.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "TruncateMod"
-  visibility: HIDDEN
+  endpoint {
+    name: "truncatemod"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
deleted file mode 100644
index 1e5415749f0d3abad8f6f5c632a0bc59b11e8de2..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "UnbatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ZipDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ZipDataset.pbtxt
deleted file mode 100644
index dd1459521ff70fc4b3adce7fbb1251b45106b439..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ZipDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ZipDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
index c4bc1a684cb3ffaa30cdaece041fc51c266a3782..1fc077af92c719bf2c5d87eded55275032891f5d 100644
--- a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
+++ b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
@@ -75,7 +75,8 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
   Status rewriteNode(Node* n, Graph* g) {
     AttrSlice n_attrs = n->attrs();
     auto base_make_node = [n, &n_attrs](const string& op, const string& name) {
-      NodeBuilder node_builder(name, op);
+      NodeDebugInfo debug_info(*n);
+      NodeBuilder node_builder(name, op, OpRegistry::Global(), &debug_info);
 
       // The pieces of AccumulateNV2 should all be on the same node.
       node_builder.Device(n->requested_device());
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index aae3392d0e64319cdd539904b2271df1598921b3..a750f8cbba4de4abd33d6ec395b6b0a5fb76cc67 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -124,7 +124,7 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
 #define DEFINE_SET_ATTR(value_type, value_field)                             \
   template <>                                                                \
   AttrBuilder& AttrBuilder::Set(StringPiece attr_name, value_type&& value) { \
-    value_field.push_back(std::make_pair(attr_name, value));                 \
+    value_field.push_back(std::make_pair(string(attr_name), value));         \
     return *this;                                                            \
   }
 
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index 41dd275a668d2694397ec415cf05ddca03b258dc..5e0172dfd328dbd4f16abdce879be1d1338e692c 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -99,7 +99,7 @@ class AttrBuilder {
   template <class T>
   AttrBuilder& Set(StringPiece attr_name, T&& value) {
     MayBeInitializeNodeDef();
-    SetInAttrValueMap(node_def_->mutable_attr(), attr_name, value);
+    SetInAttrValueMap(node_def_->mutable_attr(), string(attr_name), value);
     return *this;
   }
 
@@ -110,7 +110,7 @@ class AttrBuilder {
 
  private:
   template <class T>
-  using AttrVec = tensorflow::gtl::InlinedVector<std::pair<StringPiece, T>, 2>;
+  using AttrVec = tensorflow::gtl::InlinedVector<std::pair<string, T>, 2>;
 
   void MayBeInitializeNodeDef();
   // Fill `m` with the attr-value pairs set via AttrBuilder::Set() so far, as
@@ -122,7 +122,7 @@ class AttrBuilder {
   void FillAttrValueMap(AttrValueMap* m, bool include_those_in_node_def) const;
 
   template <class T>
-  void SetInAttrValueMap(AttrValueMap* m, StringPiece attr_name,
+  void SetInAttrValueMap(AttrValueMap* m, const string& attr_name,
                          T&& value) const {
     DCHECK(!node_def_finalized_)
         << "Calling SetInAttrValueMap after BuildNodeDef.";
@@ -131,12 +131,12 @@ class AttrBuilder {
     AttrValue attr_value;
     if (found == nullptr) {
       SetAttrValue(value, &attr_value);
-      m->insert(AttrValueMap::value_type(string(attr_name), attr_value));
+      m->insert(AttrValueMap::value_type(attr_name, attr_value));
     } else {
       // TODO(ashankar): Do what is done in
       // NodeDefBuilder::CheckInconsistency(attr_name, *found, attr_value);
       SetAttrValue(std::forward<T>(value), &attr_value);
-      (*m)[string(attr_name)] = attr_value;
+      (*m)[attr_name] = attr_value;
     }
   }
 
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 583ae64edd16af7b86c4a2c9f708f0d3d0b8c843..1727c045604bd19e038857fa34780f34cbb05d44 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/context.h"
 
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -71,6 +74,13 @@ EagerContext::EagerContext(const SessionOptions& opts,
   runner_ = [this](std::function<void()> closure) {
     this->thread_pool_->Schedule(std::move(closure));
   };
+
+  std::unique_ptr<DeviceResolverInterface> drl(
+      new DeviceResolverLocal(local_device_mgr()));
+  std::unique_ptr<ParamResolverInterface> cprl(new CollectiveParamResolverLocal(
+      local_device_mgr(), drl.get(), "/job:localhost/replica:0/task:0"));
+  collective_executor_mgr_.reset(new CollectiveExecutorMgr(
+      opts.config, local_device_mgr(), std::move(drl), std::move(cprl)));
 }
 
 void EagerContext::InitDeviceMapAndAsync() {
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 51109f8f1ae67cf1a64e6c520dd063744cf8abce..cdef94789337550fdaa760638f098ba47af5dfdb 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #endif
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -147,6 +148,11 @@ class EagerContext {
   bool LogMemory() { return log_memory_; }
 
   Rendezvous* GetRendezvous() { return rendezvous_; }
+  std::unique_ptr<CollectiveExecutor::Handle> GetCollectiveExecutorHandle() {
+    return std::unique_ptr<CollectiveExecutor::Handle>(
+        new CollectiveExecutor::Handle(
+            collective_executor_mgr_->FindOrCreate(0), true /*inherit_ref*/));
+  }
 
   const tensorflow::DeviceMgr* local_device_mgr() const {
     return (local_device_manager_ != nullptr) ? local_device_manager_.get()
@@ -273,6 +279,8 @@ class EagerContext {
 
   Env* const env_;
 
+  std::unique_ptr<CollectiveExecutorMgrInterface> collective_executor_mgr_;
+
 #ifndef __ANDROID__
   void CloseRemoteContexts();
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 5bf7888fad5043ac9a02f0d9e2fc4362d6567661..783baa96c92f224e45404e5f6586011599f02292 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -263,7 +263,8 @@ Status EagerLocalExecute(EagerOperation* op,
     // Note that it is not ideal, but currently ok, to set this
     // attribute after computing the kernel cache key above.
     if (op->is_function() && device != nullptr &&
-        device->device_type() == "TPU") {
+        (device->device_type() == "TPU" || device->device_type() == "XLA_GPU" ||
+         device->device_type() == "XLA_CPU")) {
       op->MutableAttrs()->Set(kXlaCompileAttr, true);
     }
 
@@ -284,7 +285,8 @@ Status EagerLocalExecute(EagerOperation* op,
           "Unable to find a FunctionLibraryRuntime corresponding to device ",
           device->name());
     }
-    kernel = new KernelAndDevice(ctx->GetRendezvous(), ctx->LogMemory());
+    kernel = new KernelAndDevice(ctx->GetRendezvous(), ctx->LogMemory(),
+                                 ctx->GetCollectiveExecutorHandle());
     status = KernelAndDevice::Init(ndef, flr, ctx->runner(), kernel);
     if (!status.ok()) {
       delete kernel;
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 192d22dfd5a105a31ab19a33c29ddc83ecd04142..317e9a16074b37ef6ecaf1d7f8c1a2daa412f75e 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -84,6 +84,15 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
                              tensorflow::HOST_MEMORY);
   }
 
+  gtl::InlinedVector<DeviceContext*, 4> input_device_contexts;
+  for (int i = 0; i < inputs->size(); i++) {
+    DeviceContext* device_context = nullptr;
+    if (device_->tensorflow_gpu_device_info() != nullptr) {
+      device_context = device_->tensorflow_gpu_device_info()->default_context;
+    }
+    input_device_contexts.push_back(device_context);
+  }
+
   OpKernelContext::Params params;
   params.device = device_;
   params.frame_iter = FrameAndIter(0, 0);
@@ -110,6 +119,9 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
   }
 
   params.step_container = step_container;
+  params.collective_executor =
+      collective_executor_ ? collective_executor_->get() : nullptr;
+  params.input_device_contexts = &input_device_contexts;
 
   OpKernelContext context(&params);
 
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 52dac94ccca0cc987751400778c3c1c6e95272d6..ee430b7fc70e1f4e5256e9dd28f4240ce57de86a 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
@@ -55,10 +56,16 @@ class KernelAndDevice {
                      KernelAndDevice* out);
 
   KernelAndDevice(tensorflow::Rendezvous* rendez, bool log_memory)
+      : KernelAndDevice(rendez, log_memory, nullptr) {}
+
+  KernelAndDevice(
+      tensorflow::Rendezvous* rendez, bool log_memory,
+      std::unique_ptr<CollectiveExecutor::Handle> collective_executor)
       : device_(nullptr),
         flr_(nullptr),
         rendez_(rendez),
-        log_memory_(log_memory) {}
+        log_memory_(log_memory),
+        collective_executor_(std::move(collective_executor)) {}
 
   // TODO(ashankar): Handle list-valued inputs.
   Status Run(std::vector<Tensor>* inputs, std::vector<Tensor>* outputs,
@@ -92,6 +99,7 @@ class KernelAndDevice {
   std::function<void(std::function<void()>)>* runner_;
   std::function<void(std::function<void()>)> default_runner_;
   const bool log_memory_;
+  const std::unique_ptr<CollectiveExecutor::Handle> collective_executor_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index d8d6b7a63b6f7189d4db66846a2f48982a20e610..0acd1609361453a0901e346f3b9d76e6e3a7b872 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -184,10 +184,7 @@ Status TensorHandle::CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
   bool is_same_device = (srcd == dstd) || (srcd->name() == dstd->name());
   const bool dst_cpu = dstd->tensorflow_gpu_device_info() == nullptr;
   const bool src_cpu = srcd->tensorflow_gpu_device_info() == nullptr;
-  // both_on_cpu can be true and yet is_same_device is false, if one of src/dst
-  // has device type XLA_CPU, and the other CPU.
-  const bool both_on_cpu = src_cpu && dst_cpu;
-  if (is_same_device || both_on_cpu) {
+  if (is_same_device) {
     *output = new tensorflow::TensorHandle(*src, dstd, dstd, ctx);
     return tensorflow::Status::OK();
   }
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index 34bf73972f57306eb9cfda08d8277f0bedfcafa9..02930168a4b053895827a54d065011bc9d657463 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
@@ -171,41 +172,40 @@ class ExecutorBarrier {
 
   mutable mutex mu_;
   int pending_ GUARDED_BY(mu_) = 0;
-  Status status_ GUARDED_BY(mu_);
+  StatusGroup status_group_ GUARDED_BY(mu_);
 
   void WhenDone(const Status& s) {
-    bool error = false;
     Rendezvous* error_rendez = nullptr;
     StatusCallback done = nullptr;
     Status status;
+
     {
       mutex_lock l(mu_);
-      // If we are the first error encountered, mark the status
-      // appropriately and later trigger an abort of the Rendezvous
-      // object by this thread only.
-      if (status_.ok() && !s.ok()) {
-        error = true;
+
+      // If we are the first error encountered, trigger an abort of the
+      // Rendezvous object by this thread only.
+      if (status_group_.ok() && !s.ok()) {
         error_rendez = rendez_;
         error_rendez->Ref();
-        status_ = s;
       }
 
+      status_group_.Update(s);
+
       // If this is the last call to WhenDone, call the final callback
       // below.
       if (--pending_ == 0) {
         CHECK(done_cb_ != nullptr);
         std::swap(done, done_cb_);
-      }
-
-      if (!status_.ok()) {
-        status = status_;
+        status = status_group_.as_status();
       }
     }
 
-    if (error) {
-      error_rendez->StartAbort(status);
+    if (error_rendez != nullptr) {
+      error_rendez->StartAbort(
+          errors::Aborted("Stopping remaining executors."));
       error_rendez->Unref();
     }
+
     if (done != nullptr) {
       delete this;
       done(status);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 5152d97fdefed688ba05043072ff6df635471ed9..14b57cc337b3a089645bdfa5e90cc66af38616d8 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -601,7 +601,9 @@ Status BaseGPUDevice::MaybeCopyTensorToGPU(
         [to, copy](StatusCallback done_,
                    // Begin unbound arguments.
                    const Status& s) {
-          *to = std::move(*copy);
+          if (s.ok()) {
+            *to = std::move(*copy);
+          }
           delete copy;
           done_(s);
         },
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index d2adf699f524ef6771da6b0a41e7fc552d2bbdfa..fe3214755715a896b472835652be68c5ef65a6e9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -78,7 +78,8 @@ static std::atomic_int_fast64_t live_tensor_bytes(0);
 // A TensorBuffer that counts live memory usage for testing
 class TestTensorBuffer : public TensorBuffer {
  public:
-  explicit TestTensorBuffer(size_t bytes) : bytes_(bytes) {
+  explicit TestTensorBuffer(size_t bytes)
+      : TensorBuffer(nullptr), bytes_(bytes) {
     live_tensor_bytes += bytes_;
   }
   ~TestTensorBuffer() override { live_tensor_bytes -= bytes_; }
@@ -86,7 +87,6 @@ class TestTensorBuffer : public TensorBuffer {
   size_t size() const override { return bytes_; }
 
   // Not used in this test
-  void* data() const override { return nullptr; }
   TensorBuffer* root_buffer() override { return nullptr; }
   void FillAllocationDescription(AllocationDescription* arg) const override {}
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index a9a19f0fe04d1535e442ea37e51aba26eab69dc8..8167cfb9d7dc6cd91a17323b3083d1823cbaa5e0 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -70,7 +70,10 @@ int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
   // Return the NUMA node associated with the GPU's StreamExecutor.
   se::StreamExecutor* se =
       GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
-  return se->GetDeviceDescription().numa_node();
+  int numa_node = se->GetDeviceDescription().numa_node();
+  // bus_id must be non-negative.  If the numa_node is not known,
+  // use 0.
+  return numa_node >= 0 ? numa_node : 0;
 }
 
 Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
@@ -97,6 +100,7 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
     PlatformGpuId platform_gpu_id;
     TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
     int bus_id = BusIdForGPU(tf_gpu_id);
+    DCHECK_GE(bus_id, 0);
     while (bus_id >= gpu_visitors_.size()) {
       gpu_visitors_.push_back({});
     }
@@ -249,6 +253,7 @@ void GPUProcessState::AddGPUAllocVisitor(int bus_id,
   CHECK(gpu_allocators_.empty())  // Crash OK
       << "AddGPUAllocVisitor must be called before "
          "first call to GetGPUAllocator.";
+  DCHECK_GE(bus_id, 0);
   while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
     gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>());
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc b/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
index 4bc88ffc8c3950176ae05f32c774f2f2971a4e34..0ef39fb3d78044a8611b315afbdeb4975a3af15f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
@@ -37,6 +37,14 @@ void GPUDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   GPUUtil::CopyGPUTensorToCPU(device, this, device_tensor, cpu_tensor, done);
 }
 
+void GPUDeviceContext::CopyTensorInSameDevice(const Tensor* input_tensor,
+                                              Device* device,
+                                              Tensor* output_tensor,
+                                              StatusCallback done) const {
+  GPUUtil::CopyGPUTensorToSameGPU(device, this, input_tensor, output_tensor,
+                                  done);
+}
+
 Status GPUDeviceContext::ThenExecute(Device* device, se::Stream* stream,
                                      std::function<void()> func) {
   const DeviceBase::GpuDeviceInfo* gpu_info =
diff --git a/tensorflow/core/common_runtime/gpu_device_context.h b/tensorflow/core/common_runtime/gpu_device_context.h
index 3603808152748009f29d1d01f0eeee0dd8b6ab0e..f5135267241db94a0afdd9845b09dbfdda242ecc 100644
--- a/tensorflow/core/common_runtime/gpu_device_context.h
+++ b/tensorflow/core/common_runtime/gpu_device_context.h
@@ -57,6 +57,10 @@ class GPUDeviceContext : public DeviceContext {
                              Device* device, Tensor* cpu_tensor,
                              StatusCallback done) override;
 
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override;
+
   void MaintainLifetimeOnStream(const Tensor* t,
                                 se::Stream* stream) const override {}
 
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 9a56c671623b2304ea02fdad5d3a3a2304fb3876..04d658f0472e3ea07855f4bae6a89ad5199eb2f9 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -546,10 +546,6 @@ Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
   std::unique_ptr<Graph> new_graph(new Graph(OpRegistry::Global()));
   GraphConstructorOptions opts;
   TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, *graph_def, new_graph.get()));
-  for (const Node* n : new_graph->nodes()) {
-    VLOG(2) << "Mapping " << n->name() << " to " << n->cost_id();
-    node_name_to_cost_id_map_[n->name()] = n->cost_id();
-  }
   if (session_options_ &&
       session_options_->config.graph_options().place_pruned_graph()) {
     // Rewrite the graph before placement.
@@ -578,6 +574,11 @@ Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
       OptimizationPassRegistry::POST_PLACEMENT, optimization_options));
 
+  for (const Node* n : new_graph->nodes()) {
+    VLOG(2) << "Mapping " << n->name() << " to " << n->cost_id();
+    node_name_to_cost_id_map_[n->name()] = n->cost_id();
+  }
+
   SaveStatefulNodes(new_graph.get());
   graph_ = new_graph.release();
   return Status::OK();
@@ -595,6 +596,13 @@ Status GraphExecutionState::OptimizeGraph(
     grappler::GrapplerItem item;
     item.id = "tf_graph";
     graph_->ToGraphDef(&item.graph);
+
+    // It's ok to skip invalid device annotations in Grappler.
+    Status inferred_devices = item.InferDevicesFromGraph();
+    if (!inferred_devices.ok()) {
+      VLOG(3) << inferred_devices.error_message();
+    }
+
     // TODO(b/114748242): Add a unit test to test this bug fix.
     if (flib_def_) {
       *item.graph.mutable_library() = flib_def_->ToProto();
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index 9738006f5ca9eb821439a9ad507aec3db434946c..241c403087c814717d873fc3d4d4c2c4f71e50ae 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -89,6 +89,7 @@ class CondBuilder {
   const FunctionLibraryDefinition& flib_;
   string name_;
 
+  NodeDebugInfo debug_info_;
   NodeBuilder then_call_builder_;
   NodeBuilder else_call_builder_;
 };
@@ -100,8 +101,11 @@ CondBuilder::CondBuilder(Node* if_op, const string& then_fn_name,
       graph_(graph),
       flib_(flib),
       name_(if_op->name()),
-      then_call_builder_(NewName("then"), then_fn_name, graph->op_registry()),
-      else_call_builder_(NewName("else"), else_fn_name, graph->op_registry()) {
+      debug_info_(*if_op_),
+      then_call_builder_(NewName("then"), then_fn_name, graph->op_registry(),
+                         &debug_info_),
+      else_call_builder_(NewName("else"), else_fn_name, graph->op_registry(),
+                         &debug_info_) {
   TF_CHECK_OK(if_op_->input_tensor(0, &pred_));
   then_call_builder_.Device(if_op_->requested_device());
   else_call_builder_.Device(if_op_->requested_device());
@@ -111,23 +115,23 @@ Status CondBuilder::CreatePivotNodes() {
   // Construct the basic cond body (consisting of feeding in the predicate to
   // create pivot nodes).
   Node* switch_pred;
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(NewName("switch_pred"), "Switch", graph_->op_registry())
-          .Input(NodeOut(pred_))
-          .Input(NodeOut(pred_))
-          .Device(if_op_->requested_device())
-          .Finalize(graph_, &switch_pred));
+  TF_RETURN_IF_ERROR(NodeBuilder(NewName("switch_pred"), "Switch",
+                                 graph_->op_registry(), &debug_info_)
+                         .Input(NodeOut(pred_))
+                         .Input(NodeOut(pred_))
+                         .Device(if_op_->requested_device())
+                         .Finalize(graph_, &switch_pred));
   control_predecessor_ = switch_pred;
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(NewName("pivot_f"), "Identity", graph_->op_registry())
-          .Input(switch_pred, kElseBranch)
-          .Device(if_op_->requested_device())
-          .Finalize(graph_, &pivot_f_));
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(NewName("pivot_t"), "Identity", graph_->op_registry())
-          .Input(switch_pred, kThenBranch)
-          .Device(if_op_->requested_device())
-          .Finalize(graph_, &pivot_t_));
+  TF_RETURN_IF_ERROR(NodeBuilder(NewName("pivot_f"), "Identity",
+                                 graph_->op_registry(), &debug_info_)
+                         .Input(switch_pred, kElseBranch)
+                         .Device(if_op_->requested_device())
+                         .Finalize(graph_, &pivot_f_));
+  TF_RETURN_IF_ERROR(NodeBuilder(NewName("pivot_t"), "Identity",
+                                 graph_->op_registry(), &debug_info_)
+                         .Input(switch_pred, kThenBranch)
+                         .Device(if_op_->requested_device())
+                         .Finalize(graph_, &pivot_t_));
   return Status::OK();
 }
 
@@ -137,12 +141,13 @@ string CondBuilder::NewName(const string& infix) {
 
 Status CondBuilder::AddInput(Node* src, int src_output) {
   Node* input;
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(NewName(src->name()), "Switch", graph_->op_registry())
-          .Input(src, src_output)
-          .Input(pred_)
-          .Device(if_op_->requested_device())
-          .Finalize(graph_, &input));
+  NodeDebugInfo debug_info(*src);
+  TF_RETURN_IF_ERROR(NodeBuilder(NewName(src->name()), "Switch",
+                                 graph_->op_registry(), &debug_info)
+                         .Input(src, src_output)
+                         .Input(pred_)
+                         .Device(if_op_->requested_device())
+                         .Finalize(graph_, &input));
   then_call_builder_.Input(input, kThenBranch);
   else_call_builder_.Input(input, kElseBranch);
   return Status::OK();
@@ -178,7 +183,8 @@ Status CondBuilder::AddOutputs() {
   outputs_.resize(merges.size());
   for (int i = 0; i < then_call_node_->num_outputs(); ++i) {
     TF_RETURN_IF_ERROR(
-        NodeBuilder(graph_->NewName("merge"), "Merge", graph_->op_registry())
+        NodeBuilder(graph_->NewName("merge"), "Merge", graph_->op_registry(),
+                    &debug_info_)
             .Input({NodeOut(then_call_node_, i), NodeOut(else_call_node_, i)})
             .Device(if_op_->requested_device())
             .Finalize(graph_, &merges[i]));
diff --git a/tensorflow/core/common_runtime/lower_while_op.cc b/tensorflow/core/common_runtime/lower_while_op.cc
index 6f9921a7968b9cad4bc96b21600fdb026636bc2a..8b68c31a722474e4b73fa9e1d46ccafbc7b66ddd 100644
--- a/tensorflow/core/common_runtime/lower_while_op.cc
+++ b/tensorflow/core/common_runtime/lower_while_op.cc
@@ -133,6 +133,7 @@ class LowerWhileHelper {
   // Name of the `while_op_`.
   string name_;
 
+  NodeDebugInfo debug_info_;
   NodeBuilder cond_call_builder_;
   NodeBuilder body_call_builder_;
 
@@ -152,8 +153,11 @@ LowerWhileHelper::LowerWhileHelper(Node* while_op, const string& cond_fn_name,
       graph_(graph),
       flib_(flib),
       name_(while_op->name()),
-      cond_call_builder_(NewName("cond"), cond_fn_name, graph->op_registry()),
-      body_call_builder_(NewName("body"), body_fn_name, graph->op_registry()),
+      debug_info_(*while_op_),
+      cond_call_builder_(NewName("cond"), cond_fn_name, graph->op_registry(),
+                         &debug_info_),
+      body_call_builder_(NewName("body"), body_fn_name, graph->op_registry(),
+                         &debug_info_),
       num_loop_inputs_(while_op_->num_inputs()) {
   // We intentionally `resize` instead of `reserve` space in `enter_nodes_`
   // because we need to set it's elements out of order in `CreateEnterNodes`.
@@ -186,11 +190,11 @@ Status LowerWhileHelper::CreateEnterNodes() {
   TF_RETURN_IF_ERROR(while_op_->input_edges(&edges));
   for (const Edge* edge : edges) {
     Node* enter_node;
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(NewName("enter"), "Enter", graph_->op_registry())
-            .Input(NodeOut(edge->src(), edge->src_output()))
-            .Attr("frame_name", name_)
-            .Finalize(graph_, &enter_node));
+    TF_RETURN_IF_ERROR(NodeBuilder(NewName("enter"), "Enter",
+                                   graph_->op_registry(), &debug_info_)
+                           .Input(NodeOut(edge->src(), edge->src_output()))
+                           .Attr("frame_name", name_)
+                           .Finalize(graph_, &enter_node));
     enter_nodes_[edge->dst_input()] = enter_node;
   }
   // Create a NoOp node that takes incoming control inputs of the original While
@@ -203,10 +207,10 @@ Status LowerWhileHelper::CreateEnterNodes() {
   }
   if (!control_inputs.empty()) {
     Node* incoming_control_node;
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(NewName("LoopControlInputs"), "NoOp", graph_->op_registry())
-            .ControlInputs(control_inputs)
-            .Finalize(graph_, &incoming_control_node));
+    TF_RETURN_IF_ERROR(NodeBuilder(NewName("LoopControlInputs"), "NoOp",
+                                   graph_->op_registry(), &debug_info_)
+                           .ControlInputs(control_inputs)
+                           .Finalize(graph_, &incoming_control_node));
     for (Node* n : enter_nodes_) {
       graph_->AddControlEdge(incoming_control_node, n);
     }
@@ -218,7 +222,8 @@ Status LowerWhileHelper::CreateMergeNodes() {
   for (Node* enter_node : enter_nodes_) {
     Node* merge_node;
     TF_RETURN_IF_ERROR(
-        NodeBuilder(NewName("merge"), "Merge", graph_->op_registry())
+        NodeBuilder(NewName("merge"), "Merge", graph_->op_registry(),
+                    &debug_info_)
             .Input({NodeOut(enter_node, 0), NodeOut(enter_node, 0)})
             .Finalize(graph_, &merge_node));
     merge_nodes_.emplace_back(merge_node);
@@ -235,10 +240,10 @@ Status LowerWhileHelper::CreateCondFuncCallNode() {
   // are in the same frame as the rest of the function, otherwise
   // `BuildControlFlowInfo` throws an error.
   graph_->AddControlEdge(merge_nodes_[0], cond_call_node_);
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(NewName("LoopCond"), "LoopCond", graph_->op_registry())
-          .Input(NodeOut(cond_call_node_, 0))
-          .Finalize(graph_, &loop_cond_node_));
+  TF_RETURN_IF_ERROR(NodeBuilder(NewName("LoopCond"), "LoopCond",
+                                 graph_->op_registry(), &debug_info_)
+                         .Input(NodeOut(cond_call_node_, 0))
+                         .Finalize(graph_, &loop_cond_node_));
   return Status::OK();
 }
 
@@ -255,11 +260,11 @@ Status LowerWhileHelper::CreateSwitchNodes() {
     if (IsRefType(merge_nodes_[i]->output_type(0))) {
       op_type = "RefSwitch";
     }
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(NewName(op_name), op_type, graph_->op_registry())
-            .Input(NodeOut(merge_nodes_[i], 0))
-            .Input(NodeOut(loop_cond_node_, 0))
-            .Finalize(graph_, &switch_node));
+    TF_RETURN_IF_ERROR(NodeBuilder(NewName(op_name), op_type,
+                                   graph_->op_registry(), &debug_info_)
+                           .Input(NodeOut(merge_nodes_[i], 0))
+                           .Input(NodeOut(loop_cond_node_, 0))
+                           .Finalize(graph_, &switch_node));
     switch_nodes_.emplace_back(switch_node);
   }
   return Status::OK();
@@ -282,10 +287,10 @@ Status LowerWhileHelper::CreateBodyFuncCallNode() {
   if (IsRefType(switch_nodes_[0]->output_type(1))) {
     op_type = "RefIdentity";
   }
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(NewName("loop_body_control"), op_type, graph_->op_registry())
-          .Input(NodeOut(switch_nodes_[0], 1))
-          .Finalize(graph_, &body_control_node_));
+  TF_RETURN_IF_ERROR(NodeBuilder(NewName("loop_body_control"), op_type,
+                                 graph_->op_registry(), &debug_info_)
+                         .Input(NodeOut(switch_nodes_[0], 1))
+                         .Finalize(graph_, &body_control_node_));
   graph_->AddControlEdge(body_control_node_, body_call_node_);
   return Status::OK();
 }
@@ -295,10 +300,10 @@ Status LowerWhileHelper::CreateExitNodes() {
   outputs.reserve(num_loop_inputs_);
   for (Node* switch_node : switch_nodes_) {
     Node* exit_node;
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(NewName("exit"), "Exit", graph_->op_registry())
-            .Input(NodeOut(switch_node, 0))
-            .Finalize(graph_, &exit_node));
+    TF_RETURN_IF_ERROR(NodeBuilder(NewName("exit"), "Exit",
+                                   graph_->op_registry(), &debug_info_)
+                           .Input(NodeOut(switch_node, 0))
+                           .Finalize(graph_, &exit_node));
     exit_nodes_.emplace_back(exit_node);
     outputs.emplace_back(NodeOut(exit_node, 0));
   }
@@ -307,7 +312,7 @@ Status LowerWhileHelper::CreateExitNodes() {
   // original functional While op. This is used for
   // 1. Rewiring the control edges with the original while op as src.
   // 2. Fetching the output of the While node by name in calls to sess.run.
-  NodeBuilder ib(name_, "IdentityN");
+  NodeBuilder ib(name_, "IdentityN", OpRegistry::Global(), &debug_info_);
   ib.Input(outputs);
   TF_RETURN_IF_ERROR(ib.Finalize(graph_, &lowered_while_output_));
   return Status::OK();
@@ -317,7 +322,7 @@ Status LowerWhileHelper::CreateNextIterationNodes() {
   for (int i = 0; i < num_loop_inputs_; i++) {
     Node* next_iteration;
     TF_RETURN_IF_ERROR(NodeBuilder(NewName("next_iteration"), "NextIteration",
-                                   graph_->op_registry())
+                                   graph_->op_registry(), &debug_info_)
                            .Input(NodeOut(body_call_node_, i))
                            .Finalize(graph_, &next_iteration));
     next_iterations_nodes_.emplace_back(next_iteration);
diff --git a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
index 6af4ca4d961d96a46be67e3770434e380658f32a..ecb2670a74b9387f46ed21eb5bc40c87136b3254 100644
--- a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
+++ b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
@@ -52,7 +52,8 @@ class ParallelConcatRemovePass : public GraphOptimizationPass {
       AttrSlice n_attrs = n->attrs();
       auto base_make_node = [n, &n_attrs](const string& op,
                                           const string& name) {
-        NodeBuilder node_builder(name, op);
+        NodeDebugInfo debug_info(*n);
+        NodeBuilder node_builder(name, op, OpRegistry::Global(), &debug_info);
         node_builder.Device(n->requested_device());
         string colo;
         if (GetNodeAttr(n_attrs, "_class", &colo).ok()) {
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index bc8ba6e47d5c66eab72eacd1f4d9a65a4b9cae6c..5a524eba7625f43116eea762c0e8171a746a8ae6 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -512,25 +512,25 @@ class RunManyGraphs {
     if (resp->status_code() != error::Code::OK) {
       // resp->status_code will only be non-OK if s.ok().
       mutex_lock l(mu_);
-      UpdateStatusLocked(
+      ReportBadStatus(
           Status(resp->status_code(), resp->status_error_message()));
     } else if (!s.ok()) {
       mutex_lock l(mu_);
-      UpdateStatusLocked(s);
+      ReportBadStatus(s);
     }
     pending_.DecrementCount();
   }
 
   void StartCancel() {
     mutex_lock l(mu_);
-    UpdateStatusLocked(errors::Cancelled("RunManyGraphs"));
+    ReportBadStatus(errors::Cancelled("RunManyGraphs"));
   }
 
   void Wait() { pending_.Wait(); }
 
   Status status() const {
     mutex_lock l(mu_);
-    return status_;
+    return status_group_.as_status();
   }
 
  private:
@@ -538,15 +538,17 @@ class RunManyGraphs {
 
   BlockingCounter pending_;
   mutable mutex mu_;
-  Status status_ GUARDED_BY(mu_);
+  StatusGroup status_group_ GUARDED_BY(mu_);
 
-  void UpdateStatusLocked(const Status& s) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (status_.ok()) {
-      status_ = s;
+  void ReportBadStatus(const Status& s) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    // Start cancellation if we aren't already in an error state.
+    if (status_group_.ok()) {
       for (Call& call : calls_) {
         call.opts.StartCancel();
       }
     }
+
+    status_group_.Update(s);
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(RunManyGraphs);
@@ -1352,7 +1354,9 @@ Status MasterSession::DeleteWorkerSessions() {
         &workers[i].call_opts, &workers[i].request, &workers[i].response, cb);
   }
 
-  done.Wait();
+  if (!done.WaitFor(std::chrono::milliseconds(10000))) {
+    LOG(WARNING) << "Timeout for closing worker session";
+  }
   for (size_t i = 0; i < workers.size(); ++i) {
     status.Update(workers[i].status);
   }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index 781b7d65cdd184363d7c7650305bd62f3129c271..e5634d38bd993572b877228f1c7b1af9706bcf0c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -60,6 +60,8 @@ Status ValidateHostPortPair(const string& host_port) {
   // TODO(mrry): Implement secure channels.
   ::grpc::ChannelArguments args;
   args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+  args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, std::numeric_limits<int>::max());
+  args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, std::numeric_limits<int>::max());
   // NOTE(mrry): Some versions of gRPC use a 20-second minimum backoff
   // on connection failure, which makes our tests time out.
   args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index cbd5cd927e7d73fd0ed28a910c89eef1f73b0d91..08518606f607bf733bcfe9f927890bb1e05f6cde 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -110,6 +110,8 @@ GrpcServer::~GrpcServer() {
   // - worker_env_.compute_pool
 }
 
+void GrpcServer::MaybeMutateBuilder(::grpc::ServerBuilder* builder) {}
+
 Status GrpcServer::Init(
     ServiceInitFunction service_func,
     const RendezvousMgrCreationFunction& rendezvous_mgr_func,
@@ -191,6 +193,11 @@ Status GrpcServer::Init(
   builder.AddListeningPort(strings::StrCat("0.0.0.0:", requested_port),
                            GetServerCredentials(server_def_), &bound_port_);
   builder.SetMaxMessageSize(std::numeric_limits<int32>::max());
+  builder.AddChannelArgument(GRPC_ARG_KEEPALIVE_TIME_MS,
+                             std::numeric_limits<int>::max());
+  builder.AddChannelArgument(GRPC_ARG_KEEPALIVE_TIMEOUT_MS,
+                             std::numeric_limits<int>::max());
+
   builder.SetOption(
       std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
   // Allow subclasses to specify more args to pass to the gRPC server.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index c1395abddebd1af780ade4884b3f5af239c5fb0e..c7f543e5bfc0655a603da7436eaaca5351b2f07a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -62,7 +62,7 @@ class GrpcServer : public ServerInterface {
   GrpcServer(const ServerDef& server_def, Env* env);
   // Allow children classes to override this and provide custom args to the
   // server before it is constructed. Default behavior is to do nothing.
-  virtual void MaybeMutateBuilder(::grpc::ServerBuilder* builder) {}
+  virtual void MaybeMutateBuilder(::grpc::ServerBuilder* builder);
 
  public:
   static Status Create(const ServerDef& server_def, Env* env,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 1ad40fe2971cfbd05fe868ad4ae39edd9f9775f9..32063fecbbef4347bcdbfbdfda32f008015b5975 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -92,6 +92,12 @@ void ReEncodeConsts(GraphDef* gdef) {
 }
 }  // namespace
 
+void GrpcSession::SetHandleAndGraphVersion(string handle, int64 graph_version) {
+  mutex_lock l(mu_);
+  handle_ = std::move(handle);
+  current_graph_version_ = graph_version;
+}
+
 Status GrpcSession::Handle(string* out_handle) {
   mutex_lock l(mu_);
   if (handle_.empty()) {
@@ -117,9 +123,7 @@ Status GrpcSession::CreateImpl(CallOptions* call_options,
   CreateSessionResponse resp;
   Status s = master_->CreateSession(call_options, &req, &resp);
   if (s.ok()) {
-    mutex_lock l(mu_);
-    swap(handle_, *(resp.mutable_session_handle()));
-    current_graph_version_ = resp.graph_version();
+    SetHandleAndGraphVersion(resp.session_handle(), resp.graph_version());
   }
   return s;
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.h b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
index 63795117f9763434f5ff331d3d2d3bdb99413e81..a3ed3ec73669a0844c27af90e974131574174e88 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
@@ -106,9 +106,12 @@ class GrpcSession : public Session {
  protected:
   // Takes ownership of `*master`.
   void SetRemoteMaster(std::unique_ptr<MasterInterface> master);
+  // Allows subclasses to customize Session creation.
+  void SetHandleAndGraphVersion(string handle, int64 graph_version)
+      LOCKS_EXCLUDED(mu_);
 
  private:
-  SessionOptions options_;
+  const SessionOptions options_;
   std::unique_ptr<MasterInterface> master_;
   mutex mu_;
 
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index f42143e5824827e35a97ac25cb80b0e2c82e716e..c6e34c568e73d9dc4ccf007088c79a054f8f1a18 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -104,7 +104,8 @@ void Worker::AbortStep(int64 step_id) {
     // Delay a bit before aborting the step. This way, the root
     // cause may return first back to the client instead of this
     // cancellation generated abort error.
-    rendez->StartAbort(errors::Aborted("Step ", step_id));
+    rendez->StartAbort(errors::Aborted("Step ", step_id,
+                                       " cancelled.  Cancelling rendezvous."));
     rendez->Unref();
   });
 }
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 6af14150b70fd1f1a6d73f24077814e0054a57d8..6e214332710c9f2e854db99ec588424c8df81145 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -74,6 +76,113 @@ class DatasetVariantWrapper {
   DatasetBase* const dataset_;  // Owns one reference.
 };
 
+const char kWrappedDatasetVariantTypeName[] =
+    "tensorflow::data::WrappedDatasetVariant";
+
+class WrappedDatasetVariantWrapper {
+ public:
+  WrappedDatasetVariantWrapper() {}
+
+  explicit WrappedDatasetVariantWrapper(const Tensor& ds_tensor)
+      : ds_tensor_(ds_tensor) {}
+
+  Tensor get() const { return ds_tensor_; }
+
+  string TypeName() const { return "tensorflow::WrappedDatasetVariantWrapper"; }
+
+  string DebugString() const {
+    return "tensorflow::WrappedDatasetVariantWrapper::DebugString";
+  }
+
+  void Encode(VariantTensorData* data) const {
+    *(data->add_tensors()) = ds_tensor_;
+  }
+
+  bool Decode(const VariantTensorData& data) {
+    ds_tensor_ = data.tensors(0);
+    return true;
+  }
+
+ private:
+  Tensor ds_tensor_;
+};
+
+class WrapDatasetVariantOp : public OpKernel {
+ public:
+  explicit WrapDatasetVariantOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& tensor = ctx->input(0);
+    OP_REQUIRES(ctx,
+                tensor.dtype() == DT_VARIANT &&
+                    TensorShapeUtils::IsScalar(tensor.shape()),
+                errors::InvalidArgument(
+                    "Dataset tensor must be a scalar of dtype DT_VARIANT."));
+    DatasetBase* unused;
+    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(tensor, &unused));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    output->scalar<Variant>()() = WrappedDatasetVariantWrapper(tensor);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("WrapDatasetVariant").Device(DEVICE_CPU),
+                        WrapDatasetVariantOp);
+REGISTER_KERNEL_BUILDER(Name("WrapDatasetVariant")
+                            .HostMemory("input_handle")
+                            .HostMemory("output_handle")
+                            .Device(DEVICE_GPU),
+                        WrapDatasetVariantOp);
+
+class UnwrapDatasetVariantOp : public OpKernel {
+ public:
+  explicit UnwrapDatasetVariantOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& tensor = ctx->input(0);
+    OP_REQUIRES(ctx,
+                tensor.dtype() == DT_VARIANT &&
+                    TensorShapeUtils::IsScalar(tensor.shape()),
+                errors::InvalidArgument(
+                    "Dataset tensor must be a scalar of dtype DT_VARIANT."));
+    Variant variant = tensor.scalar<Variant>()();
+    const WrappedDatasetVariantWrapper* wrapper =
+        variant.get<WrappedDatasetVariantWrapper>();
+    OP_REQUIRES(ctx, wrapper != nullptr,
+                errors::InvalidArgument(
+                    "Tensor must be a WrappedDataset variant object."));
+    Tensor ds_tensor = wrapper->get();
+    OP_REQUIRES_OK(ctx, ctx->set_output("output_handle", ds_tensor));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnwrapDatasetVariant").Device(DEVICE_CPU),
+                        UnwrapDatasetVariantOp);
+REGISTER_KERNEL_BUILDER(Name("UnwrapDatasetVariant")
+                            .HostMemory("input_handle")
+                            .HostMemory("output_handle")
+                            .Device(DEVICE_GPU),
+                        UnwrapDatasetVariantOp);
+
+static Status WrappedDatasetVariantDeviceCopy(
+    const WrappedDatasetVariantWrapper& from, WrappedDatasetVariantWrapper* to,
+    const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) {
+  *to = WrappedDatasetVariantWrapper(from);
+  return Status::OK();
+}
+
+#define REGISTER_OPTIONAL_COPY(DIRECTION)               \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \
+      WrappedDatasetVariantWrapper, DIRECTION,          \
+      WrappedDatasetVariantDeviceCopy)
+
+REGISTER_OPTIONAL_COPY(VariantDeviceCopyDirection::HOST_TO_DEVICE);
+REGISTER_OPTIONAL_COPY(VariantDeviceCopyDirection::DEVICE_TO_HOST);
+REGISTER_OPTIONAL_COPY(VariantDeviceCopyDirection::DEVICE_TO_DEVICE);
+
+REGISTER_UNARY_VARIANT_DECODE_FUNCTION(WrappedDatasetVariantWrapper,
+                                       kWrappedDatasetVariantTypeName);
+
 }  // namespace
 
 Status GraphDefBuilderWrapper::AddDataset(
@@ -206,6 +315,20 @@ bool GraphDefBuilderWrapper::HasAttr(const string& name,
   return HasAttr(op_def, attr_name);
 }
 
+int64 GetAllocatedBytes(const std::vector<Tensor>& element) {
+  int64 allocated_bytes = 0;
+  DatasetBase* dataset;
+  for (auto& tensor : element) {
+    if (tensor.dtype() == DT_VARIANT &&
+        GetDatasetFromVariantTensor(tensor, &dataset).ok()) {
+      allocated_bytes += dataset->AllocatedBytes();
+    } else {
+      allocated_bytes += tensor.AllocatedBytes();
+    }
+  }
+  return allocated_bytes;
+}
+
 Status GetDatasetFromVariantTensor(const Tensor& tensor,
                                    DatasetBase** out_dataset) {
   if (!(tensor.dtype() == DT_VARIANT &&
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index b2689bbdb788710753b6a66e2d90e0a9ee10411b..cca10fa49e86c062a7d6fa8b25901c7c1fb87d95 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -50,8 +50,9 @@ class GraphDefBuilder;
 class Node;
 
 namespace data {
-// A constant that can be used to enable auto-tuning.
-constexpr int kAutoTune = -1;
+
+constexpr int kInfiniteCardinality = -1;
+constexpr int kUnknownCardinality = -2;
 
 class DatasetBase;
 class SerializationContext;
@@ -163,7 +164,7 @@ class GraphDefBuilderWrapper {
                     const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
                     Node** output) {
     std::vector<std::pair<size_t, Node*>> enumerated_inputs(inputs.size());
-    for (int i = 0; i < inputs.size(); i++) {
+    for (size_t i = 0; i < inputs.size(); i++) {
       enumerated_inputs[i] = std::make_pair(i, inputs[i]);
     }
     return AddDataset(dataset, enumerated_inputs, {}, attrs, output);
@@ -282,6 +283,7 @@ class IteratorContext {
           function_library(ctx->function_library()),
           lib(ctx->lib()),
           function_handle_cache(ctx->function_handle_cache()),
+          resource_mgr(ctx->resource_mgr()),
           model(ctx->model()),
           runner(*(ctx->runner())),
           runner_threadpool_size(ctx->runner_threadpool_size()),
@@ -321,6 +323,10 @@ class IteratorContext {
     // A FunctionHandleCache that owns all the function handles. Not owned.
     FunctionHandleCache* function_handle_cache = nullptr;
 
+    // A resource manager for storing dataset-related state, e.g. random
+    // seeds or cached tensors. Not owned.
+    ResourceMgr* resource_mgr = nullptr;
+
     // If non-null, identifies the object used for performance modeling.
     std::shared_ptr<model::Model> model = nullptr;
 
@@ -360,6 +366,8 @@ class IteratorContext {
     return params_.function_handle_cache;
   }
 
+  ResourceMgr* resource_mgr() { return params_.resource_mgr; }
+
   const std::shared_ptr<model::Model>& model() { return params_.model; }
 
   std::function<void(std::function<void()>)>* runner() {
@@ -531,6 +539,25 @@ class DatasetContext {
   Params params_;
 };
 
+// Returns the number of bytes allocated for the given tensor.
+int64 GetAllocatedBytes(const std::vector<Tensor>& element);
+
+// Validates and extracts a `DatasetBase` object from `tensor`.
+//
+// `tensor` must have been written by a call to SetVariantTensorToDataset().
+//
+// The retrieved pointer is a borrowed reference to the dataset, which is owned
+// by the tensor. The consumer must either acquire its own reference to the
+// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
+// destroyed or mutated while the retrieved pointer is in use.
+Status GetDatasetFromVariantTensor(const Tensor& tensor,
+                                   DatasetBase** out_dataset);
+
+// Stores a `DatasetBase` object in `tensor`.
+//
+// The ownership of `dataset` is transferred to `tensor`.
+Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
+
 // Represents a (potentially infinite) range of outputs, where each
 // output is a tuple of tensors.
 class DatasetBase : public core::RefCounted {
@@ -584,6 +611,12 @@ class DatasetBase : public core::RefCounted {
   // in the outputs of this dataset.
   virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
 
+  // Returns the number of bytes allocated for tensors of this dataset.
+  virtual int64 AllocatedBytes() const { return 0; }
+
+  // Returns the cardinality of this dataset.
+  virtual int64 Cardinality() const { return kUnknownCardinality; }
+
   // A human-readable debug string for this dataset.
   virtual string DebugString() const = 0;
 
@@ -601,7 +634,6 @@ class DatasetBase : public core::RefCounted {
                            const DatasetBase* dataset, Node** output);
   };
 
-  // TODO(jsimsa): Consolidate overloading into a single method.
   virtual Status AsGraphDefInternal(SerializationContext* ctx,
                                     DatasetGraphDefBuilder* b,
                                     Node** node) const = 0;
@@ -689,18 +721,36 @@ class DatasetBaseIterator : public IteratorBase {
     return model::MakeUnknownNode(std::move(args));
   }
 
-  // When performance modeling is enabled, this method records the fact that
-  // this iterator has produced an element.
+  // When modeling is enabled, this method records the fact that this iterator
+  // has dequeued an element from an internal buffer.
+  void RecordBufferDequeue(IteratorContext* ctx,
+                           const std::vector<Tensor>& element) {
+    if (collect_resource_usage(ctx)) {
+      node_->add_buffered_bytes(-GetAllocatedBytes(element));
+    }
+  }
+
+  // When modeling is enabled, this method records the fact that this iterator
+  // has enqueued an element in an internal buffer.
+  void RecordBufferEnqueue(IteratorContext* ctx,
+                           const std::vector<Tensor>& element) {
+    if (collect_resource_usage(ctx)) {
+      node_->add_buffered_bytes(GetAllocatedBytes(element));
+    }
+  }
+
+  // When modeling is enabled, this method records the fact that this iterator
+  // has produced an element.
   void RecordElement(IteratorContext* ctx) {
     if (node_) {
       node_->record_element();
     }
   }
 
-  // When performance modeling is enabled, this method records the fact that
-  // a thread of this iterator has started work.
+  // When modeling is enabled, this method records the fact that a thread of
+  // this iterator has started work.
   void RecordStart(IteratorContext* ctx, bool stop_output = false) {
-    if (node_) {
+    if (collect_resource_usage(ctx)) {
       int64 now_nanos = Env::Default()->NowNanos();
       if (stop_output && node_->output()) {
         node_->output()->record_stop(now_nanos);
@@ -709,10 +759,10 @@ class DatasetBaseIterator : public IteratorBase {
     }
   }
 
-  // When performance modeling is enabled, this method records the fact that
-  // a thread of this iterator has stopped work.
+  // When modeling is enabled, this method records the fact that a thread of
+  // this iterator has stopped work.
   void RecordStop(IteratorContext* ctx, bool start_output = false) {
-    if (node_) {
+    if (collect_resource_usage(ctx)) {
       int64 now_nanos = Env::Default()->NowNanos();
       node_->record_stop(now_nanos);
       if (start_output && node_->output()) {
@@ -722,6 +772,11 @@ class DatasetBaseIterator : public IteratorBase {
   }
 
  private:
+  inline bool collect_resource_usage(IteratorContext* ctx) {
+    auto model = ctx->model();
+    return model && model->collect_resource_usage() && node_;
+  }
+
   BaseParams params_;
 };
 
@@ -821,22 +876,6 @@ class BinaryDatasetOpKernel : public DatasetOpKernel {
                            DatasetBase** output) = 0;
 };
 
-// Validates and extracts a `DatasetBase` object from `tensor`.
-//
-// `tensor` must have been written by a call to SetVariantTensorToDataset().
-//
-// The retrieved pointer is a borrowed reference to the dataset, which is owned
-// by the tensor. The consumer must either acquire its own reference to the
-// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
-// destroyed or mutated while the retrieved pointer is in use.
-Status GetDatasetFromVariantTensor(const Tensor& tensor,
-                                   DatasetBase** out_dataset);
-
-// Stores a `DatasetBase` object in `tensor`.
-//
-// The ownership of `dataset` is transferred to `tensor`.
-Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
-
 // A simple background worker that executes closures asynchronously and without
 // blocking.
 //
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 446c31b17f2904da3143438304d6407bd65c450c..321947aca8e06008c3291fa43befa389b53f998c 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -82,6 +82,13 @@ class DeviceContext : public core::RefCounted {
     done(errors::Internal("Unrecognized device type in CPU-to-device Copy"));
   }
 
+  // Copies a tensor in this device.
+  virtual void CopyTensorInSameDevice(const Tensor* input_tensor,
+                                      Device* device, Tensor* output_tensor,
+                                      StatusCallback done) const {
+    done(errors::Unimplemented("Copy in same device not implemented."));
+  }
+
   // "device_tensor" is a tensor on a non-CPU device.  Copies
   // device_tensor into "cpu_tensor".  "cpu_tensor" must be allocated
   // to be of the same size as "device_tensor".
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index b69a40f3128905960cc054ddea7cc20b5d4583a3..94af4ee580b1e7dc1e760ed7d62575e3f8ddb817 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -673,6 +673,7 @@ Status AddDefaultAttrs(const string& op,
 
 }  // end namespace
 
+// TODO(shikharagarwal): Transmit original node names correctly in file.
 Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
                            GetFunctionSignature get_function,
                            InstantiationResult* result) {
diff --git a/tensorflow/core/framework/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
index b2bc414c496338c382b5f3f194fcb778c08706fa..44b22f93c1d4908e3c7765c2b8bddc74a8a22a37 100644
--- a/tensorflow/core/framework/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -165,6 +165,7 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
       node_def->set_device(node->assigned_device_name());
     }
     node_def->set_name(node_names.Uniquify(node->name()));
+    MergeDebugInfo(NodeDebugInfo(node->def()), node_def);
 
     // Reset input names based on graph rather than the NodeDef.
     node_def->clear_input();
diff --git a/tensorflow/core/framework/graph_to_functiondef_test.cc b/tensorflow/core/framework/graph_to_functiondef_test.cc
index 587e2c07ac046e7476a2da53a9ef4d8b3651410a..c3cc1a743311b71b6604e08c6ebf3ff2d130444b 100644
--- a/tensorflow/core/framework/graph_to_functiondef_test.cc
+++ b/tensorflow/core/framework/graph_to_functiondef_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
@@ -28,6 +29,14 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+FunctionDef RemoveDebugInfo(const FunctionDef& def) {
+  FunctionDef copy = def;
+  for (auto& node_def : *copy.mutable_node_def()) {
+    node_def.clear_experimental_debug_info();
+  }
+  return copy;
+}
+
 bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
                       string* diff) {
   // TODO(phawkins) use a more sophisticated equality test.
@@ -78,7 +87,8 @@ TEST(GraphToFunctionDefTest, Basics) {
       {{"h_0", "G:sum:0"}});  // return values
 
   string diff;
-  bool fdefs_equal = EqualFunctionDef(fdef_expected, fdef, &diff);
+  bool fdefs_equal =
+      EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
   EXPECT_TRUE(fdefs_equal) << diff;
 }
 
@@ -111,7 +121,8 @@ TEST(GraphToFunctionDefTest, ControlDependencies) {
       {{"c", "b:y:0"}});  // return values
 
   string diff;
-  bool fdefs_equal = EqualFunctionDef(fdef_expected, fdef, &diff);
+  bool fdefs_equal =
+      EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
   EXPECT_TRUE(fdefs_equal) << diff;
 }
 
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 3bd5b725b860ff522dba5be86ef7ab64b387b03e..b7c6d8091092ac64af8de7ab5daf3e60797970e8 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -356,6 +356,8 @@ std::shared_ptr<Node> Model::AddNode(Node::Factory factory, const string& name,
   if (output) {
     output->add_input(node);
   }
+  collect_resource_usage_ =
+      collect_resource_usage_ || node->has_tunable_parameters();
   lookup_table_.insert(std::make_pair(name, node));
   return node;
 }
@@ -441,7 +443,7 @@ void Model::RecordElement(const string& name) {
 void Model::RecordStart(const string& name, bool stop_output) {
   tf_shared_lock l(mu_);
   auto node = gtl::FindOrNull(lookup_table_, name);
-  if (node) {
+  if (collect_resource_usage_ && node) {
     int64 now_nanos = Env::Default()->NowNanos();
     if (stop_output && (*node)->output()) {
       (*node)->output()->record_stop(now_nanos);
@@ -453,7 +455,7 @@ void Model::RecordStart(const string& name, bool stop_output) {
 void Model::RecordStop(const string& name, bool start_output) {
   tf_shared_lock l(mu_);
   auto node = gtl::FindOrNull(lookup_table_, name);
-  if (node) {
+  if (collect_resource_usage_ && node) {
     int64 now_nanos = Env::Default()->NowNanos();
     (*node)->record_stop(now_nanos);
     if (start_output && (*node)->output()) {
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 24aa5630cc38550789d6184500cff6b0394ecbee..c3a694227c229884aef60374e494ade6ca539383 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -34,18 +34,24 @@ namespace tensorflow {
 namespace data {
 namespace model {
 
+// A constant that can be used to enable auto-tuning.
+constexpr int kAutoTune = -1;
+
 // Represents thread-safe state that can be shared between an input pipeline and
 // the performance model.
 struct SharedState {
  public:
   SharedState(int64 value, std::shared_ptr<mutex> mu,
               std::shared_ptr<condition_variable> cond_var)
-      : value(value), mu(std::move(mu)), cond_var(std::move(cond_var)) {}
+      : value(value),
+        mu(std::move(mu)),
+        cond_var(std::move(cond_var)),
+        tunable(value == kAutoTune) {}
 
   int64 value;
   std::shared_ptr<mutex> mu;
   std::shared_ptr<condition_variable> cond_var;
-  bool tunable = false;
+  const bool tunable;
 };
 
 // Represents a parameter.
@@ -112,6 +118,12 @@ class Node {
   explicit Node(Args args)
       : id_(args.id), name_(args.name), output_(args.output.get()) {}
 
+  // Increments the bytes buffered by the given delta.
+  void add_buffered_bytes(int64 delta) LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    buffered_bytes_ += delta;
+  }
+
   // Adds an input.
   void add_input(std::shared_ptr<Node> node) LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
@@ -124,18 +136,33 @@ class Node {
     processing_time_ += delta;
   }
 
+  // Returns the number of bytes stored in this node's buffer.
+  int64 buffered_bytes() const LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return buffered_bytes_;
+  }
+
+  // Indicates whether the node has tunable parameters.
+  bool has_tunable_parameters() const LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    for (const auto& pair : parameters_) {
+      if (pair.second->state->tunable) return true;
+    }
+    return false;
+  }
+
   // Returns the unique node ID.
   int64 id() const LOCKS_EXCLUDED(mu_) { return id_; }
 
-  // Returns the node name.
-  const string& name() const { return name_; }
-
   // Returns the node inputs.
   std::list<std::shared_ptr<Node>> inputs() const LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return inputs_;
   }
 
+  // Returns the node name.
+  const string& name() const { return name_; }
+
   // Returns the number of elements produced by the node.
   int64 num_elements() const LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
@@ -185,7 +212,8 @@ class Node {
 
   // Collects tunable parameters in the subtree rooted in this node.
   void CollectTunableParameters(
-      std::vector<std::shared_ptr<Parameter>>* parameters) LOCKS_EXCLUDED(mu_) {
+      std::vector<std::shared_ptr<Parameter>>* parameters) const
+      LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     for (auto& pair : parameters_) {
       if (pair.second->state->tunable) {
@@ -219,6 +247,7 @@ class Node {
       LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     std::shared_ptr<Node> result = Clone(output);
+    result->buffered_bytes_ = buffered_bytes_;
     result->processing_time_ = processing_time_;
     result->num_elements_ = num_elements_;
     result->parameters_ = parameters_;
@@ -274,6 +303,7 @@ class Node {
   mutable mutex mu_;
   const int64 id_;
   const string name_;
+  int64 buffered_bytes_ GUARDED_BY(mu_) = 0;
   int64 processing_time_ GUARDED_BY(mu_) = 0;
   int64 num_elements_ GUARDED_BY(mu_) = 0;
   std::map<std::thread::id, int64> work_start_ GUARDED_BY(mu_);
@@ -329,7 +359,10 @@ std::shared_ptr<Node> MakeUnknownNode(Node::Args args);
 // implementation of `DatasetBase` and `DatasetBaseIterator` respectively.
 class Model {
  public:
-  Model() = default;
+  Model() : collect_resource_usage_(false) {}
+
+  // Indicates whether to collect resource usage.
+  bool collect_resource_usage() const { return collect_resource_usage_; }
 
   // Adds a node with the given name and given output.
   std::shared_ptr<Node> AddNode(Node::Factory factory, const string& name,
@@ -373,6 +406,14 @@ class Model {
   int64 id_counter_ GUARDED_BY(mu_) = 1;
   std::shared_ptr<Node> output_ GUARDED_BY(mu_);
   std::map<string, std::shared_ptr<Node>> lookup_table_ GUARDED_BY(mu_);
+
+  // Indicates whether the modeling framework should collect resource usage
+  // (e.g. CPU, memory). The logic for collecting this information assumes that
+  // the collection is not repeatedly disabled and enabled. As a consequence,
+  // the implementation starts collecting resource usage when it encounters a
+  // tunable parameter (because the information is used for for tuning the value
+  // of the parameter) and never stops.
+  std::atomic<bool> collect_resource_usage_;
 };
 
 }  // namespace model
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 53e35f25b28cb3770b52e8f7de54eb0ff4e65d83..90bd570f90cdab2182f3d46e009b2cd972667ef9 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -330,6 +330,62 @@ TEST(UnknownTest, Model) {
   EXPECT_EQ(100, unknown->OutputTime(&input_times));
 }
 
+class TestNode : public model::Node {
+ public:
+  using model::Node::Node;
+
+  virtual ~TestNode() {}
+
+ protected:
+  std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    return nullptr;
+  }
+
+  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    return 0;
+  }
+
+  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+    return 0;
+  }
+};
+
+TEST(SetterGetterTest, Node) {
+  std::shared_ptr<TestNode> node =
+      std::make_shared<TestNode>(model::Node::Args{-1, "TestNode", nullptr});
+  EXPECT_EQ(-1, node->id());
+  EXPECT_EQ("TestNode", node->name());
+  EXPECT_EQ(nullptr, node->output());
+
+  EXPECT_EQ(0, node->buffered_bytes());
+  node->add_buffered_bytes(42);
+  EXPECT_EQ(42, node->buffered_bytes());
+
+  EXPECT_EQ(0, node->processing_time());
+  node->record_start(1);
+  EXPECT_EQ(0, node->processing_time());
+  node->record_stop(41);
+  EXPECT_EQ(40, node->processing_time());
+  node->add_processing_time(2);
+  EXPECT_EQ(42, node->processing_time());
+
+  std::shared_ptr<TestNode> input =
+      std::make_shared<TestNode>(model::Node::Args{-1, "TestInput", node});
+  EXPECT_EQ(node.get(), input->output());
+  EXPECT_EQ(0, node->inputs().size());
+  node->add_input(input);
+  EXPECT_EQ(1, node->inputs().size());
+  EXPECT_EQ(input, node->inputs().front());
+  node->remove_input(input);
+  EXPECT_EQ(0, node->inputs().size());
+
+  EXPECT_EQ(0, node->num_elements());
+  node->record_element();
+  EXPECT_EQ(1, node->num_elements());
+}
+
 }  // namespace
 }  // namespace model
 }  // namespace data
diff --git a/tensorflow/core/framework/node_def.proto b/tensorflow/core/framework/node_def.proto
index 0a095f903f9f6b98b3247c547aaa4e21964f003e..73cbc9600c54e82a5e541d88eefcf679d241928c 100644
--- a/tensorflow/core/framework/node_def.proto
+++ b/tensorflow/core/framework/node_def.proto
@@ -60,4 +60,18 @@ message NodeDef {
   // attr's type field.
   // TODO(josh11b): Add some examples here showing best practices.
   map<string, AttrValue> attr = 5;
+
+  message ExperimentalDebugInfo {
+    // Opaque string inserted into error messages created by the runtime.
+    //
+    // This is intended to store the list of names of the nodes from the
+    // original graph that this node was derived. For example if this node, say
+    // C, was result of a fusion of 2 nodes A and B, then 'original_node' would
+    // be {A, B}. This information can be used to map errors originating at the
+    // current node to some top level source code.
+    repeated string original_node_names = 1;
+  };
+
+  // This stores debug information associated with the node.
+  ExperimentalDebugInfo experimental_debug_info = 6;
 };
diff --git a/tensorflow/core/framework/node_def_builder.cc b/tensorflow/core/framework/node_def_builder.cc
index 348a825af91f4c6093f35d9d564f111a971cde18..4808967ca6a1139cccf58ed1897306a5d54b3f1e 100644
--- a/tensorflow/core/framework/node_def_builder.cc
+++ b/tensorflow/core/framework/node_def_builder.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -37,7 +38,8 @@ void NodeDefBuilder::NodeOut::Reset(StringPiece n, int i, DataType dt) {
 }
 
 NodeDefBuilder::NodeDefBuilder(StringPiece name, StringPiece op_name,
-                               const OpRegistryInterface* op_registry) {
+                               const OpRegistryInterface* op_registry,
+                               const NodeDebugInfo* debug) {
   node_def_.set_name(string(name));
   const Status status = op_registry->LookUpOpDef(string(op_name), &op_def_);
   if (status.ok()) {
@@ -46,6 +48,13 @@ NodeDefBuilder::NodeDefBuilder(StringPiece name, StringPiece op_name,
     errors_.push_back(status.error_message());
     inputs_specified_ = 0;
   }
+  if (debug != nullptr) MergeDebugInfo(*debug, &node_def_);
+}
+
+NodeDefBuilder::NodeDefBuilder(StringPiece name, StringPiece op_name,
+                               const NodeDebugInfo& debug)
+    : NodeDefBuilder(name, op_name) {
+  MergeDebugInfo(debug, &node_def_);
 }
 
 NodeDefBuilder::NodeDefBuilder(StringPiece name, const OpDef* op_def)
diff --git a/tensorflow/core/framework/node_def_builder.h b/tensorflow/core/framework/node_def_builder.h
index ad07ec548003b5218179c75232c9247f3656574e..63d856d16c6e1dfedcfe44ff21b3222c8cc7c172 100644
--- a/tensorflow/core/framework/node_def_builder.h
+++ b/tensorflow/core/framework/node_def_builder.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -63,7 +64,10 @@ class NodeDefBuilder {
   // specified by calling the methods below.
   // REQUIRES: The OpDef must satisfy ValidateOpDef().
   NodeDefBuilder(StringPiece name, StringPiece op_name,
-                 const OpRegistryInterface* op_registry = OpRegistry::Global());
+                 const OpRegistryInterface* op_registry = OpRegistry::Global(),
+                 const NodeDebugInfo* debug = nullptr);
+  NodeDefBuilder(StringPiece name, StringPiece op_name,
+                 const NodeDebugInfo& debug);
   // REQUIRES: in addition, *op_def must outlive *this.
   NodeDefBuilder(StringPiece name, const OpDef* op_def);
 
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 95a787b2df02d48f316653ee5059b4f7e80f73e1..8071da5b6d454708a10c7d4a9d77b8a3ae6287bd 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -106,13 +106,50 @@ string SummarizeAttrs(const NodeDef& node_def) {
   return SummarizeAttrsHelper(node_def, node_def.device());
 }
 
+string FormatNodeForError(const NodeDebugInfo& debug_info) {
+  return debug_info.original_node_names.empty()
+             ? errors::FormatNodeNameForError(debug_info.name)
+             : errors::FormatNodeNamesForError(debug_info.original_node_names);
+}
+
 string FormatNodeForError(const Node& node) {
-  return FormatNodeDefForError(node.def());
+  return FormatNodeForError(NodeDebugInfo(node));
 }
 
 string FormatNodeDefForError(const NodeDef& node_def) {
-  VLOG(1) << "Error in the node: " << SummarizeNodeDef(node_def);
-  return errors::FormatNodeNameForError(node_def.name());
+  return FormatNodeForError(NodeDebugInfo(node_def));
+}
+
+void GetMergedOriginalNodeNames(const NodeDebugInfo& from,
+                                const NodeDebugInfo& to,
+                                std::set<string>* names) {
+  if (!from.original_node_names.empty()) {
+    names->insert(from.original_node_names.begin(),
+                  from.original_node_names.end());
+  } else {
+    names->insert(from.name);
+  }
+  names->insert(to.original_node_names.begin(), to.original_node_names.end());
+}
+
+void MergeDebugInfo(const NodeDebugInfo& from, Node* to) {
+  std::set<string> names;
+  GetMergedOriginalNodeNames(from, NodeDebugInfo(*to), &names);
+  to->set_original_node_names({names.begin(), names.end()});
+}
+
+void MergeDebugInfo(const NodeDebugInfo& from, NodeDef* to) {
+  std::set<string> names;
+  GetMergedOriginalNodeNames(from, NodeDebugInfo(*to), &names);
+  to->mutable_experimental_debug_info()->clear_original_node_names();
+  if (!names.empty()) {
+    *to->mutable_experimental_debug_info()->mutable_original_node_names() = {
+        names.begin(), names.end()};
+  }
+}
+
+void MergeDebugInfo(const NodeDef& from, NodeDef* to) {
+  MergeDebugInfo(NodeDebugInfo(from), to);
 }
 
 const AttrValue* AttrSlice::Find(StringPiece attr_name) const {
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index f682bb15355550622e8bbe384df790f1022bd630..4e4a5c38d5dc0248acb0d15f04cb37140e256dab 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -29,6 +29,7 @@ limitations under the License.
 namespace tensorflow {
 
 class Node;
+class NodeDebugInfo;
 
 // We forward declare protos so that kernels don't need to depend on them
 class NodeDef;
@@ -56,6 +57,12 @@ string SummarizeAttrs(const NodeDef& node_def);
 string FormatNodeForError(const Node& node);
 string FormatNodeDefForError(const NodeDef& node_def);
 
+// Merges the original node names from the debug information of 'from' to the
+// debug information of 'to'.
+void MergeDebugInfo(const NodeDebugInfo& from, Node* to);
+void MergeDebugInfo(const NodeDebugInfo& from, NodeDef* to);
+void MergeDebugInfo(const NodeDef& from, NodeDef* to);
+
 typedef protobuf::Map<string, AttrValue> AttrValueMap;
 
 // Adds an attr with name <name> and value <value> to *node_def.
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 9f4c57e880ad32afac8bfadaf2edd7ba9597f02b..19a0c5e5be2e8cbb16d55db21d4d425d9add2974 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -1527,6 +1527,7 @@ T* OpKernelContext::op_device_context() {
 
 template <typename T>
 T* OpKernelContext::input_device_context(int index) {
+  DCHECK_NE(params_->input_device_contexts, nullptr);
   DCHECK_GE(index, 0);
   DCHECK_LT(index, params_->input_device_contexts->size());
   static_assert(std::is_base_of<DeviceContext, T>::value,
@@ -1535,6 +1536,7 @@ T* OpKernelContext::input_device_context(int index) {
 }
 
 inline DeviceContext* OpKernelContext::input_device_context(int index) {
+  DCHECK_NE(params_->input_device_contexts, nullptr);
   DCHECK_GE(index, 0);
   DCHECK_LT(index, params_->input_device_contexts->size());
   return (*params_->input_device_contexts)[index];
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index de148f0bd3474421c1361cf7ae4aa681107aa883..7a777f064c7b517de9f9c1c14648e5ff32ca4b5e 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -278,6 +278,12 @@ class DummyDeviceContext : public DeviceContext {
   ~DummyDeviceContext() override {}
   int stream_id() const { return stream_id_; }
 
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override {
+    done(Status::OK());
+  }
+
  private:
   const int stream_id_;
 };
diff --git a/tensorflow/core/framework/resource_var.h b/tensorflow/core/framework/resource_var.h
index ff7b3e78a711a717d44e1e2ca307d6fef05243d9..f5de5dba8854adcfd5b94447da3ba42566a26bd8 100644
--- a/tensorflow/core/framework/resource_var.h
+++ b/tensorflow/core/framework/resource_var.h
@@ -20,14 +20,46 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Resource stored by variables in the resource manager
-// (new, resource-style version).
+// Resource stored by variables in the resource manager (new, resource-style
+// version).
+//
+// These variables have a mixed access mode: they can operate on copy-on-write
+// mode (the default) or copy-on-read mode (used only for sparse access).
+//
+// When copy-on-write mode is enabled reading the value of the variable involves
+// grabbing its mutex in shared mode and aliasing the internal tensor as the
+// output of the read operation, increasing its reference count. Writing,
+// conversely, works by, under an exclusive lock, detecting whether there are
+// outstanding aliases of the tensor, using the reference count, copying the
+// tensor if they exist, and writing to either the original or a copy with no
+// outstanding aliases. Sparse operations are not supported in copy-on-write
+// mode.
+//
+// When a variable is accessed sparsely it switches to copy-on-read mode. To
+// switch we need to grab an exclusive lock and might (if there are aliases)
+// need to copy the entire tensor. Once copy-on-read mode is enabled, no tensor
+// is allowed to alias the variable's internal tensor. This means dense reads
+// must return a copy of the variable, done while holding a shared lock. Dense
+// writes do not need to check whether aliases exist, and can always write
+// directly to the buffer without making a copy, while holding an exclusive
+// lock. Sparse reads and sparse writes, on the other hand, can be done under a
+// shared or exclusive mutex (the damage from writes under a shared mutex is
+// limited since no other buffer is allowed to alias the variable's
+// buffer). Using an exclusive mutex disallows concurrent writes and concurrent
+// sparse reads, providing some extra safety at the expense of performance,
+// while shared mutex allow for "hogwild" behavior. Doing sparse writes under a
+// shared mutex prevents them from overlapping with dense writes, which is
+// necessary as dense writes can change the shape the of the tensor.
+//
+// Transitioning a variable from copy-on-read mode to copy-on-write mode is
+// currently not supported. To upgrade a variable from copy-on-write to
+// copy-on-read use `EnsureSparseVariableAccess()`, and then grab the variable's
+// mutex as desired. To access the variable in dense mode grab the mutex either
+// directly or via `MaybeLockVariableInputMutexesInOrder` on all variables being
+// modified and then call `PrepareToUpdateVariable` on them in any order.
 class Var : public ResourceBase {
  public:
   explicit Var(DataType dtype) : tensor_(dtype) {}
-  // Not copyable or movable.
-  Var(const Var&) = delete;
-  Var& operator=(const Var&) = delete;
 
   // When locking multiple variables, the locks must be acquired in order of
   // increasing mu() address.
@@ -48,11 +80,19 @@ class Var : public ResourceBase {
   bool is_initialized = false;  // GUARDED_BY(mu_) but annotalysis doesn't like
                                 // it.
 
+  // Also fake-guarded by mu_. Should be set to True whenever any sparse
+  // operation uses the variable. Once this is true no tensor is allowed to
+  // alias the memory of the variable, and we always copy the variable on
+  // reads. This allows sparse operations to happen with only a shared lock if
+  // so desired.
+  std::atomic<bool> copy_on_read_mode{false};
+
  private:
   mutex mu_;
   Tensor tensor_;
 
   ~Var() override {}
+  TF_DISALLOW_COPY_AND_ASSIGN(Var);
 };
 
 }  //  end namespace tensorflow
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index c7ddc6c21eda7af94379b07ab3dff8a25021665e..7e841489eb35d4ec3d18fe255472107ef9d60efe 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -68,7 +68,8 @@ namespace {
 // An un-templated base class for Buffer.
 class BufferBase : public TensorBuffer {
  public:
-  explicit BufferBase(Allocator* alloc) : alloc_(alloc) {}
+  explicit BufferBase(Allocator* alloc, void* data_ptr)
+      : TensorBuffer(data_ptr), alloc_(alloc) {}
 
   TensorBuffer* root_buffer() override { return this; }
   void FillAllocationDescription(AllocationDescription* proto) const override {
@@ -106,7 +107,6 @@ class Buffer : public BufferBase {
   Buffer(Allocator* a, int64 n);
   Buffer(Allocator* a, int64 n, const AllocationAttributes& allocation_attr);
 
-  void* data() const override { return data_; }
   size_t size() const override { return sizeof(T) * elem_; }
 
  private:
@@ -442,20 +442,20 @@ struct ProtoHelper<Eigen::half> {
 
 template <typename T>
 Buffer<T>::Buffer(Allocator* a, int64 n)
-    : BufferBase(a), data_(a->Allocate<T>(n)), elem_(n) {}
+    : BufferBase(a, a->Allocate<T>(n)), elem_(n) {}
 
 template <typename T>
 Buffer<T>::Buffer(Allocator* a, int64 n,
                   const AllocationAttributes& allocation_attr)
-    : BufferBase(a), data_(a->Allocate<T>(n, allocation_attr)), elem_(n) {}
+    : BufferBase(a, a->Allocate<T>(n, allocation_attr)), elem_(n) {}
 
 template <typename T>
 Buffer<T>::~Buffer() {
-  if (data_) {
+  if (data()) {
     if (LogMemory::IsEnabled()) {
       RecordDeallocation();
     }
-    alloc_->Deallocate<T>(data_, elem_);
+    alloc_->Deallocate<T>(static_cast<T*>(data()), elem_);
   }
 }
 
@@ -764,7 +764,9 @@ class SubBuffer : public TensorBuffer {
  public:
   // This buffer is an alias to buf[delta, delta + n).
   SubBuffer(TensorBuffer* buf, int64 delta, int64 n)
-      : root_(buf->root_buffer()), data_(buf->base<T>() + delta), elem_(n) {
+      : TensorBuffer(buf->base<T>() + delta),
+        root_(buf->root_buffer()),
+        elem_(n) {
     // Sanity check. The caller should ensure the sub buffer is valid.
     CHECK_LE(root_->base<T>(), this->base<T>());
     T* root_limit = root_->base<T>() + root_->size() / sizeof(T);
@@ -775,7 +777,6 @@ class SubBuffer : public TensorBuffer {
     root_->Ref();
   }
 
-  void* data() const override { return data_; }
   size_t size() const override { return sizeof(T) * elem_; }
   TensorBuffer* root_buffer() override { return root_; }
   void FillAllocationDescription(AllocationDescription* proto) const override {
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 3177bbe7e93268444bc10f7a2de0bcc447109e39..009dd0846d2639eb9cf1ef47f8f12c10994dcb3b 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -45,6 +45,7 @@ class TensorBuffer;
 class TensorCApi;
 class TensorDescription;
 class TensorProto;
+class Var;
 
 namespace batch_util {
 Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
@@ -581,11 +582,16 @@ class Tensor {
   friend class XlaTensor;             // For access to RefCountIsOne().
   friend class XlaTensorBuffer;  // For access to the private constructor taking
                                  // the buffer
+  friend class Var;
   template <typename Device, typename T>
   friend class AssignVariableOp;  // For access to RefCountIsOne().
   template <typename Device, typename T>
   friend Status PrepareToUpdateVariable(
-      OpKernelContext* ctx, Tensor* tensor);  // For access to RefCountIsOne().
+      OpKernelContext* ctx, Tensor* tensor,
+      bool copy_on_read_mode);  // For access to RefCountIsOne().
+  template <typename Device, typename T>
+  friend Status EnsureSparseVariableAccess(
+      OpKernelContext* ctx, Var* var);  // For access to RefCountIsOne().
   friend Status batch_util::CopyElementToSlice(
       Tensor element, Tensor* parent,
       int64 index);                // For access to RefCountIsOne().
@@ -636,10 +642,15 @@ class Tensor {
 // Interface to access the raw ref-counted data buffer.
 class TensorBuffer : public core::RefCounted {
  public:
+  explicit TensorBuffer(void* data_ptr) : data_(data_ptr) {}
   ~TensorBuffer() override {}
 
   // data() points to a memory region of size() bytes.
-  virtual void* data() const = 0;
+  //
+  // NOTE(mrry): The `data()` method is not virtual for performance reasons.
+  // It can be called multiple times when the contents of a `Tensor` are
+  // accessed, and so making it non-virtual allows the body to be inlined.
+  void* data() const { return data_; }
   virtual size_t size() const = 0;
 
   // If this TensorBuffer is sub-buffer of another TensorBuffer,
@@ -657,6 +668,9 @@ class TensorBuffer : public core::RefCounted {
 
   // Whether this TensorBuffer owns the underlying memory.
   virtual bool OwnsMemory() const { return true; }
+
+ private:
+  void* const data_;
 };
 
 template <typename T>
@@ -874,6 +888,7 @@ inline Tensor::Tensor(Tensor&& other)
 
 class Tensor::HostScalarTensorBufferBase : public TensorBuffer {
  public:
+  using TensorBuffer::TensorBuffer;
   void FillAllocationDescription(AllocationDescription* proto) const final;
 };
 
@@ -884,8 +899,7 @@ template <typename T>
 struct Tensor::ValueAndTensorBuffer {
   class HostScalarTensorBuffer : public Tensor::HostScalarTensorBufferBase {
    public:
-    HostScalarTensorBuffer(void* data) : data_(data) {}
-    void* data() const final { return const_cast<void*>(data_); }
+    HostScalarTensorBuffer(void* data) : HostScalarTensorBufferBase(data) {}
     size_t size() const final { return sizeof(T); }
     TensorBuffer* root_buffer() final { return this; }
 
@@ -904,8 +918,7 @@ struct Tensor::ValueAndTensorBuffer {
     }
 
    private:
-    ~HostScalarTensorBuffer() override { static_cast<T*>(data_)->~T(); }
-    void* const data_;
+    ~HostScalarTensorBuffer() override { static_cast<T*>(data())->~T(); }
   };
 
   T value;
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 4fa9d1df6757768c8c6b00b6932ee8e3550ee2f8..713f91fe04c6fe498209d88193f6fbb1729ec57c 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1491,5 +1491,26 @@ void BM_CreateAndMoveCtrWithBuf(int iters) {
 }
 BENCHMARK(BM_CreateAndMoveCtrWithBuf);
 
+// Benchmark creating and destroy a host-scalar tensor, using the allocator
+// interface.
+void BM_CreateAndDestroyHostScalarNonOptimized(int iters) {
+  TensorShape shape({});
+  Allocator* allocator = cpu_allocator();
+  while (--iters) {
+    Tensor a(allocator, DT_FLOAT, shape);
+    a.scalar<float>()() = 37.0;
+  }
+}
+BENCHMARK(BM_CreateAndDestroyHostScalarNonOptimized);
+
+// Benchmark creating and destroy a host-scalar tensor, using the specialized
+// constructor.
+void BM_CreateAndDestroyHostScalarOptimized(int iters) {
+  while (--iters) {
+    Tensor a(37.0);
+  }
+}
+BENCHMARK(BM_CreateAndDestroyHostScalarOptimized);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/edgeset.cc b/tensorflow/core/graph/edgeset.cc
index 2e0c67146169d4b0fe3bbb548c70451b2b1907b9..e3b88994b5e24fae7c76137e920bb46f4f01aa29 100644
--- a/tensorflow/core/graph/edgeset.cc
+++ b/tensorflow/core/graph/edgeset.cc
@@ -38,9 +38,8 @@ std::pair<EdgeSet::const_iterator, bool> EdgeSet::insert(value_type value) {
     }
     // array is full. convert to set.
     s = new std::set<const Edge*>;
-    for (int i = 0; i < kInline; i++) {
-      s->insert(static_cast<const Edge*>(ptrs_[i]));
-    }
+    s->insert(reinterpret_cast<const Edge**>(std::begin(ptrs_)),
+              reinterpret_cast<const Edge**>(std::end(ptrs_)));
     ptrs_[0] = this;
     ptrs_[1] = s;
     // fall through.
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 466310d874279c1a2b3d293021f0cb0cf578c6c5..623dc855c4a4da245700bc840b5db3b74a97828a 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -216,6 +216,16 @@ void Node::set_requested_device(const string& device) {
   props_->node_def.set_device(device);
 }
 
+void Node::set_original_node_names(const std::vector<string>& names) {
+  MaybeCopyOnWrite();
+  props_->node_def.mutable_experimental_debug_info()
+      ->clear_original_node_names();
+  if (!names.empty()) {
+    *props_->node_def.mutable_experimental_debug_info()
+         ->mutable_original_node_names() = {names.begin(), names.end()};
+  }
+}
+
 Status Node::input_edge(int idx, const Edge** e) const {
   if (idx < 0 || idx >= num_inputs()) {
     return errors::InvalidArgument("Invalid input_edge index: ", idx, ", Node ",
@@ -293,6 +303,21 @@ Status Node::input_tensor(int idx, OutputTensor* t) const {
   return Status::OK();
 }
 
+// NodeDebugInfo
+
+NodeDebugInfo::NodeDebugInfo(const Node& n) : NodeDebugInfo(n.def()) {}
+NodeDebugInfo::NodeDebugInfo(const NodeDef& ndef)
+    : name(ndef.name()),
+      original_node_names(
+          ndef.has_experimental_debug_info()
+              ? std::vector<string>({ndef.experimental_debug_info()
+                                         .original_node_names()
+                                         .begin(),
+                                     ndef.experimental_debug_info()
+                                         .original_node_names()
+                                         .end()})
+              : std::vector<string>()) {}
+
 // InputTensor
 
 bool InputTensor::operator==(const InputTensor& other) const {
@@ -548,6 +573,28 @@ Status Graph::UpdateEdge(Node* new_src, int new_src_index, Node* dst,
   return Status::OK();
 }
 
+Status Graph::AddWhileInputHack(Node* new_src, int new_src_index, Node* dst) {
+  if (dst->type_string() != "While") {
+    return errors::Internal(
+        "dst argument to AddWhileEdgeHack should be a While op, got: ",
+        dst->DebugString());
+  }
+  TF_RETURN_IF_ERROR(IsValidOutputTensor(new_src, new_src_index));
+  // Find the current number of data inputs. We'll add the new edge to the next
+  // missing data input.
+  int dst_index = 0;
+  for (const Edge* edge : dst->in_edges()) {
+    if (edge->IsControlEdge()) continue;
+    ++dst_index;
+  }
+  TF_RETURN_IF_ERROR(IsValidInputTensor(dst, dst_index));
+  AddEdge(new_src, new_src_index, dst, dst_index);
+  dst->MaybeCopyOnWrite();
+  dst->props_->node_def.add_input(
+      strings::StrCat(new_src->name(), ":", new_src_index));
+  return Status::OK();
+}
+
 Status Graph::AddFunctionLibrary(const FunctionDefLibrary& fdef_lib) {
   // Need a new-enough consumer to support the functions we add to the graph.
   if (fdef_lib.function_size() > 0 && versions_->min_consumer() < 12) {
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index af0b1237062a9380d37417a119f9cff75054515a..0b31219d5f80be866faecf4081993303fc33fe21 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -120,6 +120,10 @@ class Node {
   int assigned_device_name_index() const { return assigned_device_name_index_; }
   void set_assigned_device_name_index(int index);
 
+  // Sets 'original_node_names' field of this node's DebugInfo proto to
+  // 'names'.
+  void set_original_node_names(const std::vector<string>& names);
+
   // Read only access to attributes
   AttrSlice attrs() const;
 
@@ -290,6 +294,15 @@ class Node {
   TF_DISALLOW_COPY_AND_ASSIGN(Node);
 };
 
+// Stores debug information associated with the Node.
+struct NodeDebugInfo {
+  const string name;
+  const std::vector<string> original_node_names;
+
+  NodeDebugInfo(const Node& n);
+  NodeDebugInfo(const NodeDef& ndef);
+};
+
 // Represents an input of a node, i.e., the `index`-th input to `node`.
 struct InputTensor {
   Node* node;
@@ -493,11 +506,17 @@ class Graph {
   // the corresponding NodeDef to reflect the change.
   // REQUIRES: The control edge must exist.
   void RemoveControlEdge(const Edge* e);
+
   // Updates the input to a node.  The existing edge to `dst` is removed and an
   // edge from `new_src` to `dst` is created. The NodeDef associated with `dst`
   // is also updated.
   Status UpdateEdge(Node* new_src, int new_src_index, Node* dst, int dst_index);
 
+  // Like AddEdge but updates dst's NodeDef. Used to add an input edge to a
+  // "While" op during gradient construction, see AddInputWhileHack in
+  // python_api.h for more details.
+  Status AddWhileInputHack(Node* new_src, int new_src_index, Node* dst);
+
   // Adds the function and gradient definitions in `fdef_lib` to this graph's op
   // registry. Ignores duplicate functions, and returns a bad status if an
   // imported function differs from an existing function or op with the same
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index f6d83d5f6fff9be372e512e2ff7b8366201bdd81..ac1b690df315a0086fe00f0a720ecf87534452f2 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -35,6 +35,8 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -268,22 +270,20 @@ class GraphConstructor {
     int gdef_index;
     Node* node;  // nullptr until the NodeDef is converted to a Node.
   };
-  // TODO(vrv): Profile this data structure to see if we should use an
-  // alternative implementation of std::unordered_map.
-  std::unordered_map<StringPiece, NodeInfo, StringPieceHasher> gdef_nodes_;
+  gtl::FlatMap<StringPiece, NodeInfo, StringPieceHasher> gdef_nodes_;
 
   // Prefixes already used in the GraphDef being imported.
-  std::unordered_set<StringPiece, StringPieceHasher> gdef_prefixes_;
+  gtl::FlatSet<StringPiece, StringPieceHasher> gdef_prefixes_;
 
   // Mapping from node name to the existing node in g_.
-  std::unordered_map<StringPiece, Node*, StringPieceHasher> existing_nodes_;
+  gtl::FlatMap<StringPiece, Node*, StringPieceHasher> existing_nodes_;
 
   // Prefixes already used in the graph.
-  std::unordered_set<StringPiece, StringPieceHasher> existing_prefixes_;
+  gtl::FlatSet<StringPiece, StringPieceHasher> existing_prefixes_;
 
   // Imported node names that have been uniquified. The key is the original
   // name, the value is the new unique name.
-  std::unordered_map<string, string> uniquified_names_;
+  gtl::FlatMap<string, string> uniquified_names_;
 
   // Index of NodeDefs in node_defs_ with all inputs already converted. We use a
   // (sorted) set so nodes are created in the order defined in the GraphDef.
@@ -360,7 +360,7 @@ bool NodeNameInValues(const std::vector<string>& control_dependencies,
 // Adds any prefixes of `node_name` (not including the full name itself) to
 // `prefixes`.
 void AddPrefixes(StringPiece node_name,
-                 std::unordered_set<StringPiece, StringPieceHasher>* prefixes) {
+                 gtl::FlatSet<StringPiece, StringPieceHasher>* prefixes) {
   size_t idx = -1;
   while ((idx = node_name.find('/', idx + 1)) != StringPiece::npos) {
     prefixes->insert(node_name.substr(0, idx));
@@ -857,7 +857,7 @@ void GraphConstructor::UpdateUniquifiedColocationNames() {
     for (int i = 0; i < coloc_values.size(); ++i) {
       StringPiece val(coloc_values[i]);
       if (str_util::ConsumePrefix(&val, kColocationGroupPrefix)) {
-        const auto& name_pair = uniquified_names_.find(string(val));
+        auto name_pair = uniquified_names_.find(string(val));
         if (name_pair == uniquified_names_.end()) continue;
         updated = true;
         coloc_values[i] =
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 9c640c42a5891b632e18517c848cc9a0c76a0f45..f213eb7c107c92be55d4efcf7b8551f1ac282154 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -209,7 +209,8 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
   // NOTE(yuanbyu): Only cast for cross-device send/recv.
   if (dtype != cast_dtype && !NeedSameDeviceSendRecv(edge, g_info)) {
     const string cast_op = (host_memory) ? "_HostCast" : "Cast";
-    NodeDefBuilder cast_builder(opts.new_name(src->name()), cast_op);
+    NodeDefBuilder cast_builder(opts.new_name(src->name()), cast_op,
+                                NodeDebugInfo(*src));
     cast_builder.Device(src->assigned_device_name()).Input(send_from);
     if (opts.scheduling_for_recvs) {
       cast_builder.Attr("_start_time", start_time);
@@ -233,7 +234,8 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
 
   // Add the send node.
   const string send_op = (host_memory) ? "_HostSend" : "_Send";
-  NodeDefBuilder send_builder(opts.new_name(src->name()), send_op);
+  NodeDefBuilder send_builder(opts.new_name(src->name()), send_op,
+                              NodeDebugInfo(*src));
   SetSendRecvAttrs(opts, edge, &send_builder);
   send_builder.Device(src->assigned_device_name()).Input(send_from);
   if (opts.scheduling_for_recvs) {
@@ -268,7 +270,8 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info,
 
   // Add the recv node.
   const string recv_op = (host_memory) ? "_HostRecv" : "_Recv";
-  NodeDefBuilder recv_builder(opts.new_name(src->name()), recv_op);
+  NodeDefBuilder recv_builder(opts.new_name(src->name()), recv_op,
+                              NodeDebugInfo(*src));
   SetSendRecvAttrs(opts, edge, &recv_builder);
   recv_builder.Device(dst->assigned_device_name())
       .Attr("tensor_type", cast_dtype);
@@ -280,7 +283,8 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info,
   // Add the cast node (from cast_dtype to dtype) or an Identity node.
   if (dtype != cast_dtype) {
     const string cast_op = (host_memory) ? "_HostCast" : "Cast";
-    NodeDefBuilder cast_builder(opts.new_name(src->name()), cast_op);
+    NodeDefBuilder cast_builder(opts.new_name(src->name()), cast_op,
+                                NodeDebugInfo(*src));
     cast_builder.Attr("DstT", dtype);
     cast_builder.Device(dst->assigned_device_name())
         .Input(recv->name(), 0, cast_dtype);
@@ -290,7 +294,8 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info,
     return cast;
   } else if (edge->IsControlEdge()) {
     // An Identity is only needed for control edges.
-    NodeDefBuilder id_builder(opts.new_name(src->name()), "Identity");
+    NodeDefBuilder id_builder(opts.new_name(src->name()), "Identity",
+                              NodeDebugInfo(*src));
     id_builder.Device(dst->assigned_device_name())
         .Input(recv->name(), 0, cast_dtype);
     NodeDef* id = gdef->add_node();
@@ -982,6 +987,7 @@ Status Partition(const PartitionOptions& opts, Graph* g,
     GraphDef* dst_graph = &(*partitions)[dstp];
     NodeDef* dst_def = dst_graph->add_node();
     *dst_def = dst->def();
+    MergeDebugInfo(NodeDebugInfo(dst->def()), dst_def);
     dst_def->set_device(dst->assigned_device_name());
     dst_def->clear_input();  // Inputs are filled below
     if (opts.need_to_record_start_times) {
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index 333c32567fc9b922951b558c86f29087da770894..602578a83a3fcc01dbb61841051da92ffc366144 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -661,6 +661,10 @@ TEST_F(GraphTest, BuildNodeNameIndex) {
 }
 
 REGISTER_OP("Input").Output("y: float");
+REGISTER_OP("Output")
+    .Input("x: N * float")
+    .Attr("N: int >= 1")
+    .Output("y: float");
 REGISTER_OP("In2Out1").Input("a: float").Input("b: float").Output("y: float");
 REGISTER_OP("In4Out1")
     .Input("a: float")
@@ -713,7 +717,14 @@ GraphDef CreateGraphDef(int num_nodes, int num_edges_per_node) {
     }
     s += strings::Printf("'in%04d' ] } ", rnd.Uniform(kNumInNodes));
   }
-
+  // Add a single sink node. Otherwise a lot of time is spent in
+  // FixupSourceAndSinkEdges().
+  s += strings::Printf("node { name: 'out' op: 'Output' input: [ ");
+  for (int op = 0; op < num_nodes - 1; op++) {
+    s += strings::Printf("'op%05d', ", op);
+  }
+  s += strings::Printf("'op%05d' ], attr: { key: 'N' value { i: %d } } } ",
+                       num_nodes - 1, num_nodes);
   GraphDef graph_def;
   CHECK(protobuf::TextFormat::ParseFromString(s, &graph_def));
   return graph_def;
@@ -799,5 +810,44 @@ BENCHMARK(BM_GraphCreation)->ArgPair(1 << 9, 16);
 BENCHMARK(BM_GraphCreation)->ArgPair(1 << 12, 16);
 BENCHMARK(BM_GraphCreation)->ArgPair(1 << 15, 16);
 
+static void BM_ToGraphDef(int iters, int num_nodes, int num_edges_per_node) {
+  testing::StopTiming();
+  const GraphDef graph_def = CreateGraphDef(num_nodes, num_edges_per_node);
+  const auto registry = OpRegistry::Global();
+  GraphConstructorOptions opts;
+  // Warmup step.
+  Graph graph(registry);
+  TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
+  int64 sum = 0;
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    GraphDef graph_def;
+    graph.ToGraphDef(&graph_def);
+    sum += graph_def.node_size();
+  }
+  VLOG(1) << sum;
+  testing::StopTiming();
+}
+BENCHMARK(BM_ToGraphDef)->ArgPair(10, 2);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 2);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 2);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 2);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 2);
+BENCHMARK(BM_ToGraphDef)->ArgPair(10, 4);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 4);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 4);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 4);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 4);
+BENCHMARK(BM_ToGraphDef)->ArgPair(10, 8);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 8);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 8);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 8);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 8);
+BENCHMARK(BM_ToGraphDef)->ArgPair(10, 16);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 16);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 16);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 16);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 16);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 69735aac028a979d4deb2561fd0389cceb4a11de..3c868dc22261fae7ebc061ce7a0aec51477dfdc1 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -22,9 +22,12 @@ limitations under the License.
 #include <memory>
 #include <queue>
 #include <set>
+#include <stack>
+#include <tuple>
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -257,7 +260,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.conv3d_grad_filter = "Conv3DBackpropFilterV2";
     csinfo_.fused_batch_norm = "FusedBatchNorm";
     csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
+    csinfo_.fused_conv2d = "_FusedConv2D";
     csinfo_.identity = "Identity";
+    csinfo_.leakyrelu = "LeakyRelu";
+    csinfo_.leakyrelu_grad = "LeakyReluGrad";
     csinfo_.lrn = "LRN";
     csinfo_.lrn_grad = "LRNGrad";
     csinfo_.matmul = "MatMul";
@@ -271,6 +277,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_grad_filter_with_bias =
         "_MklConv2DBackpropFilterWithBias";
+    csinfo_.mkl_fused_conv2d = "_MklFusedConv2D";
+    csinfo_.mkl_pad_with_conv2d = "_MklPadWithConv2D";
+    csinfo_.pad = "Pad";
+    csinfo_.pad_with_conv2d = "__MklDummyPadWithConv2D";
 // Temporarily don't convert quantized operators into MKL versions for now.
 // TODO(Intel-tf) Once all the relevant PRs have been merged then remove
 // the ifdef.
@@ -310,6 +320,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.slice = "Slice";
     csinfo_.softmax = "Softmax";
     csinfo_.split = "Split";
+    csinfo_.transpose = "Transpose";
     // Element-wise ops. Ensure you also add any new ops to IsOpElementWise
     // in the MklUtil.h (IsMklElementWiseOp method) to ensure that the
     // MklInputConversion op is added before it.
@@ -373,6 +384,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
         {csinfo_.fused_batch_norm_grad,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
          CopyAttrsFusedBatchNorm, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.fused_conv2d, csinfo_.mkl_fused_conv2d,
+                      CopyAttrsFusedConv2D, FusedConv2DRewrite});
     rinfo_.push_back({csinfo_.identity,
                       mkl_op_registry::GetMklOpName(csinfo_.identity),
                       CopyAttrsDataType, AlwaysRewrite});
@@ -381,6 +394,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.lrn_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
                       CopyAttrsLRN, LrnGradRewrite});
+    rinfo_.push_back({csinfo_.leakyrelu,
+                      mkl_op_registry::GetMklOpName(csinfo_.leakyrelu),
+                      CopyAttrsLeakyRelu, LeakyReluRewrite});
+    rinfo_.push_back({csinfo_.leakyrelu_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.leakyrelu_grad),
+                      CopyAttrsLeakyRelu, LeakyReluRewrite});
     rinfo_.push_back({csinfo_.max_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool),
                       CopyAttrsPooling, NonDepthBatchWisePoolRewrite});
@@ -398,6 +417,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.pad_with_conv2d, csinfo_.mkl_pad_with_conv2d,
+                      CopyAttrsPadWithConv2D, AlwaysRewrite});
 #ifdef INTEL_MKL_QUANTIZED
     rinfo_.push_back({csinfo_.quantized_avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_avg_pool),
@@ -508,6 +529,44 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
                       csinfo_.conv2d_grad_filter_with_bias,
                       GetConv2DBackpropFilterOrBiasAddGrad});
+    minfo_.push_back(
+        {csinfo_.pad, csinfo_.conv2d, csinfo_.pad_with_conv2d, GetPadOrConv2D});
+    // Merge Pad and Conv2d, only if the pad op is "Pad"
+    // Doesn't merge if pad op is "PadV2" or "MirrorPad"
+
+    // The fusion patterns in "finfo_" that show up first will get applied
+    // first, for example, graph "A->B->C-D" and finfo_ is {A->B->C to ABC,
+    // A->B->C->D to ABCD}, since the first gets applied first, the final
+    // graph will be ABC->D.
+
+    //
+    // Add rules to fuse sequences such as "Transpose (NCHW -> NHWC) + Conv2D
+    // (NHWC) + Transpose (NHWC->
+    // NCHW)" into "Conv2D (NCHW)". Such patterns occur frequently in Keras.
+    // Note: we use the term "merge" to combine (exactly) 2 nodes into one,
+    // while "fusion" is for 3+ nodes situation.
+    //
+
+    // Transpose + Conv2d + Transpose:
+    std::vector<int> transpose_to_nhwc = {NCHW::dim::N, NCHW::dim::H,
+                                          NCHW::dim::W, NCHW::dim::C};
+    std::vector<int> transpose_to_nchw = {NHWC::dim::N, NHWC::dim::C,
+                                          NHWC::dim::H, NHWC::dim::W};
+    auto CheckForTransposeToNHWC =
+        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_nhwc);
+    auto CheckForConv2dOp =
+        std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.conv2d);
+    auto CheckForTransposeToNCHW =
+        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_nchw);
+    auto FuseConv2D =
+        std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
+                  std::placeholders::_2, std::placeholders::_3, "NCHW");
+    finfo_.push_back(
+        {"transpose-elimination for Conv2D",
+         {CheckForTransposeToNHWC, CheckForConv2dOp, CheckForTransposeToNCHW},
+         // CheckForMklOp
+         FuseConv2D,
+         CopyAttrsConv});
   }
 
   // Standard interface to run pass
@@ -530,7 +589,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string name;      // Original name of op of the node in the graph
     string new_name;  // New name of the op of the node in the graph
     // A function handler to copy attributes from an old node to a new node.
-    std::function<void(const Node*, NodeBuilder*)> copy_attrs;
+    std::function<void(const Node*, NodeBuilder*, bool)> copy_attrs;
     // A rule under which to rewrite this node
     std::function<bool(const Node*)> rewrite_rule;
   } RewriteInfo;
@@ -560,6 +619,41 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     std::function<Node*(const Node*)> get_node_to_be_merged;
   } MergeInfo;
 
+  // Structure to specify information used in node fusion of 3+ operators
+  typedef struct {
+    std::string pattern_name;  // Name to describe this pattern, such as
+                               // "Transpose_Mklop_Transpose".
+    std::vector<std::function<bool(const Node*)> >
+        node_checkers;  // Extra restriction checker for these ops
+    std::function<Status(
+        std::unique_ptr<Graph>*, std::vector<Node*>&,
+        std::function<void(const Node*, NodeBuilder* nb, bool)>)>
+        fuse_func;
+    std::function<void(const Node*, NodeBuilder* nb, bool)> copy_attrs;
+  } FusionInfo;
+
+  //
+  // Dimension indices for 2D tensor.
+  //
+  struct NCHW {
+    enum dim { N = 0, C = 1, H = 2, W = 3 };
+  };
+
+  struct NHWC {
+    enum dim { N = 0, H = 1, W = 2, C = 3 };
+  };
+
+  //
+  // dimension indices for 3D tensor.
+  //
+  struct NCDHW {
+    enum dim { N = 0, C = 1, D = 2, H = 3, W = 4 };
+  };
+
+  struct NDHWC {
+    enum dim { N = 0, D = 1, H = 2, W = 3, C = 4 };
+  };
+
   /// Structure to store all constant strings
   /// NOTE: names are alphabetically sorted.
   typedef struct {
@@ -583,7 +677,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string conv3d_grad_filter;
     string fused_batch_norm;
     string fused_batch_norm_grad;
+    string fused_conv2d;
     string identity;
+    string leakyrelu;
+    string leakyrelu_grad;
     string lrn;
     string lrn_grad;
     string matmul;
@@ -597,7 +694,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_conv2d_grad_filter;
     string mkl_conv2d_grad_filter_with_bias;
     string mkl_conv2d_with_bias;
+    string mkl_fused_conv2d;
+    string mkl_pad_with_conv2d;
     string mul;
+    string pad;
+    string pad_with_conv2d;
     string quantized_avg_pool;
     string quantized_conv2d;
     string quantized_conv2d_with_requantize;
@@ -619,6 +720,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string requantize;
     string tanh;
     string tanh_grad;
+    string transpose;
     string reshape;
     string slice;
     string softmax;
@@ -637,6 +739,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   /// Maintain info about nodes to be merged
   std::vector<MergeInfo> minfo_;
 
+  /// Maintain info about nodes to be fused
+  std::vector<FusionInfo> finfo_;
+
   /// Maintain structure of constant strings
   static ConstStringsInfo csinfo_;
 
@@ -721,6 +826,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
   // Helper function to merge different nodes
   Status MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g, Node* m, Node* n);
+  Status MergePadWithConv2D(std::unique_ptr<Graph>* g, Node* m, Node* n);
   Status MergeConv2DBackpropFilterWithBiasAddGrad(std::unique_ptr<Graph>* g,
                                                   Node* m, Node* n);
 
@@ -758,6 +864,54 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return n;
   }
 
+  // Find Pad or Conv2D node that can be merged with input node 'm'.
+  // If input 'm' is Pad, then check if there exists Conv2D node that can be
+  // merged with 'm'. If input 'm' is Conv2D, then check if there exists Pad
+  // node that can be merged with 'm'.
+  static Node* GetPadOrConv2D(const Node* m) {
+    DCHECK(m);
+    Node* n = nullptr;
+
+    const Node* conv_node;
+    if (m->type_string() == csinfo_.pad) {
+      // If m is Pad, then Conv2D is the output of Pad.
+      for (const Edge* e : m->out_edges()) {
+        if (!e->IsControlEdge() && e->dst()->type_string() == csinfo_.conv2d) {
+          n = e->dst();
+          conv_node = n;
+          break;
+        }
+      }
+    } else {
+      DCHECK_EQ(m->type_string(), csinfo_.conv2d);
+      // If m is conv2D, Go over all input edges
+      // and search for Pad  Node.
+      for (const Edge* e : m->in_edges()) {
+        if (!e->IsControlEdge() && e->src()->type_string() == csinfo_.pad) {
+          n = e->src();
+          conv_node = m;
+          break;
+        }
+      }
+    }
+    // Check if only VALID type of padding is used
+    // or not.
+    if (n != nullptr) {
+      string padding;
+      TF_CHECK_OK(GetNodeAttr(conv_node->def(), "padding", &padding));
+      if (padding != "VALID")
+        // Then do not merge.
+        // Only VALID type of padding in conv op can be
+        // merged with Pad op.
+        n = nullptr;
+    } else {
+      VLOG(1) << "MklLayoutRewritePass: Could not find matching "
+              << "Pad and Conv2D node for merging. Input node: "
+              << m->DebugString();
+    }
+
+    return n;
+  }
   // Find Conv2DBackpropFilter or BiasAddGrad node that can be merged with input
   // node 'm'. If input 'm' is Conv2DBackpropFilter, then check if there exists
   // BiasAddGrad node that can be merged with 'm'. If input 'm' is BiasAddGrad,
@@ -815,6 +969,119 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return n;
   }
 
+  // Return a node that can be fused with input node 'n'
+  //
+  // @return tuple. If we can find such nodes, the first
+  // element of the tuple is a true. Otherwise, it's false.
+  std::tuple<bool, std::vector<Node*>, const MklLayoutRewritePass::FusionInfo>
+  CheckForNodeFusion(Node* n) const;
+
+  // Fuse nodes in the vector "nodes"
+  Status FuseNode(std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+                  const MklLayoutRewritePass::FusionInfo fi);
+
+  // Fuse tranpose(to "NHWC") + mklop("NHWC") + transpose(to "NCHW") into
+  // mklop("NCHW").
+  // Here "mklop" can be any MKL-DNN supported op, such as Conv2D.
+  static Status FuseTransposeMklOpTranspose(
+      std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+      std::function<void(const Node*, NodeBuilder* nb, bool)> copy_attrs,
+      string data_format);
+
+  static bool CheckForTranspose(const Node* node, std::vector<int> perm) {
+    // Check if node's type is "Transpose"
+    if (node->type_string() != "Transpose") return false;
+
+    // If "Transpose" has multiple output data edges, also don't fuse it.
+    if (node->num_outputs() > 1 || node->out_edges().size() > 1) return false;
+
+    // Check if has out control edge. If true, this is a training graph.
+    // Currently we focus on inference and do no fusion in training.
+    // Note: this constraint will eventually be removed, if we enabled this
+    // fusion for training
+    // in the future.
+    for (const Edge* e : node->out_edges()) {
+      if (e->IsControlEdge()) {
+        return false;
+      }
+    }
+
+    // If "Transpose" has input control edges, don't fuse on it.
+    for (const Edge* e : node->in_edges()) {
+      if (e->IsControlEdge()) {
+        return false;
+      }
+    }
+
+    // We compared the tensor containing the permutation order ("perm_node")
+    // with our desired order ("perm"). If they're exactly match, this check
+    // succeed and returns true.
+    for (const Edge* e : node->in_edges()) {
+      if (!e->IsControlEdge()) {
+        const Node* perm_node = e->src();
+
+        const int kPermTensorIndex = 1;
+        if (perm_node->type_string() == "Const" &&
+            e->dst_input() == kPermTensorIndex) {
+          // we find the "perm" node, now try to retrieve its value.
+          const TensorProto* proto = nullptr;
+          DCHECK(GetNodeAttr(perm_node->def(), "value", &proto).ok());
+
+          DataType type;
+          GetNodeAttr(perm_node->def(), "dtype", &type);
+
+          // Here we directly access to the "tensor_content", rather than
+          // "int_val". This is because we find "int_val" is
+          // not set properly under some circumstances.
+          if (type == DT_INT32) {
+            const int type_size = 4;
+            const int* tensor_content =
+                reinterpret_cast<const int*>(proto->tensor_content().c_str());
+            const int tensor_content_size =
+                proto->tensor_content().size() / type_size;
+
+            std::vector<int> perm_value(tensor_content,
+                                        tensor_content + tensor_content_size);
+
+            return perm_value == perm;
+          } else if (type == DT_INT64) {
+            const int type_size = 8;
+            const long* tensor_content =
+                reinterpret_cast<const long*>(proto->tensor_content().c_str());
+            const int tensor_content_size =
+                proto->tensor_content().size() / type_size;
+
+            std::vector<long> perm_value(tensor_content,
+                                         tensor_content + tensor_content_size);
+            std::vector<long> long_perm(perm.cbegin(), perm.cend());
+
+            return perm_value == long_perm;
+          }
+          return false;
+        }
+      }
+    }
+    return false;
+  }
+
+  static bool CheckForMklOp(const Node* node, string name = "") {
+    if (node == nullptr) return false;
+
+    if (!name.empty() && node->type_string() != name) {
+      return false;
+    }
+
+    // if mklop has multiple outputs, don't fuse it.
+    if (node->num_outputs() > 1) return false;
+
+    if (node->out_edges().size() > 1) return false;
+
+    DataType T;
+    TF_CHECK_OK(GetNodeAttr(node->def(), "T", &T));
+    return mkl_op_registry::IsMklOp(
+        mkl_op_registry::GetMklOpName(node->type_string()), T);
+  }
+
   // Check if the node 'n' has any applicable rewrite rule
   // We check for 2 scenarios for rewrite.
   //
@@ -891,6 +1158,30 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return do_rewrite;
   }
 
+  // MKL-DNN's LeakyRelu(feature) = feature          (if feature > 0), or
+  //                                feature * alpha  (otherwise),
+  // while TensorFlow's LeakyRelu(feature) = max(feature, feature * alpha).
+  // These two algorithms are not consistent when alpha > 1,
+  // so we only rewrite LeakyRelu to MKL OP when alpha <= 1.
+  static bool LeakyReluRewrite(const Node* n) {
+    DCHECK(n);
+
+    float alpha;
+    bool has_attr = GetNodeAttr(n->def(), "alpha", &alpha).ok();
+    DCHECK(has_attr);
+
+    // If the alpha of LeakyRelu is less than 1, rewrite the node.
+    // Otherwise eigen node is used instead.
+    if (alpha <= 1) {
+      return true;
+    }
+    VLOG(1) << "LeakyReluRewrite: The model sets alpha is greater than 1 "
+            << "which case is not optimized by Intel MKL, thus using Eigen op"
+            << "for LeakyRelu ";
+
+    return false;
+  }
+
   static bool MaxpoolGradRewrite(const Node* n) {
     CHECK_NOTNULL(n);
     bool do_rewrite = false;
@@ -923,6 +1214,23 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
+  static bool FusedConv2DRewrite(const Node* n) {
+    // MKL DNN currently doesn't support all fusions that grappler fuses
+    // together with Conv2D (ex. batchnorm). We rewrite _FusedConv2D only if
+    // it includes those we support.
+    DataType T;
+    if (!GetNodeAttr(n->def(), "T", &T).ok() ||
+        !mkl_op_registry::IsMklOp(csinfo_.mkl_fused_conv2d, T)) {
+      return false;
+    }
+
+    std::vector<string> fused_ops;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "fused_ops", &fused_ops));
+    return (fused_ops == std::vector<string>{"BiasAdd"} ||
+            fused_ops == std::vector<string>{"Relu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Relu"});
+  }
+
   // Rewrites input node to a new node specified by its matching rewrite info.
   //
   // Method first searches matching rewrite info for input node and then
@@ -1070,22 +1378,47 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
   // NOTE: names are alphabetically sorted.
-  static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsQuantizedPooling(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsQuantizedConv2D(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsQuantizedConcat(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsRequantize(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsSlice(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb,
+                            bool change_format = false);
+  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb,
+                                   bool change_format = false);
+  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb,
+                              bool change_format = false);
+  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb,
+                                bool change_format = false);
+  static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
+                            bool change_format = false);
+  static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb,
+                                bool change_format = false);
+  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb,
+                                      bool change_format = false);
+  static void CopyAttrsLeakyRelu(const Node* orig_node, NodeBuilder* nb,
+                                 bool change_format = false);
+  static void CopyAttrsFusedConv2D(const Node* orig_node, NodeBuilder* nb,
+                                   bool change_format = false);
+  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb,
+                           bool change_format = false);
+  static void CopyAttrsPadWithConv2D(const Node* orig_node, NodeBuilder* nb,
+                                     bool change_format = false);
+  static void CopyAttrsFromPadAndConv2D(const Node* orig_node1,
+                                        const Node* orig_node2, NodeBuilder* nb,
+                                        bool change_format = false);
+  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb,
+                               bool change_format = false);
+  static void CopyAttrsQuantizedPooling(const Node* orig_node, NodeBuilder* nb,
+                                        bool change_format = false);
+  static void CopyAttrsQuantizedConv2D(const Node* orig_node, NodeBuilder* nb,
+                                       bool change_format = false);
+  static void CopyAttrsQuantizedConcat(const Node* orig_node, NodeBuilder* nb,
+                                       bool change_format = false);
+  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb,
+                               bool change_format = false);
+  static void CopyAttrsRequantize(const Node* orig_node, NodeBuilder* nb,
+                                  bool change_format = false);
+  static void CopyAttrsSlice(const Node* orig_node, NodeBuilder* nb,
+                             bool change_format = false);
+  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb,
+                             bool change_format = false);
 
   // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
   // using node for original node 'orig_node' and return it in '*out'.
@@ -1282,10 +1615,13 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     CHECK_NOTNULL(filter_node);
 
     // Now check which nodes receive from filter_node. Filter feeds as
-    // 2nd input (slot 1) of _MklConv2D and _MklConv2DWithBias.
+    // 2nd input (slot 1) of _MklConv2D, _MklConv2DWithBias, and
+    // _MklFusedConv2D.
     for (const Edge* e : filter_node->out_edges()) {
       if ((e->dst()->type_string() == csinfo_.mkl_conv2d ||
-           e->dst()->type_string() == csinfo_.mkl_conv2d_with_bias) &&
+           e->dst()->type_string() == csinfo_.mkl_pad_with_conv2d ||
+           e->dst()->type_string() == csinfo_.mkl_conv2d_with_bias ||
+           e->dst()->type_string() == csinfo_.mkl_fused_conv2d) &&
           e->dst_input() == kConv2DFilterInputSlotIdx
           /* filter is 2nd input of Conv2D and _MklConv2D. */) {
         if (conv2d_node != nullptr) {
@@ -1586,13 +1922,71 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
 // Op-specific functions to copy attributes from old node to new node
 //////////////////////////////////////////////////////////////////////////
 
-void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node,
-                                         NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
+                                         bool change_format) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("padding", padding);
+
+  if (!change_format) {
+    nb->Attr("strides", strides);
+    nb->Attr("dilations", dilations);
+
+    TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+    nb->Attr("data_format", data_format);
+  } else {
+    std::vector<int32> new_strides;
+    std::vector<int32> new_dilations;
+    if (strides.size() == 5) {
+      // "strides" and "dilations" also need to be changed according to
+      // "data_format",
+      // in this case, is "NDHWC" to "NCDHW".
+      new_strides = {strides[NDHWC::dim::N], strides[NDHWC::dim::C],
+                     strides[NDHWC::dim::D], strides[NDHWC::dim::H],
+                     strides[NDHWC::dim::W]};
+
+      new_dilations = {dilations[NDHWC::dim::N], dilations[NDHWC::dim::C],
+                       dilations[NDHWC::dim::D], dilations[NDHWC::dim::H],
+                       dilations[NDHWC::dim::W]};
+    } else {
+      // "strides" and "dilations" also need to be changed according to
+      // "data_format",
+      // in this case, is "NHWC" to "NCHW".
+
+      new_strides = {strides[NHWC::dim::N], strides[NHWC::dim::C],
+                     strides[NHWC::dim::H], strides[NHWC::dim::W]};
+
+      new_dilations = {dilations[NHWC::dim::N], dilations[NHWC::dim::C],
+                       dilations[NHWC::dim::H], dilations[NHWC::dim::W]};
+    }
+    nb->Attr("strides", new_strides);
+    nb->Attr("dilations", new_dilations);
+  }
+}
+
+// Used in rinfo when replacing __MklDummyPadWithConv2D by _MklPadWithConv2D
+void MklLayoutRewritePass::CopyAttrsPadWithConv2D(const Node* orig_node,
+                                                  NodeBuilder* nb,
+                                                  bool change_format) {
+  DataType Tpaddings;
   DataType T;
   string data_format;
   string padding;
   std::vector<int32> strides;
   std::vector<int32> dilations;
+  bool use_cudnn_on_gpu;
 
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
@@ -1600,6 +1994,9 @@ void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node,
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(
+      GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tpaddings", &Tpaddings));
 
   // Add attributes to new node.
   nb->Attr("T", T);
@@ -1607,10 +2004,46 @@ void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node,
   nb->Attr("dilations", dilations);
   nb->Attr("padding", padding);
   nb->Attr("data_format", data_format);
+  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
+  nb->Attr("Tpaddings", Tpaddings);
 }
 
-void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
-                                         NodeBuilder* nb) {
+// Used with MergePadWithConv2D
+void MklLayoutRewritePass::CopyAttrsFromPadAndConv2D(const Node* orig_node1,
+                                                     const Node* orig_node2,
+                                                     NodeBuilder* nb,
+                                                     bool change_format) {
+  DataType Tpaddings;
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+  bool use_cudnn_on_gpu;
+
+  // Get all attributes from old node 1.
+  TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "data_format", &data_format));
+  TF_CHECK_OK(
+      GetNodeAttr(orig_node1->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
+  // Get all attributes from old node 2.
+  TF_CHECK_OK(GetNodeAttr(orig_node2->def(), "Tpaddings", &Tpaddings));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("dilations", dilations);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
+  nb->Attr("Tpaddings", Tpaddings);
+}
+
+void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb,
+                                         bool change_format) {
   DataType T;
   int N;
 
@@ -1624,7 +2057,8 @@ void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
-                                                NodeBuilder* nb) {
+                                                NodeBuilder* nb,
+                                                bool change_format) {
   DataType T;
   string data_format;
   std::vector<int32> strides;
@@ -1640,8 +2074,8 @@ void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
   nb->Attr("data_format", data_format);
 }
 
-void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
-                                        NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb,
+                                        bool change_format) {
   DataType T;
   int depth_radius;
   float bias;
@@ -1663,8 +2097,24 @@ void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
   nb->Attr("beta", beta);
 }
 
+void MklLayoutRewritePass::CopyAttrsLeakyRelu(const Node* orig_node,
+                                              NodeBuilder* nb,
+                                              bool change_format) {
+  DataType T;
+  float alpha;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "alpha", &alpha));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("alpha", alpha);
+}
+
 void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
-                                            NodeBuilder* nb) {
+                                            NodeBuilder* nb,
+                                            bool change_format) {
   DataType T;
   string data_format;
   string padding;
@@ -1686,7 +2136,8 @@ void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
-                                             NodeBuilder* nb) {
+                                             NodeBuilder* nb,
+                                             bool change_format) {
   DataType T;
 
   // Get all attributes from old node.
@@ -1697,7 +2148,8 @@ void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsQuantizedPooling(const Node* orig_node,
-                                                     NodeBuilder* nb) {
+                                                     NodeBuilder* nb,
+                                                     bool change_format) {
   DataType T;
   string data_format;
   string padding;
@@ -1717,7 +2169,8 @@ void MklLayoutRewritePass::CopyAttrsQuantizedPooling(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsQuantizedConv2D(const Node* orig_node,
-                                                    NodeBuilder* nb) {
+                                                    NodeBuilder* nb,
+                                                    bool change_format) {
   DataType Tinput, Tfilter, out_type;
   string padding;
   string data_format("NHWC");
@@ -1747,7 +2200,8 @@ void MklLayoutRewritePass::CopyAttrsQuantizedConv2D(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsRequantize(const Node* orig_node,
-                                               NodeBuilder* nb) {
+                                               NodeBuilder* nb,
+                                               bool change_format) {
   DataType Tinput, out_type;
 
   // Get all attributes from old node.
@@ -1760,7 +2214,8 @@ void MklLayoutRewritePass::CopyAttrsRequantize(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
-                                            NodeBuilder* nb) {
+                                            NodeBuilder* nb,
+                                            bool change_format) {
   DataType T;
   DataType Tshape;
 
@@ -1773,7 +2228,7 @@ void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsSlice(const Node* orig_node,
-                                          NodeBuilder* nb) {
+                                          NodeBuilder* nb, bool change_format) {
   DataType T;
   DataType Index;
 
@@ -1786,7 +2241,7 @@ void MklLayoutRewritePass::CopyAttrsSlice(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
-                                          NodeBuilder* nb) {
+                                          NodeBuilder* nb, bool change_format) {
   DataType T;
   string data_format;
   int num_split;
@@ -1803,7 +2258,8 @@ void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
-                                           NodeBuilder* nb) {
+                                           NodeBuilder* nb,
+                                           bool change_format) {
   DataType T;
   int N;
 
@@ -1817,7 +2273,8 @@ void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
-                                             NodeBuilder* nb) {
+                                             NodeBuilder* nb,
+                                             bool change_format) {
   DataType T;
   int N;
   DataType tidx;
@@ -1834,7 +2291,8 @@ void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
-                                                   NodeBuilder* nb) {
+                                                   NodeBuilder* nb,
+                                                   bool change_format) {
   DataType T;
   float epsilon;
   string data_format;
@@ -1853,6 +2311,39 @@ void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
   nb->Attr("is_training", is_training);
 }
 
+void MklLayoutRewritePass::CopyAttrsFusedConv2D(const Node* orig_node,
+                                                NodeBuilder* nb,
+                                                bool change_format) {
+  DataType T;
+  int num_args;
+  float epsilon;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+  std::vector<string> fused_ops;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "num_args", &num_args));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "fused_ops", &fused_ops));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("num_args", num_args);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+  nb->Attr("dilations", dilations);
+  nb->Attr("fused_ops", fused_ops);
+  nb->Attr("epsilon", epsilon);
+}
+
 //////////////////////////////////////////////////////////////////////////
 //           Helper functions related to node merge pass
 //////////////////////////////////////////////////////////////////////////
@@ -2050,6 +2541,165 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   return Status::OK();
 }
 
+Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
+                                                Node* m, Node* n) {
+  DCHECK(((m->type_string() == csinfo_.pad &&
+           n->type_string() == csinfo_.conv2d)) ||
+         ((n->type_string() == csinfo_.pad &&
+           m->type_string() == csinfo_.conv2d)));
+
+  // Conv2D is successor node, and Pad predecessor node.
+  Node* pred = m->type_string() == csinfo_.pad ? m : n;
+  Node* succ = m->type_string() == csinfo_.pad ? n : m;
+
+  // 1. Get all attributes from input nodes.
+  DataType T_pred, T_succ;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+  string data_format_pred, data_format_succ;
+  bool use_cudnn_on_gnu;
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "dilations", &dilations));
+  // Data format for pad is not available and not necessary, thus
+  // dont need to match data format for Pad
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+  // Check if the data types and devices of both succ and pred are the same.
+  // Assert is not used,  because it can be too strict.
+  // Don't need to check for data formats because it is not available in Pad.
+  if (T_pred != T_succ ||
+      pred->assigned_device_name() != succ->assigned_device_name() ||
+      pred->def().device() != succ->def().device()) {
+    return Status(error::Code::INVALID_ARGUMENT,
+                  "T attribute or devices of Conv2D and "
+                  "Pad do not match. Will skip node merge optimization");
+  }
+
+  const int succ_num = succ->num_inputs();
+  gtl::InlinedVector<Node*, 4> succ_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> succ_in(succ_num);
+  FillInputs(succ, &succ_control_edges, &succ_in);
+
+  const int pred_num = pred->num_inputs();
+  gtl::InlinedVector<Node*, 4> pred_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> pred_in(pred_num);
+  FillInputs(pred, &pred_control_edges, &pred_in);
+
+  // We need to ensure that Pad only feeds to Conv2D (some other operator is
+  // not expecting output of Pad). If this is not the case, then we cannot
+  // merge Conv2D with Pad.
+  const int kFirstOutputSlot = 0;
+  for (const Edge* e : pred->out_edges()) {
+    if (e->src_output() == kFirstOutputSlot && e->dst() != succ) {
+      return Status(error::Code::INVALID_ARGUMENT,
+                    "Pad does not feed to Conv2D, or "
+                    "it feeds Conv2D but has multiple outputs. "
+                    "Will skip node merge optimization");
+    }
+  }
+
+  // 2. Get inputs from both the nodes.
+
+  // Pad must have 2 data inputs: "input" and paddings.
+  int PadDataInputEdges = 0;
+  for (const Edge* e : pred->in_edges()) {
+    if (!e->IsControlEdge()) {
+      PadDataInputEdges++;
+    }
+  }
+  DCHECK_EQ(PadDataInputEdges, 2);
+
+  // Conv2D must have 2 data inputs: pad output and Filter
+  int ConvDataInputEdges = 0;
+  for (const Edge* e : succ->in_edges()) {
+    if (!e->IsControlEdge()) {
+      ConvDataInputEdges++;
+    }
+  }
+  DCHECK_EQ(ConvDataInputEdges, 2);
+
+  // We will use the node name of Conv2D as the name of new node
+  // Build new node. We use same name as original node, but change the op
+  // name.
+  NodeBuilder nb(succ->name(), csinfo_.pad_with_conv2d);
+  nb.Input(pred_in[0].first, pred_in[0].second);  // In1 (input data)  of Pad
+  // pred_in[1] will be 2nd Tensorflow tensor for Conv2D.
+  nb.Input(succ_in[1].first, succ_in[1].second);  // In2 (filter) of conv2d
+  // In1 of Conv2D is same as output of Pad.
+  // Thus, only need to add In2 of Conv2D
+  nb.Input(pred_in[1].first, pred_in[1].second);  // In2 (paddings) of Pad
+
+  // Copy attributes from Pad and conv2D to PadWithConv2D.
+  CopyAttrsFromPadAndConv2D(const_cast<const Node*>(succ),
+                            const_cast<const Node*>(pred), &nb);
+
+  // Copy the device assigned to old node to new node.
+  nb.Device(succ->def().device());
+
+  // Create node.
+  Node* new_node;
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
+  DCHECK(new_node);
+
+  // Incoming data edges from 'pred' node and 'succ' node to new 'new_node'
+  // node are already copied in BuildNode.
+  // We handle control edges now.
+  for (const Edge* e : pred->in_edges()) {
+    if (e->IsControlEdge()) {
+      // Don't allow duplicate edge
+      (*g)->AddControlEdge(e->src(), new_node, false);
+    }
+  }
+  for (const Edge* e : succ->in_edges()) {
+    if (e->IsControlEdge()) {
+      // Don't allow duplicate edge
+      (*g)->AddControlEdge(e->src(), new_node, false);
+    }
+  }
+
+  // Incoming edges are fixed, we will fix the outgoing edges now.
+  // First, we will fix outgoing control edges from 'pred' node.
+  for (const Edge* e : pred->out_edges()) {
+    if (e->IsControlEdge()) {
+      // Don't allow duplicate edge
+      (*g)->AddControlEdge(new_node, e->dst(), false);
+    }
+  }
+
+  // Second, we will fix outgoing control and data edges from 'succ' node.
+  for (const Edge* e : succ->out_edges()) {
+    if (e->IsControlEdge()) {
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      (*g)->AddControlEdge(new_node, e->dst(), false);
+    } else {
+      // Conv2D has only 1 output (at slot 0) and merged node also has only 1
+      // output (at slot 0).
+      const int kPadWithConv2DOutputSlot = 0;
+      (*g)->AddEdge(new_node, kPadWithConv2DOutputSlot, e->dst(),
+                    e->dst_input());
+    }
+  }
+
+  // Copy device assigned to old node to new node.
+  // It's ok to use pred or succ as we have enforced a check that
+  // both have same device assigned.
+  new_node->set_assigned_device_name(pred->assigned_device_name());
+
+  VLOG(1) << "MklLayoutRewritePass: Merged old node:" << pred->DebugString()
+          << ", and node: " << succ->DebugString()
+          << ", into node:" << new_node->DebugString();
+
+  (*g)->RemoveNode(succ);
+  (*g)->RemoveNode(pred);
+
+  return Status::OK();
+}
+
 Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
     std::unique_ptr<Graph>* g, Node* m, Node* n) {
   CHECK_EQ(((m->type_string() == csinfo_.bias_add_grad &&
@@ -2183,6 +2833,12 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* m,
         m->type_string() == csinfo_.conv2d))) {
     return this->MergeConv2DWithBiasAdd(g, m, n);
   }
+  if (((m->type_string() == csinfo_.pad &&
+        n->type_string() == csinfo_.conv2d)) ||
+      ((n->type_string() == csinfo_.pad &&
+        m->type_string() == csinfo_.conv2d))) {
+    return this->MergePadWithConv2D(g, m, n);
+  }
 
   if (((m->type_string() == csinfo_.bias_add_grad &&
         n->type_string() == csinfo_.conv2d_grad_filter)) ||
@@ -2231,7 +2887,8 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
     return s;
   }
 
-  ri->copy_attrs(const_cast<const Node*>(orig_node), &nb);
+  const bool kPartialCopyAttrs = false;
+  ri->copy_attrs(const_cast<const Node*>(orig_node), &nb, kPartialCopyAttrs);
 
   // Set the Mkl layer label for this op.
   if (DataTypeIsQuantized(orig_node->input_type(0)) ||
@@ -2328,11 +2985,13 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
     return nullptr;
   }
 
-  // We make an exception for __MklDummyConv2DWithBias and
-  // __MklConv2DBackpropFilterWithBias since their names do not match Mkl node
-  // names.
+  // We make an exception for __MklDummyConv2DWithBias,
+  // __MklConv2DBackpropFilterWithBias, and __MklDummyPadWithConv2D since their
+  // names do not match Mkl node names.
   if (n->type_string() != csinfo_.conv2d_with_bias &&
+      n->type_string() != csinfo_.pad_with_conv2d &&
       n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
+      n->type_string() != csinfo_.fused_conv2d &&
       !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()),
                                 T)) {
     return nullptr;
@@ -2391,6 +3050,143 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   return nullptr;
 }
 
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for node fusion
+//////////////////////////////////////////////////////////////////////////
+Status MklLayoutRewritePass::FuseTransposeMklOpTranspose(
+    std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+    std::function<void(const Node*, NodeBuilder* nb, bool)> copy_attrs,
+    string data_format) {
+  Node* transpose_to_nhwc = nodes[0];
+  Node* mklop = nodes[1];
+  Node* transpose_to_nchw = nodes[2];
+
+  const int transpose_nhwc_num_inputs = transpose_to_nhwc->num_inputs();
+  gtl::InlinedVector<Node*, 4> transpose_nhwc_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> transpose_nhwc_in(
+      transpose_nhwc_num_inputs);
+  FillInputs(transpose_to_nhwc, &transpose_nhwc_control_edges,
+             &transpose_nhwc_in);
+
+  const int mklop_num_inputs = mklop->num_inputs();
+  gtl::InlinedVector<Node*, 4> mklop_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> mklop_in(mklop_num_inputs);
+  FillInputs(mklop, &mklop_control_edges, &mklop_in);
+
+  const int transpose_nchw_num_inputs = transpose_to_nchw->num_inputs();
+  gtl::InlinedVector<Node*, 4> transpose_nchw_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> transpose_nchw_in(
+      transpose_nchw_num_inputs);
+  FillInputs(transpose_to_nchw, &transpose_nchw_control_edges,
+             &transpose_nchw_in);
+
+  // We use same name as original node, but change the op
+  // type.
+  NodeBuilder nb(mklop->name(), mklop->type_string());
+
+  // Storing the output slots of the input nodes.
+  for (int i = 0; i < mklop_num_inputs; i++) {
+    if (mklop_in[i].first == transpose_to_nhwc) {
+      // Fill "x":
+      nb.Input(transpose_nhwc_in[0].first, transpose_nhwc_in[0].second);
+    } else {
+      // Fill inputs other than "x":
+      nb.Input(mklop_in[i].first, mklop_in[i].second);
+    }
+  }
+
+  copy_attrs(const_cast<const Node*>(mklop), &nb, true);
+  nb.Attr("data_format", data_format);
+
+  // Copy the device assigned to old node to new node.
+  nb.Device(mklop->def().device());
+
+  // Create node.
+  Node* new_node;
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
+  DCHECK(new_node);
+
+  // Fill outputs.
+  for (const Edge* e : transpose_to_nchw->out_edges()) {
+    if (!e->IsControlEdge()) {
+      const int kTransposeWithMklOpOutputSlot = 0;
+      DCHECK((*g)->AddEdge(new_node, kTransposeWithMklOpOutputSlot, e->dst(),
+                           e->dst_input()));
+    }
+  }
+
+  // Copy device assigned to old node to new node.
+  new_node->set_assigned_device_name(mklop->assigned_device_name());
+
+  // Copy requested_device and assigned_device_name_index
+  new_node->set_requested_device(mklop->requested_device());
+  new_node->set_assigned_device_name_index(mklop->assigned_device_name_index());
+
+  (*g)->RemoveNode(transpose_to_nhwc);
+  (*g)->RemoveNode(mklop);
+  (*g)->RemoveNode(transpose_to_nchw);
+
+  return Status::OK();
+}
+
+Status MklLayoutRewritePass::FuseNode(
+    std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+    const MklLayoutRewritePass::FusionInfo fi) {
+  return fi.fuse_func(g, nodes, fi.copy_attrs);
+}
+
+std::tuple<bool, std::vector<Node*>, const MklLayoutRewritePass::FusionInfo>
+MklLayoutRewritePass::CheckForNodeFusion(Node* a) const {
+  // Stores matched nodes, in the same order as node_checkers.
+  std::vector<Node*> nodes;
+
+  for (auto fi = finfo_.begin(); fi != finfo_.end(); ++fi) {
+    //
+    // Make sure node "a" and its succeding nodes (b, c ...), match the pattern
+    // defined in fusion info (ops[0], ops[1], ...),
+    // a.k.a. "a->b->c" matches "op1->op2->op3"
+    //
+
+    // Stores the first unvisted outgoing edge of each matched node in "nodes".
+    std::stack<EdgeSet::const_iterator> current_neighbor_stack;
+    nodes.clear();
+
+    auto node_checker = fi->node_checkers.begin();
+    if (a != nullptr && (*node_checker)(a)) {
+      nodes.push_back(a);
+      current_neighbor_stack.push(a->out_edges().begin());
+      ++node_checker;
+    }
+
+    while (!nodes.empty()) {
+      auto& current_neighbor_iter = current_neighbor_stack.top();
+
+      if (current_neighbor_iter != nodes.back()->out_edges().end()) {
+        // Found an unvisited edge. Goes through the edge to get the neighbor.
+        Node* neighbor_node = (*current_neighbor_iter)->dst();
+        ++current_neighbor_stack.top();  // Retrieves the next unvisited edge.
+
+        if ((*node_checker)(neighbor_node)) {
+          // Found a match. Stores the node and moves to the next checker.
+          nodes.push_back(neighbor_node);
+          current_neighbor_stack.push(neighbor_node->out_edges().begin());
+          if (++node_checker == fi->node_checkers.end()) {
+            return make_tuple(true, nodes, *fi);
+          }
+        }
+      } else {
+        // Removes the current node since none of its neighbor leads to a
+        // further match.
+        nodes.pop_back();
+        current_neighbor_stack.pop();
+        --node_checker;
+      }
+    }
+  }
+
+  return make_tuple(false, std::vector<Node*>(), FusionInfo());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 //              Post-rewrite Mkl metadata fixup pass
 ///////////////////////////////////////////////////////////////////////////////
@@ -2516,6 +3312,30 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass(NodeMerge)", &**g);
 
+#ifdef ENABLE_TRANSPOSE_OPTIMIZATION
+  order.clear();
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+
+    auto check_result = CheckForNodeFusion(n);
+    bool found_pattern = std::get<0>(check_result);
+    std::vector<Node*> nodes = std::get<1>(check_result);
+    const FusionInfo fi = std::get<2>(check_result);
+
+    // if "found_pattern" is true, we can do the fusion.
+    if (found_pattern) {
+      if (FuseNode(g, nodes, fi) == Status::OK()) {
+        result = true;
+      }
+    }
+  }
+  DumpGraph("After running MklLayoutRewritePass(NodeFusion)", &**g);
+#endif  // ENABLE_TRANSPOSE_OPTIMIZATION
+
   order.clear();
   GetReversePostOrder(**g, &order);  // This will give us topological sort.
   for (Node* n : order) {
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 7e2d1f78785c59b2fe58d32c3f750923234419d2..197ec0c4aebcaab0a57c6b021dc146a9c6534db1 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -65,6 +65,13 @@ static void InitGraph(const string& s, Graph* graph,
 class MklLayoutPassTest : public ::testing::Test {
  public:
   MklLayoutPassTest() : graph_(OpRegistry::Global()) {}
+  // Ashraf added
+  Node* FindNode(const string& name) {
+    for (Node* node : graph_.nodes()) {
+      if (node->name() == name) return node;
+    }
+    LOG(FATAL) << name;
+  }
 
   void InitGraph(const string& s, const string& device = kCPUDevice) {
     ::tensorflow::InitGraph(s, &graph_, device);
@@ -126,14 +133,17 @@ REGISTER_OP("Input").Output("o: float").SetIsStateful();
 REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
 REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
 REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
+REGISTER_OP("DoubleInput").Output("o: double").SetIsStateful();
 REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
 REGISTER_OP("_MklInput2")
     .Output("o: uint8")
     .Output("o1: uint8")
     .SetIsStateful();
+REGISTER_OP("Output2").Input("i: float").Input("i1: float").SetIsStateful();
+REGISTER_OP("Output").Input("i: float").SetIsStateful();
 
 /////////////////////////////////////////////////////////////////////
-//  Unit tests related to node merge optiimization
+//  Unit tests related to node merge optimization
 /////////////////////////////////////////////////////////////////////
 
 TEST_F(MklLayoutPassTest, Basic) {
@@ -455,6 +465,559 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
             "E:3->G:4;F->G;F:control->DMT/_3:control;G->Z;X->Y:1;X->Z:1");
 }
 
+// Test set 3: Pad + Conv2D fusion
+// padding is VALID type
+// A = input(image), B = input(paddings), C= Pad = input of conv2D,
+// D=input(filter), E = Conv2D, Z = Zeta
+// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y)
+// After layout pass
+// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;"
+            "DMT/_2->E:5;E->Z;Y->Z:1");
+}
+// Test if input control edges do not duplicate after merge.
+// If both the merging ops have input control edge from a common op
+// then, the merged op will have only one control edge from that
+// common op.
+// padding is VALID type
+// A = input(image), A1 = input, B = input(paddings),
+// C= Pad = input of conv2D,
+// D=input(filter), E = Conv2D, Z = Zeta
+// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y)
+// A1:control->C:control
+// A1:control->E:control
+// After layout pass:
+// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2)
+// A1:control->E:control (only one control edge)
+TEST_F(MklLayoutPassTest, Input_ControlEdge_PadWithConv2D_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A1' op: 'Input'}"
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  Node* a1 = FindNode("A1");
+  Node* c = FindNode("C");
+  Node* e = FindNode("E");
+  const Edge* edge = graph_.AddControlEdge(a1, c);
+  const Edge* edge_1 = graph_.AddControlEdge(a1, e);
+  ASSERT_NE(edge, nullptr);
+  ASSERT_NE(edge_1, nullptr);
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);A1(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+      "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;"
+      "A1:control->E:control;A:control->DMT/_0:control;A:control->DMT/"
+      "_1:control;"
+      "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;"
+      "DMT/_2->E:5;E->Z;Y->Z:1");
+}
+// Test if output control edges does not duplicate after merge.
+// If both the merging ops have output control edge to a common op,
+// then after merge, the merged op will have only one control edge
+// to that commom op.
+// padding is VALID type
+// A = input(image), B = input(paddings), C= Pad = input of conv2D,
+// D=input(filter), E = Conv2D, Z = Zeta
+// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y)
+// C:control->A1:control
+// E:control->A1:control
+// After layout pass:
+// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2)
+// E:control->A1:control (only one control edge)
+TEST_F(MklLayoutPassTest, Output_ControlEdge_PadWithConv2D_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A1' op: 'Input'}"
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  Node* a1 = FindNode("A1");
+  Node* c = FindNode("C");
+  Node* e = FindNode("E");
+  const Edge* edge = graph_.AddControlEdge(c, a1);
+  const Edge* edge_1 = graph_.AddControlEdge(e, a1);
+  ASSERT_NE(edge, nullptr);
+  ASSERT_NE(edge_1, nullptr);
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);A1(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+      "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;"
+      "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+      "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;"
+      "DMT/_2->E:5;E->Z;E:control->A1:control;Y->Z:1");
+}
+// Pad + Conv2D fusion with padding is VALID,
+// Input node pointing to both Pad and Conv2D
+// A = input(image), B = input(paddings), C= Pad
+// E = Conv2D, Z = Zeta
+// C=Pad(A,B); E=Conv2D(C,A); Z=Zeta(E,Y)
+// After layout pass
+// _MklPadWithConv2D(A, A, B, DMT/_0, DMT/_1, DMT/_2)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Common_Input) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'A'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;A->E:1;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:2;DMT/_0->E:3;DMT/_1->E:4;"
+            "DMT/_2->E:5;E->Z;Y->Z:1");
+}
+// Pad + Conv2D with padding is VALID,
+// Input node pointing to both Pad and Conv2D
+// Output of both Pad and Conv2D feeds one node (Z as Output2)
+// A = input(as image), B = input(as paddings), C= Pad
+// E = Conv2D, Z = Output2
+// C=Pad(A,B); E=Conv2D(C,A); Z=Output(C,E)
+// After layout pass - No merging, since Pad and Conv2D both
+// feed to the same node (Z)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Common_InOutput) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'A'] }"
+      "node { name: 'Z' op: 'Output2'"
+      " input: ['C', 'E']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Pad);DMT/_0(Const);DMT/_1(Const);"
+            "E(_MklConv2D);Z(Output2)|A->C;A->E:1;B->C:1;C->E;C->Z;"
+            "C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "DMT/_0->E:2;DMT/_1->E:3;E->Z:1");
+}
+// Pad + Conv2D; padding is SAME
+// A = input(image), B = input(paddings), C= Pad = input of conv2D,
+// D=input(filter), E = Conv2D, Z = Zeta
+// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y)
+// After layout pass - No merging
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Negative) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(Int32Input);C(Pad);D(Input);DMT/_0(Const);DMT/_1(Const);"
+      "E(_MklConv2D);Y(Input);Z(Zeta)|A->C;B->C:1;C->E;"
+      "C:control->DMT/_0:control;C:control->DMT/_1:control;"
+      "D->E:1;DMT/_0->E:2;DMT/_1->E:3;E->Z;Y->Z:1");
+}
+#ifdef ENABLE_TRANSPOSE_OPTIMIZATION
+TEST_F(MklLayoutPassTest, NodeMerge_TransposeConv2DTranspose_Positive) {
+  InitGraph(
+      "node { name: 'Input0' op: 'Input'}"
+      "node { name: 'Input1' op: 'Input'}"
+      "node { name: 'Const0' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\002\\000\\000\\000\\003\\000\\000\\000\\001\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node { name: 'Const1' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\003\\000\\000\\000\\001\\000\\000\\000\\002\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node {              \
+      name: 'Transpose0' \
+      op: 'Transpose'    \
+      input: 'Input0'    \
+      input: 'Const0'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node {                 \
+      name: 'Conv2D'        \
+      op: 'Conv2D'          \
+      input: 'Transpose0'   \
+      input: 'Input1'       \
+      attr {                \
+        key: 'T'            \
+        value {             \
+          type: DT_FLOAT    \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'data_format'  \
+        value {             \
+          s: 'NHWC'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'dilations'    \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'padding'      \
+        value {             \
+          s: 'SAME'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'strides'      \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'use_cudnn_on_gpu' \
+        value {                 \
+          b: true               \
+        }                       \
+      }                         \
+    }"
+      "node {              \
+      name: 'Transpose1' \
+      op: 'Transpose'    \
+      input: 'Conv2D'    \
+      input: 'Const1'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node { name: 'Relu' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['Transpose1'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "Const0(Const);Const1(Const);"
+            "Conv2D(_MklConv2D);DMT/_0(Const);DMT/_1(Const);Input0(Input);"
+            "Input1(Input);Relu(_MklRelu)|Conv2D->Relu;Conv2D:2->Relu:1;DMT/"
+            "_0->Conv2D:2;DMT/_1->Conv2D:3;Input0->Conv2D;"
+            "Input0:control->DMT/_0:control;Input0:control->DMT/"
+            "_1:control;Input1->Conv2D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_TransposeConv2DTranspose_Negative) {
+  InitGraph(
+      "node { name: 'Input0' op: 'Input'}"
+      "node { name: 'Input1' op: 'Input'}"
+      "node { name: 'Const0' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\002\\000\\000\\000\\003\\000\\000\\000\\001\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node { name: 'Const1' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\002\\000\\000\\000\\003\\000\\000\\000\\001\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node {              \
+      name: 'Transpose0' \
+      op: 'Transpose'    \
+      input: 'Input0'    \
+      input: 'Const0'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node {                 \
+      name: 'Conv2D'        \
+      op: 'Conv2D'          \
+      input: 'Transpose0'   \
+      input: 'Input1'       \
+      attr {                \
+        key: 'T'            \
+        value {             \
+          type: DT_FLOAT    \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'data_format'  \
+        value {             \
+          s: 'NHWC'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'dilations'    \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'padding'      \
+        value {             \
+          s: 'SAME'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'strides'      \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'use_cudnn_on_gpu' \
+        value {                 \
+          b: true               \
+        }                       \
+      }                         \
+    }"
+      "node {              \
+      name: 'Transpose1' \
+      op: 'Transpose'    \
+      input: 'Conv2D'    \
+      input: 'Const1'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node { name: 'Relu' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['Transpose1'] }");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "Const0(Const);Const1(Const);"
+      "Conv2D(_MklConv2D);DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);"
+      "Input0(Input);Input1(Input);Relu(_MklRelu);"
+      "Transpose0(Transpose);Transpose1(Transpose)|Const0->Transpose0:1;Const1-"
+      ">Transpose1:1;"
+      "Conv2D->Transpose1;DMT/_0->Conv2D:2;DMT/_1->Conv2D:3;DMT/"
+      "_2->Relu:1;Input0->Transpose0;"
+      "Input1->Conv2D:1;Transpose0->Conv2D;Transpose0:control->DMT/_0:control;"
+      "Transpose0:control->DMT/"
+      "_1:control;Transpose1->Relu;Transpose1:control->DMT/_2:control");
+}
+#endif  // ENABLE_TRANSPOSE_OPTIMIZATION
+
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to rewriting node to Mkl node
 /////////////////////////////////////////////////////////////////////
@@ -534,6 +1097,131 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
             "A->C;B->C:1;B->D;C->D:1");
 }
 
+// Rewrite test for _FusedConv2D Op with BiasAdd fusion
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['D', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklFusedConv2D);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;C->E:1;D->E;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Rewrite test for _FusedConv2D Op with Relu fusion
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Positive2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'Relu'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['D', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklFusedConv2D);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;C->E:1;D->E;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Rewrite test for _FusedConv2D Op with BiasAdd+Relu fusion
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Positive3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'"
+      "             value { list: {s: 'BiasAdd', s: 'Relu'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['D', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklFusedConv2D);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;C->E:1;D->E;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Rewrite test for _FusedConv2D Op with unsupported fusion
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'Unsupported'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['D', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_FusedConv2D);E(Zeta)|A->D;"
+            "B->D:1;C->D:2;C->E:1;D->E");
+}
+
+// Rewrite test for _FusedConv2D Op with unsupported type
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'DoubleInput'}"
+      "node { name: 'B' op: 'DoubleInput'}"
+      "node { name: 'C' op: 'DoubleInput'}"
+      "node { name: 'D' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_DOUBLE } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_DOUBLE } }"
+      " input: ['D', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(DoubleInput);B(DoubleInput);C(DoubleInput);"
+            "D(_FusedConv2D);E(Zeta)|A->D;B->D:1;C->D:2;C->E:1;D->E");
+}
+
 TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
@@ -960,6 +1648,85 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Relu6Relu6Grad_Positive) {
             "DMT/_1->C:2");
 }
 
+TEST_F(MklLayoutPassTest, NodeRewrite_LeakyRelu_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LeakyRelu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'            value { f: 0.1 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLeakyRelu);C(Zeta);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_LeakyRelu_Negative) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LeakyRelu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'            value { f: 2.0 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(LeakyRelu);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_LeakyReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'LeakyReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'            value { f: 0.1 } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklLeakyReluGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_LeakyReluGrad_Negative) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'LeakyReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'            value { f: 2.0 } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(Input);C(LeakyReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_LeakyReluLeakyReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LeakyRelu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'            value { f: 0.1 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'LeakyReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'            value { f: 0.1 } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(_MklLeakyRelu);C(_MklLeakyReluGrad);D(Zeta);DMT/_0(Const);"
+      "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
+      "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
+      "DMT/_1->C:2");
+}
+
 TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index a91e6dd05738ae8242c812970e8bbc4a10c7675a..6dc9a50b98a9b2fefc2a0e66809f528d6fc7567f 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -38,8 +38,9 @@ NodeBuilder::NodeOut::NodeOut()
     : node(nullptr), error(true), index(0), dt(DT_FLOAT) {}
 
 NodeBuilder::NodeBuilder(StringPiece name, StringPiece op_name,
-                         const OpRegistryInterface* op_registry)
-    : def_builder_(name, op_name, op_registry) {}
+                         const OpRegistryInterface* op_registry,
+                         const NodeDebugInfo* debug)
+    : def_builder_(name, op_name, op_registry, debug) {}
 
 NodeBuilder::NodeBuilder(StringPiece name, const OpDef* op_def)
     : def_builder_(name, op_def) {}
diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h
index b1dc2ae92f14ba4519d98a4c556c1d06e14b6b5d..51e044cd8b2ee7a70dbf197c16925a0b972e9365 100644
--- a/tensorflow/core/graph/node_builder.h
+++ b/tensorflow/core/graph/node_builder.h
@@ -77,7 +77,8 @@ class NodeBuilder {
   // specified by calling the methods below.
   // REQUIRES: The OpDef must satisfy ValidateOpDef().
   NodeBuilder(StringPiece name, StringPiece op_name,
-              const OpRegistryInterface* op_registry = OpRegistry::Global());
+              const OpRegistryInterface* op_registry = OpRegistry::Global(),
+              const NodeDebugInfo* debug = nullptr);
   NodeBuilder(StringPiece name, const OpDef* op_def);
 
   // Create a NodeBuilder from an existing NodeDefBuilder.
diff --git a/tensorflow/core/graph/optimizer_cse.cc b/tensorflow/core/graph/optimizer_cse.cc
index 4073255db3f7cbcd697f3cb2781e04b3b01634c1..19afeb6badbc6c1528a3ea19b8b14eb98296c731 100644
--- a/tensorflow/core/graph/optimizer_cse.cc
+++ b/tensorflow/core/graph/optimizer_cse.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -213,6 +214,7 @@ bool OptimizerCSE::Optimize(
         g_->AddEdge(*candidate, e->src_output(), e->dst(), e->dst_input());
       }
 
+      MergeDebugInfo(NodeDebugInfo(*n), *candidate);
       g_->RemoveNode(n);
       changed = true;
     }
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 7982b358538d5cf9c8d6a31b7479f939c67d48d4..6e3012000fc82495615d3b6a53e41b3085d9bff1 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -107,6 +107,8 @@ cc_library(
         ":utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -141,6 +143,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
 )
@@ -173,12 +176,14 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_view",
-        ":grappler_item",
+        ":op_types",
         ":utils",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -188,7 +193,9 @@ tf_cc_test(
     deps = [
         ":grappler_item",
         ":mutable_graph_view",
+        ":utils",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:graph",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 5090e62b2ccfb00241e2b9c87d1922320646632e..f8af1232f773f896b3aa1406e7d365a091f923c3 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -132,9 +132,6 @@ tf_cuda_library(
     name = "utils",
     srcs = ["utils.cc"],
     hdrs = ["utils.h"],
-    cuda_deps = [
-        "@local_config_cuda//cuda:cudnn_header",
-    ],
     visibility = ["//visibility:public"],
     deps = [
         "//third_party/eigen3",
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index 7d868a3679e5b3d5759fdd951e726cfe7af3babf..d45bb14e07072fff1742e243f6b0bc15b51c62c6 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -20,12 +20,6 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 
-#if GOOGLE_CUDA
-#include "cuda/include/cuda.h"
-#include "cuda/include/cuda_runtime_api.h"
-#include "cuda/include/cudnn.h"
-#endif
-
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index 0a47b2256583f35e6ef413b50fdc8eea2bdc978d..16156d0f2042763a7518d5de2c57440343e50f2d 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -111,32 +111,37 @@ class GraphViewInternal {
 
   GraphDefT* graph() const { return graph_; }
 
-  // Find a node by name or return `nullptr` if it's not in a graph view.
+  // Finds a node by name or return `nullptr` if it's not in the graph view.
   NodeDefT* GetNode(absl::string_view node_name) const {
     return gtl::FindWithDefault(nodes_, node_name, nullptr);
   }
 
-  // Get the specified input port. Note that the special '-1' port_id can be
+  // Checks if a node by name is in the graph view.
+  bool HasNode(absl::string_view node_name) const {
+    return GetNode(node_name) != nullptr;
+  }
+
+  // Gets the specified input port. Note that the special '-1' port_id can be
   // used to access the controlling nodes (i.e. the nodes connected to node_name
   // through an incoming control dependency).
   InputPort GetInputPort(absl::string_view node_name, int port_id) const {
     return InputPort(GetNode(node_name), port_id);
   }
 
-  // Get the specified output port. Note that the special '-1' port_id can be
+  // Gets the specified output port. Note that the special '-1' port_id can be
   // used to access the controlled nodes (i.e. the nodes connected to node_name
   // through an outgoing control dependency).
   OutputPort GetOutputPort(absl::string_view node_name, int port_id) const {
     return OutputPort(GetNode(node_name), port_id);
   }
 
-  // Get the input (resp. output) port(s) in the immediate fanout (resp. fanin)
-  // of an output (resp. input) port.
+  // Gets the input port(s) in the immediate fanout of an output port.
   const absl::flat_hash_set<InputPort>& GetFanout(
       const OutputPort& port) const {
     return gtl::FindWithDefault(fanouts_, port, fanout_not_found_value_);
   }
 
+  // Gets the output port(s) in the immediate fanin of an input port.
   absl::flat_hash_set<OutputPort> GetFanin(const InputPort& port) const {
     if (port.port_id >= 0) return {GetRegularFanin(port)};
 
@@ -162,9 +167,22 @@ class GraphViewInternal {
     return GetOutputPort(tensor_id.node(), tensor_id.index());
   }
 
-  // Get all the input (resp. output) ports in the immediate fanout (resp
-  // fanin) of a node. Include the controlling nodes iff
-  // include_controlling_nodes is true.
+  // Checks if a tensor id is a fanin of the node.
+  bool HasFanin(const NodeDef& node, const TensorId& fanin) const {
+    if (fanin.index() < -1) {
+      return false;
+    }
+    string fanin_string = TensorIdToString(fanin);
+    for (int i = 0; i < node.input_size(); ++i) {
+      if (node.input(i) == fanin_string) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Gets all the input ports in the immediate fanout of a node. Include the
+  // controlled nodes iff include_controlled_nodes is true.
   absl::flat_hash_set<InputPort> GetFanouts(
       const NodeDef& node, bool include_controlled_nodes) const {
     absl::flat_hash_set<InputPort> result;
@@ -185,6 +203,8 @@ class GraphViewInternal {
     return result;
   }
 
+  // Gets all the output ports in the immediate fanin of a node. Include the
+  // controlling nodes iff include_controlling_nodes is true.
   absl::flat_hash_set<OutputPort> GetFanins(
       const NodeDef& node, bool include_controlling_nodes) const {
     absl::flat_hash_set<OutputPort> result;
@@ -198,7 +218,7 @@ class GraphViewInternal {
     return result;
   }
 
-  // Get the number of ports in the immediate fanin of a node. Count the
+  // Gets the number of ports in the immediate fanin of a node. Count the
   // controlling nodes iff include_controlling_nodes is true.
   int NumFanins(const NodeDef& node, bool include_controlling_nodes) const {
     int count = 0;
@@ -211,14 +231,14 @@ class GraphViewInternal {
     return count;
   }
 
-  // Get the number of ports in the immediate fanout of a node. Count the
-  // controlling nodes iff include_controlling_nodes is true.
-  int NumFanouts(const NodeDef& node, bool include_controlling_nodes) const {
+  // Gets the number of ports in the immediate fanout of a node. Count the
+  // controlled nodes iff include_controlled_nodes is true.
+  int NumFanouts(const NodeDef& node, bool include_controlled_nodes) const {
     int count = 0;
 
     OutputPort port;
     port.node = const_cast<NodeDefT*>(&node);
-    const int first_port_id = include_controlling_nodes ? -1 : 0;
+    const int first_port_id = include_controlled_nodes ? -1 : 0;
     const int last_port_id =
         gtl::FindWithDefault(max_regular_output_port_, port.node, -1);
 
@@ -231,8 +251,8 @@ class GraphViewInternal {
     return count;
   }
 
-  // Get all the edges in the immediate fanout (resp fanin) of a node.
-  // Include the control edges iff include_controlling_edges is true.
+  // Gets all the edges in the immediate fanout of a node. Include the
+  // controlled edges iff include_controlled_edges is true.
   absl::flat_hash_set<Edge> GetFanoutEdges(
       const NodeDef& node, bool include_controlled_edges) const {
     absl::flat_hash_set<Edge> result;
@@ -248,14 +268,16 @@ class GraphViewInternal {
       auto it = fanouts_.find(port);
       if (it != fanouts_.end()) {
         for (auto itr = it->second.begin(); itr != it->second.end(); ++itr) {
-          result.emplace(/*src*/ OutputPort(const_cast<NodeDefT*>(&node), i),
-                         /*dst*/ *itr);
+          result.emplace(/*src=*/OutputPort(const_cast<NodeDefT*>(&node), i),
+                         /*dst=*/*itr);
         }
       }
     }
     return result;
   }
 
+  // Gets all the edges in the immediate fanin of a node. Include the
+  // controlling edges iff include_controlling_edges is true.
   absl::flat_hash_set<Edge> GetFaninEdges(
       const NodeDef& node, bool include_controlling_edges) const {
     absl::flat_hash_set<Edge> result;
@@ -265,8 +287,8 @@ class GraphViewInternal {
 
       auto it = nodes_.find(tensor_id.node());
       if (it != nodes_.end()) {
-        result.emplace(/*src*/ OutputPort(it->second, tensor_id.index()),
-                       /*dst*/ InputPort(const_cast<NodeDefT*>(&node), i));
+        result.emplace(/*src=*/OutputPort(it->second, tensor_id.index()),
+                       /*dst=*/InputPort(const_cast<NodeDefT*>(&node), i));
       }
     }
     return result;
diff --git a/tensorflow/core/grappler/graph_view_test.cc b/tensorflow/core/grappler/graph_view_test.cc
index cbf859a4a99d7c434a4a65185c8962ea539c1aed..404dcd30c12781f2f9581ac6a1cb5986bb75f187 100644
--- a/tensorflow/core/grappler/graph_view_test.cc
+++ b/tensorflow/core/grappler/graph_view_test.cc
@@ -230,6 +230,40 @@ TEST_F(GraphViewTest, ControlDependencies) {
   EXPECT_EQ(0, (*fanin.begin()).port_id);
 }
 
+TEST_F(GraphViewTest, HasNode) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  GraphView graph(&item.graph);
+
+  EXPECT_EQ(true, graph.HasNode("a"));
+  EXPECT_EQ(false, graph.HasNode("b"));
+}
+
+TEST_F(GraphViewTest, HasFanin) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::Square(s.WithOpName("b"), {a});
+  Output c = ops::Sqrt(s.WithOpName("c"), {b});
+  Output d = ops::AddN(s.WithOpName("d").WithControlDependencies(a), {b, c});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  GraphView graph(&item.graph);
+
+  const NodeDef* d_node = graph.GetNode("d");
+  EXPECT_NE(nullptr, d_node);
+
+  EXPECT_EQ(true, graph.HasFanin(*d_node, {"a", Graph::kControlSlot}));
+  EXPECT_EQ(false, graph.HasFanin(*d_node, {"a", 0}));
+  EXPECT_EQ(true, graph.HasFanin(*d_node, {"b", 0}));
+  EXPECT_EQ(false, graph.HasFanin(*d_node, {"b", Graph::kControlSlot}));
+  EXPECT_EQ(true, graph.HasFanin(*d_node, {"c", 0}));
+  EXPECT_EQ(false, graph.HasFanin(*d_node, {"c", Graph::kControlSlot}));
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 2c490f3966cb45f61a22ba0a858a928f46e9db1b..2d71ac54cc7af2b40e42ef34d198fd42f4b0a3d4 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -19,27 +19,33 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace grappler {
 
-GrapplerItem::GrapplerItem(const GrapplerItem& other, GraphDef* graph_def) {
-  id = other.id;
-  feed = other.feed;
-  fetch = other.fetch;
-  init_ops = other.init_ops;
-  keep_ops = other.keep_ops;
-  expected_init_time = other.expected_init_time;
-  save_op = other.save_op;
-  restore_op = other.restore_op;
-  save_restore_loc_tensor = other.save_restore_loc_tensor;
-  queue_runners = other.queue_runners;
-  allowed_optimizations = other.allowed_optimizations;
-  graph.Swap(graph_def);
+GrapplerItem GrapplerItem::WithGraph(GraphDef&& graph_def) const {
+  GrapplerItem item;
+  item.id = id;
+  item.feed = feed;
+  item.fetch = fetch;
+  item.init_ops = init_ops;
+  item.keep_ops = keep_ops;
+  item.expected_init_time = expected_init_time;
+  item.save_op = save_op;
+  item.restore_op = restore_op;
+  item.save_restore_loc_tensor = save_restore_loc_tensor;
+  item.queue_runners = queue_runners;
+  item.devices_ = devices_;
+  item.allowed_optimizations_ = allowed_optimizations_;
+  item.graph.Swap(&graph_def);
+  return item;
 }
 
 std::vector<const NodeDef*> GrapplerItem::MainOpsFanin() const {
@@ -108,9 +114,76 @@ std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
       result.insert(NodeName(queue_runner.cancel_op_name()));
     }
   }
+
+  if (!allowed_optimizations_.prune_ops_with_side_effects) {
+    for (const NodeDef& node : graph.node()) {
+      if (!IsFreeOfSideEffect(node)) {
+        result.insert(node.name());
+      }
+    }
+  }
+
   return result;
 }
 
+const std::unordered_set<string>& GrapplerItem::devices() const {
+  return devices_;
+}
+
+Status GrapplerItem::AddDevice(const string& device) {
+  DeviceNameUtils::ParsedName name;
+
+  if (!DeviceNameUtils::ParseFullName(device, &name)) {
+    return errors::InvalidArgument("Invalid device name: device=", device);
+
+  } else if (!name.has_job || !name.has_replica || !name.has_task ||
+             !name.has_type || !name.has_id) {
+    return errors::InvalidArgument("Not a fully defined device name: device=",
+                                   device);
+  }
+
+  devices_.insert(DeviceNameUtils::ParsedNameToString(name));
+  return Status::OK();
+}
+
+Status GrapplerItem::AddDevices(const GrapplerItem& other) {
+  std::vector<absl::string_view> invalid_devices;
+  for (const string& device : other.devices()) {
+    Status added = AddDevice(device);
+    if (!added.ok()) invalid_devices.emplace_back(device);
+  }
+  return invalid_devices.empty()
+             ? Status::OK()
+             : errors::InvalidArgument("Skipped invalid devices: [",
+                                       absl::StrJoin(invalid_devices, ", "),
+                                       "]");
+}
+
+Status GrapplerItem::InferDevicesFromGraph() {
+  absl::flat_hash_set<absl::string_view> invalid_devices;
+  for (const NodeDef& node : graph.node()) {
+    Status added = AddDevice(node.device());
+    if (!added.ok()) invalid_devices.insert(node.device());
+  }
+  VLOG(2) << "Inferred device set: [" << absl::StrJoin(devices_, ", ") << "]";
+  return invalid_devices.empty()
+             ? Status::OK()
+             : errors::InvalidArgument("Skipped invalid devices: [",
+                                       absl::StrJoin(invalid_devices, ", "),
+                                       "]");
+}
+
+void GrapplerItem::ClearDevices() { devices_.clear(); }
+
+const GrapplerItem::AllowedOptimizations& GrapplerItem::allowed_optimizations()
+    const {
+  return allowed_optimizations_;
+}
+
+GrapplerItem::AllowedOptimizations& GrapplerItem::allowed_optimizations() {
+  return allowed_optimizations_;
+}
+
 std::vector<const NodeDef*> ComputeTransitiveFanin(
     const GraphDef& graph, const std::vector<string>& terminal_nodes) {
   bool ill_formed = false;
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index a0748abfe691334c6dc838c05e0d3f1cee2e2ecb..1ae551f5ac9f5ed09dbaf2c399bf1a464dfab138 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -35,12 +35,15 @@ namespace grappler {
 // nodes, and potentially a set of nodes to feed.
 struct GrapplerItem {
   GrapplerItem() = default;
-  GrapplerItem(const GrapplerItem& other, GraphDef&& graph_def)
-      : GrapplerItem(other, &graph_def) {}
-  // Swaps *graph_def with an empty GraphDef.
-  GrapplerItem(const GrapplerItem& other, GraphDef* graph_def);
+  GrapplerItem(const GrapplerItem& other) = default;
+  GrapplerItem(GrapplerItem&& other) = default;
+  GrapplerItem& operator=(const GrapplerItem& other) = default;
+  GrapplerItem& operator=(GrapplerItem&& other) = default;
   virtual ~GrapplerItem() = default;
 
+  // Create a copy of this GrapplerItem with graph swapped with the argument.
+  GrapplerItem WithGraph(GraphDef&& graph) const;
+
   string id;  // A unique id for this item
 
   // Inputs
@@ -83,9 +86,41 @@ struct GrapplerItem {
     // Is it allowed to add nodes to the graph that do not have registered
     // gradient function.
     bool non_differentiable_rewrites = true;
+
+    // By default we are allowed to prune ops with side-effects from the main
+    // graph if they are not in transitive fanin of the fetch nodes. If we are
+    // optimizing a graph that was instantiated by a function definition, we
+    // must keep all side effects intact.
+    bool prune_ops_with_side_effects = true;
   };
 
-  AllowedOptimizations allowed_optimizations;
+  const std::unordered_set<string>& devices() const;
+  // Adds a device to a set of available devices, only if it's a valid fully
+  // defined device name. Returns `Status::OK()` if successfully added a device,
+  // and an error otherwise.
+  Status AddDevice(const string& device);
+  // Adds all valid devices from the other Grappler item to the device set.
+  Status AddDevices(const GrapplerItem& other);
+  // Adds all valid devices from the nodes of the graph to the device set.
+  // Returns `Status::OK()` if all device annotations found in a graph are valid
+  // fully defined device names, and an error otherwise.
+  Status InferDevicesFromGraph();
+  // Clears a set of available devices.
+  void ClearDevices();
+
+  const AllowedOptimizations& allowed_optimizations() const;
+  AllowedOptimizations& allowed_optimizations();
+
+ private:
+  // TODO(ezhulenev) Make GrapplerItem a class and hide all public data members.
+  // TODO(ezhulenev): Migrate all unordered collections to absl.
+
+  // A set of fully defined device names that can be used to place the nodes of
+  // the `graph`.
+  // Example of a fully defined name: "/job:work/replica:1/task:1/device:CPU:0"
+  std::unordered_set<string> devices_;
+
+  AllowedOptimizations allowed_optimizations_;
 };
 
 // Return the transitive fanin of a set of terminal nodes.
diff --git a/tensorflow/core/grappler/grappler_item_test.cc b/tensorflow/core/grappler/grappler_item_test.cc
index 72a9f481cab6cc5dfdc5994459e149739e427ce6..a8fbe356829409ac3b472267cd22d4b5b54cd1f5 100644
--- a/tensorflow/core/grappler/grappler_item_test.cc
+++ b/tensorflow/core/grappler/grappler_item_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -44,6 +46,32 @@ TEST_F(GrapplerItemTest, Basic) {
   EXPECT_EQ(main_ops, graph_nodes);
 }
 
+TEST_F(GrapplerItemTest, InferDevices) {
+  using test::function::NDef;
+
+  const string cpu0 = "/job:work/replica:1/task:1/device:CPU:0";
+  const string cpu1 = "/job:work/replica:1/task:1/device:CPU:1";
+  const string cpu2 = "/device:CPU:2";
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {
+          NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu0),
+          NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu1),
+          NDef("c", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu2),
+      },
+      {} /* Empty function library */);
+
+  ASSERT_FALSE(item.InferDevicesFromGraph().ok());
+
+  EXPECT_EQ(item.devices().size(), 2);
+  EXPECT_NE(item.devices().find(cpu0), item.devices().end());
+  EXPECT_NE(item.devices().find(cpu1), item.devices().end());
+
+  item.ClearDevices();
+  EXPECT_EQ(item.devices().size(), 0);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/mutable_graph_view.cc b/tensorflow/core/grappler/mutable_graph_view.cc
index 1a4754153bca9bb7ee019b9b9ea67e6ce3cb5f89..224b720328a36e37079244acf952873ec8fc47c8 100644
--- a/tensorflow/core/grappler/mutable_graph_view.cc
+++ b/tensorflow/core/grappler/mutable_graph_view.cc
@@ -14,14 +14,32 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/mutable_graph_view.h"
+
+#include <algorithm>
+#include <utility>
+
 #include "absl/strings/str_cat.h"
 #include "absl/strings/substitute.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace grappler {
 
+namespace {
+
+bool IsTensorIdPortValid(const TensorId& tensor_id) {
+  return tensor_id.index() >= Graph::kControlSlot;
+}
+
+}  // namespace
+
 const absl::flat_hash_set<MutableGraphView::InputPort>&
 MutableGraphView::GetFanout(const GraphView::OutputPort& port) const {
   return GetFanout(MutableGraphView::OutputPort(const_cast<NodeDef*>(port.node),
@@ -68,7 +86,7 @@ void MutableGraphView::UpdateFanouts(absl::string_view from_node,
 }
 
 void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
-  VLOG(0) << absl::Substitute("Update fanouts from '$0' to '$1'.",
+  VLOG(2) << absl::Substitute("Update fanouts from '$0' to '$1'.",
                               from_node->name(), to_node->name());
 
   // Update internal state with the new output_port->input_port edge.
@@ -160,17 +178,201 @@ void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
   }
 }
 
+bool MutableGraphView::AddFanin(NodeDef* node, const TensorId& fanin) {
+  NodeDef* fanin_node = GetNode(fanin.node());
+  if (fanin_node == nullptr) {
+    return false;
+  }
+
+  int num_non_controlling_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
+  InputPort input;
+  input.node = node;
+  input.port_id = fanin.index() == Graph::kControlSlot
+                      ? Graph::kControlSlot
+                      : num_non_controlling_fanins;
+
+  OutputPort fanin_port(fanin_node, fanin.index());
+
+  if (!gtl::InsertIfNotPresent(&fanouts()[fanin_port], input)) {
+    return false;
+  }
+  node->add_input(TensorIdToString(fanin));
+  if (fanin.index() > Graph::kControlSlot) {
+    int node_input_size = node->input_size() - 1;
+    // If there are control dependencies in node, move newly inserted fanin to
+    // be before such control dependencies.
+    if (num_non_controlling_fanins < node_input_size) {
+      node->mutable_input()->SwapElements(node_input_size,
+                                          num_non_controlling_fanins);
+    }
+  }
+  return true;
+}
+
+bool MutableGraphView::AddFanin(absl::string_view node_name,
+                                const TensorId& fanin) {
+  if (!IsTensorIdPortValid(fanin)) {
+    return false;
+  }
+  NodeDef* node = GetNode(node_name);
+  if (node == nullptr) {
+    return false;
+  }
+  return AddFanin(node, fanin);
+}
+
+bool MutableGraphView::RemoveFanins(NodeDef* node,
+                                    absl::Span<const TensorId> fanins) {
+  bool modified = false;
+  auto mutable_inputs = node->mutable_input();
+  int curr_pos = 0;
+  int num_inputs = node->input_size();
+  for (int i = 0; i < num_inputs; ++i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    bool remove_fanin =
+        std::find(fanins.begin(), fanins.end(), tensor_id) != fanins.end();
+    bool update_fanin = !remove_fanin && modified;
+    if (remove_fanin || update_fanin) {
+      OutputPort fanin(nodes()[tensor_id.node()], tensor_id.index());
+
+      InputPort input;
+      input.node = node;
+      input.port_id =
+          tensor_id.index() == Graph::kControlSlot ? Graph::kControlSlot : i;
+
+      if (remove_fanin) {
+        fanouts()[fanin].erase(input);
+      } else {
+        // Shift inputs to be retained.
+        if (tensor_id.index() > Graph::kControlSlot) {
+          fanouts()[fanin].erase(input);
+          fanouts()[fanin].insert(InputPort(node, i));
+        }
+        mutable_inputs->SwapElements(i, curr_pos++);
+      }
+
+      modified = true;
+    } else {
+      // Skip inputs to be retained until first modification.
+      curr_pos++;
+    }
+  }
+  if (modified) {
+    mutable_inputs->DeleteSubrange(curr_pos, num_inputs - curr_pos);
+  }
+  return modified;
+}
+
+bool MutableGraphView::RemoveFanin(absl::string_view node_name,
+                                   const TensorId& fanin) {
+  if (!IsTensorIdPortValid(fanin)) {
+    return false;
+  }
+  NodeDef* node = GetNode(node_name);
+  if (node == nullptr) {
+    return false;
+  }
+  return RemoveFanins(node, {fanin});
+}
+
+bool MutableGraphView::RemoveAllFanins(absl::string_view node_name,
+                                       bool keep_controlling_fanins) {
+  NodeDef* node = GetNode(node_name);
+  if (node == nullptr || node->input().empty()) {
+    return false;
+  }
+  RemoveFaninsInternal(node, keep_controlling_fanins);
+  if (keep_controlling_fanins) {
+    int num_non_controlling_fanins =
+        NumFanins(*node, /*include_controlling_nodes=*/false);
+    if (num_non_controlling_fanins == 0) {
+      return false;
+    } else if (num_non_controlling_fanins < node->input_size()) {
+      node->mutable_input()->DeleteSubrange(0, num_non_controlling_fanins);
+    } else {
+      node->clear_input();
+    }
+  } else {
+    node->clear_input();
+  }
+  return true;
+}
+
+bool MutableGraphView::UpdateFanin(absl::string_view node_name,
+                                   const TensorId& from_fanin,
+                                   const TensorId& to_fanin) {
+  if (from_fanin == to_fanin || !IsTensorIdPortValid(from_fanin) ||
+      !IsTensorIdPortValid(to_fanin)) {
+    return false;
+  }
+  NodeDef* node = GetNode(node_name);
+  if (node == nullptr) {
+    return false;
+  }
+
+  bool is_from_fanin_control = from_fanin.index() == Graph::kControlSlot;
+  bool is_to_fanin_control = to_fanin.index() == Graph::kControlSlot;
+  // When replacing a non control dependency fanin with a control dependency, or
+  // vice versa, remove and add, so ports can be updated properly in fanout(s).
+  if (is_from_fanin_control || is_to_fanin_control) {
+    bool modified = RemoveFanins(node, {from_fanin});
+    if (!HasFanin(*node, to_fanin)) {
+      modified |= AddFanin(node, to_fanin);
+    }
+    return modified;
+  }
+
+  // In place mutation, requires no shifting of ports.
+  NodeDef* from_fanin_node = GetNode(from_fanin.node());
+  NodeDef* to_fanin_node = GetNode(to_fanin.node());
+  if (from_fanin_node == nullptr || to_fanin_node == nullptr) {
+    return false;
+  }
+
+  string to_fanin_string = TensorIdToString(to_fanin);
+  int num_inputs = node->input_size();
+  bool modified = false;
+  for (int i = 0; i < num_inputs; ++i) {
+    if (ParseTensorName(node->input(i)) == from_fanin) {
+      OutputPort from_fanin_port(from_fanin_node, from_fanin.index());
+      InputPort old_input;
+      old_input.node = node;
+      old_input.port_id =
+          from_fanin.index() == Graph::kControlSlot ? Graph::kControlSlot : i;
+      fanouts()[from_fanin_port].erase(old_input);
+
+      OutputPort to_fanin_port(to_fanin_node, to_fanin.index());
+      InputPort new_input;
+      new_input.node = node;
+      new_input.port_id =
+          to_fanin.index() == Graph::kControlSlot ? Graph::kControlSlot : i;
+      fanouts()[to_fanin_port].insert(new_input);
+
+      node->set_input(i, to_fanin_string);
+      modified = true;
+    }
+  }
+
+  return modified;
+}
+
 void MutableGraphView::DeleteNodes(const std::set<string>& nodes_to_delete) {
   for (const string& node_name_to_delete : nodes_to_delete)
-    RemoveFanouts(nodes().at(node_name_to_delete));
+    RemoveFaninsInternal(nodes().at(node_name_to_delete),
+                         /*keep_controlling_fanins=*/false);
   for (const string& node_name_to_delete : nodes_to_delete)
     nodes().erase(node_name_to_delete);
   EraseNodesFromGraph(nodes_to_delete, graph());
 }
 
-void MutableGraphView::RemoveFanouts(NodeDef* deleted_node) {
+void MutableGraphView::RemoveFaninsInternal(NodeDef* deleted_node,
+                                            bool keep_controlling_fanins) {
   for (int i = 0; i < deleted_node->input_size(); ++i) {
     TensorId tensor_id = ParseTensorName(deleted_node->input(i));
+    if (keep_controlling_fanins && tensor_id.index() < 0) {
+      break;
+    }
     OutputPort fanin(nodes()[tensor_id.node()], tensor_id.index());
 
     InputPort input;
diff --git a/tensorflow/core/grappler/mutable_graph_view.h b/tensorflow/core/grappler/mutable_graph_view.h
index 355dd6c491763e96b509ce42977e2cf0f5db2eb5..8025b8ca778a0dd5950b2da6c0fb355be879e52d 100644
--- a/tensorflow/core/grappler/mutable_graph_view.h
+++ b/tensorflow/core/grappler/mutable_graph_view.h
@@ -16,7 +16,17 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_MUTABLE_GRAPH_VIEW_H_
 #define TENSORFLOW_CORE_GRAPPLER_MUTABLE_GRAPH_VIEW_H_
 
+#include <set>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -60,6 +70,38 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   //   2. foo2(new_bar:1, other:1)
   void UpdateFanouts(absl::string_view from_node, absl::string_view to_node);
 
+  // Add fanin to node `node_name`. If the node or fanin do not exist in the
+  // graph, nothing will be modified in the graph. If fanin is a control
+  // dependency, existing control dependencies will be checked first before
+  // adding. Otherwise fanin will be added after existing non control dependency
+  // inputs.
+  //
+  // This will return true iff the node is modified. If a control dependency
+  // already exists, the node will not be modified.
+  bool AddFanin(absl::string_view node_name, const TensorId& fanin);
+
+  // Remove fanin from node `node_name`. If the node or fanin do not exist in
+  // the graph, nothing will be modified in the graph. If there are multiple
+  // inputs that match the fanin, all of them will be removed.
+  //
+  // This will return true iff the node is modified. If no inputs match the
+  // fanin, the node will not be modified.
+  bool RemoveFanin(absl::string_view node_name, const TensorId& fanin);
+
+  // Remove all fanins from node `node_name`. Control dependencies will be
+  // retained if keep_controlling_fanins is true.
+  //
+  // This will return true iff the node is modified.
+  bool RemoveAllFanins(absl::string_view node_name,
+                       bool keep_controlling_fanins);
+
+  // Replace all fanins `from_fanin` with `to_fanin` in node `node_name`. If
+  // the fanins or node do not exist, nothing will be modified in the graph.
+  //
+  // This will return true iff the node is modified.
+  bool UpdateFanin(absl::string_view node_name, const TensorId& from_fanin,
+                   const TensorId& to_fanin);
+
   // Deletes nodes from the graph.
   void DeleteNodes(const std::set<string>& nodes_to_delete);
 
@@ -79,9 +121,22 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   // behavior is undefined.
   void UpdateFanouts(NodeDef* from_node, NodeDef* to_node);
 
-  // Remove fanouts of the deleted node from internal state (including control
-  // dependencies).
-  void RemoveFanouts(NodeDef* deleted_node);
+  // Remove fanins of the deleted node from internal state. Control dependencies
+  // are retained iff keep_controlling_fanins is true.
+  void RemoveFaninsInternal(NodeDef* deleted_node,
+                            bool keep_controlling_fanins);
+
+  // Add fanin to node. If the node or fanin do not exist in the graph, nothing
+  // will be modified in the graph. If fanin is a control dependency, existing
+  // control dependencies will be checked first before adding. Otherwise fanin
+  // will be added after existing non control dependency inputs.
+  //
+  // This will return true iff the node is modified. If a control dependency
+  // already exists, the node will not be modified.
+  bool AddFanin(NodeDef* node, const TensorId& fanin);
+
+  // Remove any fanin in node that matches to a fanin in fanins.
+  bool RemoveFanins(NodeDef* node, absl::Span<const TensorId> fanins);
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/mutable_graph_view_test.cc b/tensorflow/core/grappler/mutable_graph_view_test.cc
index c1b3f8c01cf3dbb570d64845fb7097d1b309fc30..cd7e638595e0e4dc42ca70ca66b89457045e73cc 100644
--- a/tensorflow/core/grappler/mutable_graph_view_test.cc
+++ b/tensorflow/core/grappler/mutable_graph_view_test.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -110,6 +112,356 @@ TEST(MutableGraphViewTest, AddAndUpdateFanoutsWithoutSelfLoops) {
   EXPECT_EQ(new_bar_fanouts.count(MutableGraphView::InputPort(foo, -1)), 1);
 }
 
+GraphDef SimpleMutateFaninGraph() {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {}),
+       NDef("c", "NotImportant", {}, {}), NDef("d", "NotImportant", {}, {}),
+       NDef("foo_1", "NotImportant", {"a"}),
+       NDef("foo_2", "NotImportant", {"b", "^a", "^c"}),
+       NDef("foo_3", "NotImportant", {"b", "a:1", "a:1"}),
+       NDef("foo_4", "NotImportant", {"a", "b:2", "b:2", "^c", "^d"}),
+       NDef("foo_5", "NotImportant", {}),
+       NDef("foo_6", "NotImportant", {"^a", "^b"})},
+      /*funcs=*/{});
+  return graph_def;
+}
+
+void CompareNodeInputs(const MutableGraphView& graph, const NodeDef* expected,
+                       NodeDef* actual) {
+  ASSERT_EQ(actual->input_size(), expected->input_size());
+  int port;
+  for (int i = 0; i < actual->input_size(); ++i) {
+    EXPECT_EQ(actual->input(i), expected->input(i));
+    TensorId tensor_id = ParseTensorName(expected->input(i));
+    if (tensor_id.index() == Graph::kControlSlot) {
+      port = Graph::kControlSlot;
+    } else {
+      port = i;
+    }
+    MutableGraphView::InputPort input_port(actual, port);
+    MutableGraphView::OutputPort output_port =
+        graph.GetOutputPort(tensor_id.node(), tensor_id.index());
+    EXPECT_EQ(graph.GetFanin(input_port).contains(output_port), true);
+    EXPECT_EQ(graph.GetFanout(output_port).contains(input_port), true);
+  }
+}
+
+void TestAddFanin(absl::string_view node_name, const TensorId& fanin_to_add,
+                  bool modified, const NodeDef* expected_node) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  auto node = graph.GetNode(node_name);
+  if (expected_node == nullptr) {
+    EXPECT_EQ(node, nullptr);
+  } else {
+    EXPECT_NE(node, nullptr);
+  }
+
+  EXPECT_EQ(modified, graph.AddFanin(node_name, fanin_to_add));
+  if (expected_node != nullptr) {
+    CompareNodeInputs(graph, expected_node, node);
+  }
+}
+
+TEST(MutableGraphViewTest, AddFanin) {
+  NodeDef expected_node;
+  // Add input to node with 1 input 0 controls.
+  expected_node = NDef("", "", {"a", "b:1"});
+  TestAddFanin("foo_1", {"b", 1}, /*modified=*/true, &expected_node);
+  // Add input to node with multiple inputs and 0 controls.
+  expected_node = NDef("", "", {"b", "a:1", "a:1", "b:2"});
+  TestAddFanin("foo_3", {"b", 2}, /*modified=*/true, &expected_node);
+  // Add input to node with 1 input multiple controls.
+  expected_node = NDef("", "", {"b", "a", "^c", "^a"});
+  TestAddFanin("foo_2", {"a", 0}, /*modified=*/true, &expected_node);
+  // Add input to node with multiple inputs and controls.
+  expected_node = NDef("", "", {"a", "b:2", "b:2", "a:1", "^d", "^c"});
+  TestAddFanin("foo_4", {"a", 1}, /*modified=*/true, &expected_node);
+  // Add input to node with 0 inputs 0 controls.
+  expected_node = NDef("", "", {"a:1"});
+  TestAddFanin("foo_5", {"a", 1}, /*modified=*/true, &expected_node);
+  // Add input to node with 0 inputs multiple controls.
+  expected_node = NDef("", "", {"c:1", "^b", "^a"});
+  TestAddFanin("foo_6", {"c", 1}, /*modified=*/true, &expected_node);
+
+  // Add control to node with 1 input 0 controls.
+  expected_node = NDef("", "", {"a", "^b"});
+  TestAddFanin("foo_1", {"b", Graph::kControlSlot}, /*modified=*/true,
+               &expected_node);
+  // Add control to node with multiple inputs and 0 controls.
+  expected_node = NDef("", "", {"b", "a:1", "a:1", "^c"});
+  TestAddFanin("foo_3", {"c", Graph::kControlSlot}, /*modified=*/true,
+               &expected_node);
+  // Add control to node with 1 input multiple controls.
+  expected_node = NDef("", "", {"b", "^a", "^c", "^d"});
+  TestAddFanin("foo_2", {"d", Graph::kControlSlot}, /*modified=*/true,
+               &expected_node);
+  // Add control to node with multiple input multiple controls.
+  expected_node = NDef("", "", {"a", "b:2", "b:2", "^c", "^d", "^a"});
+  TestAddFanin("foo_4", {"a", Graph::kControlSlot}, /*modified=*/true,
+               &expected_node);
+  // Add control to node with 0 inputs 0 controls.
+  expected_node = NDef("", "", {"^a"});
+  TestAddFanin("foo_5", {"a", Graph::kControlSlot}, /*modified=*/true,
+               &expected_node);
+  // Add control to node with 0 inputs multiple controls.
+  expected_node = NDef("", "", {"^a", "^b", "^c"});
+  TestAddFanin("foo_6", {"c", Graph::kControlSlot}, /*modified=*/true,
+               &expected_node);
+  // Add control to node with control that already exists.
+  expected_node = NDef("", "", {"b", "^a", "^c"});
+  TestAddFanin("foo_2", {"a", Graph::kControlSlot}, /*modified=*/false,
+               &expected_node);
+
+  // Add fanin to node where node is missing.
+  TestAddFanin("foo_missing", {"a", 0}, /*modified=*/false, nullptr);
+  // Add fanin to node where fanin is missing.
+  expected_node = NDef("", "", {"a"});
+  TestAddFanin("foo_1", {"bar_missing", 0}, /*modified=*/false, &expected_node);
+  // Add fanin to node where node and fanin are missing.
+  TestAddFanin("foo_missing", {"bar_missing", 0}, /*modified=*/false,
+               /*expected_node=*/nullptr);
+}
+
+void CheckFanout(const MutableGraphView& graph, const TensorId& fanin,
+                 absl::string_view node_name) {
+  MutableGraphView::OutputPort output_port =
+      graph.GetOutputPort(fanin.node(), fanin.index());
+  auto fanouts = graph.GetFanout(output_port);
+  for (auto fanout : fanouts) {
+    EXPECT_NE(fanout.node->name(), fanin.node());
+  }
+}
+
+void TestRemoveFanin(absl::string_view node_name,
+                     const TensorId& fanin_to_remove, bool modified,
+                     const NodeDef* expected_node) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  auto node = graph.GetNode(node_name);
+  if (expected_node == nullptr) {
+    EXPECT_EQ(nullptr, node);
+  } else {
+    EXPECT_NE(nullptr, node);
+  }
+
+  EXPECT_EQ(modified, graph.RemoveFanin(node_name, fanin_to_remove));
+  if (expected_node != nullptr) {
+    CompareNodeInputs(graph, expected_node, node);
+    if (modified) {
+      CheckFanout(graph, fanin_to_remove, node_name);
+    }
+  }
+}
+
+TEST(MutableGraphViewTest, RemoveFanin) {
+  NodeDef expected_node;
+  // Remove input from node with 1 input 0 controls.
+  expected_node = NDef("", "", {});
+  TestRemoveFanin("foo_1", {"a", 0}, /*modified=*/true, &expected_node);
+  // Remove input from node with multiple inputs and 0 controls.
+  expected_node = NDef("", "", {"b"});
+  TestRemoveFanin("foo_3", {"a", 1}, /*modified=*/true, &expected_node);
+  // Remove input from node with 1 input multiple controls.
+  expected_node = NDef("", "", {"^a", "^c"});
+  TestRemoveFanin("foo_2", {"b", 0}, /*modified=*/true, &expected_node);
+  // Remove input from node with multiple inputs and controls.
+  expected_node = NDef("", "", {"a", "^c", "^d"});
+  TestRemoveFanin("foo_4", {"b", 2}, /*modified=*/true, &expected_node);
+
+  // Remove control from node with 1 input multiple controls.
+  expected_node = NDef("", "", {"b", "^c"});
+  TestRemoveFanin("foo_2", {"a", Graph::kControlSlot}, /*modified=*/true,
+                  &expected_node);
+  // Remove control from node with multiple input multiple controls.
+  expected_node = NDef("", "", {"a", "b:2", "b:2", "^c"});
+  TestRemoveFanin("foo_4", {"d", Graph::kControlSlot}, /*modified=*/true,
+                  &expected_node);
+  // Remove control from node with 0 inputs multiple controls.
+  expected_node = NDef("", "", {"^b"});
+  TestRemoveFanin("foo_6", {"a", Graph::kControlSlot}, /*modified=*/true,
+                  &expected_node);
+
+  // Remove input from node with 0 inputs 0 controls.
+  expected_node = NDef("", "", {});
+  TestRemoveFanin("foo_5", {"a", 1}, /*modified=*/false, &expected_node);
+  // Remove input from node with 0 inputs multiple controls.
+  expected_node = NDef("", "", {"^a", "^b"});
+  TestRemoveFanin("foo_6", {"a", 1}, /*modified=*/false, &expected_node);
+  // Remove control from node with 1 input 0 controls.
+  expected_node = NDef("", "", {"a"});
+  TestRemoveFanin("foo_1", {"b", Graph::kControlSlot}, /*modified=*/false,
+                  &expected_node);
+  // Remove control from node with multiple inputs and 0 controls.
+  expected_node = NDef("", "", {"b", "a:1", "a:1"});
+  TestRemoveFanin("foo_3", {"c", Graph::kControlSlot}, /*modified=*/false,
+                  &expected_node);
+  // Remove control from node with 0 inputs 0 controls.
+  expected_node = NDef("", "", {});
+  TestRemoveFanin("foo_5", {"a", Graph::kControlSlot}, /*modified=*/false,
+                  &expected_node);
+
+  // Remove fanin from node where node is missing.
+  TestRemoveFanin("foo_missing", {"a", 0}, /*modified=*/false,
+                  /*expected_node=*/nullptr);
+  // Remove fanin from node where fanin is missing.
+  expected_node = NDef("", "", {"a"});
+  TestRemoveFanin("foo_1", {"bar_missing", 0}, /*modified=*/false,
+                  &expected_node);
+  // Remove fanin from node where node and fanin are missing.
+  TestRemoveFanin("foo_missing", {"bar_missing", 0}, /*modified=*/false,
+                  /*expected_node=*/nullptr);
+}
+
+void TestRemoveAllFanins(absl::string_view node_name,
+                         bool keep_controlling_nodes, bool modified,
+                         const NodeDef* expected_node) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  auto node = graph.GetNode(node_name);
+  absl::flat_hash_set<string> fanin_strings;
+  if (expected_node == nullptr) {
+    EXPECT_EQ(node, nullptr);
+  } else {
+    EXPECT_NE(node, nullptr);
+    fanin_strings.insert(node->input().begin(), node->input().end());
+  }
+
+  EXPECT_EQ(modified, graph.RemoveAllFanins(node_name, keep_controlling_nodes));
+  if (expected_node != nullptr) {
+    CompareNodeInputs(graph, expected_node, node);
+    if (modified) {
+      TensorId tensor_id;
+      auto retained_inputs = absl::flat_hash_set<string>(node->input().begin(),
+                                                         node->input().end());
+      for (const string& fanin : fanin_strings) {
+        if (!retained_inputs.contains(fanin)) {
+          tensor_id = ParseTensorName(fanin);
+          CheckFanout(graph, tensor_id, node_name);
+        }
+      }
+    }
+  }
+}
+
+TEST(MutableGraphViewTest, RemoveAllFanins) {
+  NodeDef expected_node;
+  // Remove all fanins from node with no control dependencies.
+  expected_node = NDef("", "", {});
+  TestRemoveAllFanins("foo_3", /*keep_controlling_nodes=*/false,
+                      /*modified=*/true, &expected_node);
+  // Remove all fanins from node with control dependencies.
+  TestRemoveAllFanins("foo_4", /*keep_controlling_nodes=*/false,
+                      /*modified=*/true, &expected_node);
+
+  // Remove all fanins from node with no control dependencies and preserve
+  // control dependencies.
+  TestRemoveAllFanins("foo_3", /*keep_controlling_nodes=*/true,
+                      /*modified=*/true, &expected_node);
+  // Remove all fanins from node with control dependencies and preserve control
+  // dependencies.
+  expected_node = NDef("", "", {"^c", "^d"});
+  TestRemoveAllFanins("foo_4", /*keep_controlling_nodes=*/true,
+                      /*modified=*/true, &expected_node);
+
+  // Remove all fanins from node with no fanins.
+  expected_node = NDef("", "", {});
+  TestRemoveAllFanins("foo_5", /*keep_controlling_nodes=*/false,
+                      /*modified=*/false, &expected_node);
+  TestRemoveAllFanins("foo_5", /*keep_controlling_nodes=*/true,
+                      /*modified=*/false, &expected_node);
+
+  // Remove all fanins from node with only control dependencies.
+  TestRemoveAllFanins("foo_6", /*keep_controlling_nodes=*/false,
+                      /*modified=*/true, &expected_node);
+  expected_node = NDef("", "", {"^a", "^b"});
+  TestRemoveAllFanins("foo_6", /*keep_controlling_nodes=*/true,
+                      /*modified=*/false, &expected_node);
+
+  // Remove all fanins from node where node is missing.
+  TestRemoveAllFanins("foo_missing", /*keep_controlling_nodes=*/false,
+                      /*modified=*/false, /*expected_node=*/nullptr);
+  TestRemoveAllFanins("foo_missing", /*keep_controlling_nodes=*/true,
+                      /*modified=*/false, /*expected_node=*/nullptr);
+}
+
+void TestUpdateFanin(absl::string_view node_name, const TensorId& from_fanin,
+                     const TensorId& to_fanin, bool modified,
+                     const NodeDef* expected_node) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  auto node = graph.GetNode(node_name);
+  if (expected_node == nullptr) {
+    EXPECT_EQ(node, nullptr);
+  } else {
+    EXPECT_NE(node, nullptr);
+  }
+
+  EXPECT_EQ(modified, graph.UpdateFanin(node_name, from_fanin, to_fanin));
+  if (expected_node != nullptr) {
+    CompareNodeInputs(graph, expected_node, node);
+    if (modified) {
+      CheckFanout(graph, from_fanin, node_name);
+    }
+  }
+}
+
+TEST(MutableGraphViewTest, UpdateFanin) {
+  NodeDef expected_node;
+  // Update fanin from non control to non control.
+  expected_node = NDef("", "", {"a", "b:3", "b:3", "^c", "^d"});
+  TestUpdateFanin("foo_4", {"b", 2}, {"b", 3}, /*modified=*/true,
+                  &expected_node);
+  // Update fanin from non control to control.
+  expected_node = NDef("", "", {"a", "^c", "^d", "^b"});
+  TestUpdateFanin("foo_4", {"b", 2}, {"b", Graph::kControlSlot},
+                  /*modified=*/true, &expected_node);
+  // Update fanin from control to non control.
+  expected_node = NDef("", "", {"a", "b:2", "b:2", "d:1", "^c"});
+  TestUpdateFanin("foo_4", {"d", Graph::kControlSlot}, {"d", 1},
+                  /*modified=*/true, &expected_node);
+  // Update fanin from control to control.
+  expected_node = NDef("", "", {"a", "b:2", "b:2", "^d", "^b"});
+  TestUpdateFanin("foo_4", {"c", Graph::kControlSlot},
+                  {"b", Graph::kControlSlot}, /*modified=*/true,
+                  &expected_node);
+  // Update fanin from control to existing control.
+  expected_node = NDef("", "", {"a", "b:2", "b:2", "^d"});
+  TestUpdateFanin("foo_4", {"c", Graph::kControlSlot},
+                  {"d", Graph::kControlSlot}, /*modified=*/true,
+                  &expected_node);
+
+  // Update fanin of node where from and to fanins are the same.
+  expected_node = NDef("", "", {"a"});
+  TestUpdateFanin("foo_1", {"a", -1}, {"a", -1}, /*modified=*/false,
+                  &expected_node);
+  TestUpdateFanin("foo_1", {"a", 0}, {"a", 0}, /*modified=*/false,
+                  &expected_node);
+  TestUpdateFanin("foo_1", {"a", 1}, {"a", 1}, /*modified=*/false,
+                  &expected_node);
+  // Update fanin of node where node is missing.
+  TestUpdateFanin("foo_missing", {"a", 0}, {"a", 1}, /*modified=*/false,
+                  /*expected_node=*/nullptr);
+  // Update fanin of node where from fanin is missing.
+  TestUpdateFanin("foo_1", {"from_bar_missing", 0}, {"a", 1},
+                  /*modified=*/false, &expected_node);
+  // Update fanin of node where to fanin is missing.
+  TestUpdateFanin("foo_1", {"a", 0}, {"to_bar_missing", 1}, /*modified=*/false,
+                  &expected_node);
+  // Update fanin of node where from/to fanins and node are missing.
+  TestUpdateFanin("foo_missing", {"from_bar_missing", 0}, {"to_bar_missing", 1},
+                  /*modified=*/false, /*expected_node=*/nullptr);
+}
+
 TEST(MutableGraphViewTest, DeleteNodes) {
   // Actual node.op() is not important in this test.
   GraphDef graph_def = test::function::GDef(
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 38fc1fff329eda5b80bb771442f2c543bd27e85d..b201c3a7172a717d0d88003cf15b411721afdd34 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -247,6 +247,11 @@ bool IsIdentityNSingleInput(const NodeDef& node) {
          node.attr().at("T").list().type_size() == 1;
 }
 
+bool IsIf(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "If" || op == "StatelessIf";
+}
+
 bool IsIgamma(const NodeDef& node) { return node.op() == "Igamma"; }
 
 bool IsIgammac(const NodeDef& node) { return node.op() == "Igammac"; }
@@ -524,6 +529,11 @@ bool IsVariable(const NodeDef& node) {
          op == "VarHandleOp" || op == "ReadVariableOp";
 }
 
+bool IsWhile(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "While" || op == "StatelessWhile";
+}
+
 bool IsZeta(const NodeDef& node) { return node.op() == "Zeta"; }
 
 namespace {
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 67897e8512d7dc6e4774c066297674629dd4f714..cb7781ec6ef4c131325b7103952754335653d674 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -75,6 +75,7 @@ bool IsHistogramSummary(const NodeDef& node);
 bool IsIdentity(const NodeDef& node);
 bool IsIdentityN(const NodeDef& node);
 bool IsIdentityNSingleInput(const NodeDef& node);
+bool IsIf(const NodeDef& node);
 bool IsIgamma(const NodeDef& node);
 bool IsIgammac(const NodeDef& node);
 bool IsImag(const NodeDef& node);
@@ -167,6 +168,7 @@ bool IsTruncateDiv(const NodeDef& node);
 bool IsTruncateMod(const NodeDef& node);
 bool IsUnpack(const NodeDef& node);
 bool IsVariable(const NodeDef& node);
+bool IsWhile(const NodeDef& node);
 bool IsZeta(const NodeDef& node);
 
 // Return true if the op is an aggregation (e.g. Add, AddN).
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 9149ee275db0bb368494fb3f477d9dba4363f6d5..79578cb3ce0733bcfce1a382414c20881879e3e3 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -141,6 +141,7 @@ cc_library(
     deps = [
         ":graph_optimizer",
         "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -149,6 +150,7 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/utils:functions",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
@@ -383,7 +385,7 @@ cc_library(
     srcs = [
         "gpu_swapping_ops.cc",
     ],
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
@@ -459,7 +461,6 @@ cc_library(
         "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/costs:virtual_placer",
@@ -613,7 +614,6 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:frame",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -765,7 +765,6 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:frame",
     ],
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index e3ac89b50dbf383b29980eab4a16796af83586dc..e28f991e2dfa50c559c42f06e06d475f8017b323 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -2722,6 +2722,9 @@ class OptimizeMaxOrMinOfMonotonicStage : public ArithmeticOptimizerStage {
 
   Status TrySimplify(NodeDef* reduction_node,
                      string* simplified_node_name) override {
+    if (IsInPreserveSet(*reduction_node)) {
+      return Status::OK();
+    }
     NodeDef* inner_function;
     TF_RETURN_IF_ERROR(GetInputNode(reduction_node->input(0), &inner_function));
     // Optimize only if:
@@ -3561,8 +3564,7 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
   // Set up helper data structures.
   nodes_to_preserve_ = item.NodesToPreserve();
   fetch_nodes_known_ = !item.fetch.empty();
-  *optimized_graph = item.graph;
-  GrapplerItem optimized_item(item, optimized_graph);
+  GrapplerItem optimized_item(item);
   optimized_graph_ = &optimized_item.graph;
   node_map_.reset(new NodeMap(optimized_graph_));
 
@@ -3572,7 +3574,7 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
 
   // Disable restricted graph rewrites.
   options_.unary_ops_composition &=
-      item.allowed_optimizations.non_differentiable_rewrites;
+      item.allowed_optimizations().non_differentiable_rewrites;
 
   if (options_.dedup_computations) {
     DedupComputations();
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 35d22898f6c15afd63df8b6136fad1f346172cd5..94c59c68c8f1adf0ea6b234d8ebeb305c561b994 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -3490,6 +3490,35 @@ TEST_F(ArithmeticOptimizerTest,
   VerifyGraphsMatch(item.graph, output, __LINE__);
 }
 
+TEST_F(ArithmeticOptimizerTest,
+       OptimizeMaxOrMinOfMonotonicElementWiseDoNotChangeFetchNodeReduction) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), {2, 3}, {1, 2});
+  Output reshape = ops::Reshape(s.WithOpName("reshape"), x, {-1});
+  Output y = ops::Neg(s.WithOpName("y"), reshape);
+  Output z = ops::Max(s.WithOpName("z"), y, {0});
+
+  GrapplerItem item;
+  item.fetch = {"z"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyOptimizeMaxOrMinOfMonotonic(&optimizer);
+  OptimizeTwice(&optimizer, &item, &output);
+
+  // Should be a NoOp since we are not allowed to change the output of fetch
+  // nodes.
+  VerifyGraphsMatch(item.graph, output, __LINE__);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  ASSERT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors[0], tensors_expected[0]);
+  test::ExpectTensorEqual<int>(tensors[0], Tensor(-2));
+}
+
 TEST_F(ArithmeticOptimizerTest,
        OptimizeMaxOrMinOfMonotonicElementWiseNonIncreasing) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 5e3e5d6af9a7dd435a15f83e94434de0c25ed7aa..3882e3b3a9a0fa5788a298f0900ca545b792f56e 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -751,6 +751,12 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   if (ModifiesFrameInfo(node)) {
     return false;
   }
+
+  // Removing LoopCond nodes can screw up the partitioner.
+  if (node.op() == "LoopCond") {
+    return false;
+  }
+
   // Skip constants, they're already folded
   if (IsConstant(node)) {
     return false;
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
index affaeafb0fba27f6754e447de588e65b768d5d41..9d8b388a3a8bca1fb560e5acc94d50f3d82ed30d 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
@@ -41,7 +41,7 @@ NodeDef MakeMapAndBatchNode(StringPiece name, StringPiece input_node_name,
                             StringPiece drop_remainder_node_name,
                             StringPiece function_name) {
   return test::function::NDef(
-      name, "MapAndBatchDatasetV2",
+      name, "ExperimentalMapAndBatchDataset",
       {string(input_node_name), "", string(batch_size_node_name),
        string(num_parallel_calls_node_name), string(drop_remainder_node_name)},
       {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
index 5af9fbadf76bfde5b031df0978ff9447ea3afb57..60755256d83d74287748125e18ccd8a63a1b4759 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
@@ -67,7 +67,7 @@ NodeDef MakeStatelessMap(const NodeDef& map_node, const NodeDef& zip_node,
 NodeDef MakeRandomDataset(const NodeDef& random_uniform_node,
                           MutableGraphView* graph) {
   NodeDef random_dataset;
-  random_dataset.set_op("RandomDataset");
+  random_dataset.set_op("ExperimentalRandomDataset");
   graph_utils::SetUniqueGraphNodeName("RandomDataset", graph->graph(),
                                       &random_dataset);
 
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc
index 455459e3f67d9cb51bf24af24e2c73f30447b24f..b6a29a442ea3a3e62eeec8d1f571fef5225c3c80 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc
@@ -55,7 +55,7 @@ TEST(HoistRandomUniform, SimpleHoisting) {
   const int zip_dataset_id =
       graph_utils::FindGraphNodeWithOp("ZipDataset", output);
   const int random_dataset_id =
-      graph_utils::FindGraphNodeWithOp("RandomDataset", output);
+      graph_utils::FindGraphNodeWithOp("ExperimentalRandomDataset", output);
   const int batch_random_id =
       graph_utils::FindGraphNodeWithOp("BatchDatasetV2", output);
   ASSERT_NE(random_dataset_id, -1);
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
index 16b2efb3ed3c25c4fa5b8b42205037c212140289..52b4b785a3d09ca7f3bec3373d9dd1c8de444a87 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
@@ -31,7 +31,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-constexpr char kInsertOpName[] = "LatencyStatsDataset";
+constexpr char kInsertOpName[] = "ExperimentalLatencyStatsDataset";
 
 NodeDef MakeLatencyNode(const NodeDef& node, MutableGraphView* graph) {
   NodeDef new_node;
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
index 6789cf5bd669cfa61e161397f792700098923e75..d428d04a66659cd3b961428e3762ea3ab81ad69e 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
@@ -57,9 +57,10 @@ TEST(LatencyAllEdgesTest, AddLatenciesAfterTensorMapPrefetch) {
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("LatencyStatsDataset", output));
-  std::vector<int> latency_node_indices =
-      graph_utils::FindAllGraphNodesWithOp("LatencyStatsDataset", output);
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalLatencyStatsDataset",
+                                              output));
+  std::vector<int> latency_node_indices = graph_utils::FindAllGraphNodesWithOp(
+      "ExperimentalLatencyStatsDataset", output);
   EXPECT_EQ(latency_node_indices.size(), 3);
   std::vector<NodeDef> dataset_nodes = {std::move(from_tensor_node),
                                         std::move(map_node),
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc b/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
index e5de981822376d2e4d1d78ac628f527d242f133a..72c27a1d4afb8f3766a1f7c56ade37b1e161a039 100644
--- a/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
@@ -44,7 +44,7 @@ Status MakeNumaAware::Optimize(Cluster* cluster, const GrapplerItem& item,
   std::set<string> nodes_to_delete;
 
   for (const NodeDef& node : item.graph.node()) {
-    if (node.op() != "MapAndBatchDatasetV2") continue;
+    if (node.op() != "ExperimentalMapAndBatchDataset") continue;
 
     auto* numa_node = graph.AddNode(MakeNumaAwareNode(node, &graph));
     graph.UpdateFanouts(node.name(), numa_node->name());
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc b/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc
index 5d52bd6208f7f21ff44cfe4fef042146a97c5fb9..4b83fb6ef19f8ee241dd4f7b635c9672ef01bcc0 100644
--- a/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc
@@ -57,7 +57,8 @@ TEST(MakeNumaAwareTest, ReplaceSimple) {
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map_and_batch", output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                               output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp(
       "ExperimentalNumaMapAndBatchDataset", output));
 }
@@ -91,7 +92,8 @@ TEST(MapAndBatchNumaAawareReplacementTest, ReplaceWithExtraChild) {
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map_and_batch", output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                               output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp(
       "ExperimentalNumaMapAndBatchDataset", output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp("CacheDataset", output));
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index 800050b840326d826328763a52c5447c8df70a99..84c4d82f6a38dd81e88374c6ce6a7a6082451a38 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -30,7 +30,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-constexpr char kFusedOpName[] = "MapAndBatchDatasetV2";
+constexpr char kFusedOpName[] = "ExperimentalMapAndBatchDataset";
 
 NodeDef MakeMapAndBatchNode(const NodeDef& map_node, const NodeDef& batch_node,
                             MutableGraphView* graph) {
@@ -77,15 +77,22 @@ NodeDef MakeMapAndBatchNode(const NodeDef& map_node, const NodeDef& batch_node,
     new_node.add_input(tmp->name());
   }
 
-  // Set `f` and `Targuments` attributes.
+  // Required attributes.
   for (auto key : {"f", "Targuments"}) {
     graph_utils::CopyAttribute(key, map_node, &new_node);
   }
-
-  // Set `output_types` and `output_shapes` attributes.
   for (auto key : {"output_shapes", "output_types"}) {
     graph_utils::CopyAttribute(key, batch_node, &new_node);
   }
+
+  // Optional attributes.
+  // TODO(jsimsa): Support `use_inter_op_parallelism` and `sloppy`.
+  for (auto key : {"preserve_cardinality"}) {
+    if (gtl::FindOrNull(map_node.attr(), key)) {
+      graph_utils::CopyAttribute(key, map_node, &new_node);
+    }
+  }
+
   return new_node;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
index eed558de7eb42c5b7879e93bdc43fc8184b599b4..ef4e64826f030ae404a0a523ad5f09bbf7e325a4 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
@@ -84,9 +84,10 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
       graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
   EXPECT_FALSE(
       graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
-  NodeDef map_and_batch_node = output.node(
-      graph_utils::FindGraphNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                              output));
+  NodeDef map_and_batch_node = output.node(graph_utils::FindGraphNodeWithOp(
+      "ExperimentalMapAndBatchDataset", output));
   EXPECT_EQ(map_and_batch_node.input_size(), 5);
   EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
@@ -169,9 +170,10 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
       graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
   EXPECT_FALSE(
       graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
-  NodeDef map_and_batch_node = output.node(
-      graph_utils::FindGraphNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                              output));
+  NodeDef map_and_batch_node = output.node(graph_utils::FindGraphNodeWithOp(
+      "ExperimentalMapAndBatchDataset", output));
   EXPECT_EQ(map_and_batch_node.input_size(), 5);
   EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
@@ -252,9 +254,10 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
       graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
   EXPECT_FALSE(
       graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
-  NodeDef map_and_batch_node = output.node(
-      graph_utils::FindGraphNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                              output));
+  NodeDef map_and_batch_node = output.node(graph_utils::FindGraphNodeWithOp(
+      "ExperimentalMapAndBatchDataset", output));
   EXPECT_EQ(map_and_batch_node.input_size(), 5);
   EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
index 2b0a347ce625140be16d258964af06ef418e9f58..233d7968c8965a5ec2389aa297da72a9708b9257 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
@@ -55,8 +55,9 @@ NodeDef MakeFusedNode(const NodeDef& map_node,
   }
 
   // Optional attrs.
-  for (auto key : {"use_inter_op_parallelism", "sloppy"}) {
-    if (const auto* attr = gtl::FindOrNull(map_node.attr(), key)) {
+  for (auto key :
+       {"use_inter_op_parallelism", "sloppy", "preserve_cardinality"}) {
+    if (gtl::FindOrNull(map_node.attr(), key)) {
       graph_utils::CopyAttribute(key, map_node, &fused_node);
     }
   }
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
index 6ca0da27551bc78a9167d308eb229c662821c582..6b8015f96a29ac2fa2de3871a678a1b82efb12ff 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
@@ -62,9 +62,16 @@ NodeDef MakeFusedNode(const NodeDef& parent_map_node, const NodeDef& map_node,
       gtl::FindOrNull(map_node.attr(), "use_inter_op_parallelism");
   // Some graphs cannot execute with use_inter_op_parallelism=False, so we need
   // to set it to true if one of the ops have it set to true.
-  if (value_or_false(first_parallelism) || value_or_false(second_parallelism)) {
-    (*fused_node.mutable_attr())["use_inter_op_parallelism"].set_b(true);
-  }
+  (*fused_node.mutable_attr())["use_inter_op_parallelism"].set_b(
+      value_or_false(first_parallelism) || value_or_false(second_parallelism));
+
+  const auto* first_cardinality =
+      gtl::FindOrNull(parent_map_node.attr(), "preserve_cardinality");
+  const auto* second_cardinality =
+      gtl::FindOrNull(map_node.attr(), "preserve_cardinality");
+  (*fused_node.mutable_attr())["preserve_cardinality"].set_b(
+      value_or_false(first_cardinality) && value_or_false(second_cardinality));
+
   return fused_node;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc b/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
index e1ac7766d34af69668a57e20acc945a1c975fd1b..e330835e9bc4fea33928e376a3fd98ebe34a74ee 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
+++ b/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
@@ -127,7 +127,7 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationEval) {
                                  test::AsScalar<float>(4.0f));
 
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   const auto twice_boosted_tensor = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(twice_boosted_tensor[0],
                                  test::AsScalar<float>(2.0f));
@@ -223,7 +223,7 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
                                  test::AsScalar<float>(4.0f));
 
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   const auto twice_boosted_tensor = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(twice_boosted_tensor[0],
                                  test::AsScalar<float>(2.0f));
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 9faab6614cb34c4c684cc1f31365cbac8ecda14a..4ec68c7543c998f3551c374056efb8092d200133 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -18,11 +18,15 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/substitute.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function.h"
@@ -110,10 +114,26 @@ AttrSlice FunctionInstantiationAttributes(const FunctionDef& func,
   }
 }
 
-class FakeCPUDevice : public Device {
+// This is a fake device that should not be used for any op kernel execution,
+// the only purpose of this device is to be passed as a part of DeviceSet to the
+// Placer.
+class FakeDevice : public Device {
  public:
-  FakeCPUDevice(Env* env, const DeviceAttributes& attr) : Device(env, attr) {}
+  FakeDevice(Env* env, const string& device) : Device(env, attr(device)) {}
+  explicit FakeDevice(const string& device) : FakeDevice(nullptr, device) {}
   Status Sync() override { return Status::OK(); }
+
+ private:
+  static DeviceAttributes attr(const string& device) {
+    DeviceNameUtils::ParsedName parsed_name;
+    bool parsed = DeviceNameUtils::ParseFullName(device, &parsed_name);
+    DCHECK(parsed) << "Failed to parse full device name: " << device;
+
+    DeviceAttributes attr;
+    attr.set_name(device);
+    attr.set_device_type(parsed_name.type);
+    return attr;
+  }
 };
 
 // -------------------------------------------------------------------------- //
@@ -238,7 +258,9 @@ class FunctionOptimizerContext {
       : grappler_item_id_(item.id),
         graph_version_(item.graph.versions().producer()),
         opt_level_(opt_level),
+        allowed_optimizations_(item.allowed_optimizations()),
         function_library_(OpRegistry::Global(), item.graph.library()),
+        available_device_names_(item.devices().begin(), item.devices().end()),
         graph_view_(&item.graph) {
     InitializeTrulyConstNodes(item);
     InitializeFetchNodes(item);
@@ -246,6 +268,10 @@ class FunctionOptimizerContext {
 
   const RewriterConfig::Toggle opt_level() const { return opt_level_; }
 
+  const GrapplerItem::AllowedOptimizations& allowed_optimizations() const {
+    return allowed_optimizations_;
+  }
+
   const FunctionLibraryDefinition& function_library() const {
     return function_library_;
   }
@@ -264,12 +290,28 @@ class FunctionOptimizerContext {
     return tensor_mapping_;
   }
 
+  const gtl::FlatMap<string, std::vector<string>>& control_overrides() const {
+    return control_overrides_;
+  }
+
   const GraphView& graph_view() const { return graph_view_; }
 
   const string& grappler_item_id() const { return grappler_item_id_; }
 
   const gtl::FlatSet<string>& fetch_tensors() const { return fetch_tensors_; }
 
+  const DeviceSet* devices() const {
+    // Create fake devices lazily only if we need a DeviceSet.
+    if (available_devices_.empty() && !available_device_names_.empty()) {
+      for (const string& name : available_device_names_) {
+        auto device = absl::make_unique<FakeDevice>(name);
+        available_device_set_.AddDevice(device.get());
+        available_devices_.push_back(std::move(device));
+      }
+    }
+    return &available_device_set_;
+  }
+
   bool IsFetchNode(const string& node_name) const {
     return fetch_nodes_.find(node_name) != fetch_nodes_.end();
   }
@@ -292,6 +334,13 @@ class FunctionOptimizerContext {
     specialized_functions_.emplace(sig, specialized_func);
   }
 
+  void AddTensorMapping(const SafeTensorId& from, const SafeTensorId& to) {
+    auto inserted = tensor_mapping_.insert({from, to});
+    DCHECK(inserted.second)
+        << "Failed to insert duplicated tensor mapping: "
+        << "from=" << from.ToString() << " to=" << to.ToString();
+  }
+
   void AddTensorMapping(const string& func_node,
                         const FunctionSpecialization& specialized_func) {
     for (const auto& pair : specialized_func.output_mapping) {
@@ -306,6 +355,14 @@ class FunctionOptimizerContext {
     }
   }
 
+  void AddControlOverrides(const NodeDef& func_node,
+                           const std::vector<string>& control_overrides) {
+    control_overrides_[func_node.name()].reserve(control_overrides.size());
+    for (const string& control_override : control_overrides) {
+      control_overrides_[func_node.name()].push_back(control_override);
+    }
+  }
+
  private:
   void InitializeTrulyConstNodes(const GrapplerItem& item) {
     gtl::FlatSet<string> feed_nodes;
@@ -330,11 +387,8 @@ class FunctionOptimizerContext {
   void InitializeFunctionLibraryRuntime() {
     if (!flr_) {
       Env* env = Env::Default();
-      DeviceAttributes attr;
-      attr.set_name("/device:CPU:0");
-      attr.set_device_type("CPU");
       std::vector<std::unique_ptr<Device>> devices;
-      devices.push_back(absl::make_unique<FakeCPUDevice>(env, attr));
+      devices.push_back(absl::make_unique<FakeDevice>(env, "/device:CPU:0"));
       device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
       OptimizerOptions optimizer_opts;
       optimizer_opts.set_do_function_inlining(true);
@@ -348,6 +402,7 @@ class FunctionOptimizerContext {
   const string grappler_item_id_;
   const int graph_version_;
   const RewriterConfig::Toggle opt_level_;
+  const GrapplerItem::AllowedOptimizations allowed_optimizations_;
   FunctionLibraryDefinition function_library_;
 
   // These fields initialized lazily only if needed.
@@ -355,6 +410,16 @@ class FunctionOptimizerContext {
   std::unique_ptr<ProcessFunctionLibraryRuntime> process_flr_;
   FunctionLibraryRuntime* flr_ = nullptr;
 
+  // Fully defined names of the devices available to the GrapplerItem.
+  const gtl::FlatSet<string> available_device_names_;
+
+  // List of available `FakedDevices` (lazily initialized, see devices()).
+  mutable std::vector<std::unique_ptr<Device>> available_devices_;
+
+  // DeviceSet of fake devices (`FakeDevice`) constructed from
+  // available_devices_ (lazily initialized).
+  mutable DeviceSet available_device_set_;
+
   // Nodes that are Const and not in feed.
   std::unordered_map<string, const NodeDef*> truly_const_nodes_;
   // Specialized functions.
@@ -377,6 +442,14 @@ class FunctionOptimizerContext {
   gtl::FlatMap<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>
       tensor_mapping_;
 
+  // When we inline a function into the optimized graph, we no longer have the
+  // function call node to anchor control dependencies. Instead we must expand
+  // each function call control output edge into multiple control dependencies
+  // to all side-effectful ops inside the function body.
+  //
+  // Invalidated function call node name -> Inlined side-effectful nodes
+  gtl::FlatMap<string, std::vector<string>> control_overrides_;
+
   // Use graph view to find active outputs of the function caller nodes.
   GraphView graph_view_;
 
@@ -426,7 +499,7 @@ bool HasTrulyConstInputs(const NodeDef& node,
   const auto is_truly_const = [&ctx](const string& input) {
     return ctx.IsTrulyConst(NodeName(input));
   };
-  return std::any_of(node.input().begin(), node.input().end(), is_truly_const);
+  return absl::c_any_of(node.input(), is_truly_const);
 }
 
 bool HasUnusedOutputs(const NodeDef& func_node, const FunctionDef& func,
@@ -799,22 +872,35 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
 // inputs into that, and we insert another IdentityN node to hook all function
 // outputs to it.
 
-// Returns true iff `node` is a direct function call of `func`, and we know how
-// to inline it into the main graph.
-bool IsInlinableDirectFunctionCall(const FunctionOptimizerContext& ctx,
-                                   const FunctionDef& func,
-                                   const NodeDef& node) {
+// Returns `Status::OK()` iff `node` is a direct function call of `func`, and we
+// know how to inline it into the main graph, otherwise returns and error
+// indicating why the function call is not inlinable.
+Status IsInlinableDirectFunctionCall(const FunctionOptimizerContext& ctx,
+                                     const FunctionDef& func,
+                                     const NodeDef& func_node) {
   // Indirect function calls (PartitionedCallOp) have automatic control
   // dependencies and inlined separately from direct function calls.
-  bool is_direct_function_call = IsDirectFunctionCall(func, node);
+  if (!IsDirectFunctionCall(func, func_node)) {
+    return errors::InvalidArgument("Unsupported function call type: ",
+                                   SummarizeNodeDef(func_node));
+  }
 
   // For direct function  calls we insert IdentityN nodes before/after inlined
   // function body to preserve function call semantics (all inputs evaluated
   // before function evaluation starts, and all function body nodes finished
   // before output consumed by other nodes).
-  bool has_inputs = func.signature().input_arg_size() > 0;
+  if (func.signature().input_arg_size() == 0) {
+    return errors::FailedPrecondition(
+        "Can't inline direct function call with empty inputs: ",
+        SummarizeNodeDef(func_node));
+  }
+
   // TODO(ezhulenev): Relax constraint on output args?
-  bool has_outputs = func.signature().output_arg_size() > 0;
+  if (func.signature().output_arg_size() == 0) {
+    return errors::FailedPrecondition(
+        "Can't inline direct function call with empty outputs: ",
+        SummarizeNodeDef(func_node));
+  }
 
   // Function must execute all the nodes in a function body that might have side
   // effects. After inlining these nodes into the main graph, we can no longer
@@ -825,21 +911,32 @@ bool IsInlinableDirectFunctionCall(const FunctionOptimizerContext& ctx,
   //
   // Indirect function calls (via PartitionedCallOp) have automatic dependency
   // tracking, and allow us to safely inline functions with side effects.
-  bool free_of_side_effects =
-      std::all_of(func.node_def().begin(), func.node_def().end(),
-                  [&ctx](const NodeDef& node) {
-                    return IsFreeOfSideEffect(node, &ctx.function_library());
-                  });
-
-  bool marked_noinline = MarkedNoInline(func);
-  bool marked_specialized = MarkedSpecialized(func);
+  bool has_side_effects =
+      absl::c_any_of(func.node_def(), [&ctx](const NodeDef& node) {
+        return !IsFreeOfSideEffect(node, &ctx.function_library());
+      });
+  if (has_side_effects) {
+    return errors::FailedPrecondition(
+        "Can't inline function with side-effects in the function body: ",
+        SummarizeNodeDef(func_node));
+  }
 
   // We ignore `_noinline` marker in aggressive mode.
   bool aggressive = ctx.opt_level() == RewriterConfig::AGGRESSIVE;
+  if (MarkedNoInline(func) && !aggressive) {
+    return errors::FailedPrecondition(
+        "Can't inline function marked with '_noinline': ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // Function specialization and inlining must be mutually exclusive.
+  if (MarkedSpecialized(func)) {
+    return errors::FailedPrecondition(
+        "Can't inline function created in Grappler function specialization: ",
+        SummarizeNodeDef(func_node));
+  }
 
-  return is_direct_function_call && has_inputs && has_outputs &&
-         free_of_side_effects && !marked_specialized &&
-         (!marked_noinline || aggressive);
+  return Status::OK();
 }
 
 // Create an IdentityN node to hook the function inputs to: this ensures that
@@ -886,17 +983,11 @@ NodeDef InlinedFunctionOutputsNode(const NodeDef& func_node,
 
 Status InlineDirectFunctionCall(const NodeDef& func_node,
                                 const FunctionDef& func,
-                                const FunctionOptimizerContext& ctx,
                                 const int graph_def_version,
+                                const FunctionOptimizerContext& ctx,
                                 GraphDef* optimized_graph) {
   VLOG(2) << "Inline direct function call: " << SummarizeNodeDef(func_node);
-
-  // Indirect function calls (via PartitionedCallOp) have automatic control
-  // dependencies, and doesn't need IdentityN nodes before/after inlined
-  // function body, and we inline them separately.
-  if (!IsDirectFunctionCall(func, func_node)) {
-    return errors::InvalidArgument("Can't inline indirect function call");
-  }
+  TF_RETURN_IF_ERROR(IsInlinableDirectFunctionCall(ctx, func, func_node));
 
   const AttrSlice func_instantiation_attr =
       FunctionInstantiationAttributes(func, func_node);
@@ -959,23 +1050,35 @@ Status InlineDirectFunctionCall(const NodeDef& func_node,
     // Make sure the node is placed.
     func_body_node.set_device(func_node.device());
 
-    // Check if a body node is itself a function call and can be inlined.
-    const FunctionDef* func_body_node_func =
-        FindFunctionCall(ctx, func_body_node);
-    if (func_body_node_func != nullptr &&
-        IsInlinableDirectFunctionCall(ctx, *func_body_node_func,
-                                      func_body_node)) {
-      // Recursively inline function calls.
-      TF_RETURN_IF_ERROR(
-          InlineDirectFunctionCall(func_body_node, *func_body_node_func, ctx,
-                                   graph_def_version, optimized_graph));
-    } else {
+    // Move the function body node to the optimized graph.
+    const auto move_node_to_optimized_graph = [&]() {
       // Annotate the node with the function attributes.
       for (const auto& attr : func.attr()) {
         func_body_node.mutable_attr()->insert(attr);
       }
       // Move the node to the main graph.
       optimized_graph->add_node()->Swap(&func_body_node);
+    };
+
+    // Check if a body node is itself a function call and can be inlined.
+    const FunctionDef* func_body_node_func =
+        FindFunctionCall(ctx, func_body_node);
+
+    if (func_body_node_func != nullptr) {
+      Status inlinable = IsInlinableDirectFunctionCall(
+          ctx, *func_body_node_func, func_body_node);
+      if (inlinable.ok()) {
+        TF_RETURN_IF_ERROR(
+            InlineDirectFunctionCall(func_body_node, *func_body_node_func,
+                                     graph_def_version, ctx, optimized_graph));
+      } else {
+        VLOG(2) << "Can't inline nested direct function call: "
+                << inlinable.error_message();
+        move_node_to_optimized_graph();
+      }
+
+    } else {
+      move_node_to_optimized_graph();
     }
   }
 
@@ -1082,9 +1185,350 @@ Status InlineSymbolicGradient(const NodeDef& node,
   return Status::OK();
 }
 
+// -------------------------------------------------------------------------- //
+// Inline indirect functions calls (aka PartitionedCallOp).
+//
+// When we inline indirect function calls, we instantiate the function body from
+// its FunctionDef and caller node attributes, and embed the instantiated graph
+// into the "main graph".
+//
+// In contrast to direct function calls, `PartitionedCallOp` has automatic
+// dependency tracking via input/output control edges, and we relax some of the
+// constraints that we have for direct function call inlining.
+//
+// "When a `PartitionedCallOp` function has a resource (DT_RESOURCE data type)
+// input argument it "captures" the mutable resource.  This is implemented by
+// automatically adding a incoming control edge from the previous side-effectful
+// op touching that resource, and an outgoing control edge to the next
+// side-effectful op using the same resource. This serializes the mutations of
+// the resource to make graph execution deterministic.
+//
+// Function call inlining must preserve side effect visibility:
+//
+// 1) All side effects to the captured resources, that happened before function
+//    call must be visible to the function body nodes using that resources.
+// 2) All side effects to the captured resources, that happened inside function
+//    body, must be visible to every op/function using that resource after the
+//    function call completed.
+
+// To guarantee that these properties are preserved after inlining we do:
+//
+// 1) Forward all input control dependencies from the function call node to the
+//    inlined function inputs (Identity nodes).
+// 2) Each side-effectful op inside function body adds itself as a control
+//    dependency to all the nodes in output control set of function call node.
+//
+// We do not add any other control dependencies to/from function body nodes,
+// because they are pure functions of input tensors, and can be freely
+// reordered.
+
+// Returns `Status::OK()` iff `node` is an indirect function call of `func`, and
+// we know how to inline it into the main graph, otherwise returns and error
+// indicating why the function call is not inlinable.
+Status IsInlinableIndirectFunctionCall(const FunctionOptimizerContext& ctx,
+                                       const FunctionDef& func,
+                                       const NodeDef& func_node) {
+  // We inline direct function calls above, using different rules.
+  if (!IsIndirectFunctionCall(func, func_node)) {
+    return errors::InvalidArgument("Unsupported function call type: ",
+                                   SummarizeNodeDef(func_node));
+  }
+
+  if (MarkedNoInline(func)) {
+    return errors::FailedPrecondition(
+        "Can't inline function marked with '_noinline': ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // Function specialization and inlining must be mutually exclusive.
+  if (MarkedSpecialized(func)) {
+    return errors::FailedPrecondition(
+        "Can't inline function created in Grappler function specialization: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // We can't inline functions that are in a fetch set, because it would
+  // invalidate fetch tensors (function call node fully inlined and doesn't
+  // exist in the optimized graph).
+  if (ctx.IsFetchNode(func_node.name())) {
+    return errors::FailedPrecondition(
+        "Can't inline function in a Grappler item fetch set: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // We can't inline functions with `Switch` nodes in the function body, because
+  // they might have dead tensors as a function output argument (we need all
+  // intermediate tensors to compute the function gradient). `PartitionedCallOp`
+  // invokes functions with `allow_dead_tensors = true` to reset dead flag,
+  // and return default initialized tensors instead of a dead tensors.
+  // TODO(ezhulenev): Do the liveness analysis and add
+  // `IdentitytWithResurrection` nodes after all potentially dead output
+  // tensors?
+  if (absl::c_any_of(func.node_def(), IsSwitch)) {
+    return errors::FailedPrecondition(
+        "Can't inline function with `Switch` nodes in the function body: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // TODO(b/120991525, b/120986912): We need to lower `If` and `While` nodes to
+  // `Switch` nodes after function inlining (one more PRE_PLACEMENT pass?), but
+  // because of the reason described above we are not sure that it's safe, for
+  // now just disable inlining functions with functional control flow.
+  const auto is_functional_ctrl_flow_op = [](const NodeDef& node) {
+    return IsIf(node) || IsWhile(node);
+  };
+  if (absl::c_any_of(func.node_def(), is_functional_ctrl_flow_op)) {
+    return errors::FailedPrecondition(
+        "Can't inline function with `If` or `While` nodes in the function "
+        "body: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  return Status::OK();
+}
+
+Status InlineIndirectFunctionCall(const NodeDef& func_node,
+                                  const FunctionDef& func,
+                                  const int graph_def_version,
+                                  FunctionOptimizerContext* ctx,
+                                  GraphDef* optimized_graph) {
+  VLOG(2) << "Inline indirect function call: " << SummarizeNodeDef(func_node);
+  TF_RETURN_IF_ERROR(IsInlinableIndirectFunctionCall(*ctx, func, func_node));
+
+  const AttrSlice func_instantiation_attr =
+      FunctionInstantiationAttributes(func, func_node);
+
+  GrapplerFunctionItem item;
+  Status item_status = MakeGrapplerFunctionItem(func, func_instantiation_attr,
+                                                ctx->function_library(),
+                                                graph_def_version, &item);
+
+  if (!item_status.ok()) {
+    return errors::InvalidArgument("Failed to inline function ", func_node.op(),
+                                   " instantiated by ", func_node.name(),
+                                   ". Error: ", item_status.error_message());
+  }
+
+  GraphView::InputPort control_input_port =
+      ctx->graph_view().GetInputPort(func_node.name(), Graph::kControlSlot);
+  GraphView::OutputPort control_output_port =
+      ctx->graph_view().GetOutputPort(func_node.name(), Graph::kControlSlot);
+
+  // Nodes that have side effects to the captured resources.
+  std::vector<string> happens_before;
+  absl::c_transform(
+      ctx->graph_view().GetFanin(control_input_port),
+      std::back_inserter(happens_before),
+      [](const GraphView::OutputPort port) { return port.node->name(); });
+
+  VLOG(3) << "Happens before set (size = " << happens_before.size()
+          << "): " << absl::StrJoin(happens_before, ", ");
+
+  // Nodes that must observe side effects to the captured resources.
+  std::vector<string> happens_after;
+  absl::c_transform(
+      ctx->graph_view().GetFanout(control_output_port),
+      std::back_inserter(happens_after),
+      [](const GraphView::InputPort port) { return port.node->name(); });
+
+  VLOG(3) << "Happens after set (size = " << happens_after.size()
+          << "): " << absl::StrJoin(happens_after, ", ");
+
+  // Regular (positional) inputs to the function call.
+  std::vector<SafeTensorId> inputs;
+  for (const string& input : func_node.input()) {
+    SafeTensorId tensor_id = ParseTensorName(input);
+    if (tensor_id.index() == Graph::kControlSlot) break;
+    inputs.push_back(tensor_id);
+  }
+
+  // If we have a node inside the function body without inputs (e.g. Const), we
+  // must attach a control dependency to it, to make sure that if a function
+  // call happens inside a loop, the node will be evaluated in correct frame.
+  //
+  // If the function call node has no inputs and no control dependencies, it
+  // means that it can't be a function call inside a loop, and we can safely
+  // insert that node without inputs into the main graph.
+  //
+  // TODO(ezhulenev): Use FrameMap (see grappler/utils/frame.h) to find out if
+  // the function is called inside a loop.
+  std::vector<string> empty_inputs_hook;
+  if (!item.inputs().empty()) {
+    const InputArgExpansion& arg0 = item.inputs()[0];
+    DCHECK(!arg0.placeholders.empty());
+    empty_inputs_hook.push_back(AsControlDependency(AddPrefixToNodeName(
+        arg0.placeholders[0], /*prefix=*/func_node.name())));
+  } else if (!happens_before.empty()) {
+    empty_inputs_hook.push_back(AsControlDependency(happens_before[0]));
+  }
+
+  // Mapping from input placeholder name to function input position.
+  int idx = 0;
+  absl::flat_hash_map<absl::string_view, int> input_placeholders_idx;
+  for (const InputArgExpansion& input_arg : item.inputs()) {
+    for (const string& placeholder : input_arg.placeholders) {
+      input_placeholders_idx[placeholder] = idx++;
+    }
+  }
+
+  const string prefix = strings::StrCat(func_node.name(), "/");
+
+  // ------------------------------------------------------------------------ //
+  // First we need to assign device placements to all function body nodes.
+
+  GraphDef placed_graph_def;
+
+  const DeviceSet* devices = ctx->devices();
+
+  if (devices->devices().empty()) {
+    // If there are no devices available for placer, we just put all nodes to
+    // the same device as a function caller node. This can happen if Grappler is
+    // running "offline", without active runtime session, for example as a part
+    // of a batch job for graph analysis/optimization.
+    VLOG(3) << "Assign function call node device to all function body nodes. "
+            << "Device: " << func_node.device();
+    placed_graph_def = item.mutable_function_body();
+    for (NodeDef& node : *placed_graph_def.mutable_node()) {
+      node.set_device(func_node.device());
+    }
+  } else {
+    // If we are running in an active runtime session, Grappler will get the
+    // graph after initial placing is done, and we should have devices for the
+    // placer.
+    VLOG(3) << "Run placer for instantiated function body. Devices: ["
+            << absl::StrJoin(
+                   devices->devices(), ", ",
+                   [](string* out, const Device* d) { out->append(d->name()); })
+            << "]";
+
+    // Construct a Graph object from the instantiated function body.
+    GraphConstructorOptions opts;
+    Graph graph(ctx->function_library());
+    TF_RETURN_IF_ERROR(
+        ConvertGraphDefToGraph(opts, item.function_body(), &graph));
+
+    // Use function caller node device as a default for placer.
+    const Device* default_device =
+        devices->FindDeviceByName(func_node.device());
+
+    Placer placer(&graph, devices, nullptr, /* No session options */
+                  default_device);
+    TF_RETURN_IF_ERROR(placer.Run());
+
+    // Convert Graph back to the GraphDef.
+    graph.ToGraphDef(&placed_graph_def);
+  }
+
+  // ------------------------------------------------------------------------ //
+  // After all nodes placed we need to prepare them for inlining into the
+  // optimized graph: turn placeholders into identities, update nodes
+  // connectivity, etc...
+
+  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
+    if (item.IsInputPlaceholder(func_body_node.name())) {
+      // Turn input placeholders into identity node.
+      DCHECK_EQ(0, func_body_node.input_size());
+      func_body_node.set_op("Identity");
+      (*func_body_node.mutable_attr())["T"] = func_body_node.attr().at("dtype");
+      func_body_node.mutable_attr()->erase("dtype");
+      func_body_node.mutable_attr()->erase("shape");
+      int input_idx = input_placeholders_idx[func_body_node.name()];
+      func_body_node.add_input(strings::StrCat(inputs[input_idx].ToString()));
+
+      // All side effects must happen before inputs can start executing.
+      for (const string& hb_node : happens_before) {
+        func_body_node.add_input(AsControlDependency(hb_node));
+      }
+
+    } else {
+      // Update inputs of the regular function body nodes.
+      for (string& input : *func_body_node.mutable_input()) {
+        input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
+      }
+      if (func_body_node.input_size() == 0 && !empty_inputs_hook.empty()) {
+        *func_body_node.add_input() = empty_inputs_hook[0];
+      }
+    }
+
+    // Add the function node name as a prefix 1) to node name to avoid
+    // collisions; 2) to frame name to avoid multiple LoopCond nodes in one
+    // frame after inlining.
+    TF_RETURN_IF_ERROR(
+        AddPrefixAndSuffixToNode(prefix, /*suffix=*/"", &func_body_node));
+
+    // After inlining into the optimized graph, NodeDef must have all attributes
+    // defined, which is not required for a node in a FunctionDef.
+    const OpDef* op_def;
+    TF_RETURN_IF_ERROR(
+        ctx->function_library().LookUpOpDef(func_body_node.op(), &op_def));
+    AddDefaultsToNodeDef(*op_def, &func_body_node);
+  }
+
+  // Construct a graph view for the preprocessed function body graph.
+  GraphView placed_graph_view(&placed_graph_def);
+
+  // Keep track of side-effectful ops inside function body. Each outgoing
+  // control edge from the function call node, must be replaced with control
+  // edges from inlined side-effectful ops.
+  std::vector<string> side_effectful_nodes;
+
+  // We have to make sure that all side-effectful nodes inside a function body
+  // will be executed after function inlining.
+  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
+    if (!IsFreeOfSideEffect(func_body_node, &ctx->function_library())) {
+      int num_fanouts = placed_graph_view.NumFanouts(
+          func_body_node, /*include_controlling_nodes=*/true);
+
+      // If the node doesn't have any outgoing edges and we do not have any
+      // nodes in the `happens_after` set, we can't inline a function and
+      // guarantee that side-effects will be executed. The only exception if we
+      // do function library optimization, and the GrapplerItem was constructed
+      // for the function body, because functions have strict semantics.
+
+      if (num_fanouts == 0 && happens_after.empty() &&
+          ctx->allowed_optimizations().prune_ops_with_side_effects) {
+        return errors::Internal(
+            "Can't inline a function with a side-effectful op with empty "
+            "fanouts and empty output control edge set. Function body node: ",
+            SummarizeNodeDef(func_body_node));
+      }
+
+      side_effectful_nodes.push_back(func_body_node.name());
+    }
+  }
+
+  // Move all the nodes to the optimized graph after successful preprocessing.
+  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
+    optimized_graph->add_node()->Swap(&func_body_node);
+  }
+
+  // TODO(ezhulenev): Inline nested indirect function calls.
+
+  // Indirect function call is fully inlined into the optimized graph, and we do
+  // not copy the original function call node, so we have to setup tensor
+  // mapping from old output tensors, to the outputs of inlined nodes.
+  int output_idx = 0;
+  for (const OutputArgExpansion& output : item.outputs()) {
+    for (const string& output_tensor : output.output_tensors) {
+      const SafeTensorId from_tensor(func_node.name(), output_idx++);
+      const SafeTensorId to_tensor = ParseTensorName(
+          AddPrefixToNodeName(output_tensor, /*prefix=*/func_node.name()));
+      ctx->AddTensorMapping(from_tensor, to_tensor);
+    }
+  }
+
+  // After inlining we'll have to forward all control dependencies from function
+  // call node to all side-effectful ops inside function body.
+  ctx->AddControlOverrides(func_node, side_effectful_nodes);
+
+  VLOG(3) << "Successfully inlined indirect function call: "
+          << SummarizeNodeDef(func_node);
+  return Status::OK();
+}
+
 }  // namespace
 
-Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
   // Nothing to do here.
   if (item.graph.library().function_size() == 0) {
@@ -1149,17 +1593,36 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     if (func != nullptr) {
       const string& func_name = func->signature().name();
+      const int graph_def_version = item.graph.versions().producer();
+
+      const bool is_direct_func = IsDirectFunctionCall(*func, node);
+      const bool is_indirect_func = IsIndirectFunctionCall(*func, node);
 
       // 2a. Inline direct function call if it's inlinable.
-      if (inline_func && IsInlinableDirectFunctionCall(ctx, *func, node)) {
-        // Inline function body into the optimized graph}
-        TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(InlineDirectFunctionCall(
-            node, *func, ctx, item.graph.versions().producer(),
-            optimized_graph));
-        continue;
+      if (inline_func && is_direct_func) {
+        Status inlinable = IsInlinableDirectFunctionCall(ctx, *func, node);
+        if (inlinable.ok()) {
+          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(InlineDirectFunctionCall(
+              node, *func, graph_def_version, ctx, optimized_graph));
+          continue;
+        } else {
+          VLOG(2) << inlinable.error_message();
+        }
+      }
+
+      // 2b. Inline indirect function call if it's inlinable.
+      if (inline_func && is_indirect_func) {
+        Status inlinable = IsInlinableIndirectFunctionCall(ctx, *func, node);
+        if (inlinable.ok()) {
+          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(InlineIndirectFunctionCall(
+              node, *func, graph_def_version, &ctx, optimized_graph));
+          continue;
+        } else {
+          VLOG(2) << inlinable.error_message();
+        }
       }
 
-      // 2b. Specialize it to it's instantiation context if can't be inlined,
+      // 2c. Specialize it to its instantiation context if can't be inlined,
       // and it has something worth specializing.
       bool specialization_worthy = IsParametrized(*func) ||
                                    HasTrulyConstInputs(node, ctx) ||
@@ -1205,6 +1668,57 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
   }
 
+  // Function inlining instantiates function body directly into the optimized
+  // graph, and we might end up with control dependencies to the nodes that no
+  // longer exist in a graph. We need to apply control overrides to all
+  // invalidated nodes, and rewire control dependencies to the inlined
+  // side-effectful function body nodes.
+
+  // TODO(ezhulenev): With nested function call inlining, single pass over
+  // `control_overrides` might not bring the graph into a valid state,
+  // continue until it converges and all invalidated control dependencies
+  // removed.
+
+  if (!ctx.control_overrides().empty()) {
+    for (NodeDef& node : *optimized_graph->mutable_node()) {
+      // Keep track of new control inputs to the node.
+      gtl::FlatSet<string> add_ctrl_inputs;
+
+      // Remove all invalidated control inputs.
+      for (int idx = 0; idx < node.input_size(); /* see below */) {
+        // TODO(ezhulenev): Use non-allocating TensorId after migrating
+        // `control_overrides()` to absl::flat_hash_set.
+        SafeTensorId input_tensor = ParseTensorName(node.input(idx));
+
+        auto overrides = ctx.control_overrides().find(input_tensor.node());
+        if (overrides != ctx.control_overrides().end()) {
+          // If this happens it's a bug in the function inlining.
+          if (input_tensor.index() != Graph::kControlSlot) {
+            return errors::Internal(
+                "Illegal input edge from inlined function call node");
+          }
+          // Remove control dependency to the inlined function call node.
+          node.mutable_input()->SwapElements(idx, node.input_size() - 1);
+          node.mutable_input()->RemoveLast();
+
+          // Keep track of all overrides.
+          for (const string& override : overrides->second) {
+            add_ctrl_inputs.insert(AsControlDependency(override));
+          }
+        } else {
+          // Go to the next input only if the current one was not invalidated,
+          // otherwise we need to check the swapped input as well.
+          ++idx;
+        }
+      }
+
+      // Add overrides to the node inputs.
+      for (const string& ctrl_input : add_ctrl_inputs) {
+        node.add_input(ctrl_input);
+      }
+    }
+  }
+
   *optimized_graph->mutable_versions() = item.graph.versions();
   *optimized_graph->mutable_library() =
       options_.enable_trim_function_library
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index 9bb51c2641937fab68600fecddc5cbc439d4f4b2..c971eec3f4dae5cc3457ad802700ee4f3086eb90 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -108,7 +108,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_SimpleFunction) {
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -184,7 +184,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_SkipErrorsIfGraphNotModified) {
   item.fetch = {"z1"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -284,7 +284,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FixedTypeFunction) {
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -368,7 +368,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithOutputMapping) {
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -418,7 +418,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithInputForwarding) {
   item.feed.emplace_back("x4", test::AsScalar<float>(-1.0f));
   item.feed.emplace_back("x3", test::AsScalar<int>(1234));
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
   test::ExpectTensorEqual<float>(tensors_expected[1], tensors[1]);
@@ -549,7 +549,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithNestedFunctionCall) {
   item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
   auto tensors_expected = EvaluateFetchNodes(item);
 
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
 
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
@@ -699,6 +699,331 @@ TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_NoInlineFunc) {
   CompareGraphs(item.graph, output);
 }
 
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionSimpleFunction) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // Build a graph to compute c = MyMul(a, b)
+  GrapplerItem item;
+  item.fetch = {"d"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("c", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+       NDef("d", "Identity", {"c"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {mul_func} /* Function library */);
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Function must be inlined and all nodes placed on a valid device.
+       NDef("c/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("c/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}, kDevice),
+
+       NDef("d", "Identity", {"c/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.feed.emplace_back("a", pi);
+  item.feed.emplace_back("b", pi);
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+  auto tensors_expected = EvaluateFetchNodes(item);
+  auto tensors = EvaluateFetchNodes(optimized);
+  ASSERT_EQ(tensors_expected.size(), 1);
+  ASSERT_EQ(tensors.size(), tensors_expected.size());
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithControlDependencies) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  const Tensor kOne = test::AsScalar<float>(1.0);
+  const Tensor kTwo = test::AsScalar<float>(2.0);
+  const TensorShape scalar = TensorShape({});
+
+  // Compute `x*y` and add `1.0` to the variable.
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T", "v: resource"}, {"z:T"}, {"T: {float, double}"},
+      {{{"one"}, "Const", {}, {{"value", kOne}, {"dtype", DT_FLOAT}}},
+       {{"add"},
+        "AssignAddVariableOp",
+        {"v", "one:output:0"},
+        {{"dtype", DT_FLOAT}}},
+       {{"mul"}, "Mul", {"x", "y", "^add"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // Build a graph to compute:
+  //   a = Placeholder
+  //   b = Placeholder
+  //   v = VarHandleOp(init = a)
+  //   f1 = MyMul(a, b, v)
+  //   f2 = MyMul(f1, f1, v)
+  //   return [f2, v]
+  GrapplerItem item;
+  item.fetch = {"out_1", "out_2"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Initialize variable with one of the placeholders.
+       NDef("v", "VarHandleOp", {}, {{"dtype", DT_FLOAT}, {"shape", scalar}}),
+       NDef("init_v", "AssignVariableOp", {"v", "a"}, {{"dtype", DT_FLOAT}},
+            kDevice),
+
+       // Call function first time.
+       NDef("f1", "PartitionedCall", {"a", "b", "v", "^init_v"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_RESOURCE}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       // Call function second time.
+       NDef("f2", "PartitionedCall", {"f1", "f1", "v", "^f1"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_RESOURCE}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       // Return result of multiplication and a current value of the variable.
+       NDef("out_1", "Identity", {"f2"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("out_2", "ReadVariableOp", {"v", "^f1", "^f2"},
+            {{"dtype", DT_FLOAT}}, kDevice)},
+
+      // Function library.
+      {mul_func});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Initialize variable with one of the placeholders.
+       NDef("v", "VarHandleOp", {}, {{"dtype", DT_FLOAT}, {"shape", scalar}}),
+       NDef("init_v", "AssignVariableOp", {"v", "a"}, {{"dtype", DT_FLOAT}},
+            kDevice),
+
+       // Function body of a first function call inlined into the graph.
+       NDef("f1/x", "Identity", {"a:0", "^init_v"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/y", "Identity", {"b:0", "^init_v"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/v", "Identity", {"v:0", "^init_v"}, {{"T", DT_RESOURCE}},
+            kDevice),
+       NDef("f1/one", "Const", {"^f1/x"},
+            {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
+       NDef("f1/add", "AssignAddVariableOp", {"f1/v", "f1/one"},
+            {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("f1/mul", "Mul", {"f1/x", "f1/y", "^f1/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+
+       // Function body of a second function call also inlined into the graph,
+       // and input nodes read directly from the inlined nodes of the first
+       // function call.
+       NDef("f2/x", "Identity", {"f1/mul:0", "^f1/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("f2/y", "Identity", {"f1/mul:0", "^f1/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("f2/v", "Identity", {"v:0", "^f1/add"}, {{"T", DT_RESOURCE}},
+            kDevice),
+       NDef("f2/one", "Const", {"^f2/x"},
+            {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
+       NDef("f2/add", "AssignAddVariableOp", {"f2/v", "f2/one"},
+            {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("f2/mul", "Mul", {"f2/x", "f2/y", "^f2/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+
+       // Return values read directly from inlined nodes.
+       NDef("out_1", "Identity", {"f2/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("out_2", "ReadVariableOp", {"v", "^f1/add", "^f2/add"},
+            {{"dtype", DT_FLOAT}}, kDevice)},
+
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+
+  item.feed.emplace_back("a", kOne);
+  item.feed.emplace_back("b", kTwo);
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  ASSERT_EQ(tensors_expected.size(), 2);
+  EXPECT_EQ(tensors_expected[0].flat<float>()(0), 4.0);  // mul
+  EXPECT_EQ(tensors_expected[1].flat<float>()(0), 3.0);  // read variable
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+  auto tensors = EvaluateFetchNodes(optimized);
+  ASSERT_EQ(tensors.size(), 2);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorEqual<float>(tensors_expected[1], tensors[1]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithDevicePlacement) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+  // Add device placement spec to the function body node.
+  (*mul_func.mutable_node_def())[0].set_device("/device:CPU:1");
+
+  // We need fully defined device names to run the placer for inlined function.
+  const string cpu0 = "/job:work/replica:1/task:1/device:CPU:0";
+  const string cpu1 = "/job:work/replica:1/task:1/device:CPU:1";
+
+  // Build a graph to compute c = MyMul(a, b)
+  GrapplerItem item;
+  item.fetch = {"d"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu0),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu1),
+       NDef("c", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            cpu0),
+       NDef("d", "Identity", {"c"}, {{"T", DT_FLOAT}}, cpu0)},
+      // Function library.
+      {mul_func});
+  ASSERT_TRUE(item.InferDevicesFromGraph().ok());
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu0),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu1),
+
+       // Function must be inlined and `mul` node placed on a requested device.
+       NDef("c/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, cpu1),
+       NDef("c/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, cpu1),
+       NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}, cpu1),
+
+       NDef("d", "Identity", {"c/mul:0"}, {{"T", DT_FLOAT}}, cpu0)},
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithoutSideEffects) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  const Tensor kOne = test::AsScalar<float>(1.0);
+  const Tensor kTwo = test::AsScalar<float>(2.0);
+  const TensorShape scalar = TensorShape({});
+
+  // MyMul doesn't have any side-effectful nodes in the function body, but the
+  // optimized graph has a control dependency edge `f1->f2`.
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // Build a graph to compute:
+  //   a = Placeholder
+  //   b = Placeholder
+  //   f1 = MyMul(a, b)
+  //   f2 = MyMul(a, b, ^f1)  <-- control dependency on inlined function!
+  //   return f2
+  GrapplerItem item;
+  item.fetch = {"out"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Call function first time.
+       NDef("f1", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       // Call function second time.
+       NDef("f2", "PartitionedCall", {"f1", "f1", "^f1"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       // Return result of f2.
+       NDef("out", "Identity", {"f2"}, {{"T", DT_FLOAT}}, kDevice)},
+
+      // Function library.
+      {mul_func});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Function body of a first function call inlined into the graph.
+       NDef("f1/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/mul", "Mul", {"f1/x", "f1/y"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Function body of a second function call also inlined into the graph,
+       // and input nodes read directly from the inlined nodes of the first
+       // function call, and control dependency edge removed.
+       NDef("f2/x", "Identity", {"f1/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/y", "Identity", {"f1/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/mul", "Mul", {"f2/x", "f2/y"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Return directly from inlined node of f2.
+       NDef("out", "Identity", {"f2/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
+
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+
+  item.feed.emplace_back("a", kOne);
+  item.feed.emplace_back("b", kTwo);
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  ASSERT_EQ(tensors_expected.size(), 1);
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
 TEST_F(FunctionOptimizerTest, SpecializeFunctionXTimesTwo) {
   using test::function::NDef;
 
@@ -742,7 +1067,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunctionXTimesTwo) {
   item.feed.emplace_back("x", pi);
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -806,7 +1131,7 @@ TEST_F(FunctionOptimizerTest, SpecializeIndirectFunctionXTimesTwo) {
   item.feed.emplace_back("x", pi);
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -869,7 +1194,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunctionPushDownConstInput) {
   item.feed.emplace_back("x", pi);
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -949,7 +1274,7 @@ TEST_F(FunctionOptimizerTest, SpecializeIndirectFunctionPushDownConstInput) {
   item.feed.emplace_back("x", pi);
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -1065,7 +1390,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_OncePerUniqueContext) {
   item.feed = {{"xf", pi}, {"yf", pi}, {"xi", four}, {"yi", four}};
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
 
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
@@ -1174,7 +1499,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunctionForUsedOutputTensors) {
   item.feed = {{"xf", pi}, {"yf", pi}};
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
 
   ASSERT_EQ(tensors_expected.size(), tensors.size());
@@ -1335,7 +1660,7 @@ TEST_F(FunctionOptimizerTest, SpecializeIndirectFunctionForUsedOutputTensors) {
   item.feed = {{"xf", pi}, {"yf", pi}};
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
 
   ASSERT_EQ(tensors_expected.size(), tensors.size());
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index f4653505f71c537b708ce99a62bedb24cbcb06ca..8f25a1c8c1c48281fb44c01a142348863836d5aa 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
-#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -1969,9 +1968,9 @@ class DataLayoutOptimizer : GraphProcessor {
   // Expand all nodes which is in NHWC, but supports NCHW or is layout agnostic.
   Status Expand() {
     int node_size_original = graph_->node_size();
-    std::unordered_map<const NodeDef*, std::vector<int>> frames;
-    int num_frames;
-    TF_RETURN_IF_ERROR(IdentifyFrames(*graph_, &frames, &num_frames));
+
+    FrameView frame_view;
+    TF_RETURN_IF_ERROR(frame_view.InferFromGraph(*graph_));
 
     // This is the first pass where we expand the nodes which support NCHW.
     std::set<string> ops_format_supported = GetOpsFormatSupported();
@@ -1983,7 +1982,7 @@ class DataLayoutOptimizer : GraphProcessor {
       if (ops_format_supported.find(graph_->node(i).op()) !=
           ops_format_supported.end()) {
         auto node = graph_->mutable_node(i);
-        bool is_in_frame = !frames[node].empty();
+        bool is_in_frame = frame_view.IsInFrame(*node);
         OptimizeContext opt_cxt(graph_, node, node_map_, graph_properties_,
                                 virtual_placer_, nodes_to_preserve_,
                                 is_in_frame);
@@ -2033,7 +2032,7 @@ class DataLayoutOptimizer : GraphProcessor {
         if (ops_format_agnostic.find(graph_->node(i).op()) !=
             ops_format_agnostic.end()) {
           auto node = graph_->mutable_node(i);
-          bool is_in_frame = !frames[node].empty();
+          bool is_in_frame = frame_view.IsInFrame(*node);
           OptimizeContext opt_cxt(graph_, node, node_map_, graph_properties_,
                                   virtual_placer_, nodes_to_preserve_,
                                   is_in_frame);
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index 775fb9a95f2a7107d013bfafa3779ef465138b20..36064738408c744db53cb9e95645d6a2968b1746 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/evaluation_utils.h"
-#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -380,14 +379,14 @@ Status LoopInvariantNodeMotionOptimizer::FindInvariantNodes(
 
 Status LoopInvariantNodeMotionOptimizer::Optimize() {
   node_map_.reset(new NodeMap(optimized_graph_));
-  FrameMap frame_map;
-  int num_frames;
-  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
-                                               &frame_map, &num_frames));
+  FrameView frame_view;
+  // TODO(ezhulenev): Use GraphView when migrated from NodeMap.
+  TF_RETURN_IF_ERROR(frame_view.InferFromGraph(*optimized_graph_));
+
   std::deque<int> worklist;
-  for (auto iter = frame_map.begin(); iter != frame_map.end(); ++iter) {
-    auto* node = iter->first;
-    auto& frame_ids = iter->second;
+  for (const NodeDef& node : optimized_graph_->node()) {
+    const std::vector<int>& frame_ids = frame_view.Frames(node);
+
     if (frame_ids.size() >= 3) {
       for (unsigned int i = 1; i < frame_ids.size() - 1; ++i) {
         frame_parent_[frame_ids[i]] = frame_ids[i - 1];
@@ -400,18 +399,18 @@ Status LoopInvariantNodeMotionOptimizer::Optimize() {
     }
     if (!frame_ids.empty()) {
       frame_children_.insert(std::make_pair(frame_ids.back(), empty_set_));
-      if (node->op() == "LoopCond") {
+      if (node.op() == "LoopCond") {
         if (loop_cond_.count(frame_ids.back())) {
           return errors::InvalidArgument(
               "Loop ", frame_ids.back(),
-              " has more than one LoopCond node: ", node->name(), " and ",
+              " has more than one LoopCond node: ", node.name(), " and ",
               loop_cond_[frame_ids.back()]->name());
         }
-        loop_cond_[frame_ids.back()] = node;
+        loop_cond_[frame_ids.back()] = &node;
       }
-      if (IsEnter(*node) && node->attr().at("is_constant").b()) {
+      if (IsEnter(node) && node.attr().at("is_constant").b()) {
         invariant_enters_[frame_ids.back()].push_back(
-            const_cast<NodeDef*>(node));
+            const_cast<NodeDef*>(&node));
       }
     }
   }
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
index 7c04f55381edca8f6a6679edb73479414f4c6f0b..d467237a9a704a81a0ecc1da71531868c7f3a49b 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <unordered_set>
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
-#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
index 7332839128eef23bdf77bcdfc6b22a19413c3dfa..587767c23c370ca1f747fc5b4e2bfa4cba3ae10d 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -101,27 +101,30 @@ TEST_F(LoopOptimizerTest, Basic) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).back(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd")).back(), 0);
+  }
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).back(), 0);
-
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).back(), 0);
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd")).back(), 0);
+  }
 }
 
 TEST_F(LoopOptimizerTest, Const) {
@@ -149,26 +152,29 @@ TEST_F(LoopOptimizerTest, Const) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).back(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("Const")).back(), 0);
+  }
+
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const")).back(), 0);
-
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const")).size(), 0);
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const")).size(), 0);
+  }
 }
 
 TEST_F(LoopOptimizerTest, ControlOutput) {
@@ -197,24 +203,27 @@ TEST_F(LoopOptimizerTest, ControlOutput) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
-
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).back(), 0);
+  }
+
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).back(), 0);
+  }
 }
 
 TEST_F(LoopOptimizerTest, NestedLoop1) {
@@ -258,31 +267,34 @@ TEST_F(LoopOptimizerTest, NestedLoop1) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).back(), 0);
+  }
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
-
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 0);
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 0);
+  }
 }
 
 TEST_F(LoopOptimizerTest, NestedLoop2) {
@@ -326,27 +338,30 @@ TEST_F(LoopOptimizerTest, NestedLoop2) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).back(), 1);
+  }
+
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
-
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).back(), 1);
+  }
 }
 
 TEST_F(LoopOptimizerTest, NestedLoopConst1) {
@@ -390,28 +405,31 @@ TEST_F(LoopOptimizerTest, NestedLoopConst1) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("Const2")).back(), 1);
+  }
+
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).back(), 1);
-
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).back(), 0);
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const2")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("Const2")).back(), 0);
+  }
 }
 
 TEST_F(LoopOptimizerTest, NestedLoopConst2) {
@@ -455,26 +473,29 @@ TEST_F(LoopOptimizerTest, NestedLoopConst2) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("Const2")).back(), 1);
+  }
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).back(), 1);
-
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 0);
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const2")).size(), 0);
+  }
 }
 
 void VerifyGraphsEqual(const GraphDef& original_graph,
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 453db5d91e71910405d3bd388c400276c8ae099f..227c2bb8b0f3d3e6809f65f3b3716270b0c2c6e5 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -1306,13 +1306,12 @@ Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
 
 Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* optimized_graph) {
-  *optimized_graph = item.graph;
+  GrapplerItem optimized_item(item);
 
   RecomputationRewritingPass(optimization_level_,
-                             recomputation_targets_name_scope_, optimized_graph,
-                             item);
+                             recomputation_targets_name_scope_,
+                             &optimized_item.graph, item);
 
-  GrapplerItem optimized_item(item, optimized_graph);
   std::unordered_set<string> skip_list;
   // Bound the number of rewrite passes to avoid long processing times on graphs
   // that simply won't fit in memory.
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index 75285b07bb9305fb78e37ef4918b3daf997015f6..356b23dec0de7d8648fd92b977413720654f2451 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -279,7 +279,7 @@ TEST_F(MemoryOptimizerTest, SimpleSwapping) {
   EXPECT_EQ("^swap_out_e_0", new_c.input(1));
 
   // Run the optimizer a second time to ensure it's idempotent.
-  GrapplerItem item_copy(item, std::move(output));
+  GrapplerItem item_copy = item.WithGraph(std::move(output));
   status = optimizer.Optimize(cluster.get(), item_copy, &output);
   TF_EXPECT_OK(status);
 
@@ -287,7 +287,7 @@ TEST_F(MemoryOptimizerTest, SimpleSwapping) {
   item.fetch = {"e"};
   item.init_ops = {init.name()};
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 #endif
@@ -337,7 +337,7 @@ TEST_F(MemoryOptimizerTest, SwappingHeuristics) {
 
 #if GOOGLE_CUDA
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
@@ -386,7 +386,7 @@ TEST_F(MemoryOptimizerTest, UnswappableInputs) {
 
 #if GOOGLE_CUDA
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 #endif
@@ -474,7 +474,7 @@ TEST_F(RelaxAllocatorConstraintsTest, SameDevice) {
   item.fetch = {"exp"};
   item.init_ops = {"variable"};
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -505,7 +505,7 @@ TEST_F(RelaxAllocatorConstraintsTest, DifferentDevice) {
   item.fetch = {"exp"};
   item.init_ops = {"variable"};
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 #endif
@@ -598,7 +598,7 @@ TEST_F(RelaxAllocatorConstraintsTest, AssignNodeInFanout) {
   item.fetch = {"assign0", "assign1"};
   item.init_ops = {"exp1", "variable1"};
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   for (int i = 0; i < tensors_expected.size(); ++i) {
     test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 304ddc7710abbcc7130dc5c892c18390127def61..7b788c613c9c1c42e62f69bf2dab1122b08c4f9a 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -440,7 +440,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
           item.graph)
           .ToProto();
 
-  GrapplerItem trimmed_item(item, std::move(trimmed_graph));
+  GrapplerItem trimmed_item = item.WithGraph(std::move(trimmed_graph));
 
   VLOG(1) << absl::Substitute(
       "Deleted $0 unreachable functions from the graph (library size = $1)",
@@ -524,9 +524,20 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       // can't perform non-differentiable rewrites.
       if (differentiable_functions.find(func_name) !=
           differentiable_functions.end()) {
-        func_item.allowed_optimizations.non_differentiable_rewrites = false;
+        func_item.allowed_optimizations().non_differentiable_rewrites = false;
       }
 
+      // Function item is allowed to use all devices from the main graph.
+      Status added_devices = func_item.AddDevices(item);
+      if (!added_devices.ok()) {
+        VLOG(3) << added_devices.error_message();
+      }
+
+      // We are not allowed to prune side effects from the graph instantiated
+      // by the function definition, because we must guarantee function
+      // execution semantics wrt side effects (see function_optimizer.cc).
+      func_item.allowed_optimizations().prune_ops_with_side_effects = false;
+
       // Optimize function body graph.
       GraphDef optimized_func_graph;
       TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index b60aa256676416ba4c7045b2b127d49b99a14f1f..12db5d6ca9b001fa04e42e6d228fe6289d87726e 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -108,7 +108,7 @@ class GrapplerItemPropertiesAccumulator : public CustomGraphOptimizer {
                   GraphDef* optimized_graph) override {
     *optimized_graph = item.graph;
     if (allowed_optimizations_) {
-      allowed_optimizations_->insert({item.id, item.allowed_optimizations});
+      allowed_optimizations_->insert({item.id, item.allowed_optimizations()});
     }
     return Status::OK();
   }
@@ -396,7 +396,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   item.feed.emplace_back("b", test::AsScalar<int>(4));
   auto tensors_expected = EvaluateFetchNodes(item);
 
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
 
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
@@ -502,7 +502,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneFunctionBody) {
   item.feed.emplace_back("b", test::AsScalar<float>(3.123f));
   auto tensors_expected = EvaluateFetchNodes(item);
 
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
 
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index d8e62e0b24e19033090ea19e1c5698dbc7e3bbe9..3fb3f2b0ec75d1a628445a2f5e4d58e7a498c893 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -665,7 +665,7 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
   std::reverse(topo_sorted_graph.mutable_node()->begin(),
                topo_sorted_graph.mutable_node()->end());
 
-  GrapplerItem topo_sorted_item(item, std::move(topo_sorted_graph));
+  GrapplerItem topo_sorted_item = item.WithGraph(std::move(topo_sorted_graph));
   RemapperContext ctx(topo_sorted_item);
 
   // Skip nodes that were invalidated by a remapper, e.g. do not process BiasAdd
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
index 0d4aaf646218f1a784878bd099e68f166dd0340b..e537b3df07deea17b1a53d1abf18be7bad3a6d23 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -790,20 +790,17 @@ Tree* ComputeScopeTree(const string& op_name,
   return root;
 }
 
-void PartitionByLoopStructure(const FrameMap& frame_map,
+void PartitionByLoopStructure(const FrameView& frame_view,
                               std::vector<NodeDef*> nodes,
                               std::vector<std::vector<NodeDef*>>* loop_groups) {
   // It is assumed that two nodes with identical loop containment have
-  // identical integer vectors.  Represent those by 64 bit hashes.
+  // identical integer vectors. Represent those by 64 bit hashes.
   std::unordered_map<uint64, std::vector<NodeDef*>> loop_sets;
   for (NodeDef* nd : nodes) {
     uint64 hash = 0;
-    const auto& it = frame_map.find(nd);
-    if (it != frame_map.end()) {
-      const std::vector<int>& loop_ids = it->second;
-      for (int id : loop_ids) {
-        hash = Hash64Combine(hash, static_cast<uint64>(id));
-      }
+    const std::vector<int>& loop_ids = frame_view.Frames(*nd);
+    for (int id : loop_ids) {
+      hash = Hash64Combine(hash, static_cast<uint64>(id));
     }
     loop_sets[hash].push_back(nd);
   }
@@ -821,10 +818,11 @@ Status ScopedAllocatorOptimizer::ProcessGraphDef(
   GraphOpOccurrences occ;
   FindOpOccurrences(graph, op_name_set_, &occ);
   if (!occ.empty()) {
-    FrameMap frame_map;
-    int num_frames;
-    LOG_WARNING_AND_RETURN_IF_ERROR(
-        IdentifyFramesWithNodeMap(*graph, *node_map_, &frame_map, &num_frames));
+    FrameView frame_view;
+    // TODO(ezhulenev): Pass a GraphView when this optimizer will be migrated
+    // from NodeMap.
+    LOG_WARNING_AND_RETURN_IF_ERROR(frame_view.InferFromGraph(*graph));
+
     for (auto& dt : occ) {
       VLOG(2) << "Processing device " << dt.first;
       const DevOpOccurrences& dev_occ = dt.second;
@@ -841,26 +839,26 @@ Status ScopedAllocatorOptimizer::ProcessGraphDef(
         // Nodes with a common depth and root path are now grouped
         // in the same Tree struct.  Split those groups into subgroups that
         // share identical loop nesting.
-        status = ApplyToAll(
-            root.get(), [this, rewriter, graph, &frame_map, &op_name](Tree* t) {
-              VLOG(2) << "applied to tree node " << t->edge_ << " at depth "
-                      << t->depth_ << " of size " << t->nodes_.size();
-              if (t->nodes_.size() > 1) {
-                std::vector<std::vector<NodeDef*>> loop_groups;
-                PartitionByLoopStructure(frame_map, t->nodes_, &loop_groups);
-                for (auto& lg : loop_groups) {
-                  if (lg.size() > 1) {
-                    bool applied = false;
-                    Status s = OrderNodeSet(&lg);
-                    TF_RETURN_IF_ERROR(s);
-                    VLOG(1) << "Applying Rewriter for " << op_name;
-                    s = rewriter->Rewrite(this, graph, op_name, lg, &applied);
-                    LOG_WARNING_AND_RETURN_IF_ERROR(s);
-                  }
-                }
+        status = ApplyToAll(root.get(), [this, rewriter, graph, &frame_view,
+                                         &op_name](Tree* t) {
+          VLOG(2) << "applied to tree node " << t->edge_ << " at depth "
+                  << t->depth_ << " of size " << t->nodes_.size();
+          if (t->nodes_.size() > 1) {
+            std::vector<std::vector<NodeDef*>> loop_groups;
+            PartitionByLoopStructure(frame_view, t->nodes_, &loop_groups);
+            for (auto& lg : loop_groups) {
+              if (lg.size() > 1) {
+                bool applied = false;
+                Status s = OrderNodeSet(&lg);
+                TF_RETURN_IF_ERROR(s);
+                VLOG(1) << "Applying Rewriter for " << op_name;
+                s = rewriter->Rewrite(this, graph, op_name, lg, &applied);
+                LOG_WARNING_AND_RETURN_IF_ERROR(s);
               }
-              return Status::OK();
-            });
+            }
+          }
+          return Status::OK();
+        });
         if (!status.ok()) {
           break;
         }
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 29775442629dd5a56776f2d0005f9ba50c2da84b..90ad04cf47b7ec7d8d80f90d65ea4aafa7722464 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -144,11 +144,16 @@ void NodeMap::UpdateOutput(const string& node_name,
   outputs.insert(nodes_[NodeName(new_output_name)]);
 }
 
+string TensorIdToString(const TensorId& tensor_id) {
+  return tensor_id.index() == 0 ? string(tensor_id.node())
+                                : tensor_id.ToString();
+}
+
 bool IsSameInput(const string& name1, const string& name2) {
   if (name1 == name2) return true;
   TensorId tensor1 = ParseTensorName(name1);
   TensorId tensor2 = ParseTensorName(name2);
-  return tensor1.node() == tensor2.node() && tensor1.index() == tensor2.index();
+  return tensor1 == tensor2;
 }
 
 bool IsControlInput(const string& name) {
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index b1e2d4e9cb5bbe15508695595de4e00f7313c401..89a87af323a4b40e3ce0a997d4a68a243498b046 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -100,6 +100,10 @@ class SetVector {
   std::vector<T> vector_;
 };
 
+// Returns formatted string from TensorId specific to grappler. Specifically,
+// for the 0 port (first output), only the node name is returned.
+string TensorIdToString(const TensorId& tensor_id);
+
 // True iff 'name' refers to a control inputs, i.e. a node name prefixed with
 // the ^ character.
 bool IsControlInput(const string& name);
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 2b9448e40344c16d7b4bf636d6252569674a9c85..c0f19d3828ac1581a937531318ff62875fbf3bc7 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -74,8 +74,9 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
diff --git a/tensorflow/core/grappler/utils/frame.cc b/tensorflow/core/grappler/utils/frame.cc
index df5f4ff7cf38dbc7ab3038346cd4ea65031c8227..2484b35de06c74659c583c7d34d4881729e00f21 100644
--- a/tensorflow/core/grappler/utils/frame.cc
+++ b/tensorflow/core/grappler/utils/frame.cc
@@ -15,77 +15,128 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/utils/frame.h"
 #include <deque>
-#include <stack>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
-#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 namespace grappler {
 
-Status IdentifyFrames(const GraphDef& graph, FrameMap* frame_map,
-                      int* num_frames) {
-  NodeMap node_map(const_cast<GraphDef*>(&graph));
-  return IdentifyFramesWithNodeMap(graph, node_map, frame_map, num_frames);
-}
+namespace {}  // namespace
+
+Status FrameView::InferFromGraphView(const GraphView& graph_view) {
+  if (is_inferred_) {
+    return errors::Internal("FrameView was already inferred from the graph");
+  }
+  is_inferred_ = true;
+
+  std::deque<const NodeDef*> ready_nodes;
 
-Status IdentifyFramesWithNodeMap(const GraphDef& graph, const NodeMap& node_map,
-                                 FrameMap* frame_map, int* num_frames) {
-  std::deque<std::pair<const NodeDef*, std::vector<int>>> ready_nodes;
-  for (const NodeDef& node : graph.node()) {
+  // All nodes without inputs are automatically added to the ready queue.
+  for (const NodeDef& node : graph_view.graph()->node()) {
     if (node.input_size() == 0) {
-      std::vector<int> empty;
-      ready_nodes.emplace_back(&node, empty);
-      (*frame_map)[&node] = empty;
+      ready_nodes.push_back(&node);
+      node_to_frames_[&node] = node_has_no_frames_;
     }
   }
-  std::map<string, int> name_to_id;
+
+  // We assign unique int id to each frame, and use this map to track what
+  // frames we've already seen in the graph.
+  absl::flat_hash_map<string, int> frame_name_to_id;
+
   while (!ready_nodes.empty()) {
-    auto ready_node = ready_nodes.front();
-    for (const auto& fanout : node_map.GetOutputs(ready_node.first->name())) {
-      if (frame_map->count(fanout) < 1) {
-        std::vector<int> frame_ids = ready_node.second;
-        if (IsExit(*ready_node.first)) {
+    const NodeDef* ready_node = ready_nodes.front();
+
+    absl::flat_hash_set<GraphView::InputPort> fanouts =
+        graph_view.GetFanouts(*ready_node, /*include_controlled_nodes=*/true);
+
+    for (const GraphView::InputPort& fanout : fanouts) {
+      if (node_to_frames_.count(fanout.node) < 1) {
+        // If we have never seen this node before, we add all frames from the
+        // incoming node (and pop/push frames if coming from Exit/Enter nodes).
+        std::vector<int> frame_ids = node_to_frames_[ready_node];
+
+        if (IsExit(*ready_node)) {
           frame_ids.pop_back();
         }
-        if (IsEnter(*fanout)) {
-          CHECK(fanout->attr().count("frame_name"))
-              << "Missing frame name for the Enter node " << fanout->name();
-          string name = fanout->attr().at("frame_name").s();
-          int id;
-          if (name_to_id.count(name)) {
-            id = name_to_id[name];
+
+        if (IsEnter(*fanout.node)) {
+          const AttrValue* frame_name_attr =
+              AttrSlice(*fanout.node).Find("frame_name");
+
+          if (!frame_name_attr) {
+            return errors::InvalidArgument(
+                "Missing frame name for the Enter node: ",
+                SummarizeNodeDef(*fanout.node));
+          }
+
+          absl::string_view frame_name = frame_name_attr->s();
+          int frame_id;
+
+          if (frame_name_to_id.count(frame_name)) {
+            frame_id = frame_name_to_id[frame_name];
           } else {
-            id = name_to_id.size();
-            name_to_id[name] = id;
+            frame_id = static_cast<int>(frame_name_to_id.size());
+            frame_name_to_id[frame_name] = frame_id;
           }
-          frame_ids.push_back(id);
+
+          frame_ids.push_back(frame_id);
         }
-        ready_nodes.emplace_back(fanout, frame_ids);
-        (*frame_map)[fanout] = frame_ids;
+
+        ready_nodes.push_back(fanout.node);
+        node_to_frames_[fanout.node] = std::move(frame_ids);
+
       } else {
-        auto frame_ids_fanout = (*frame_map)[fanout];
-        auto frame_ids_node = ready_node.second;
-        if (IsEnter(*fanout)) {
+        // If we've already seen this node before, we need to make sure that
+        // graph is correct and same nodes doesn't have incoming edges with
+        // conflicting frames (all inputs must be produces in the same frame).
+
+        std::vector<int> frame_ids_fanout = node_to_frames_[fanout.node];
+        std::vector<int> frame_ids_node = node_to_frames_[ready_node];
+
+        if (IsEnter(*fanout.node)) {
           frame_ids_fanout.pop_back();
         }
-        if (IsExit(*ready_node.first)) {
+        if (IsExit(*ready_node)) {
           frame_ids_node.pop_back();
         }
+
         if (frame_ids_node != frame_ids_fanout) {
           return errors::InvalidArgument(
-              "Invalid graph: Frame ids for node ", ready_node.first->name(),
-              " does not match frame ids for it's fanout.");
+              "Invalid graph: Frame ids for node ", ready_node->name(),
+              " does not match frame ids for it's fanout ",
+              fanout.node->name());
         }
       }
     }
+
     ready_nodes.pop_front();
   }
-  *num_frames = name_to_id.size();
+
+  num_frames_ = static_cast<int>(frame_name_to_id.size());
   return Status::OK();
 }
 
+Status FrameView::InferFromGraph(const GraphDef& graph) {
+  return InferFromGraphView(GraphView(&graph));
+}
+
+const std::vector<int>& FrameView::Frames(const NodeDef& node) const {
+  DCHECK(is_inferred_) << "FrameView is not initialized";
+  auto frames = node_to_frames_.find(&node);
+  if (frames == node_to_frames_.end()) {
+    LOG(WARNING) << "Node doesn't belong to the graph used for initialization";
+    return node_has_no_frames_;
+  } else {
+    return frames->second;
+  }
+}
+
+bool FrameView::IsInFrame(const NodeDef& node) const {
+  return !Frames(node).empty();
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/frame.h b/tensorflow/core/grappler/utils/frame.h
index 95b72748f4e1f13f1c61d64c4a457287e9d7d46b..04c6588275098a0a3f7110be7af4e2e9207b0ac2 100644
--- a/tensorflow/core/grappler/utils/frame.h
+++ b/tensorflow/core/grappler/utils/frame.h
@@ -17,25 +17,52 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
 
 #include <unordered_map>
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
-using FrameMap = std::unordered_map<const NodeDef*, std::vector<int>>;
+// FrameView is a helper class that allows to find in what execution frames (if
+// any) the given node can be running in. It's constructed from an immutable
+// GraphView, and any modification of the underlying graph might invalidate it.
+//
+// All execution frames assigned an unique integer id, but they do not have any
+// meaning whatsoever, it's just a sequence number.
+//
+// See the paper "Dynamic Control Flow in Large-Scale Machine Learning" for
+// detailed explanation of execution frames (https://arxiv.org/abs/1805.01772).
+class FrameView {
+ public:
+  FrameView() : is_inferred_(false), num_frames_(0) {}
 
-// Returns the number of frames present in the graph, and populates
-// the 'frames' argument with the collection of frames (denoted by their
-// frame ids) in the outermost-to-innermost order. Frame ids are arbitrary.
-Status IdentifyFrames(const GraphDef& graph, FrameMap* frame_map,
-                      int* num_frames);
+  // Infers nodes execution frames from the GraphView. Returns an error if
+  // called multiple times.
+  Status InferFromGraphView(const GraphView& graph_view);
+  // Infers nodes execution by constructing temporary GraphView and passing it
+  // to InferFromGraphView.
+  Status InferFromGraph(const GraphDef& graph);
 
-// As above, but use an existing NodeMap for graph instead of building it
-// from scratch.
-Status IdentifyFramesWithNodeMap(const GraphDef& graph, const NodeMap& node_map,
-                                 FrameMap* frame_map, int* num_frames);
+  // Returns all frames of the given node (denoted by their frame ids) in
+  // outermost-to-innermost order.
+  const std::vector<int>& Frames(const NodeDef& node) const;
+
+  // Returns true iff the node is at least in one execution frame.
+  bool IsInFrame(const NodeDef& node) const;
+
+  int num_frames() const { return num_frames_; }
+  bool is_inferred() const { return is_inferred_; }
+
+ private:
+  bool is_inferred_;  // true if it was inferred from the graph
+  int num_frames_;    // number of frames present in a graph
+  absl::flat_hash_map<const NodeDef*, std::vector<int>> node_to_frames_;
+
+  // We return a reference to this vector if node has no frames.
+  const std::vector<int> node_has_no_frames_;
+};
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/frame_test.cc b/tensorflow/core/grappler/utils/frame_test.cc
index df76083fc3a0334172ac93998e0b549a2c723431..cc82e0ed3a39dd117e2197fa9a47fe2f3372051d 100644
--- a/tensorflow/core/grappler/utils/frame_test.cc
+++ b/tensorflow/core/grappler/utils/frame_test.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class IdentifyFramesTest : public ::testing::Test {
+class FrameViewTest : public ::testing::Test {
  protected:
   static NodeDef CreateNode(const string& name,
                             const std::vector<string>& inputs) {
@@ -53,19 +53,17 @@ class IdentifyFramesTest : public ::testing::Test {
   }
 };
 
-TEST_F(IdentifyFramesTest, NestedLoop) {
+TEST_F(FrameViewTest, NestedLoop) {
   GraphDef graph;
   // Create a two-level nested loop
   *graph.add_node() = CreateNode("0", {});
-  *graph.add_node() =
-      CreateNode("1", "Enter", "map/while/while_context1", {"0"});
+  *graph.add_node() = CreateNode("1", "Enter", "while/context1", {"0"});
   *graph.add_node() = CreateNode("2", {"1"});
   *graph.add_node() = CreateNode("3", "Merge", {"2", "14"});
   *graph.add_node() = CreateNode("4", {"3"});
   *graph.add_node() = CreateNode("5", "Switch", {"4"});
   *graph.add_node() = CreateNode("6", {"5"});
-  *graph.add_node() =
-      CreateNode("7", "Enter", "map/while/while_context2", {"6"});
+  *graph.add_node() = CreateNode("7", "Enter", "while/context2", {"6"});
   *graph.add_node() = CreateNode("8", {"7"});
   *graph.add_node() = CreateNode("9", "Merge", {"8", "12"});
   *graph.add_node() = CreateNode("10", {"9"});
@@ -77,118 +75,106 @@ TEST_F(IdentifyFramesTest, NestedLoop) {
   *graph.add_node() = CreateNode("16", "Exit", {"15"});
   *graph.add_node() = CreateNode("17", {"16"});
 
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  FrameView frame_view;
+  ASSERT_TRUE(frame_view.InferFromGraph(graph).ok());
+
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}},      {"1", {0}},     {"2", {0}},     {"3", {0}},
       {"4", {0}},     {"5", {0}},     {"6", {0}},     {"7", {0, 1}},
       {"8", {0, 1}},  {"9", {0, 1}},  {"10", {0, 1}}, {"11", {0, 1}},
       {"12", {0, 1}}, {"13", {0, 1}}, {"14", {0}},    {"15", {0}},
       {"16", {0}},    {"17", {}}};
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.size(), expected.size());
-  std::cout << "Number of frame: " << num_frames << std::endl;
-  for (const auto& node : frames) {
-    std::cout << node.first->name() << ": ";
-    EXPECT_EQ(node.second.size(), expected[node.first->name()].size());
-    for (int i = 0; i < node.second.size(); i++) {
-      EXPECT_EQ(expected[node.first->name()][i], node.second[i]);
-      std::cout << node.second[i] << " ";
-    }
-    std::cout << std::endl;
+
+  EXPECT_EQ(frame_view.num_frames(), 2);
+  for (const NodeDef& node : graph.node()) {
+    std::vector<int> expected_frames = expected[node.name()];
+    std::vector<int> node_frames = frame_view.Frames(node);
+    EXPECT_EQ(expected_frames, node_frames);
   }
 }
 
-TEST_F(IdentifyFramesTest, MultipleInputsToEnter) {
+TEST_F(FrameViewTest, MultipleInputsToEnter) {
   GraphDef graph;
   *graph.add_node() = CreateNode("0", {});
   *graph.add_node() = CreateNode("1", {});
-  *graph.add_node() =
-      CreateNode("2", "Enter", "map/while/while_context", {"0", "1"});
+  *graph.add_node() = CreateNode("2", "Enter", "while/context", {"0", "1"});
   *graph.add_node() = CreateNode("3", "Exit", {"2"});
 
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  FrameView frame_view;
+  ASSERT_TRUE(frame_view.InferFromGraph(graph).ok());
+
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}}, {"1", {}}, {"2", {0}}, {"3", {0}}};
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.size(), expected.size());
-  std::cout << "Number of frame: " << num_frames << std::endl;
-  for (const auto& node : frames) {
-    std::cout << node.first->name() << ": ";
-    EXPECT_EQ(node.second.size(), expected[node.first->name()].size());
-    for (int i = 0; i < node.second.size(); i++) {
-      EXPECT_EQ(expected[node.first->name()][i], node.second[i]);
-      std::cout << node.second[i] << " ";
-    }
-    std::cout << std::endl;
+
+  EXPECT_EQ(frame_view.num_frames(), 1);
+  for (const NodeDef& node : graph.node()) {
+    std::vector<int> expected_frames = expected[node.name()];
+    std::vector<int> node_frames = frame_view.Frames(node);
+    EXPECT_EQ(expected_frames, node_frames);
   }
 }
 
-TEST_F(IdentifyFramesTest, ExitOutput) {
+TEST_F(FrameViewTest, ExitOutput) {
   GraphDef graph;
   *graph.add_node() = CreateNode("0", {});
-  *graph.add_node() =
-      CreateNode("1", "Enter", "map/while/while_context", {"0"});
+  *graph.add_node() = CreateNode("1", "Enter", "while/context", {"0"});
   *graph.add_node() = CreateNode("2", "Exit", {"1"});
   *graph.add_node() = CreateNode("3", {});
   *graph.add_node() = CreateNode("4", {"2", "3"});
 
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  FrameView frame_view;
+  ASSERT_TRUE(frame_view.InferFromGraph(graph).ok());
+
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}}, {"1", {0}}, {"2", {0}}, {"3", {}}, {"4", {}}};
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.size(), expected.size());
-  std::cout << "Number of frame: " << num_frames << std::endl;
-  for (const auto& node : frames) {
-    std::cout << node.first->name() << ": ";
-    EXPECT_EQ(node.second.size(), expected[node.first->name()].size());
-    for (int i = 0; i < node.second.size(); i++) {
-      EXPECT_EQ(expected[node.first->name()][i], node.second[i]);
-      std::cout << node.second[i] << " ";
-    }
-    std::cout << std::endl;
+
+  EXPECT_EQ(frame_view.num_frames(), 1);
+  for (const NodeDef& node : graph.node()) {
+    std::vector<int> expected_frames = expected[node.name()];
+    std::vector<int> node_frames = frame_view.Frames(node);
+    EXPECT_EQ(expected_frames, node_frames);
   }
 }
 
-TEST_F(IdentifyFramesTest, MultipleEnterNodes) {
+TEST_F(FrameViewTest, MultipleEnterNodes) {
   GraphDef graph;
   *graph.add_node() = CreateNode("0", {});
-  string frame = "map/while/while_context";
-  *graph.add_node() = CreateNode("1", "Enter", frame, {"0"});
+  *graph.add_node() = CreateNode("1", "Enter", "while/context", {"0"});
   *graph.add_node() = CreateNode("2", {"1"});
   *graph.add_node() = CreateNode("5", {});
-  *graph.add_node() = CreateNode("4", "Enter", frame, {"5"});
+  *graph.add_node() = CreateNode("4", "Enter", "while/context", {"5"});
   *graph.add_node() = CreateNode("3", {"4", "2"});
   *graph.add_node() = CreateNode("6", "Merge", {"3", "8"});
   *graph.add_node() = CreateNode("7", "Switch", {"6"});
   *graph.add_node() = CreateNode("8", "NextIteration", {"7"});
   *graph.add_node() = CreateNode("9", "Exit", {"7"});
 
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  FrameView frame_view;
+  ASSERT_TRUE(frame_view.InferFromGraph(graph).ok());
+
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}}, {"1", {0}}, {"2", {0}}, {"3", {0}}, {"4", {0}},
       {"5", {}}, {"6", {0}}, {"7", {0}}, {"8", {0}}, {"9", {0}}};
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.size(), expected.size());
-  std::cout << "Number of frame: " << num_frames << std::endl;
-  for (const auto& node : frames) {
-    std::cout << node.first->name() << ": ";
-    EXPECT_EQ(node.second.size(), expected[node.first->name()].size());
-    for (int i = 0; i < node.second.size(); i++) {
-      EXPECT_EQ(expected[node.first->name()][i], node.second[i]);
-      std::cout << node.second[i] << " ";
-    }
-    std::cout << std::endl;
+
+  EXPECT_EQ(frame_view.num_frames(), 1);
+  for (const NodeDef& node : graph.node()) {
+    std::vector<int> expected_frames = expected[node.name()];
+    std::vector<int> node_frames = frame_view.Frames(node);
+    EXPECT_EQ(expected_frames, node_frames);
   }
 }
 
+TEST_F(FrameViewTest, ConflictingFrames) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("0", {});
+  *graph.add_node() = CreateNode("1", "Enter", "while/context1", {"0"});
+  *graph.add_node() = CreateNode("2", "Enter", "while/context2", {"1"});
+  *graph.add_node() = CreateNode("3", {"1", "2"});
+
+  FrameView frame_view;
+  ASSERT_FALSE(frame_view.InferFromGraph(graph).ok());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 7756c73967b1b169deae59e1647d19f17f89f8f5..f2894a942bd3dac3e22748787eaa24717ed61555 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -94,7 +94,7 @@ void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
   for (int i = 0; i < placeholders.size(); ++i) {
     const string& placeholder = input_arg_expansion.placeholders[i];
     input_arg_placeholders_.insert(
-        {placeholder, InputArgPlaceholder{input_name, /*position=*/i}});
+        {placeholder, InputArgPlaceholder{input_name, /*input_position=*/i}});
   }
   input_arg_expansions_.insert(
       {std::move(input_name), std::move(input_arg_expansion)});
@@ -248,8 +248,8 @@ Status GrapplerFunctionConnectivity::AsFunctionDefInput(
     const InputArgPlaceholder* placeholder =
         FindOrNull(input_arg_placeholders_, node_name);
     if (placeholder != nullptr) {
-      *func_def_input =
-          strings::StrCat(placeholder->input_name, ":", placeholder->position);
+      *func_def_input = strings::StrCat(placeholder->input_name, ":",
+                                        placeholder->input_position);
       return Status::OK();
     }
   }
@@ -347,6 +347,10 @@ GrapplerFunctionItem::GrapplerFunctionItem(
       fetch.push_back(output_tensor);
     }
   }
+
+  // It's unsafe to prune side-effectful ops from the graph instantiated from a
+  // function definition (see inlining in function_optimizer.cc).
+  allowed_optimizations().prune_ops_with_side_effects = false;
 }
 
 const string& GrapplerFunctionItem::description() const { return description_; }
@@ -561,7 +565,6 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     inputs.push_back(std::move(input_expansion));
   }
 
-  std::vector<string> keep_nodes;
   // Add all function nodes to the function body
   for (const NodeDef& func_def_node : func.node_def()) {
     NodeDef* new_node = function_body.add_node();
@@ -577,11 +580,6 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     // Register node output range in a function connectivity.
     TF_RETURN_IF_ERROR(RegisterFunctionBodyOutputs(*registration, func_def_node,
                                                    &connectivity));
-
-    // Ops with side effects must be preserved in a function body.
-    if (!IsFreeOfSideEffect(func_def_node)) {
-      keep_nodes.push_back(func_def_node.name());
-    }
   }
 
   // Rewrite inputs to use GraphDef format
@@ -612,12 +610,14 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     outputs.push_back(std::move(output));
   }
 
+  std::vector<string> keep_ops;
   bool is_stateful = signature.is_stateful();
 
   *item = GrapplerFunctionItem(
-      /*func_name=*/signature.name(), /*description=*/signature.description(),
+      /*func_name=*/signature.name(),
+      /*description=*/signature.description(),
       /*func_attr=*/AttrSlice(&func.attr()), std::move(inputs),
-      std::move(outputs), std::move(keep_nodes), graph_def_version, is_stateful,
+      std::move(outputs), std::move(keep_ops), graph_def_version, is_stateful,
       std::move(function_body));
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index ba9950e4843bf40ae524ef11abcfeacd7b079827..038cf5f527e0f32cc10e123bb0cab357e5902463 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -111,8 +111,10 @@ class GrapplerFunctionConnectivity {
   std::unordered_map<string, tensorflow::NameRangeMap> function_body_outputs_;
 
   struct InputArgPlaceholder {
-    string input_name;
-    int position;
+    string input_name;   // Name of the function input argument.
+    int input_position;  // Index of a tensor in the function input argument
+                         // expansion, it can be greater than `0` if input
+                         // argument is a list of tensors (aka list(type)).
   };
 
   // Mapping from input arg placeholder to the function input tensor.
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 8639dec05a1eb8aa7afcadc20ee9f8949bfeae14..5923850eca65a219fe3c452947751509a2bcf445 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -599,8 +599,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithSideEffectfulOps) {
   EXPECT_EQ(3, item.function_body().node_size());
   EXPECT_EQ(1, item.input_size());
   EXPECT_EQ(0, item.output_size());
-  ASSERT_EQ(1, item.keep_ops.size());
-  EXPECT_EQ("update", item.keep_ops[0]);
+  EXPECT_EQ(false, item.allowed_optimizations().prune_ops_with_side_effects);
 }
 
 TEST_F(FunctionsTest, MakeFunctionDef) {
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index 6266733f3e6588af9e06a5a279ecabf5adbd009a..576494cad55e22ba8457f30d0ea79b53f6f5de78 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -114,9 +114,13 @@ void GrapplerTest::CompareGraphs(GraphDef want, GraphDef got) const {
   for (int i = 0; i < want.node_size(); ++i) {
     EXPECT_EQ(want.node(i).op(), got.node(i).op());
     EXPECT_EQ(want.node(i).name(), got.node(i).name());
+    EXPECT_EQ(want.node(i).device(), got.node(i).device());
+
     ASSERT_EQ(want.node(i).input_size(), got.node(i).input_size());
     for (int j = 0; j < want.node(i).input_size(); ++j) {
-      EXPECT_TRUE(IsSameInput(want.node(i).input(j), got.node(i).input(j)));
+      const TensorId want_tensor = ParseTensorName(want.node(i).input(j));
+      const TensorId got_tensor = ParseTensorName(got.node(i).input(j));
+      EXPECT_EQ(want_tensor.ToString(), got_tensor.ToString());
     }
   }
 }
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index e993391b51bfe882a1e662f220ace0542db4ffba..f5ae39867ac758efa52d9109b5f85b020c1e7ae4 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -464,6 +464,13 @@ TEST_F(UtilsTest, SetTensorValueBFloat16IntMin) {
       Tensor(bfloat16(std::numeric_limits<int>::min())), t);
 }
 
+TEST_F(UtilsTest, TensorIdToString) {
+  EXPECT_EQ("^foo", TensorIdToString({"foo", -1}));
+  EXPECT_EQ("foo", TensorIdToString({"foo", 0}));
+  EXPECT_EQ("foo:1", TensorIdToString({"foo", 1}));
+  EXPECT_EQ("foo:2", TensorIdToString({"foo", 2}));
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 60accc0f9bd04a1ac405c5b94331351e09f7d4a9..d519b2426e485aa4c790594f8c719bb2c388197a 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -29,26 +29,26 @@ package_group(
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "cc_header_only_library",
     "if_android",
+    "if_not_windows",
+    "tf_cc_binary",
     "tf_cc_test",
     "tf_cc_test_mkl",
     "tf_cc_tests",
-    "tf_cc_binary",
     "tf_copts",
     "tf_cuda_library",
-    "tf_opts_nortti_if_android",
     "tf_kernel_library",
     "tf_mkl_kernel_library",
-    "cc_header_only_library",
-    "if_not_windows",
+    "tf_opts_nortti_if_android",
 )
 load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_proto_library",
     "tf_kernel_tests_linkstatic",
+    "tf_proto_library",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -592,6 +592,7 @@ cc_library(
 #   #endif
 cc_library(
     name = "eigen_contraction_kernel",
+    srcs = ["eigen_contraction_kernel.cc"],
     hdrs = ["eigen_contraction_kernel.h"],
     defines = select({
         ":mkldnn_contraction_kernel": [
@@ -603,7 +604,7 @@ cc_library(
     deps = [
         "//third_party/eigen3",
     ] + select({
-        ":mkldnn_contraction_kernel": ["//third_party/intel_mkl_dnn:mkldnn_single_threaded"],
+        ":mkldnn_contraction_kernel": ["@mkl_dnn//:mkldnn_single_threaded"],
         "//conditions:default": [],
     }),
 )
@@ -2196,6 +2197,7 @@ tf_kernel_library(
         ":state",
         ":training_op_helpers",
         ":variable_ops",
+        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -2755,6 +2757,7 @@ cc_library(
         ":cholesky_grad",
         ":cholesky_op",
         ":determinant_op",
+        ":lu_op",
         ":matrix_exponential_op",
         ":matrix_inverse_op",
         ":matrix_logarithm_op",
@@ -2900,6 +2903,19 @@ tf_kernel_library(
     deps = LINALG_DEPS,
 )
 
+tf_kernel_library(
+    name = "lu_op",
+    prefix = "lu_op",
+    deps = if_cuda([
+        ":cuda_solvers",
+        ":transpose_functor",
+    ]) + [
+        "//third_party/eigen3",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "linalg_ops_common",
     srcs = ["linalg_ops_common.cc"],
@@ -3698,7 +3714,6 @@ NN_DEPS = [
     ":bounds_check",
     ":conv_2d",
     ":eigen_contraction_kernel",
-    ":fused_batch_norm_util_gpu",
     ":ops_util",
     ":pooling_ops",
     "//tensorflow/core:framework",
@@ -3957,19 +3972,6 @@ tf_kernel_library(
     alwayslink = 1,
 )
 
-tf_kernel_library(
-    name = "fused_batch_norm_util",
-    gpu_srcs = [
-        "fused_batch_norm_op.h",
-        "fused_batch_norm_op.cu.cc",
-    ],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//third_party/eigen3",
-    ],
-)
-
 cc_library(
     name = "pooling_ops_hdrs",
     hdrs = [
@@ -6741,6 +6743,31 @@ tf_mkl_kernel_library(
     deps = NN_DEPS + mkl_deps() + [":cwise_op"],
 )
 
+tf_cc_test_mkl(
+    name = "mkl_fused_ops_test",
+    size = "small",
+    srcs = ["mkl_fused_ops_test.cc"],
+    linkstatic = 1,
+    deps = [
+        ":conv_ops",
+        ":image",
+        ":mkl_conv_op",
+        ":mkl_tfconv_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_mkl_kernel_library(
     name = "mkl_transpose_op",
     srcs = [
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 426c404f4388d4366dec4cec84c01accb5ec6cd6..33b9243dfea29b687d1cde15ac6971b886400d4f 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -47,7 +47,7 @@ namespace {
 std::unique_ptr<const NodeDef> StripTensorDataFromNodeDef(
     OpKernelConstruction* ctx) {
 #ifndef __ANDROID__
-  DCHECK_EQ(NodeDef::descriptor()->field_count(), 5)
+  DCHECK_EQ(NodeDef::descriptor()->field_count(), 6)
       << "The NodeDef format has changed, and the attr-stripping code may need "
       << "to be updated.";
 #endif
@@ -61,6 +61,7 @@ std::unique_ptr<const NodeDef> StripTensorDataFromNodeDef(
   // attrs that affect the cardinality of list-typed inputs and outputs, so it
   // is safe to drop other attrs from the NodeDef.
   AddNodeAttr("dtype", ctx->output_type(0), ret);
+  MergeDebugInfo(original, ret);
   return std::unique_ptr<const NodeDef>(ret);
 }
 
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 081ef72c15dc20245a7ad1a409023465543a64c8..36def4a53065e2c6ac68a8b67818096012104753 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -601,6 +601,13 @@ LoopCondOp::LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
 LoopCondOp::~LoopCondOp() = default;
 
 void LoopCondOp::Compute(OpKernelContext* context) {
+  CancellationManager* cm = context->cancellation_manager();
+  if (cm != nullptr) {
+    bool already_cancelled = cm->IsCancelled();
+    OP_REQUIRES(context, !already_cancelled,
+                errors::Cancelled("Loop execution was cancelled."));
+  }
+
   context->set_output(0, context->input(0));
 }
 
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index bc30da40991b56adc136bbe6115db16c00a04666..4e3de33e83a34e0ec6a4c4d87f93127ec134c822 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -903,7 +903,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
   auto input_ptr = AsDeviceMemory(transformed_input.template flat<T>().data(),
                                   transformed_input.template flat<T>().size());
 
-  static int64 ConvolveBackwardFilterScratchSize = GetCudnnWorkspaceLimit(
+  static int64 ConvolveBackwardFilterScratchSize = GetDnnWorkspaceLimit(
       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB by default
   );
   int device_id = stream->parent()->device_ordinal();
@@ -939,8 +939,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
-      CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
-                                              ctx);
+      DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
+                                            ctx);
       ProfileResult profile_result;
       bool cudnn_launch_status =
           stream
@@ -977,8 +977,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     AutoTuneConvBwdFilter::GetInstance()->Insert(conv_parameters,
                                                  algorithm_config);
   }
-  CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
-                                          ctx);
+  DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize, ctx);
   bool cudnn_launch_status =
       stream
           ->ThenConvolveBackwardFilterWithAlgorithm(
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index e06af15f2fc5558e9810c3da525fbf3cb385e893..9f983ed8166d51a720b4ea0ff360a974a7b4fb86 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -951,10 +951,10 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(),
                      pre_transformed_in_backprop.template flat<T>().size());
 
-  static int64 ConvolveBackwardDataScratchSize = GetCudnnWorkspaceLimit(
+  static int64 ConvolveBackwardDataScratchSize = GetDnnWorkspaceLimit(
       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB by default
   );
-  CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, ctx);
+  DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, ctx);
   int device_id = stream->parent()->device_ordinal();
   DataType dtype = out_backprop.dtype();
   ConvParameters conv_parameters = {
@@ -988,8 +988,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
-      CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
-                                              ctx);
+      DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
+                                            ctx);
       ProfileResult profile_result;
       bool cudnn_launch_status =
           stream
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index e4c49efea0bd87fdbaa3fbdad3d5612d6b4f8a82..562a9c8aed5850418aa8acecec35a7860ae99921 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -1333,7 +1333,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(),
                        pre_transformed_in_backprop.template flat<T>().size());
 
-    static int64 ConvolveBackwardDataScratchSize = GetCudnnWorkspaceLimit(
+    static int64 ConvolveBackwardDataScratchSize = GetDnnWorkspaceLimit(
         "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32);  // 4GB by default
 
     const int device_id = stream->parent()->device_ordinal();
@@ -1368,8 +1368,8 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
       for (auto profile_algorithm : algorithms) {
         // TODO(zhengxq): profile each algorithm multiple times to better
         // accuracy.
-        CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
-                                                context);
+        DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
+                                              context);
         ProfileResult profile_result;
         bool cudnn_launch_status =
             stream
@@ -1405,8 +1405,8 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
       AutoTuneConv3dBwdData::GetInstance()->Insert(conv_parameters,
                                                    algorithm_config);
     }
-    CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
-                                            context);
+    DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
+                                          context);
     bool cudnn_launch_status =
         stream
             ->ThenConvolveBackwardDataWithAlgorithm(
@@ -1739,7 +1739,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         AsDeviceMemory(transformed_input.template flat<T>().data(),
                        transformed_input.template flat<T>().size());
 
-    static int64 ConvolveBackwardFilterScratchSize = GetCudnnWorkspaceLimit(
+    static int64 ConvolveBackwardFilterScratchSize = GetDnnWorkspaceLimit(
         "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32);  // 4GB by default
 
     const int device_id = stream->parent()->device_ordinal();
@@ -1774,8 +1774,8 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
       for (auto profile_algorithm : algorithms) {
         // TODO(zhengxq): profile each algorithm multiple times to better
         // accuracy.
-        CudnnScratchAllocator scratch_allocator(
-            ConvolveBackwardFilterScratchSize, context);
+        DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
+                                              context);
         ProfileResult profile_result;
         bool cudnn_launch_status =
             stream
@@ -1812,8 +1812,8 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
       AutoTuneConv3dBwdFilter::GetInstance()->Insert(conv_parameters,
                                                      algorithm_config);
     }
-    CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
-                                            context);
+    DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
+                                          context);
     bool cudnn_launch_status =
         stream
             ->ThenConvolveBackwardFilterWithAlgorithm(
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 74857fc2078dc3ee5e17959fc32febcdcb38a689..dfba15792dcf5d293d894027b51c56df31a0e520 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -521,8 +521,8 @@ template struct LaunchConv2DOp<CPUDevice, float>;
 template struct LaunchConv2DOp<CPUDevice, double>;
 
 #if GOOGLE_CUDA
-int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb,
-                             int64 default_value_in_bytes) {
+int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
+                           int64 default_value_in_bytes) {
   const char* workspace_limit_in_mb_str = getenv(envvar_in_mb.c_str());
   if (workspace_limit_in_mb_str != nullptr &&
       strcmp(workspace_limit_in_mb_str, "") != 0) {
@@ -759,7 +759,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       AsDeviceMemory(transformed_output.template flat<T>().data(),
                      transformed_output.template flat<T>().size());
 
-  static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
+  static int64 ConvolveScratchSize = GetDnnWorkspaceLimit(
       // default value is in bytes despite the name of the environment variable
       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
   );
@@ -803,7 +803,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
-      CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+      DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
       ProfileResult profile_result;
       bool cudnn_launch_status =
           stream
@@ -841,7 +841,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     AutoTuneConv::GetInstance()->Insert(conv_parameters, algorithm_config);
   }
 
-  CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+  DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
   bool cudnn_launch_status =
       stream
           ->ThenConvolveWithAlgorithm(input_desc, input_ptr, filter_desc,
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index f20ac93b5a01cf2dbd1c53ce55c832727f49979f..5a59e20cc27cb7fe7b6fc6d9fdd160f2e3c4a983 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -407,7 +407,7 @@ struct LaunchConvOp<GPUDevice, T> {
         AsDeviceMemory(transformed_output.template flat<T>().data(),
                        transformed_output.template flat<T>().size());
 
-    static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
+    static int64 ConvolveScratchSize = GetDnnWorkspaceLimit(
         "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32);  // 4GB by default
 
     int device_id = stream->parent()->device_ordinal();
@@ -450,7 +450,7 @@ struct LaunchConvOp<GPUDevice, T> {
       for (auto profile_algorithm : algorithms) {
         // TODO(zhengxq): profile each algorithm multiple times to better
         // accuracy.
-        CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+        DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
         ProfileResult profile_result;
         bool cudnn_launch_status =
             stream
@@ -486,7 +486,7 @@ struct LaunchConvOp<GPUDevice, T> {
       AutoTuneConv3d::GetInstance()->Insert(conv_parameters, algorithm_config);
     }
 
-    CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+    DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
     bool cudnn_launch_status =
         stream
             ->ThenConvolveWithAlgorithm(input_desc, input_ptr, filter_desc,
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index a0484e9235dd3235f8074bf956914772a0d8c84e..798a7325cd25494d8b12447c86f4883ca038c8ca 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -14,897 +14,30 @@ limitations under the License.
 ==============================================================================*/
 
 // Implements convolution operations with other kernels baked into the
-// processing, to optimize latency and memory usage.
+// processing, to optimize latency and memory usage:
+//  - Conv2D + BiasAdd + <Activation>
+//  - Conv2D + FusedBatchNorm + <Activation>
+//
+// Activation: Relu, Relu6, Elu, etc...
+//
+// Kernels for convolutions fused with image transformations (resize and mirror
+// padding) defined in `conv_ops_fused_image_transform.cc`.
 
 #define EIGEN_USE_THREADS
 
-#include <string.h>
-#include <map>
+#include <string>
 #include <vector>
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_ops.h"
-#include "tensorflow/core/kernels/gemm_functors.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/util/mirror_pad_mode.h"
-#include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
-
-namespace {
-
-// We don't want to allocate a buffer to hold all the patches if the size is
-// going to be extremely large, so break it into chunks if it's bigger than
-// a limit. Each chunk will be processed serially, so we can refill the
-// buffer for the next chunk and reuse it, keeping maximum memory size down.
-// In this case, we've picked 16 megabytes as a reasonable limit for Android and
-// other platforms using Eigen, and 1MB for iOS devices, from experimentation.
-#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
-const size_t kMaxChunkSize = (1 * 1024 * 1024);
-#else
-const size_t kMaxChunkSize = (16 * 1024 * 1024);
-#endif
-const size_t kResizeCacheSize = (8 * 1024 * 1024);
-
-// Lookup method used when resizing.
-enum SamplingMode {
-  BILINEAR = 0,
-  NEAREST = 1,
-};
-
-// Simple utility function used by FusedConv to multithread basic workloads. To
-// use it, pass begin and end values for the full workload and a std::function
-// that receives a subset of that through the begin and end values for each
-// worker's task. The division of the full workload into worker tasks is handled
-// by the multithreading logic. Here's an example of how to use it:
-// std::vector<float> my_vector(100);
-// ...
-// FusedConvParallelFor(context, 0, 100,
-//   [&my_vector](int64 task_begin, int64 task_end) {
-//     for (int64 current = task_begin; current != task_end; ++current) {
-//       my_vector[current] *= 10.0f;
-//     }
-// });
-void FusedConvParallelFor(
-    OpKernelContext* context, int64 begin, int64 end,
-    const std::function<void(int64, int64)>& task_function) {
-// On iOS, the thread management imposes a very big performance penalty, so
-// just call the function directly with no multithreading.
-#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
-  task_function(begin, end);
-#else
-  auto& worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-  thread::ThreadPool* thread_pool = worker_threads.workers;
-  const int64 total_elements = end - begin;
-  // This is a bit of an arbitrary number, but was found to work well for
-  // typical models we've been profiling on various devices.
-  const int64 element_cost = 10000000;
-  thread_pool->ParallelFor(
-      total_elements, element_cost,
-      [begin, task_function](int64 begin_offset, int64 end_offset) {
-        const int64 task_begin = begin + begin_offset;
-        const int64 task_end = begin + end_offset;
-        task_function(task_begin, task_end);
-      });
-#endif
-}
-
-// Holds the state needed for the resizing subtasks.
-template <class T1>
-struct ResizeTaskParameters {
-  ResizeTaskParameters() : st(false) {}
-
-  int cache_height;
-  T1* resize_cache;
-  int cache_line_width;
-  int input_width;
-  int input_depth;
-  int top_padding;
-  int pad_offset;
-  int64 resized_height;
-  ImageResizerState st;
-  const T1* input_batch_start;
-  int64 cache_start_x;
-  int64 cache_end_x;
-  int left_padding;
-  int64 resized_width;
-  int64 padded_width;
-  int64 padded_height;
-};
-
-template <class T1>
-struct PerCacheLineParameters {
-  PerCacheLineParameters() {}
-  PerCacheLineParameters(const PerCacheLineParameters<T1>& other)
-      : cache_line_start(other.cache_line_start),
-        input_top_row_start(other.input_top_row_start),
-        input_bottom_row_start(other.input_bottom_row_start),
-        y_lerp(other.y_lerp) {}
-
-  T1* cache_line_start;
-  const T1* input_top_row_start;
-  const T1* input_bottom_row_start;
-  T1 y_lerp;
-};
-
-// Helper class to simplify bilinear filtering
-template <class T1>
-struct SampleRect {
-  EIGEN_ALWAYS_INLINE SampleRect(const T1* in_top_left, const T1* in_top_right,
-                                 const T1* in_bottom_left,
-                                 const T1* in_bottom_right)
-      : top_left(in_top_left),
-        top_right(in_top_right),
-        bottom_left(in_bottom_left),
-        bottom_right(in_bottom_right) {}
-
-  EIGEN_ALWAYS_INLINE T1 BilinearSample(int channel, T1 x_lerp,
-                                        T1 y_lerp) const {
-    const T1 top =
-        top_left[channel] + (top_right[channel] - top_left[channel]) * x_lerp;
-    const T1 bottom = bottom_left[channel] +
-                      (bottom_right[channel] - bottom_left[channel]) * x_lerp;
-    return top + (bottom - top) * y_lerp;
-  }
-
-  const T1* top_left;
-  const T1* top_right;
-  const T1* bottom_left;
-  const T1* bottom_right;
-};
-
-// Calculates parameters which remain constant through a resize cache row.
-template <class T1>
-EIGEN_ALWAYS_INLINE PerCacheLineParameters<T1> CalculatePerCacheLineParameters(
-    int64 cache_height, int64 cache_y, T1* resize_cache, int64 cache_line_width,
-    int64 input_width, int64 input_depth, int64 top_padding, int64 pad_offset,
-    int64 resized_height, const ImageResizerState& st,
-    const T1* input_batch_start) {
-  PerCacheLineParameters<T1> result;
-  // The cache is organized so that the real y values of the resized image map
-  // onto the actual cache values through a modulo scheme. This means that as we
-  // progress downwards through the image, we keep reusing a small cache and so
-  // keep memory usage down.
-  int64 cache_index_y;
-  if (cache_y < 0) {
-    cache_index_y = cache_height + (cache_y % cache_height);
-  } else {
-    cache_index_y = cache_y % cache_height;
-  }
-  result.cache_line_start =
-      resize_cache + (cache_index_y * cache_line_width * input_depth);
-  // This part is implementing the mirror padding that happens before resizing.
-  float in_y = (cache_y - top_padding);
-  if (in_y < 0) {
-    in_y = -(in_y + 1.0f - pad_offset);
-  } else if (in_y >= resized_height) {
-    in_y = (resized_height * 2.0f) - (in_y + 1.0f + pad_offset);
-  }
-  // Here's where do do the actual resize.
-  in_y *= st.height_scale;
-  const int64 top_y_index = static_cast<int64>(std::floor(in_y));
-  const int64 bottom_y_index =
-      std::min(static_cast<int64>(std::ceil(in_y)), (st.in_height - 1));
-  // Lerp is used for bilinear filtering when that's needed.
-  result.y_lerp = static_cast<T1>(in_y - top_y_index);
-  // Which rows of the original input image to pull the values from.
-  result.input_top_row_start =
-      input_batch_start + (top_y_index * input_width * input_depth);
-  result.input_bottom_row_start =
-      input_batch_start + (bottom_y_index * input_width * input_depth);
-  return result;
-}
-
-template <class T1>
-struct PerCachePixelParameters {
-  PerCachePixelParameters() {}
-  PerCachePixelParameters(const PerCachePixelParameters<T1>& other)
-      : cache_line_pixel(other.cache_line_pixel),
-        left_x_index(other.left_x_index),
-        right_x_index(other.right_x_index),
-        x_lerp(other.x_lerp) {}
-
-  T1* cache_line_pixel;
-  int64 left_x_index;
-  int64 right_x_index;
-  T1 x_lerp;
-};
-
-// Pulls out common parameters used for every resized pixel.
-template <class T1>
-EIGEN_ALWAYS_INLINE PerCachePixelParameters<T1>
-CalculatePerCachePixelParameters(int64 cache_x, int64 cache_start_x,
-                                 T1* cache_line_start, int64 input_depth,
-                                 int64 left_padding, int64 pad_offset,
-                                 int64 resized_width,
-                                 const ImageResizerState& st) {
-  PerCachePixelParameters<T1> result;
-  // Figure out where we're going to store the results of our transform.
-  const int cache_index_x = cache_x - cache_start_x;
-  result.cache_line_pixel = cache_line_start + (cache_index_x * input_depth);
-  // Implement mirror padding by flipping in_x if it's off the edge.
-  float in_x = (cache_x - left_padding);
-  if (in_x < 0) {
-    in_x = -(in_x + 1.0f - pad_offset);
-  } else if (in_x >= resized_width) {
-    in_x = (resized_width * 2.0f) - (in_x + 1.0f + pad_offset);
-  }
-  // Resize the x parameters.
-  in_x *= st.width_scale;
-  // Get the x coordinates for the left and right pixels to pull from.
-  result.left_x_index = static_cast<int64>(std::floor(in_x));
-  result.right_x_index =
-      std::min(static_cast<int64>(std::ceil(in_x)), (st.in_width - 1));
-  // This x_lerp is used to blend pixels in bilinear filtering.
-  result.x_lerp = static_cast<T1>(in_x - result.left_x_index);
-  return result;
-}
-
-// Combines bilinear resizing and mirror padding into the im2col transformation
-// stage of convolution.
-template <class T1, class T2, class T3, class TGemmFunctor,
-          SamplingMode SampleMode>
-class FusedResizeAndPadConvFunctor {
- public:
-  void operator()(OpKernelContext* context, const Tensor& input,
-                  int input_batches, int resized_height, int resized_width,
-                  int padded_height, int padded_width, int input_depth,
-                  const T2* filter_data, int filter_height, int filter_width,
-                  int filter_count, int stride_rows, int stride_cols,
-                  Padding padding, T3* output_data, int output_height,
-                  int output_width, const ImageResizerState& st,
-                  int top_padding, int bottom_padding, int left_padding,
-                  int right_padding, int pad_offset) {
-    if ((input_batches <= 0) || (padded_width <= 0) || (padded_height <= 0) ||
-        (input_depth <= 0)) {
-      LOG(WARNING) << "Conv2D was called with bad input dimensions: "
-                   << input_batches << ", " << padded_height << ", "
-                   << padded_width << ", " << input_depth;
-      return;
-    }
-    if ((filter_width <= 0) || (filter_height <= 0) || (filter_count <= 0)) {
-      LOG(WARNING) << "Conv2D was called with bad filter dimensions: "
-                   << filter_width << ", " << filter_height << ", "
-                   << filter_count;
-      return;
-    }
-    if ((output_width <= 0) || (output_height <= 0)) {
-      LOG(WARNING) << "Conv2D was called with bad output width or height: "
-                   << output_width << ", " << output_height;
-      return;
-    }
-    OP_REQUIRES(
-        context, ((SampleMode == NEAREST) || (SampleMode == BILINEAR)),
-        errors::InvalidArgument("Bad sample mode passed in", SampleMode));
-
-    // These calculations define how the patches will be positioned within the
-    // input image. The actual definitions are quite complex, and rely on the
-    // previously-calculated output size.
-    int filter_left_offset;
-    int filter_top_offset;
-    if (padding == VALID) {
-      filter_left_offset =
-          ((output_width - 1) * stride_cols + filter_width - padded_width + 1) /
-          2;
-      filter_top_offset = ((output_height - 1) * stride_rows + filter_height -
-                           padded_height + 1) /
-                          2;
-    } else {
-      filter_left_offset =
-          ((output_width - 1) * stride_cols + filter_width - padded_width) / 2;
-      filter_top_offset =
-          ((output_height - 1) * stride_rows + filter_height - padded_height) /
-          2;
-    }
-
-    ResizeTaskParameters<T1> task_params;
-    task_params.input_depth = input_depth;
-    task_params.top_padding = top_padding;
-    task_params.pad_offset = pad_offset;
-    task_params.resized_height = resized_height;
-    task_params.st = st;
-    task_params.left_padding = left_padding;
-    task_params.resized_width = resized_width;
-    task_params.padded_width = padded_width;
-    task_params.padded_height = padded_height;
-
-    // The im2col buffer has # of patches rows, and # of filters cols.
-    // It's laid out like this, in row major order in memory:
-    //        < filter value count >
-    //   ^   +---------------------+
-    // patch |                     |
-    // count |                     |
-    //   v   +---------------------+
-    // Each patch row contains a filter_width x filter_height patch of the
-    // input, with the depth channel as the most contiguous in memory, followed
-    // by the width, then the height. This is the standard memory order in the
-    // image world if it helps to visualize it.
-    const int filter_value_count = filter_width * filter_height * input_depth;
-
-    OP_REQUIRES(context, (filter_value_count * sizeof(T1)) <= kMaxChunkSize,
-                errors::InvalidArgument("Im2Col patch too large for buffer"));
-    const size_t patches_per_chunk =
-        kMaxChunkSize / (filter_value_count * sizeof(T1));
-    // Because memory allocation is very expensive on mobile platforms, try to
-    // allocate a persistent buffer that will be kept around between calls. We
-    // use TensorFlow's resource management to ensure that the memory will be
-    // released when the session is over.
-    Im2ColBufferResource<T1, kMaxChunkSize>* im2col_buffer_resource;
-    std::function<Status(Im2ColBufferResource<T1, kMaxChunkSize>**)> creator =
-        [](Im2ColBufferResource<T1, kMaxChunkSize>** resource) {
-          *resource = new Im2ColBufferResource<T1, kMaxChunkSize>();
-          return Status::OK();
-        };
-    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
-                                "Conv2d", "im2col_buffer",
-                                &im2col_buffer_resource, creator));
-
-    // Create a resize cache memory buffer that will hold the rows of
-    // transformed and mirror padded input pixels, ready to be copied
-    // into filter patches by im2col.
-    // It's laid out like this, in row major order in memory:
-    //         < cache line width >
-    //   ^    +--------------------+
-    // cache  |                    |
-    // height |                    |
-    //   v    +--------------------+
-    // Each cache row contains a cache_line_width number of resized pixels,
-    // each with input_depth channels. The cache height is typically less than
-    // the full height the resized image would be, so it's filled up
-    // incrementally as we progress downwards through the input creating im2col
-    // patches.
-    task_params.cache_start_x = -filter_left_offset;
-    task_params.cache_end_x =
-        (((output_width - 1) * stride_cols) - filter_left_offset) +
-        filter_width;
-    task_params.cache_line_width =
-        task_params.cache_end_x - task_params.cache_start_x;
-    task_params.cache_height =
-        kResizeCacheSize / (task_params.cache_line_width * input_depth);
-    const int needed_resize_cache_count =
-        filter_height * task_params.cache_line_width * input_depth;
-    OP_REQUIRES(context,
-                (needed_resize_cache_count * sizeof(T1)) <= kResizeCacheSize,
-                errors::InvalidArgument("Input too large for resize cache"));
-    Im2ColBufferResource<T1, kResizeCacheSize>* resize_cache_resource;
-    std::function<Status(Im2ColBufferResource<T1, kResizeCacheSize>**)>
-        resize_creator =
-            [](Im2ColBufferResource<T1, kResizeCacheSize>** resource) {
-              *resource = new Im2ColBufferResource<T1, kResizeCacheSize>();
-              return Status::OK();
-            };
-    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
-                                "Conv2d", "resize_cache",
-                                &resize_cache_resource, resize_creator));
-
-    // This means that multiple ops can't be run simultaneously on different
-    // threads, because we have a single shared resource. The platforms this is
-    // aimed at have intra-op parallelism as their focus though, so it shouldn't
-    // be an issue.
-    mutex_lock lock_buffer(im2col_buffer_resource->mu);
-    core::ScopedUnref unref_buffer(im2col_buffer_resource);
-    T1* im2col_buffer = im2col_buffer_resource->data;
-
-    // This buffer is used as a fairly heavy-weight cache for the resized and
-    // mirrored inputs to the im2col operation. The problem is that we want to
-    // keep the memory usage down by not rendering the fully resized and padded
-    // input tensor to the convolution into an entire buffer. The first approach
-    // to avoid this was to fold the bilinear filtering and padding spatial
-    // transformations into the im2col lookup itself. This successfully reduced
-    // memory usage, but because im2col can access an individual pixel for many
-    // different patches, the extra overhead of doing the same bilinear lookups
-    // repeatedly became too expensive.
-    // The resize cache is designed to avoid this problem by keeping a
-    // horizontal slice of the resized and padded input to the im2col
-    // precalculated, so that repeated accesses to the same pixel from different
-    // filter patches can just be copied from this cache. It's organized as a
-    // horizontal slice stretching across the whole virtual image, and as high
-    // as the filter window, so that as the patch processing moves across all
-    // the pixels are present, and before a new row of patches is started any
-    // previously calculated rows that are needed are maintained, with new rows
-    // calculated as required.
-    mutex_lock resize_lock_buffer(resize_cache_resource->mu);
-    core::ScopedUnref unref_resized_cache(resize_cache_resource);
-    task_params.resize_cache = resize_cache_resource->data;
-
-    const T1* input_data = input.flat<T1>().data();
-    const int64 input_height = input.shape().dim_sizes()[1];
-    task_params.input_width = input.shape().dim_sizes()[2];
-
-    int end_cached_lines = std::numeric_limits<int>::min();
-
-    for (int batch = 0; batch < input_batches; ++batch) {
-      task_params.input_batch_start =
-          input_data +
-          (batch * input_height * task_params.input_width * input_depth);
-      const int in_y_end =
-          ((output_height * stride_rows) - filter_top_offset) + filter_height;
-      for (int out_y = 0; out_y < output_height; ++out_y) {
-        const int in_y_origin = (out_y * stride_rows) - filter_top_offset;
-        const int cache_start_y = std::max(in_y_origin, end_cached_lines);
-        const int cache_end_y = std::min(
-            in_y_end, std::max((in_y_origin + task_params.cache_height),
-                               end_cached_lines));
-        if (end_cached_lines < (in_y_origin + filter_height)) {
-          // This call breaks up the work required for calculating the mirror
-          // padding and resizing across multiple threads.
-          FusedConvParallelFor(
-              context, cache_start_y, cache_end_y,
-              [task_params](int64 task_cache_start_y, int64 task_cache_end_y) {
-                // This is a long and confusing function, but it's been laid out
-                // this way to help with performance on some intensive models.
-                // What it's doing is populating a cache of the original input
-                // image, after it's been bilinear resized and had its edges
-                // mirrored. This allows the following im2col code to access the
-                // transformed pixels from this cache, without having to
-                // repeatedly apply the expensive bilinear calculations as the
-                // same pixels are accessed by different patches.
-                // This is most effective when the stride is small and the
-                // filter size is large, since that's when pixels are reused
-                // most frequently as patches overlap.
-                for (int cache_y = task_cache_start_y;
-                     cache_y < task_cache_end_y; ++cache_y) {
-                  // We organize the cache as a series of rows, each containing
-                  // all the transformed pixels for a given line in the image.
-                  // This cache is big enough to hold at least a filter's height
-                  // worth of rows, but typically more, limited by the size of
-                  // the cache buffer.
-                  // We don't allocate an entire image's worth of rows though,
-                  // because we're trying to keep memory usage down, so as we
-                  // progress downwards through the im2col we periodically
-                  // refresh the cache so that the next lines that are needed
-                  // for that operation are always present.
-                  // Work out the parameters that remain constant across the
-                  // row we're calculating.
-                  PerCacheLineParameters<T1> line_params(
-                      CalculatePerCacheLineParameters<T1>(
-                          task_params.cache_height, cache_y,
-                          task_params.resize_cache,
-                          task_params.cache_line_width, task_params.input_width,
-                          task_params.input_depth, task_params.top_padding,
-                          task_params.pad_offset, task_params.resized_height,
-                          task_params.st, task_params.input_batch_start));
-                  // Iterate through the resize cache row we're filling in.
-                  for (int cache_x = task_params.cache_start_x;
-                       cache_x < task_params.cache_end_x; ++cache_x) {
-                    // Figure out what we need for the cache pixel we're
-                    // populating.
-                    PerCachePixelParameters<T1> pixel_params(
-                        CalculatePerCachePixelParameters<T1>(
-                            cache_x, task_params.cache_start_x,
-                            line_params.cache_line_start,
-                            task_params.input_depth, task_params.left_padding,
-                            task_params.pad_offset, task_params.resized_width,
-                            task_params.st));
-                    // If the access is off the left, right, top, or bottom of
-                    // the resized image, the conv padding means we should set
-                    // it to zero.
-                    if ((cache_x < 0) ||
-                        (cache_x >= task_params.padded_width) ||
-                        (cache_y < 0) ||
-                        (cache_y >= task_params.padded_height)) {
-                      std::fill_n(pixel_params.cache_line_pixel,
-                                  task_params.input_depth, T1(0));
-                    } else {
-                      // There are two different sampling strategies for
-                      // resizing. When using nearest, we can just do a
-                      // straight copy of the pixel closest to our sample point,
-                      // but bilinear requires a more complex calculation.
-                      if (SampleMode == NEAREST) {
-                        const T1* input_top_left_pixel =
-                            line_params.input_top_row_start +
-                            (pixel_params.left_x_index *
-                             task_params.input_depth);
-
-                        std::copy_n(input_top_left_pixel,
-                                    task_params.input_depth,
-                                    pixel_params.cache_line_pixel);
-                      } else {
-                        const SampleRect<T1> rect(
-                            line_params.input_top_row_start +
-                                (pixel_params.left_x_index *
-                                 task_params.input_depth),
-                            line_params.input_top_row_start +
-                                (pixel_params.right_x_index *
-                                 task_params.input_depth),
-                            line_params.input_bottom_row_start +
-                                (pixel_params.left_x_index *
-                                 task_params.input_depth),
-                            line_params.input_bottom_row_start +
-                                (pixel_params.right_x_index *
-                                 task_params.input_depth));
-                        for (int in_channel = 0;
-                             in_channel < task_params.input_depth;
-                             ++in_channel) {
-                          pixel_params.cache_line_pixel[in_channel] =
-                              rect.BilinearSample(in_channel,
-                                                  pixel_params.x_lerp,
-                                                  line_params.y_lerp);
-                        }
-                      }
-                    }
-                  }
-                }
-              });
-          end_cached_lines = cache_end_y;
-        }
-        for (int out_x = 0; out_x < output_width; ++out_x) {
-          const int in_x_origin = (out_x * stride_cols) - filter_left_offset;
-          const int patch_index = (batch * output_width * output_height) +
-                                  (out_y * output_width) + out_x;
-          const int patch_index_within_chunk = patch_index % patches_per_chunk;
-          T1* im2col_patch_start =
-              im2col_buffer + (patch_index_within_chunk * filter_value_count);
-          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-            T1* im2col_row_start =
-                im2col_patch_start +
-                (filter_y * filter_width * task_params.input_depth);
-            const int conv_in_y = in_y_origin + filter_y;
-            int cache_index_y;
-            if (conv_in_y < 0) {
-              cache_index_y = task_params.cache_height +
-                              (conv_in_y % task_params.cache_height);
-            } else {
-              cache_index_y = conv_in_y % task_params.cache_height;
-            }
-            T1* cache_line_start =
-                task_params.resize_cache +
-                (cache_index_y * task_params.cache_line_width *
-                 task_params.input_depth);
-            T1* cache_filter_row_start =
-                cache_line_start + ((in_x_origin - task_params.cache_start_x) *
-                                    task_params.input_depth);
-            std::copy_n(cache_filter_row_start,
-                        (filter_width * task_params.input_depth),
-                        im2col_row_start);
-          }
-          const bool is_last_in_chunk =
-              (patch_index_within_chunk == (patches_per_chunk - 1));
-          const bool is_last_overall =
-              ((batch == (input_batches - 1)) &&
-               (out_y == (output_height - 1)) && (out_x == (output_width - 1)));
-          if (is_last_in_chunk || is_last_overall) {
-            // Now we've assembled a set of image patches into a matrix, apply
-            // a GEMM matrix multiply of the patches as rows, times the filter
-            // weights in columns, to get partial results in the output
-            // matrix.
-            const int how_many_patches = patch_index_within_chunk + 1;
-            const int m = how_many_patches;
-            const int n = filter_count;
-            const int k = filter_value_count;
-            const int lda = filter_value_count;
-            const int ldb = filter_count;
-            const int ldc = filter_count;
-            const size_t start_patch_index =
-                patch_index - (how_many_patches - 1);
-            T3* chunk_output_data =
-                output_data + (start_patch_index * filter_count);
-            TGemmFunctor gemm_functor;
-            gemm_functor(context, m, n, k, im2col_buffer, lda, filter_data, ldb,
-                         chunk_output_data, ldc);
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace
-
-// Implements a version of convolution with bilinear resizing and mirror padding
-// included.
-template <class T, class TConvFunctor, bool DoResize>
-class FusedResizeConv2DUsingGemmOp : public OpKernel {
- public:
-  explicit FusedResizeConv2DUsingGemmOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    if (DoResize) {
-      OP_REQUIRES_OK(context,
-                     context->GetAttr("resize_align_corners", &align_corners_));
-    }
-    MirrorPadMode mode;
-    OP_REQUIRES_OK(context, context->GetAttr("mode", &mode));
-
-    switch (mode) {
-      case MirrorPadMode::SYMMETRIC: {
-        offset_ = 0;
-        break;
-      }
-      case MirrorPadMode::REFLECT: {
-        offset_ = 1;
-        break;
-      }
-      default:
-        OP_REQUIRES(context, false,
-                    errors::InvalidArgument(
-                        "mode must be either REFLECT or SYMMETRIC."));
-    }
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    OP_REQUIRES(context, strides_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
-    const int64 stride_n = GetTensorDim(strides_, FORMAT_NHWC, 'N');
-    const int64 stride_c = GetTensorDim(strides_, FORMAT_NHWC, 'C');
-    OP_REQUIRES(
-        context, stride_n == 1 && stride_c == 1,
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Input tensor is of the following dimensions:
-    // [ batch, in_rows, in_cols, in_depth ]
-    const Tensor& input = context->input(0);
-    OP_REQUIRES(context, (input.shape().num_elements() > 0),
-                errors::InvalidArgument("Input tensor can't be empty"));
-
-    ImageResizerState st(false);
-    if (DoResize) {
-      st = ImageResizerState(align_corners_);
-      st.ValidateAndCalculateOutputSize(context, input);
-      if (!context->status().ok()) return;
-    } else {
-      // Set up the resize parameters to do no scaling at all.
-      st.batch_size = input.dim_size(0);
-      st.out_height = input.dim_size(1);
-      st.out_width = input.dim_size(2);
-      st.in_height = input.dim_size(1);
-      st.in_width = input.dim_size(2);
-      st.channels = input.dim_size(3);
-      st.height_scale = 1.0f;
-      st.width_scale = 1.0f;
-    }
-    TensorShape resized_shape(
-        {input.dim_size(0), st.out_height, st.out_width, input.dim_size(3)});
-    int paddings_index;
-    int filter_index;
-    if (DoResize) {
-      paddings_index = 2;
-      filter_index = 3;
-    } else {
-      paddings_index = 1;
-      filter_index = 2;
-    }
-    const Tensor& paddings = context->input(paddings_index);
-
-    const int dims = resized_shape.dims();
-    OP_REQUIRES(
-        context,
-        TensorShapeUtils::IsMatrix(paddings.shape()) &&
-            paddings.dim_size(1) == 2,
-        errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
-                                paddings.shape().DebugString()));
-    const int fixed_dims =
-        (allow_legacy_scalars() && dims == 0 && paddings.dim_size(0) == 1)
-            ? 1
-            : dims;
-    OP_REQUIRES(
-        context, fixed_dims == paddings.dim_size(0),
-        errors::InvalidArgument(
-            "The first dimension of paddings must be the rank of inputs: ",
-            fixed_dims, " ", paddings.shape().DebugString(), " ",
-            resized_shape.DebugString()));
-    OP_REQUIRES(
-        context, dims == paddings.dim_size(0),
-        errors::InvalidArgument(
-            "The first dimension of paddings must be the rank of inputs: ",
-            dims, " ", paddings.shape().DebugString(), " ",
-            resized_shape.DebugString()));
-
-    OP_REQUIRES(
-        context, dims == 4,
-        errors::InvalidArgument(
-            "Fused mirror padding only supports four-dimensional inputs, but ",
-            dims, " requested"));
-
-    // Compute the shape of the output tensor, and allocate it.
-    TensorShape padded_shape;
-    TTypes<int32>::ConstMatrix paddings_matrix = paddings.matrix<int32>();
-    for (int d = 0; d < dims; ++d) {
-      const int32 before =
-          paddings_matrix(d, 0);  // Pad before existing elements.
-      const int32 after =
-          paddings_matrix(d, 1);  // Pad after existing elements.
-      OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument(
-                      "paddings must be non-negative: ", before, " ", after));
-      if (offset_ == 0) {  // SYMMETRIC mode.
-        OP_REQUIRES(
-            context,
-            before <= resized_shape.dim_size(d) &&
-                after <= resized_shape.dim_size(d),
-            errors::InvalidArgument("paddings must be no greater "
-                                    "than the dimension size: ",
-                                    before, ", ", after, " greater than ",
-                                    resized_shape.dim_size(d)));
-      } else if (offset_ == 1) {  // REFLECT mode.
-        OP_REQUIRES(
-            context,
-            before < resized_shape.dim_size(d) &&
-                after < resized_shape.dim_size(d),
-            errors::InvalidArgument("paddings must be less than"
-                                    " the dimension size: ",
-                                    before, ", ", after, " not less than ",
-                                    resized_shape.dim_size(d)));
-      }
-      padded_shape.AddDim(before + resized_shape.dim_size(d) + after);
-    }
-
-    OP_REQUIRES(
-        context, ((paddings_matrix(0, 0) == 0) && (paddings_matrix(0, 1) == 0)),
-        errors::InvalidArgument(
-            "Fused mirror padding only support spatial padding, not batches: ",
-            paddings.DebugString()));
-    OP_REQUIRES(
-        context, ((paddings_matrix(3, 0) == 0) && (paddings_matrix(3, 1) == 0)),
-        errors::InvalidArgument(
-            "Fused mirror padding only support spatial padding, not channels: ",
-            paddings.DebugString()));
-    const int32 top_padding = paddings_matrix(1, 0);
-    const int32 bottom_padding = paddings_matrix(1, 1);
-    const int32 left_padding = paddings_matrix(2, 0);
-    const int32 right_padding = paddings_matrix(2, 1);
-
-    // Input filter is of the following dimensions:
-    // [ filter_rows, filter_cols, in_depth, out_depth]
-    const Tensor& filter = context->input(filter_index);
-
-    // For 2D convolution, there should be 4 dimensions.
-    OP_REQUIRES(context, padded_shape.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        padded_shape.DebugString()));
-    OP_REQUIRES(context, filter.dims() == 4,
-                errors::InvalidArgument("filter must be 4-dimensional: ",
-                                        filter.shape().DebugString()));
-
-    // We only check the first three dims, since the depth is accessed as an
-    // int64 below.
-    for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(
-          context,
-          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
-          errors::InvalidArgument("filter too large"));
-    }
-
-    // The last dimension for input is in_depth. It must be the same as the
-    // filter's in_depth.
-    const int64 in_depth = padded_shape.dim_size(3);
-    OP_REQUIRES(context, in_depth == filter.dim_size(2),
-                errors::InvalidArgument(
-                    "input and filter must have the same depth: ", in_depth,
-                    " vs ", filter.dim_size(2)));
-
-    // The last dimension for filter is out_depth.
-    const int out_depth = static_cast<int>(filter.dim_size(3));
-
-    // The second dimension for input is rows/height.
-    // The first dimension for filter is rows/height.
-    const int64 padded_rows_raw = padded_shape.dim_size(1);
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("Input rows too large"));
-    const int padded_rows = static_cast<int>(padded_rows_raw);
-    const int filter_rows = static_cast<int>(filter.dim_size(0));
-    const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
-
-    // The third dimension for input is columns/width.
-    // The second dimension for filter is columns/width.
-    const int64 padded_cols_raw = padded_shape.dim_size(2);
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("Input cols too large"));
-    const int padded_cols = static_cast<int>(padded_cols_raw);
-    const int filter_cols = static_cast<int>(filter.dim_size(1));
-    const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
-
-    // The first dimension for input is batch.
-    const int64 batch_raw = padded_shape.dim_size(0);
-    OP_REQUIRES(context,
-                FastBoundsCheck(batch_raw, std::numeric_limits<int>::max()),
-                errors::InvalidArgument("batch is too large"));
-    const int batch = static_cast<int>(batch_raw);
-
-    // For now we take the stride from the second and third dimensions only (we
-    // do not support striding on the batch or depth dimension).
-    const int stride_rows = GetTensorDim(strides_, FORMAT_NHWC, 'H');
-    const int stride_cols = GetTensorDim(strides_, FORMAT_NHWC, 'W');
-
-    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(padded_rows, filter_rows, stride_rows,
-                                         padding_, &out_rows, &pad_rows));
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(padded_cols, filter_cols, stride_cols,
-                                         padding_, &out_cols, &pad_cols));
-    TensorShape out_shape =
-        ShapeFromFormat(FORMAT_NHWC, batch, out_rows, out_cols, out_depth);
-    OP_REQUIRES(context, (out_shape.num_elements() > 0),
-                errors::InvalidArgument("Output tensor can't be empty"));
-
-    // Output tensor is of the following dimensions:
-    // [ in_batch, out_rows, out_cols, out_depth ]
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
-
-    VLOG(2) << "FusedConv2D: " << name() << ", in_depth = " << in_depth
-            << ", padded_cols = " << padded_cols
-            << ", resized_cols = " << resized_cols
-            << ", filter_cols = " << filter_cols
-            << ", padded_rows = " << padded_rows
-            << ", resized_rows = " << resized_rows
-            << ", filter_rows = " << filter_rows
-            << ", stride_rows = " << stride_rows
-            << ", stride_cols = " << stride_cols
-            << ", out_depth = " << out_depth << ", DoResize=" << DoResize;
-
-    // If there is nothing to compute, return.
-    if (out_shape.num_elements() == 0) {
-      return;
-    }
-    TConvFunctor conv_functor;
-    conv_functor(context, input, batch, resized_rows, resized_cols, padded_rows,
-                 padded_cols, in_depth, filter.flat<T>().data(), filter_rows,
-                 filter_cols, out_depth, stride_rows, stride_cols, padding_,
-                 output->flat<T>().data(), out_rows, out_cols, st, top_padding,
-                 bottom_padding, left_padding, right_padding, offset_);
-  }
-
- private:
-  std::vector<int32> strides_;
-  Padding padding_;
-  bool align_corners_;
-  int offset_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
-};
-
-#define REGISTER_FUSED(T)                                                 \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("FusedResizeAndPadConv2D")                                     \
-          .Device(DEVICE_CPU)                                             \
-          .TypeConstraint<T>("T"),                                        \
-      FusedResizeConv2DUsingGemmOp<                                       \
-          T,                                                              \
-          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                       BILINEAR>,                         \
-          true>);
-
-TF_CALL_half(REGISTER_FUSED);
-TF_CALL_float(REGISTER_FUSED);
-TF_CALL_double(REGISTER_FUSED);
-
-#define REGISTER_PAD_ONLY_FUSED(T)                                        \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
-      FusedResizeConv2DUsingGemmOp<                                       \
-          T,                                                              \
-          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                       NEAREST>,                          \
-          false>);
-
-TF_CALL_half(REGISTER_PAD_ONLY_FUSED);
-TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
-TF_CALL_double(REGISTER_PAD_ONLY_FUSED);
-
-// Support for fusing computationally cheap, but memory bandwidth expensive
-// computations into the output of convolution to reduce the overall latency.
-//
-// Example: Fuse Conv2D+BiasAdd+Relu.
-
 namespace {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7be1de29c951dca16085e35587d02eeeec01354f
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
@@ -0,0 +1,902 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implements convolution operations with image transformations (resize and
+// mirror padding) baked into the processing, to optimize latency and memory
+// usage.
+
+#define EIGEN_USE_THREADS
+
+#include <string>
+#include <vector>
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_ops.h"
+#include "tensorflow/core/kernels/gemm_functors.h"
+#include "tensorflow/core/kernels/image_resizer_state.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/util/mirror_pad_mode.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+namespace {
+
+// We don't want to allocate a buffer to hold all the patches if the size is
+// going to be extremely large, so break it into chunks if it's bigger than
+// a limit. Each chunk will be processed serially, so we can refill the
+// buffer for the next chunk and reuse it, keeping maximum memory size down.
+// In this case, we've picked 16 megabytes as a reasonable limit for Android and
+// other platforms using Eigen, and 1MB for iOS devices, from experimentation.
+#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
+const size_t kMaxChunkSize = (1 * 1024 * 1024);
+#else
+const size_t kMaxChunkSize = (16 * 1024 * 1024);
+#endif
+const size_t kResizeCacheSize = (8 * 1024 * 1024);
+
+// Lookup method used when resizing.
+enum SamplingMode {
+  BILINEAR = 0,
+  NEAREST = 1,
+};
+
+// Simple utility function used by FusedConv to multithread basic workloads. To
+// use it, pass begin and end values for the full workload and a std::function
+// that receives a subset of that through the begin and end values for each
+// worker's task. The division of the full workload into worker tasks is handled
+// by the multithreading logic. Here's an example of how to use it:
+// std::vector<float> my_vector(100);
+// ...
+// FusedConvParallelFor(context, 0, 100,
+//   [&my_vector](int64 task_begin, int64 task_end) {
+//     for (int64 current = task_begin; current != task_end; ++current) {
+//       my_vector[current] *= 10.0f;
+//     }
+// });
+void FusedConvParallelFor(
+    OpKernelContext* context, int64 begin, int64 end,
+    const std::function<void(int64, int64)>& task_function) {
+// On iOS, the thread management imposes a very big performance penalty, so
+// just call the function directly with no multithreading.
+#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
+  task_function(begin, end);
+#else
+  auto& worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+  thread::ThreadPool* thread_pool = worker_threads.workers;
+  const int64 total_elements = end - begin;
+  // This is a bit of an arbitrary number, but was found to work well for
+  // typical models we've been profiling on various devices.
+  const int64 element_cost = 10000000;
+  thread_pool->ParallelFor(
+      total_elements, element_cost,
+      [begin, task_function](int64 begin_offset, int64 end_offset) {
+        const int64 task_begin = begin + begin_offset;
+        const int64 task_end = begin + end_offset;
+        task_function(task_begin, task_end);
+      });
+#endif
+}
+
+// Holds the state needed for the resizing subtasks.
+template <class T1>
+struct ResizeTaskParameters {
+  ResizeTaskParameters() : st(false) {}
+
+  int cache_height;
+  T1* resize_cache;
+  int cache_line_width;
+  int input_width;
+  int input_depth;
+  int top_padding;
+  int pad_offset;
+  int64 resized_height;
+  ImageResizerState st;
+  const T1* input_batch_start;
+  int64 cache_start_x;
+  int64 cache_end_x;
+  int left_padding;
+  int64 resized_width;
+  int64 padded_width;
+  int64 padded_height;
+};
+
+template <class T1>
+struct PerCacheLineParameters {
+  PerCacheLineParameters() {}
+  PerCacheLineParameters(const PerCacheLineParameters<T1>& other)
+      : cache_line_start(other.cache_line_start),
+        input_top_row_start(other.input_top_row_start),
+        input_bottom_row_start(other.input_bottom_row_start),
+        y_lerp(other.y_lerp) {}
+
+  T1* cache_line_start;
+  const T1* input_top_row_start;
+  const T1* input_bottom_row_start;
+  T1 y_lerp;
+};
+
+// Helper class to simplify bilinear filtering
+template <class T1>
+struct SampleRect {
+  EIGEN_ALWAYS_INLINE SampleRect(const T1* in_top_left, const T1* in_top_right,
+                                 const T1* in_bottom_left,
+                                 const T1* in_bottom_right)
+      : top_left(in_top_left),
+        top_right(in_top_right),
+        bottom_left(in_bottom_left),
+        bottom_right(in_bottom_right) {}
+
+  EIGEN_ALWAYS_INLINE T1 BilinearSample(int channel, T1 x_lerp,
+                                        T1 y_lerp) const {
+    const T1 top =
+        top_left[channel] + (top_right[channel] - top_left[channel]) * x_lerp;
+    const T1 bottom = bottom_left[channel] +
+                      (bottom_right[channel] - bottom_left[channel]) * x_lerp;
+    return top + (bottom - top) * y_lerp;
+  }
+
+  const T1* top_left;
+  const T1* top_right;
+  const T1* bottom_left;
+  const T1* bottom_right;
+};
+
+// Calculates parameters which remain constant through a resize cache row.
+template <class T1>
+EIGEN_ALWAYS_INLINE PerCacheLineParameters<T1> CalculatePerCacheLineParameters(
+    int64 cache_height, int64 cache_y, T1* resize_cache, int64 cache_line_width,
+    int64 input_width, int64 input_depth, int64 top_padding, int64 pad_offset,
+    int64 resized_height, const ImageResizerState& st,
+    const T1* input_batch_start) {
+  PerCacheLineParameters<T1> result;
+  // The cache is organized so that the real y values of the resized image map
+  // onto the actual cache values through a modulo scheme. This means that as we
+  // progress downwards through the image, we keep reusing a small cache and so
+  // keep memory usage down.
+  int64 cache_index_y;
+  if (cache_y < 0) {
+    cache_index_y = cache_height + (cache_y % cache_height);
+  } else {
+    cache_index_y = cache_y % cache_height;
+  }
+  result.cache_line_start =
+      resize_cache + (cache_index_y * cache_line_width * input_depth);
+  // This part is implementing the mirror padding that happens before resizing.
+  float in_y = (cache_y - top_padding);
+  if (in_y < 0) {
+    in_y = -(in_y + 1.0f - pad_offset);
+  } else if (in_y >= resized_height) {
+    in_y = (resized_height * 2.0f) - (in_y + 1.0f + pad_offset);
+  }
+  // Here's where do do the actual resize.
+  in_y *= st.height_scale;
+  const int64 top_y_index = static_cast<int64>(std::floor(in_y));
+  const int64 bottom_y_index =
+      std::min(static_cast<int64>(std::ceil(in_y)), (st.in_height - 1));
+  // Lerp is used for bilinear filtering when that's needed.
+  result.y_lerp = static_cast<T1>(in_y - top_y_index);
+  // Which rows of the original input image to pull the values from.
+  result.input_top_row_start =
+      input_batch_start + (top_y_index * input_width * input_depth);
+  result.input_bottom_row_start =
+      input_batch_start + (bottom_y_index * input_width * input_depth);
+  return result;
+}
+
+template <class T1>
+struct PerCachePixelParameters {
+  PerCachePixelParameters() {}
+  PerCachePixelParameters(const PerCachePixelParameters<T1>& other)
+      : cache_line_pixel(other.cache_line_pixel),
+        left_x_index(other.left_x_index),
+        right_x_index(other.right_x_index),
+        x_lerp(other.x_lerp) {}
+
+  T1* cache_line_pixel;
+  int64 left_x_index;
+  int64 right_x_index;
+  T1 x_lerp;
+};
+
+// Pulls out common parameters used for every resized pixel.
+template <class T1>
+EIGEN_ALWAYS_INLINE PerCachePixelParameters<T1>
+CalculatePerCachePixelParameters(int64 cache_x, int64 cache_start_x,
+                                 T1* cache_line_start, int64 input_depth,
+                                 int64 left_padding, int64 pad_offset,
+                                 int64 resized_width,
+                                 const ImageResizerState& st) {
+  PerCachePixelParameters<T1> result;
+  // Figure out where we're going to store the results of our transform.
+  const int cache_index_x = cache_x - cache_start_x;
+  result.cache_line_pixel = cache_line_start + (cache_index_x * input_depth);
+  // Implement mirror padding by flipping in_x if it's off the edge.
+  float in_x = (cache_x - left_padding);
+  if (in_x < 0) {
+    in_x = -(in_x + 1.0f - pad_offset);
+  } else if (in_x >= resized_width) {
+    in_x = (resized_width * 2.0f) - (in_x + 1.0f + pad_offset);
+  }
+  // Resize the x parameters.
+  in_x *= st.width_scale;
+  // Get the x coordinates for the left and right pixels to pull from.
+  result.left_x_index = static_cast<int64>(std::floor(in_x));
+  result.right_x_index =
+      std::min(static_cast<int64>(std::ceil(in_x)), (st.in_width - 1));
+  // This x_lerp is used to blend pixels in bilinear filtering.
+  result.x_lerp = static_cast<T1>(in_x - result.left_x_index);
+  return result;
+}
+
+// Combines bilinear resizing and mirror padding into the im2col transformation
+// stage of convolution.
+template <class T1, class T2, class T3, class TGemmFunctor,
+          SamplingMode SampleMode>
+class FusedResizeAndPadConvFunctor {
+ public:
+  void operator()(OpKernelContext* context, const Tensor& input,
+                  int input_batches, int resized_height, int resized_width,
+                  int padded_height, int padded_width, int input_depth,
+                  const T2* filter_data, int filter_height, int filter_width,
+                  int filter_count, int stride_rows, int stride_cols,
+                  Padding padding, T3* output_data, int output_height,
+                  int output_width, const ImageResizerState& st,
+                  int top_padding, int bottom_padding, int left_padding,
+                  int right_padding, int pad_offset) {
+    if ((input_batches <= 0) || (padded_width <= 0) || (padded_height <= 0) ||
+        (input_depth <= 0)) {
+      LOG(WARNING) << "Conv2D was called with bad input dimensions: "
+                   << input_batches << ", " << padded_height << ", "
+                   << padded_width << ", " << input_depth;
+      return;
+    }
+    if ((filter_width <= 0) || (filter_height <= 0) || (filter_count <= 0)) {
+      LOG(WARNING) << "Conv2D was called with bad filter dimensions: "
+                   << filter_width << ", " << filter_height << ", "
+                   << filter_count;
+      return;
+    }
+    if ((output_width <= 0) || (output_height <= 0)) {
+      LOG(WARNING) << "Conv2D was called with bad output width or height: "
+                   << output_width << ", " << output_height;
+      return;
+    }
+    OP_REQUIRES(
+        context, ((SampleMode == NEAREST) || (SampleMode == BILINEAR)),
+        errors::InvalidArgument("Bad sample mode passed in", SampleMode));
+
+    // These calculations define how the patches will be positioned within the
+    // input image. The actual definitions are quite complex, and rely on the
+    // previously-calculated output size.
+    int filter_left_offset;
+    int filter_top_offset;
+    if (padding == VALID) {
+      filter_left_offset =
+          ((output_width - 1) * stride_cols + filter_width - padded_width + 1) /
+          2;
+      filter_top_offset = ((output_height - 1) * stride_rows + filter_height -
+                           padded_height + 1) /
+                          2;
+    } else {
+      filter_left_offset =
+          ((output_width - 1) * stride_cols + filter_width - padded_width) / 2;
+      filter_top_offset =
+          ((output_height - 1) * stride_rows + filter_height - padded_height) /
+          2;
+    }
+
+    ResizeTaskParameters<T1> task_params;
+    task_params.input_depth = input_depth;
+    task_params.top_padding = top_padding;
+    task_params.pad_offset = pad_offset;
+    task_params.resized_height = resized_height;
+    task_params.st = st;
+    task_params.left_padding = left_padding;
+    task_params.resized_width = resized_width;
+    task_params.padded_width = padded_width;
+    task_params.padded_height = padded_height;
+
+    // The im2col buffer has # of patches rows, and # of filters cols.
+    // It's laid out like this, in row major order in memory:
+    //        < filter value count >
+    //   ^   +---------------------+
+    // patch |                     |
+    // count |                     |
+    //   v   +---------------------+
+    // Each patch row contains a filter_width x filter_height patch of the
+    // input, with the depth channel as the most contiguous in memory, followed
+    // by the width, then the height. This is the standard memory order in the
+    // image world if it helps to visualize it.
+    const int filter_value_count = filter_width * filter_height * input_depth;
+
+    OP_REQUIRES(context, (filter_value_count * sizeof(T1)) <= kMaxChunkSize,
+                errors::InvalidArgument("Im2Col patch too large for buffer"));
+    const size_t patches_per_chunk =
+        kMaxChunkSize / (filter_value_count * sizeof(T1));
+    // Because memory allocation is very expensive on mobile platforms, try to
+    // allocate a persistent buffer that will be kept around between calls. We
+    // use TensorFlow's resource management to ensure that the memory will be
+    // released when the session is over.
+    Im2ColBufferResource<T1, kMaxChunkSize>* im2col_buffer_resource;
+    std::function<Status(Im2ColBufferResource<T1, kMaxChunkSize>**)> creator =
+        [](Im2ColBufferResource<T1, kMaxChunkSize>** resource) {
+          *resource = new Im2ColBufferResource<T1, kMaxChunkSize>();
+          return Status::OK();
+        };
+    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
+                                "Conv2d", "im2col_buffer",
+                                &im2col_buffer_resource, creator));
+
+    // Create a resize cache memory buffer that will hold the rows of
+    // transformed and mirror padded input pixels, ready to be copied
+    // into filter patches by im2col.
+    // It's laid out like this, in row major order in memory:
+    //         < cache line width >
+    //   ^    +--------------------+
+    // cache  |                    |
+    // height |                    |
+    //   v    +--------------------+
+    // Each cache row contains a cache_line_width number of resized pixels,
+    // each with input_depth channels. The cache height is typically less than
+    // the full height the resized image would be, so it's filled up
+    // incrementally as we progress downwards through the input creating im2col
+    // patches.
+    task_params.cache_start_x = -filter_left_offset;
+    task_params.cache_end_x =
+        (((output_width - 1) * stride_cols) - filter_left_offset) +
+        filter_width;
+    task_params.cache_line_width =
+        task_params.cache_end_x - task_params.cache_start_x;
+    task_params.cache_height =
+        kResizeCacheSize / (task_params.cache_line_width * input_depth);
+    const int needed_resize_cache_count =
+        filter_height * task_params.cache_line_width * input_depth;
+    OP_REQUIRES(context,
+                (needed_resize_cache_count * sizeof(T1)) <= kResizeCacheSize,
+                errors::InvalidArgument("Input too large for resize cache"));
+    Im2ColBufferResource<T1, kResizeCacheSize>* resize_cache_resource;
+    std::function<Status(Im2ColBufferResource<T1, kResizeCacheSize>**)>
+        resize_creator =
+            [](Im2ColBufferResource<T1, kResizeCacheSize>** resource) {
+              *resource = new Im2ColBufferResource<T1, kResizeCacheSize>();
+              return Status::OK();
+            };
+    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
+                                "Conv2d", "resize_cache",
+                                &resize_cache_resource, resize_creator));
+
+    // This means that multiple ops can't be run simultaneously on different
+    // threads, because we have a single shared resource. The platforms this is
+    // aimed at have intra-op parallelism as their focus though, so it shouldn't
+    // be an issue.
+    mutex_lock lock_buffer(im2col_buffer_resource->mu);
+    core::ScopedUnref unref_buffer(im2col_buffer_resource);
+    T1* im2col_buffer = im2col_buffer_resource->data;
+
+    // This buffer is used as a fairly heavy-weight cache for the resized and
+    // mirrored inputs to the im2col operation. The problem is that we want to
+    // keep the memory usage down by not rendering the fully resized and padded
+    // input tensor to the convolution into an entire buffer. The first approach
+    // to avoid this was to fold the bilinear filtering and padding spatial
+    // transformations into the im2col lookup itself. This successfully reduced
+    // memory usage, but because im2col can access an individual pixel for many
+    // different patches, the extra overhead of doing the same bilinear lookups
+    // repeatedly became too expensive.
+    // The resize cache is designed to avoid this problem by keeping a
+    // horizontal slice of the resized and padded input to the im2col
+    // precalculated, so that repeated accesses to the same pixel from different
+    // filter patches can just be copied from this cache. It's organized as a
+    // horizontal slice stretching across the whole virtual image, and as high
+    // as the filter window, so that as the patch processing moves across all
+    // the pixels are present, and before a new row of patches is started any
+    // previously calculated rows that are needed are maintained, with new rows
+    // calculated as required.
+    mutex_lock resize_lock_buffer(resize_cache_resource->mu);
+    core::ScopedUnref unref_resized_cache(resize_cache_resource);
+    task_params.resize_cache = resize_cache_resource->data;
+
+    const T1* input_data = input.flat<T1>().data();
+    const int64 input_height = input.shape().dim_sizes()[1];
+    task_params.input_width = input.shape().dim_sizes()[2];
+
+    int end_cached_lines = std::numeric_limits<int>::min();
+
+    for (int batch = 0; batch < input_batches; ++batch) {
+      task_params.input_batch_start =
+          input_data +
+          (batch * input_height * task_params.input_width * input_depth);
+      const int in_y_end =
+          ((output_height * stride_rows) - filter_top_offset) + filter_height;
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        const int in_y_origin = (out_y * stride_rows) - filter_top_offset;
+        const int cache_start_y = std::max(in_y_origin, end_cached_lines);
+        const int cache_end_y = std::min(
+            in_y_end, std::max((in_y_origin + task_params.cache_height),
+                               end_cached_lines));
+        if (end_cached_lines < (in_y_origin + filter_height)) {
+          // This call breaks up the work required for calculating the mirror
+          // padding and resizing across multiple threads.
+          FusedConvParallelFor(
+              context, cache_start_y, cache_end_y,
+              [task_params](int64 task_cache_start_y, int64 task_cache_end_y) {
+                // This is a long and confusing function, but it's been laid out
+                // this way to help with performance on some intensive models.
+                // What it's doing is populating a cache of the original input
+                // image, after it's been bilinear resized and had its edges
+                // mirrored. This allows the following im2col code to access the
+                // transformed pixels from this cache, without having to
+                // repeatedly apply the expensive bilinear calculations as the
+                // same pixels are accessed by different patches.
+                // This is most effective when the stride is small and the
+                // filter size is large, since that's when pixels are reused
+                // most frequently as patches overlap.
+                for (int cache_y = task_cache_start_y;
+                     cache_y < task_cache_end_y; ++cache_y) {
+                  // We organize the cache as a series of rows, each containing
+                  // all the transformed pixels for a given line in the image.
+                  // This cache is big enough to hold at least a filter's height
+                  // worth of rows, but typically more, limited by the size of
+                  // the cache buffer.
+                  // We don't allocate an entire image's worth of rows though,
+                  // because we're trying to keep memory usage down, so as we
+                  // progress downwards through the im2col we periodically
+                  // refresh the cache so that the next lines that are needed
+                  // for that operation are always present.
+                  // Work out the parameters that remain constant across the
+                  // row we're calculating.
+                  PerCacheLineParameters<T1> line_params(
+                      CalculatePerCacheLineParameters<T1>(
+                          task_params.cache_height, cache_y,
+                          task_params.resize_cache,
+                          task_params.cache_line_width, task_params.input_width,
+                          task_params.input_depth, task_params.top_padding,
+                          task_params.pad_offset, task_params.resized_height,
+                          task_params.st, task_params.input_batch_start));
+                  // Iterate through the resize cache row we're filling in.
+                  for (int cache_x = task_params.cache_start_x;
+                       cache_x < task_params.cache_end_x; ++cache_x) {
+                    // Figure out what we need for the cache pixel we're
+                    // populating.
+                    PerCachePixelParameters<T1> pixel_params(
+                        CalculatePerCachePixelParameters<T1>(
+                            cache_x, task_params.cache_start_x,
+                            line_params.cache_line_start,
+                            task_params.input_depth, task_params.left_padding,
+                            task_params.pad_offset, task_params.resized_width,
+                            task_params.st));
+                    // If the access is off the left, right, top, or bottom of
+                    // the resized image, the conv padding means we should set
+                    // it to zero.
+                    if ((cache_x < 0) ||
+                        (cache_x >= task_params.padded_width) ||
+                        (cache_y < 0) ||
+                        (cache_y >= task_params.padded_height)) {
+                      std::fill_n(pixel_params.cache_line_pixel,
+                                  task_params.input_depth, T1(0));
+                    } else {
+                      // There are two different sampling strategies for
+                      // resizing. When using nearest, we can just do a
+                      // straight copy of the pixel closest to our sample point,
+                      // but bilinear requires a more complex calculation.
+                      if (SampleMode == NEAREST) {
+                        const T1* input_top_left_pixel =
+                            line_params.input_top_row_start +
+                            (pixel_params.left_x_index *
+                             task_params.input_depth);
+
+                        std::copy_n(input_top_left_pixel,
+                                    task_params.input_depth,
+                                    pixel_params.cache_line_pixel);
+                      } else {
+                        const SampleRect<T1> rect(
+                            line_params.input_top_row_start +
+                                (pixel_params.left_x_index *
+                                 task_params.input_depth),
+                            line_params.input_top_row_start +
+                                (pixel_params.right_x_index *
+                                 task_params.input_depth),
+                            line_params.input_bottom_row_start +
+                                (pixel_params.left_x_index *
+                                 task_params.input_depth),
+                            line_params.input_bottom_row_start +
+                                (pixel_params.right_x_index *
+                                 task_params.input_depth));
+                        for (int in_channel = 0;
+                             in_channel < task_params.input_depth;
+                             ++in_channel) {
+                          pixel_params.cache_line_pixel[in_channel] =
+                              rect.BilinearSample(in_channel,
+                                                  pixel_params.x_lerp,
+                                                  line_params.y_lerp);
+                        }
+                      }
+                    }
+                  }
+                }
+              });
+          end_cached_lines = cache_end_y;
+        }
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin = (out_x * stride_cols) - filter_left_offset;
+          const int patch_index = (batch * output_width * output_height) +
+                                  (out_y * output_width) + out_x;
+          const int patch_index_within_chunk = patch_index % patches_per_chunk;
+          T1* im2col_patch_start =
+              im2col_buffer + (patch_index_within_chunk * filter_value_count);
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            T1* im2col_row_start =
+                im2col_patch_start +
+                (filter_y * filter_width * task_params.input_depth);
+            const int conv_in_y = in_y_origin + filter_y;
+            int cache_index_y;
+            if (conv_in_y < 0) {
+              cache_index_y = task_params.cache_height +
+                              (conv_in_y % task_params.cache_height);
+            } else {
+              cache_index_y = conv_in_y % task_params.cache_height;
+            }
+            T1* cache_line_start =
+                task_params.resize_cache +
+                (cache_index_y * task_params.cache_line_width *
+                 task_params.input_depth);
+            T1* cache_filter_row_start =
+                cache_line_start + ((in_x_origin - task_params.cache_start_x) *
+                                    task_params.input_depth);
+            std::copy_n(cache_filter_row_start,
+                        (filter_width * task_params.input_depth),
+                        im2col_row_start);
+          }
+          const bool is_last_in_chunk =
+              (patch_index_within_chunk == (patches_per_chunk - 1));
+          const bool is_last_overall =
+              ((batch == (input_batches - 1)) &&
+               (out_y == (output_height - 1)) && (out_x == (output_width - 1)));
+          if (is_last_in_chunk || is_last_overall) {
+            // Now we've assembled a set of image patches into a matrix, apply
+            // a GEMM matrix multiply of the patches as rows, times the filter
+            // weights in columns, to get partial results in the output
+            // matrix.
+            const int how_many_patches = patch_index_within_chunk + 1;
+            const int m = how_many_patches;
+            const int n = filter_count;
+            const int k = filter_value_count;
+            const int lda = filter_value_count;
+            const int ldb = filter_count;
+            const int ldc = filter_count;
+            const size_t start_patch_index =
+                patch_index - (how_many_patches - 1);
+            T3* chunk_output_data =
+                output_data + (start_patch_index * filter_count);
+            TGemmFunctor gemm_functor;
+            gemm_functor(context, m, n, k, im2col_buffer, lda, filter_data, ldb,
+                         chunk_output_data, ldc);
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace
+
+// Implements a version of convolution with bilinear resizing and mirror padding
+// included.
+template <class T, class TConvFunctor, bool DoResize>
+class FusedResizeConv2DUsingGemmOp : public OpKernel {
+ public:
+  explicit FusedResizeConv2DUsingGemmOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    if (DoResize) {
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("resize_align_corners", &align_corners_));
+    }
+    MirrorPadMode mode;
+    OP_REQUIRES_OK(context, context->GetAttr("mode", &mode));
+
+    switch (mode) {
+      case MirrorPadMode::SYMMETRIC: {
+        offset_ = 0;
+        break;
+      }
+      case MirrorPadMode::REFLECT: {
+        offset_ = 1;
+        break;
+      }
+      default:
+        OP_REQUIRES(context, false,
+                    errors::InvalidArgument(
+                        "mode must be either REFLECT or SYMMETRIC."));
+    }
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    const int64 stride_n = GetTensorDim(strides_, FORMAT_NHWC, 'N');
+    const int64 stride_c = GetTensorDim(strides_, FORMAT_NHWC, 'C');
+    OP_REQUIRES(
+        context, stride_n == 1 && stride_c == 1,
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is of the following dimensions:
+    // [ batch, in_rows, in_cols, in_depth ]
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, (input.shape().num_elements() > 0),
+                errors::InvalidArgument("Input tensor can't be empty"));
+
+    ImageResizerState st(false);
+    if (DoResize) {
+      st = ImageResizerState(align_corners_);
+      st.ValidateAndCalculateOutputSize(context, input);
+      if (!context->status().ok()) return;
+    } else {
+      // Set up the resize parameters to do no scaling at all.
+      st.batch_size = input.dim_size(0);
+      st.out_height = input.dim_size(1);
+      st.out_width = input.dim_size(2);
+      st.in_height = input.dim_size(1);
+      st.in_width = input.dim_size(2);
+      st.channels = input.dim_size(3);
+      st.height_scale = 1.0f;
+      st.width_scale = 1.0f;
+    }
+    TensorShape resized_shape(
+        {input.dim_size(0), st.out_height, st.out_width, input.dim_size(3)});
+    int paddings_index;
+    int filter_index;
+    if (DoResize) {
+      paddings_index = 2;
+      filter_index = 3;
+    } else {
+      paddings_index = 1;
+      filter_index = 2;
+    }
+    const Tensor& paddings = context->input(paddings_index);
+
+    const int dims = resized_shape.dims();
+    OP_REQUIRES(
+        context,
+        TensorShapeUtils::IsMatrix(paddings.shape()) &&
+            paddings.dim_size(1) == 2,
+        errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
+                                paddings.shape().DebugString()));
+    const int fixed_dims =
+        (allow_legacy_scalars() && dims == 0 && paddings.dim_size(0) == 1)
+            ? 1
+            : dims;
+    OP_REQUIRES(
+        context, fixed_dims == paddings.dim_size(0),
+        errors::InvalidArgument(
+            "The first dimension of paddings must be the rank of inputs: ",
+            fixed_dims, " ", paddings.shape().DebugString(), " ",
+            resized_shape.DebugString()));
+    OP_REQUIRES(
+        context, dims == paddings.dim_size(0),
+        errors::InvalidArgument(
+            "The first dimension of paddings must be the rank of inputs: ",
+            dims, " ", paddings.shape().DebugString(), " ",
+            resized_shape.DebugString()));
+
+    OP_REQUIRES(
+        context, dims == 4,
+        errors::InvalidArgument(
+            "Fused mirror padding only supports four-dimensional inputs, but ",
+            dims, " requested"));
+
+    // Compute the shape of the output tensor, and allocate it.
+    TensorShape padded_shape;
+    TTypes<int32>::ConstMatrix paddings_matrix = paddings.matrix<int32>();
+    for (int d = 0; d < dims; ++d) {
+      const int32 before =
+          paddings_matrix(d, 0);  // Pad before existing elements.
+      const int32 after =
+          paddings_matrix(d, 1);  // Pad after existing elements.
+      OP_REQUIRES(context, before >= 0 && after >= 0,
+                  errors::InvalidArgument(
+                      "paddings must be non-negative: ", before, " ", after));
+      if (offset_ == 0) {  // SYMMETRIC mode.
+        OP_REQUIRES(
+            context,
+            before <= resized_shape.dim_size(d) &&
+                after <= resized_shape.dim_size(d),
+            errors::InvalidArgument("paddings must be no greater "
+                                    "than the dimension size: ",
+                                    before, ", ", after, " greater than ",
+                                    resized_shape.dim_size(d)));
+      } else if (offset_ == 1) {  // REFLECT mode.
+        OP_REQUIRES(
+            context,
+            before < resized_shape.dim_size(d) &&
+                after < resized_shape.dim_size(d),
+            errors::InvalidArgument("paddings must be less than"
+                                    " the dimension size: ",
+                                    before, ", ", after, " not less than ",
+                                    resized_shape.dim_size(d)));
+      }
+      padded_shape.AddDim(before + resized_shape.dim_size(d) + after);
+    }
+
+    OP_REQUIRES(
+        context, ((paddings_matrix(0, 0) == 0) && (paddings_matrix(0, 1) == 0)),
+        errors::InvalidArgument(
+            "Fused mirror padding only support spatial padding, not batches: ",
+            paddings.DebugString()));
+    OP_REQUIRES(
+        context, ((paddings_matrix(3, 0) == 0) && (paddings_matrix(3, 1) == 0)),
+        errors::InvalidArgument(
+            "Fused mirror padding only support spatial padding, not channels: ",
+            paddings.DebugString()));
+    const int32 top_padding = paddings_matrix(1, 0);
+    const int32 bottom_padding = paddings_matrix(1, 1);
+    const int32 left_padding = paddings_matrix(2, 0);
+    const int32 right_padding = paddings_matrix(2, 1);
+
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth]
+    const Tensor& filter = context->input(filter_index);
+
+    // For 2D convolution, there should be 4 dimensions.
+    OP_REQUIRES(context, padded_shape.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        padded_shape.DebugString()));
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().DebugString()));
+
+    // We only check the first three dims, since the depth is accessed as an
+    // int64 below.
+    for (int i = 0; i < 3; i++) {
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
+    }
+
+    // The last dimension for input is in_depth. It must be the same as the
+    // filter's in_depth.
+    const int64 in_depth = padded_shape.dim_size(3);
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
+
+    // The last dimension for filter is out_depth.
+    const int out_depth = static_cast<int>(filter.dim_size(3));
+
+    // The second dimension for input is rows/height.
+    // The first dimension for filter is rows/height.
+    const int64 padded_rows_raw = padded_shape.dim_size(1);
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
+    const int padded_rows = static_cast<int>(padded_rows_raw);
+    const int filter_rows = static_cast<int>(filter.dim_size(0));
+    const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
+
+    // The third dimension for input is columns/width.
+    // The second dimension for filter is columns/width.
+    const int64 padded_cols_raw = padded_shape.dim_size(2);
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
+    const int padded_cols = static_cast<int>(padded_cols_raw);
+    const int filter_cols = static_cast<int>(filter.dim_size(1));
+    const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
+
+    // The first dimension for input is batch.
+    const int64 batch_raw = padded_shape.dim_size(0);
+    OP_REQUIRES(context,
+                FastBoundsCheck(batch_raw, std::numeric_limits<int>::max()),
+                errors::InvalidArgument("batch is too large"));
+    const int batch = static_cast<int>(batch_raw);
+
+    // For now we take the stride from the second and third dimensions only (we
+    // do not support striding on the batch or depth dimension).
+    const int stride_rows = GetTensorDim(strides_, FORMAT_NHWC, 'H');
+    const int stride_cols = GetTensorDim(strides_, FORMAT_NHWC, 'W');
+
+    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(padded_rows, filter_rows, stride_rows,
+                                         padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(padded_cols, filter_cols, stride_cols,
+                                         padding_, &out_cols, &pad_cols));
+    TensorShape out_shape =
+        ShapeFromFormat(FORMAT_NHWC, batch, out_rows, out_cols, out_depth);
+    OP_REQUIRES(context, (out_shape.num_elements() > 0),
+                errors::InvalidArgument("Output tensor can't be empty"));
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    VLOG(2) << "FusedConv2D: " << name() << ", in_depth = " << in_depth
+            << ", padded_cols = " << padded_cols
+            << ", resized_cols = " << resized_cols
+            << ", filter_cols = " << filter_cols
+            << ", padded_rows = " << padded_rows
+            << ", resized_rows = " << resized_rows
+            << ", filter_rows = " << filter_rows
+            << ", stride_rows = " << stride_rows
+            << ", stride_cols = " << stride_cols
+            << ", out_depth = " << out_depth << ", DoResize=" << DoResize;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+    TConvFunctor conv_functor;
+    conv_functor(context, input, batch, resized_rows, resized_cols, padded_rows,
+                 padded_cols, in_depth, filter.flat<T>().data(), filter_rows,
+                 filter_cols, out_depth, stride_rows, stride_cols, padding_,
+                 output->flat<T>().data(), out_rows, out_cols, st, top_padding,
+                 bottom_padding, left_padding, right_padding, offset_);
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  bool align_corners_;
+  int offset_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
+};
+
+#define REGISTER_FUSED(T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedResizeAndPadConv2D")                                     \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<T>("T"),                                        \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       BILINEAR>,                         \
+          true>);
+
+TF_CALL_half(REGISTER_FUSED);
+TF_CALL_float(REGISTER_FUSED);
+TF_CALL_double(REGISTER_FUSED);
+
+#define REGISTER_PAD_ONLY_FUSED(T)                                        \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       NEAREST>,                          \
+          false>);
+
+TF_CALL_half(REGISTER_PAD_ONLY_FUSED);
+TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
+TF_CALL_double(REGISTER_PAD_ONLY_FUSED);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 21d135decdd459fc9bb6551f00ee5b6f546d2540..7a67658c4d88b9a5dc66635527f97719773e6f83 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -27,19 +27,19 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Get the Cudnn workspace limit from the environment variable, which is in MB.
+// Get the Dnn workspace limit from the environment variable, which is in MB.
 // Return the workspace memory limit in bytes. If no value is set, return the
 // default value.
-int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb,
-                             int64 default_value_in_bytes);
+int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
+                           int64 default_value_in_bytes);
 
 // A class to provide scratch-space allocator for Stream-Executor Cudnn
 // callback. TensorFlow is responsible for releasing the temporary buffers after
 // the kernel finishes.
-class CudnnScratchAllocator : public se::ScratchAllocator {
+class DnnScratchAllocator : public se::ScratchAllocator {
  public:
-  virtual ~CudnnScratchAllocator() {}
-  CudnnScratchAllocator(int64 memory_limit, OpKernelContext* context)
+  virtual ~DnnScratchAllocator() {}
+  DnnScratchAllocator(int64 memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
   int64 GetMemoryLimitInBytes(se::Stream* stream) override {
     return memory_limit_;
diff --git a/tensorflow/core/kernels/cwise_op_arg.cc b/tensorflow/core/kernels/cwise_op_arg.cc
index 62ffa0718ff5287167c702841ff00511da4866b5..ea659facdc4eb5605ad6327e3c073c47eefedeec 100644
--- a/tensorflow/core/kernels/cwise_op_arg.cc
+++ b/tensorflow/core/kernels/cwise_op_arg.cc
@@ -26,9 +26,7 @@ namespace tensorflow {
 REGISTER_COMPLEX(CPU, float, complex64);
 REGISTER_COMPLEX(CPU, double, complex128);
 
-// TODO: Enable GPU support for angle op after resolving
-// build failures on GPU (See #10643 for context).
-#if 0 && GOOGLE_CUDA
+#if GOOGLE_CUDA
 REGISTER_COMPLEX(GPU, float, complex64);
 REGISTER_COMPLEX(GPU, double, complex128);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_gpu_arg.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_arg.cu.cc
index 9b3f8200bd77d3179700c1abcc0b9a74484f3f52..34028e936e483035c1d410502252261b3e424ec9 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_arg.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_arg.cu.cc
@@ -13,9 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// TODO: Enable GPU support for angle op after resolving
-// build failures on GPU (See #10643 for context).
-#if 0 && GOOGLE_CUDA
+#if GOOGLE_CUDA
 
 #include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
 
diff --git a/tensorflow/core/kernels/cwise_op_squared_difference.cc b/tensorflow/core/kernels/cwise_op_squared_difference.cc
index 78fefc69c776e2f7b7c44c941e0a1afefdbaf143..d0ff271df6ad0475b970b7303292c8f7ea14396e 100644
--- a/tensorflow/core/kernels/cwise_op_squared_difference.cc
+++ b/tensorflow/core/kernels/cwise_op_squared_difference.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "SquaredDifference", functor::squared_difference,
-          float, Eigen::half, double, int32, int64);
+REGISTER7(BinaryOp, CPU, "SquaredDifference", functor::squared_difference,
+          float, Eigen::half, double, int32, int64, complex64, complex128);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "SquaredDifference", functor::squared_difference,
           float, Eigen::half, double, int64);
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 313def9a75fd7d5c796b59fce474119731d15d53..a22d76717a50e0869d38b77f0ec7f0cc46f8c7ac 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -29,6 +29,28 @@ limitations under the License.
 namespace Eigen {
 namespace internal {
 
+#if GOOGLE_CUDA
+template <>
+struct scalar_arg_op<std::complex<float>> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op)
+  typedef typename Eigen::NumTraits<std::complex<float>>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const float operator()(
+      const std::complex<float>& a) const {
+    return ::atan2f(a.imag(), a.real());
+  }
+};
+
+template <>
+struct scalar_arg_op<std::complex<double>> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op)
+  typedef typename Eigen::NumTraits<std::complex<double>>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const double operator()(
+      const std::complex<double>& a) const {
+    return ::atan2(a.imag(), a.real());
+  }
+};
+#endif
+
 template <typename T>
 struct scalar_asinh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
@@ -296,27 +318,32 @@ struct less_equal : std::binary_function<T, T, bool> {
   }
 };
 
-// Functor that enables composition of multiple Eigen functors.
-template <typename Scalar, typename UnaryFunctor, typename BinaryFunctor>
-struct scalar_compose_op {
+// Functor that enables squared difference functor.
+template <typename Scalar>
+struct scalar_squared_difference_op {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
   operator()(const Scalar& a, const Scalar& b) const {
-    return UnaryFunctor()(BinaryFunctor()(a, b));
+    const Scalar v = scalar_difference_op<Scalar>()(a, b);
+    return scalar_product_op<Scalar>()(v, scalar_conjugate_op<Scalar>()(v));
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
   packetOp(const Packet& a, const Packet& b) const {
-    return UnaryFunctor().packetOp(BinaryFunctor().packetOp(a, b));
+    const Packet v = scalar_difference_op<Scalar>().packetOp(a, b);
+    return scalar_product_op<Scalar>().packetOp(
+        v, scalar_conjugate_op<Scalar>().packetOp(v));
   }
 };
 
-template <typename Scalar, typename UnaryFunctor, typename BinaryFunctor>
-struct functor_traits<scalar_compose_op<Scalar, UnaryFunctor, BinaryFunctor>> {
+template <typename Scalar>
+struct functor_traits<scalar_squared_difference_op<Scalar>> {
   enum {
-    Cost = functor_traits<UnaryFunctor>::Cost +
-           functor_traits<BinaryFunctor>::Cost,
-    PacketAccess = functor_traits<UnaryFunctor>::PacketAccess &&
-                   functor_traits<BinaryFunctor>::PacketAccess
+    Cost = functor_traits<scalar_difference_op<Scalar>>::Cost +
+           functor_traits<scalar_conjugate_op<Scalar>>::Cost +
+           functor_traits<scalar_product_op<Scalar>>::Cost,
+    PacketAccess = functor_traits<scalar_difference_op<Scalar>>::PacketAccess &&
+                   functor_traits<scalar_conjugate_op<Scalar>>::PacketAccess &&
+                   functor_traits<scalar_product_op<Scalar>>::PacketAccess
   };
 };
 
@@ -775,7 +802,7 @@ struct rint : base<T, scalar_rint_op<T>> {};
 // pow(x, y) = x ^ y
 // maximum(x, y) = x > y ? x : y
 // minimum(x, y) = x < y ? x : y
-// squared_difference(x, y) = (x - y) * (x - y)
+// squared_difference(x, y) = conj(x - y) * (x - y)
 
 template <typename T>
 struct add : base<T, Eigen::internal::scalar_sum_op<T>> {
@@ -885,9 +912,7 @@ struct atan2 : base<T, scalar_atan2_op<T>> {};
 
 template <typename T>
 struct squared_difference
-    : base<T, Eigen::internal::scalar_compose_op<
-                  T, Eigen::internal::scalar_square_op<T>,
-                  Eigen::internal::scalar_difference_op<T>>> {};
+    : base<T, Eigen::internal::scalar_squared_difference_op<T>> {};
 
 template <typename T>
 struct xdivy : base<T, Eigen::internal::xdivy_op<T>> {};
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index f1eeda20d301e70c336632bae5e98581a3b3772c..e2ab77632da4830f63d63c95c6ace5465fb46b9e 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -13,16 +13,6 @@ load(
     "tf_cc_test",
 )
 
-tf_kernel_library(
-    name = "stats_aggregator_ops",
-    srcs = ["stats_aggregator_ops.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
 # TODO(mrry): Remove this empty forwarding library.
 cc_library(
     name = "dataset",
@@ -139,17 +129,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "slide_dataset_op",
-    srcs = ["slide_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "padded_batch_dataset_op",
     srcs = ["padded_batch_dataset_op.cc"],
@@ -161,44 +140,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "dense_to_sparse_batch_dataset_op",
-    srcs = ["dense_to_sparse_batch_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "group_by_reducer_dataset_op",
-    srcs = ["group_by_reducer_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "group_by_window_dataset_op",
-    srcs = ["group_by_window_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":window_dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "filter_dataset_op",
     srcs = ["filter_dataset_op.cc"],
@@ -238,21 +179,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "map_and_batch_dataset_op",
-    srcs = ["map_and_batch_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:inplace_ops",
-    ],
-)
-
 cc_library(
     name = "parallel_map_iterator",
     srcs = ["parallel_map_iterator.cc"],
@@ -267,16 +193,6 @@ cc_library(
     ],
 )
 
-tf_kernel_library(
-    name = "parse_example_dataset_op",
-    srcs = ["parse_example_dataset_op.cc"],
-    deps = [
-        ":parallel_map_iterator",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-    ],
-)
-
 tf_kernel_library(
     name = "parallel_map_dataset_op",
     srcs = ["parallel_map_dataset_op.cc"],
@@ -307,19 +223,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "scan_dataset_op",
-    srcs = ["scan_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "flat_map_dataset_op",
     srcs = ["flat_map_dataset_op.cc"],
@@ -359,7 +262,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -430,39 +332,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "stats_dataset_ops",
-    srcs = ["stats_dataset_ops.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_kernel_library(
-    name = "stats_aggregator_dataset_op",
-    srcs = ["stats_aggregator_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:core_cpu_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "random_dataset_op",
-    srcs = ["random_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "range_dataset_op",
     srcs = ["range_dataset_op.cc"],
@@ -506,17 +375,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "tensor_queue_dataset_op",
-    srcs = ["tensor_queue_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "tensor_slice_dataset_op",
     srcs = ["tensor_slice_dataset_op.cc"],
@@ -527,17 +385,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "unbatch_dataset_op",
-    srcs = ["unbatch_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "zip_dataset_op",
     srcs = ["zip_dataset_op.cc"],
@@ -571,20 +418,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "sql_dataset_ops",
-    srcs = [
-        "sql_dataset_ops.cc",
-    ],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels/data/sql",
-    ],
-)
-
 tf_kernel_library(
     name = "iterator_ops",
     srcs = ["iterator_ops.cc"],
@@ -655,7 +488,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
@@ -664,6 +496,7 @@ tf_kernel_library(
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core/grappler/optimizers/data",
+        "//tensorflow/core/grappler/optimizers/data:function_utils",
         "//tensorflow/core/grappler/optimizers/data:graph_utils",
     ],
 )
@@ -700,16 +533,12 @@ tf_kernel_library(
         ":cache_dataset_ops",
         ":concatenate_dataset_op",
         ":dataset_ops",
-        ":dense_to_sparse_batch_dataset_op",
         ":filter_by_component_dataset_op",
         ":filter_dataset_op",
         ":flat_map_dataset_op",
         ":generator_dataset_op",
-        ":group_by_reducer_dataset_op",
-        ":group_by_window_dataset_op",
         ":interleave_dataset_op",
         ":iterator_ops",
-        ":map_and_batch_dataset_op",
         ":map_dataset_op",
         ":map_defun_op",
         ":model_dataset_op",
@@ -719,45 +548,22 @@ tf_kernel_library(
         ":padded_batch_dataset_op",
         ":parallel_interleave_dataset_op",
         ":parallel_map_dataset_op",
-        ":parse_example_dataset_op",
         ":prefetch_dataset_op",
-        ":random_dataset_op",
         ":range_dataset_op",
         ":reader_dataset_ops",
         ":repeat_dataset_op",
-        ":scan_dataset_op",
         ":shuffle_dataset_op",
         ":skip_dataset_op",
-        ":slide_dataset_op",
         ":sparse_tensor_slice_dataset_op",
-        ":sql_dataset_ops",
-        ":stats_aggregator_dataset_op",
-        ":stats_aggregator_ops",
-        ":stats_dataset_ops",
         ":take_dataset_op",
         ":tensor_dataset_op",
-        ":tensor_queue_dataset_op",
         ":tensor_slice_dataset_op",
-        ":unbatch_dataset_op",
         ":window_dataset_op",
-        ":writer_ops",
         ":zip_dataset_op",
         "//tensorflow/core/kernels/data/experimental:dataset_kernels",
     ],
 )
 
-tf_kernel_library(
-    name = "writer_ops",
-    srcs = ["writer_ops.cc"],
-    deps = [
-        ":dataset_utils",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:ops_util",
-    ],
-)
-
 tf_kernel_library(
     name = "map_defun_op",
     srcs = ["map_defun_op.cc"],
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 41b04346ebdd20dedd00f0a9575e349dc6403e03..1f8d2bdbae897e471113375150935b69e47f6d84 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -95,6 +95,15 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
       return strings::StrCat("BatchDatasetOp(", batch_size_, ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index ce6fd09aee53a4bb94fde1cfd332e34f4d608b17..f00b38e732a7835896a275d14507e75eade05fa1 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
@@ -84,6 +85,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       return "CacheDatasetOp::FileDataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -562,9 +565,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
   class MemoryDataset : public DatasetBase {
    public:
     explicit MemoryDataset(OpKernelContext* ctx, const DatasetBase* input)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          cache_(new MemoryCache()) {
+        : DatasetBase(DatasetContext(ctx)), input_(input) {
       input->Ref();
     }
 
@@ -572,8 +573,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new MemoryIterator(
-          {this, strings::StrCat(prefix, "::MemoryCache")}, cache_));
+      return std::unique_ptr<IteratorBase>(
+          new MemoryIterator({this, strings::StrCat(prefix, "::MemoryCache")}));
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -588,6 +589,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       return "CacheDatasetOp::MemoryDataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -607,10 +610,12 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
     // The expected use is that a single `MemoryWriterIterator` populates the
     // cache with dataset elements. Once all elements are cached, the cache can
     // be used by one or more `MemoryReaderIterator`s.
-    class MemoryCache {
+    class MemoryCache : public ResourceBase {
      public:
       MemoryCache() = default;
 
+      string DebugString() override { return "CacheDataset::MemoryCache"; }
+
       // Marks the cache as completed.
       void Complete() {
         mutex_lock l(mu_);
@@ -677,15 +682,25 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
 
     class MemoryIterator : public DatasetIterator<MemoryDataset> {
      public:
-      explicit MemoryIterator(const Params& params,
-                              const std::shared_ptr<MemoryCache>& cache)
-          : DatasetIterator<MemoryDataset>(params), cache_(cache) {
-        mode_ = cache->MaybeClaim() ? Mode::write : Mode::read;
-        InitializeIterator();
-      }
+      explicit MemoryIterator(const Params& params)
+          : DatasetIterator<MemoryDataset>(params) {}
+
+      ~MemoryIterator() override { cache_->Unref(); }
 
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(mu_);
+        // Use the resource manager in the iterator context to get / create
+        // a cache.
+        ResourceMgr* mgr = ctx->resource_mgr();
+        const string name =
+            strings::StrCat(prefix(), "::", dataset()->name(), "::MemoryCache");
+        TF_RETURN_IF_ERROR(mgr->LookupOrCreate<MemoryCache>(
+            "tf_data", name, &cache_, [](MemoryCache** cache) {
+              *cache = new MemoryCache();
+              return Status::OK();
+            }));
+        mode_ = cache_->MaybeClaim() ? Mode::write : Mode::read;
+        InitializeIterator();
         if (mode_ == Mode::read && !cache_->IsCompleted()) {
           return errors::Internal(
               "Cache should only be read after it has been completed.");
@@ -784,8 +799,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
      private:
       class MemoryWriterIterator : public DatasetIterator<MemoryDataset> {
        public:
-        explicit MemoryWriterIterator(const Params& params,
-                                      const std::shared_ptr<MemoryCache>& cache)
+        explicit MemoryWriterIterator(const Params& params, MemoryCache* cache)
             : DatasetIterator<MemoryDataset>(params), cache_(cache) {
           CHECK(cache_);
         }
@@ -818,6 +832,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
             cache_->Complete();
             return Status::OK();
           }
+          RecordBufferEnqueue(ctx, *out_tensors);
           cache_->emplace_back(*out_tensors);
           return Status::OK();
         }
@@ -843,17 +858,46 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
        private:
         mutex mu_;
         std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-        std::shared_ptr<MemoryCache> cache_;
+        MemoryCache* const cache_ GUARDED_BY(mu_);  // not owned.
       };  // MemoryWriterIterator
 
       class MemoryReaderIterator : public DatasetIterator<MemoryDataset> {
        public:
-        explicit MemoryReaderIterator(const Params& params,
-                                      const std::shared_ptr<MemoryCache>& cache)
+        explicit MemoryReaderIterator(const Params& params, MemoryCache* cache)
             : DatasetIterator<MemoryDataset>(params), cache_(cache), index_(0) {
           CHECK(cache);
         }
 
+        Status Initialize(IteratorContext* ctx) override {
+          // The memory allocated for the cache is owned by the parent
+          // dataset but performance modeling uses the iterator abstraction and
+          // thus we record the memory allocated for the cache here. The caveat
+          // is that this is incorrect if there are concurrent instances of this
+          // iterator.
+          tf_shared_lock l(mu_);
+          for (size_t i = 0; i < cache_->size(); ++i) {
+            RecordBufferEnqueue(ctx, cache_->at(i));
+          }
+          return Status::OK();
+        }
+
+        Status GetNextInternal(IteratorContext* ctx,
+                               std::vector<Tensor>* out_tensors,
+                               bool* end_of_sequence) override {
+          mutex_lock l(mu_);
+          if (index_ < cache_->size()) {
+            const std::vector<Tensor>& cache_tensors = cache_->at(index_);
+            out_tensors->insert(out_tensors->begin(), cache_tensors.begin(),
+                                cache_tensors.end());
+            index_++;
+            *end_of_sequence = false;
+            return Status::OK();
+          } else {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+        }
+
        protected:
         std::shared_ptr<model::Node> CreateNode(
             IteratorContext* ctx, model::Node::Args args) const override {
@@ -878,26 +922,9 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
           return Status::OK();
         }
 
-        Status GetNextInternal(IteratorContext* ctx,
-                               std::vector<Tensor>* out_tensors,
-                               bool* end_of_sequence) override {
-          mutex_lock l(mu_);
-          if (index_ < cache_->size()) {
-            const std::vector<Tensor>& cache_tensors = cache_->at(index_);
-            out_tensors->insert(out_tensors->begin(), cache_tensors.begin(),
-                                cache_tensors.end());
-            index_++;
-            *end_of_sequence = false;
-            return Status::OK();
-          } else {
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-        }
-
        private:
         mutex mu_;
-        const std::shared_ptr<MemoryCache> cache_;
+        MemoryCache* const cache_ GUARDED_BY(mu_);  // not owned.
         size_t index_ GUARDED_BY(mu_);
       };  // MemoryReaderIterator
 
@@ -914,14 +941,13 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       }
 
       mutex mu_;
-      std::shared_ptr<MemoryCache> cache_;
+      MemoryCache* cache_ GUARDED_BY(mu_);  // not owned.
       enum Mode { read, write };
       Mode mode_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> iterator_ GUARDED_BY(mu_);
     };  // MemoryIterator
 
     const DatasetBase* const input_;
-    const std::shared_ptr<MemoryCache> cache_;
   };  // MemoryDataset
 };    // CacheDatasetOp
 
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index d5a0abc64b4e8769aff260d2580bd44e7af7e9ac..066b2c9aef4faaf23981b207e46c301e99360119 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -79,6 +79,18 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
       return "ConcatenateDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override {
+      int64 n1 = input_->Cardinality();
+      int64 n2 = to_concatenate_->Cardinality();
+      if (n1 == kInfiniteCardinality || n2 == kInfiniteCardinality) {
+        return kInfiniteCardinality;
+      }
+      if (n1 == kUnknownCardinality || n2 == kUnknownCardinality) {
+        return kUnknownCardinality;
+      }
+      return n1 + n2;
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
index 36e9714736a5725f69e33a49bd5d1389994213ac..0abfdbb56b577764bbd48dbe0903148b2cf691d6 100644
--- a/tensorflow/core/kernels/data/dataset_ops.cc
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -46,8 +46,25 @@ class DatasetToGraphOp : public OpKernel {
   }
 };
 
+class DatasetCardinalityOp : public OpKernel {
+ public:
+  explicit DatasetCardinalityOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* dataset;
+    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
+    Tensor* result;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &result));
+    result->scalar<int64>()() = dataset->Cardinality();
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("DatasetToGraph").Device(DEVICE_CPU),
                         DatasetToGraphOp);
 
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalDatasetCardinality").Device(DEVICE_CPU),
+    DatasetCardinalityOp);
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 958c42a22a6800b32085a918caf640579999703a..7433303f77671cbf67a6365fb1d552edc7b471e0 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -11,35 +11,31 @@ load(
     "tf_kernel_library",
 )
 
-cc_library(
-    name = "indexed_dataset_headers",
-    hdrs = ["indexed_dataset.h"],
+tf_kernel_library(
+    name = "assert_next_dataset_op",
+    srcs = ["assert_next_dataset_op.cc"],
     deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//third_party/eigen3",
     ],
 )
 
 tf_kernel_library(
-    name = "indexed_dataset",
-    srcs = [
-        "identity_indexed_dataset.cc",
-        "indexed_dataset.cc",
-    ],
+    name = "csv_dataset_op",
+    srcs = ["csv_dataset_op.cc"],
     deps = [
-        ":indexed_dataset_headers",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//third_party/eigen3",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
 tf_kernel_library(
-    name = "prefetching_kernels",
-    srcs = ["prefetching_kernels.cc"],
+    name = "dense_to_sparse_batch_dataset_op",
+    srcs = ["dense_to_sparse_batch_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -59,13 +55,29 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "csv_dataset_op",
-    srcs = ["csv_dataset_op.cc"],
+    name = "group_by_reducer_dataset_op",
+    srcs = ["group_by_reducer_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+    ],
+)
+
+tf_kernel_library(
+    name = "group_by_window_dataset_op",
+    srcs = ["group_by_window_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+        "//tensorflow/core/kernels/data:window_dataset",
     ],
 )
 
@@ -79,6 +91,18 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "indexed_dataset_op",
+    srcs = ["indexed_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/data:dataset_utils",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_kernel_library(
     name = "lmdb_dataset_op",
     srcs = ["lmdb_dataset_op.cc"],
@@ -92,12 +116,38 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "threadpool_dataset_op",
-    srcs = ["threadpool_dataset_op.cc"],
+    name = "map_and_batch_dataset_op",
+    srcs = ["map_and_batch_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:inplace_ops",
+        "//tensorflow/core/kernels/data:captured_function",
+        "//tensorflow/core/kernels/data:dataset_utils",
+    ],
+)
+
+tf_kernel_library(
+    name = "matching_files_dataset_op",
+    srcs = ["matching_files_dataset_op.cc"],
     deps = [
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:dataset",
+    ],
+)
+
+tf_kernel_library(
+    name = "non_serializable_dataset_op",
+    srcs = ["non_serializable_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
         "//third_party/eigen3",
     ],
 )
@@ -118,23 +168,72 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "unique_dataset_op",
-    srcs = ["unique_dataset_op.cc"],
+    name = "parallel_interleave_dataset_op",
+    srcs = ["parallel_interleave_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//third_party/eigen3",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+        "//tensorflow/core/kernels/data:dataset_utils",
     ],
 )
 
 tf_kernel_library(
-    name = "assert_next_dataset_op",
-    srcs = ["assert_next_dataset_op.cc"],
+    name = "parse_example_dataset_op",
+    srcs = ["parse_example_dataset_op.cc"],
     deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/kernels/data:parallel_map_iterator",
+    ],
+)
+
+tf_kernel_library(
+    name = "prefetching_kernels",
+    srcs = ["prefetching_kernels.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//third_party/eigen3",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "random_dataset_op",
+    srcs = ["random_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "scan_dataset_op",
+    srcs = ["scan_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+    ],
+)
+
+tf_kernel_library(
+    name = "set_stats_aggregator_dataset_op",
+    srcs = ["set_stats_aggregator_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
@@ -148,24 +247,94 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "non_serializable_dataset_op",
-    srcs = ["non_serializable_dataset_op.cc"],
+    name = "sliding_window_dataset_op",
+    srcs = ["sliding_window_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "sql_dataset_op",
+    srcs = [
+        "sql_dataset_op.cc",
+    ],
     deps = [
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//third_party/eigen3",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data/experimental/sql",
     ],
 )
 
 tf_kernel_library(
-    name = "matching_files_dataset_op",
-    srcs = ["matching_files_dataset_op.cc"],
+    name = "stats_aggregator_ops",
+    srcs = ["stats_aggregator_ops.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_dataset_ops",
+    srcs = ["stats_dataset_ops.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "to_tf_record_op",
+    srcs = ["to_tf_record_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/kernels/data:dataset_utils",
+    ],
+)
+
+tf_kernel_library(
+    name = "threadpool_dataset_op",
+    srcs = ["threadpool_dataset_op.cc"],
     deps = [
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "unbatch_dataset_op",
+    srcs = ["unbatch_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels/data:dataset",
+    ],
+)
+
+tf_kernel_library(
+    name = "unique_dataset_op",
+    srcs = ["unique_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
     ],
 )
 
@@ -174,16 +343,31 @@ tf_kernel_library(
     deps = [
         ":assert_next_dataset_op",
         ":csv_dataset_op",
+        ":dense_to_sparse_batch_dataset_op",
         ":directed_interleave_dataset_op",
+        ":group_by_reducer_dataset_op",
+        ":group_by_window_dataset_op",
         ":ignore_errors_dataset_op",
-        ":indexed_dataset",
+        ":indexed_dataset_op",
         ":lmdb_dataset_op",
+        ":map_and_batch_dataset_op",
         ":matching_files_dataset_op",
         ":non_serializable_dataset_op",
         ":numa_map_and_batch_dataset_op",
+        ":parallel_interleave_dataset_op",
+        ":parse_example_dataset_op",
         ":prefetching_kernels",
+        ":random_dataset_op",
+        ":scan_dataset_op",
+        ":set_stats_aggregator_dataset_op",
         ":sleep_dataset_op",
+        ":sliding_window_dataset_op",
+        ":sql_dataset_op",
+        ":stats_aggregator_ops",
+        ":stats_dataset_ops",
         ":threadpool_dataset_op",
+        ":to_tf_record_op",
+        ":unbatch_dataset_op",
         ":unique_dataset_op",
     ],
 )
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
index 3b5ee9b783c7c6b123ae220221f82a10c59dbd4c..3e87f484b940b336ed68099df7427250a4304207 100644
--- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
@@ -76,6 +76,8 @@ class AssertNextDatasetOp : public UnaryDatasetOpKernel {
       return "AssertNextDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
similarity index 97%
rename from tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
index d684d23b24212e629c8082091b1b310e82eedb70..97e64dd7444e93660afa6defa31314c909a31c7b 100644
--- a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
@@ -114,6 +114,14 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
                              ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ + (n % batch_size_ == 0 ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -304,8 +312,9 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("DenseToSparseBatchDataset").Device(DEVICE_CPU),
-                        DenseToSparseBatchDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalDenseToSparseBatchDataset").Device(DEVICE_CPU),
+    DenseToSparseBatchDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
similarity index 99%
rename from tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
index a01085491ea18122e16f712ec21de62c69c04dd5..1c298cfdd6a3a39aabd81cb5226e03b1c3e3de63 100644
--- a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
@@ -438,8 +438,9 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
   NameAttrList finalize_func_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("GroupByReducerDataset").Device(DEVICE_CPU),
-                        GroupByReducerDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalGroupByReducerDataset").Device(DEVICE_CPU),
+    GroupByReducerDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
similarity index 99%
rename from tensorflow/core/kernels/data/group_by_window_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index d9f504a170e977d5c12d7c22d7daa14eaaed2cd5..98603d5a732c8143db61535e6704d6a7b214413c 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -523,8 +523,9 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
   NameAttrList window_size_func_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("GroupByWindowDataset").Device(DEVICE_CPU),
-                        GroupByWindowDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalGroupByWindowDataset").Device(DEVICE_CPU),
+    GroupByWindowDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/identity_indexed_dataset.cc b/tensorflow/core/kernels/data/experimental/identity_indexed_dataset.cc
deleted file mode 100644
index d10a3dea110c9cd29919b89e0814bc815ac13500..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/experimental/identity_indexed_dataset.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/kernels/data/experimental/indexed_dataset.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-class IdentityIndexedDatasetOp : public IndexedDatasetOpKernel {
- public:
-  using IndexedDatasetOpKernel::IndexedDatasetOpKernel;
-
-  void MakeIndexedDataset(OpKernelContext* ctx,
-                          IndexedDataset** output) override {
-    uint64 size = -1;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<uint64>(ctx, "size", &size));
-    OP_REQUIRES(ctx, size > 0, errors::InvalidArgument("`size` must be > 0"));
-    *output = new Dataset(ctx, size);
-  }
-
-  class Dataset : public IndexedDataset {
-   public:
-    Dataset(OpKernelContext* ctx, uint64 size)
-        : IndexedDataset(DatasetContext(ctx)), size_(size) {}
-
-    Status MaterializeDataset(
-        std::shared_ptr<MaterializedIndexedDataset>* materialized) override {
-      materialized->reset(new Materialized(this));
-      return Status::OK();
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      static DataTypeVector* dtypes = new DataTypeVector({DT_UINT64});
-      return *dtypes;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      static std::vector<PartialTensorShape>* shapes =
-          new std::vector<PartialTensorShape>({{}});
-      return *shapes;
-    }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::IdentityIndexedDataset")}));
-    }
-
-    string DebugString() const override {
-      return "IdentityIndexedDataset::Dataset";
-    }
-
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** node) const override {
-      return errors::Unimplemented(
-          "identity_indexed_dataset.AsGraphDefInternal");
-    }
-
-   private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        if (cur_ < dataset()->size_) {
-          out_tensors->emplace_back(ctx->allocator({}), DT_UINT64,
-                                    TensorShape({}));
-          out_tensors->back().scalar<uint64>()() = cur_++;
-          *end_of_sequence = false;
-          return Status::OK();
-        }
-        *end_of_sequence = true;
-        return Status::OK();
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeKnownRatioNode(std::move(args),
-                                         /*ratio=*/1);
-      }
-
-     private:
-      mutex mu_;
-      uint64 cur_ GUARDED_BY(mu_);
-    };
-
-    class Materialized : public MaterializedIndexedDataset {
-     public:
-      explicit Materialized(Dataset* dataset) : dataset_(dataset) {
-        dataset->Ref();
-      }
-
-      ~Materialized() override {
-        // TODO(saeta): Pull this into MaterializedIndexedDataset
-        dataset_->Unref();
-      }
-
-      const DataTypeVector& output_dtypes() const override {
-        return dataset_->output_dtypes();
-      }
-
-      const std::vector<PartialTensorShape>& output_shapes() const override {
-        return dataset_->output_shapes();
-      }
-
-      Status Get(IteratorContext&& ctx, uint64 index,
-                 std::vector<Tensor>* out_tensors) const override {
-        LOG(INFO) << "Materialized(" << dataset_->size_ << ")::Get(" << index
-                  << ")";
-        if (index >= dataset_->size_) {
-          // Note: use InvalidArgument instead of OutOfRange error because many
-          // things consider OutOfRange to be a "clean termination" error.
-          return errors::InvalidArgument(
-              "Index ", index,
-              " is out of range for this dataset. (Size is: ", dataset_->size_,
-              ".)");
-        }
-        out_tensors->emplace_back(ctx.allocator({}), DT_UINT64,
-                                  TensorShape({}));
-        out_tensors->back().scalar<uint64>()() = index;
-        return Status::OK();
-      }
-
-      Status Size(uint64* size) const override {
-        *size = dataset_->size_;
-        return Status::OK();
-      }
-
-     private:
-      const Dataset* const dataset_;  // Not owned.
-    };
-
-    const uint64 size_;
-    std::shared_ptr<Materialized> materialized_;
-  };
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("ExperimentalIdentityIndexedDataset").Device(DEVICE_CPU),
-    IdentityIndexedDatasetOp);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
index 57cb44335b17f368d41178ed7cef0b3daefef3c3..d445d9c8094eec5c9a2bff9c45e2dc28e264d096 100644
--- a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
@@ -60,6 +60,8 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
       return "IgnoreErrorsDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/experimental/indexed_dataset.h b/tensorflow/core/kernels/data/experimental/indexed_dataset.h
deleted file mode 100644
index 27a8360cbcffc55c2f4f8ce437e5080e070845df..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/experimental/indexed_dataset.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_INDEXED_DATASET_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_INDEXED_DATASET_H_
-
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-namespace tensorflow {
-namespace data {
-
-// TODO(saeta): Urgh, this is ugly.
-class MaterializedIndexedDataset {
- public:
-  virtual ~MaterializedIndexedDataset() = default;
-
-  // Retrieve the element at a given index. The output tensors are stored in
-  // out_tensors.
-  //
-  // If `index` is greater than `Size()`, tensorflow::errors::OutOfRangeError is
-  // returned.
-  //
-  // Get is thread-safe.
-  virtual Status Get(IteratorContext&& ctx, uint64 index,
-                     std::vector<Tensor>* out_tensors) const = 0;
-
-  // Size determines the number of elements in this IndexedDataset.
-  //
-  // Size is thread-safe.
-  virtual Status Size(uint64* size) const = 0;
-
-  // Returns a vector of DataType values, representing the respective
-  // element types of each tuple component in the outputs of this dataset.
-  virtual const DataTypeVector& output_dtypes() const = 0;
-
-  // Returns a vector of tensor shapes, representing the respective
-  // (and possibly partially defined) shapes of each tuple component
-  // in the outputs of this dataset.
-  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
-};
-
-// IndexedDataset represents a dataset that supports random access in addition
-// to iterator-based sequential access.
-//
-// Note: IndexedDatasets are HIGHLY experimental at this time. Expect
-// significant (backwards incompatible) changes!
-class IndexedDataset : public DatasetBase {
- public:
-  IndexedDataset(DatasetContext&& ctx) : DatasetBase(std::move(ctx)) {}
-
-  // Materialize (if necessary) the dataset, and return a pointer.
-  // TODO(saeta): Add in `IteratorContext* ctx` when materializing.
-  virtual Status MaterializeDataset(
-      std::shared_ptr<MaterializedIndexedDataset>* materialized) = 0;
-};
-
-// IndexedDatasetOpKernel abstracts away interfacing IndexedDatasets with the
-// rest of the TensorFlow runtime.
-//
-// Most IndexedDataset's will be private members of classes inheriting from this
-// class.
-class IndexedDatasetOpKernel : public OpKernel {
- public:
-  IndexedDatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  void Compute(OpKernelContext* ctx) final;
-
- protected:
-  // Subclasses should implement this method. It will be called during Compute
-  // execution.
-  virtual void MakeIndexedDataset(OpKernelContext* ctx,
-                                  IndexedDataset** output) = 0;
-
-  template <typename T>
-  Status ParseScalarArgument(OpKernelContext* ctx,
-                             const StringPiece& argument_name, T* output) {
-    const Tensor* argument_t;
-    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
-    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
-      return errors::InvalidArgument(argument_name, " must be a scalar");
-    }
-    *output = argument_t->scalar<T>()();
-    return Status::OK();
-  }
-};
-
-// Validates and extracts an `IndexedDataset` object from `tensor`.
-//
-// `tensor` must have been written by a call to
-// `StoreIndexedDatasetInVariantTensor`
-//
-// The retrieved pointer isa  borrowed reference to the dataset, which is owned
-// by the tensor. The consumer must either acquire its own reference to the
-// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
-// destroyed or mutated while the retrieved pointer is in use.
-Status GetIndexedDatasetFromVariantTensor(const Tensor& tensor,
-                                          IndexedDataset** out_dataset);
-
-// Stores an `IndexedDataset` object in `tensor.`
-//
-// The ownership of `dataset` is transferred to `tensor`.
-Status StoreIndexedDatasetInVariantTensor(IndexedDataset* dataset,
-                                          Tensor* tensor);
-
-}  // namespace data
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_INDEXED_DATASET_H_
diff --git a/tensorflow/core/kernels/data/experimental/indexed_dataset.cc b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
similarity index 62%
rename from tensorflow/core/kernels/data/experimental/indexed_dataset.cc
rename to tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
index 75ea462f4020bbf02ab05597a23869f90a90cc30..a07eaebdf9d645fba51945d7bd3e79b72b5e5dc2 100644
--- a/tensorflow/core/kernels/data/experimental/indexed_dataset.cc
+++ b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/data/experimental/indexed_dataset.h"
 
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 
@@ -23,42 +25,79 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-Status VerifyTypesMatch(const DataTypeVector& expected,
-                        const DataTypeVector& received) {
-  if (expected.size() != received.size()) {
-    return errors::InvalidArgument(
-        "Number of components does not match: expected ", expected.size(),
-        " types but got ", received.size(), ".");
-  }
-  for (size_t i = 0; i < expected.size(); ++i) {
-    if (expected[i] != received[i]) {
-      return errors::InvalidArgument("Data type mismatch at component ", i,
-                                     ": expected ", DataTypeString(expected[i]),
-                                     " but got ", DataTypeString(received[i]),
-                                     ".");
-    }
-  }
-  return Status::OK();
-}
+// TODO(saeta): Urgh, this is ugly.
+class MaterializedIndexedDataset {
+ public:
+  virtual ~MaterializedIndexedDataset() = default;
+
+  // Retrieve the element at a given index. The output tensors are stored in
+  // out_tensors.
+  //
+  // If `index` is greater than `Size()`, tensorflow::errors::OutOfRangeError is
+  // returned.
+  //
+  // Get is thread-safe.
+  virtual Status Get(IteratorContext&& ctx, uint64 index,
+                     std::vector<Tensor>* out_tensors) const = 0;
+
+  // Size determines the number of elements in this IndexedDataset.
+  //
+  // Size is thread-safe.
+  virtual Status Size(uint64* size) const = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this dataset.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this dataset.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+};
 
-Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
-                              const std::vector<PartialTensorShape>& received) {
-  if (expected.size() != received.size()) {
-    return errors::InvalidArgument(
-        "Number of components does not match: expected ", expected.size(),
-        " shapes but got ", received.size(), ".");
-  }
-  for (size_t i = 0; i < expected.size(); ++i) {
-    if (!expected[i].IsCompatibleWith(received[i])) {
-      return errors::InvalidArgument("Incompatible shapes at component ", i,
-                                     ": expected ", expected[i].DebugString(),
-                                     " but got ", received[i].DebugString(),
-                                     ".");
+// IndexedDataset represents a dataset that supports random access in addition
+// to iterator-based sequential access.
+//
+// Note: IndexedDatasets are HIGHLY experimental at this time. Expect
+// significant (backwards incompatible) changes!
+class IndexedDataset : public DatasetBase {
+ public:
+  explicit IndexedDataset(DatasetContext&& ctx) : DatasetBase(std::move(ctx)) {}
+
+  // Materialize (if necessary) the dataset, and return a pointer.
+  // TODO(saeta): Add in `IteratorContext* ctx` when materializing.
+  virtual Status MaterializeDataset(
+      std::shared_ptr<MaterializedIndexedDataset>* materialized) = 0;
+};
+
+// IndexedDatasetOpKernel abstracts away interfacing IndexedDatasets with the
+// rest of the TensorFlow runtime.
+//
+// Most IndexedDataset's will be private members of classes inheriting from this
+// class.
+class IndexedDatasetOpKernel : public OpKernel {
+ public:
+  explicit IndexedDatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) final;
+
+ protected:
+  // Subclasses should implement this method. It will be called during Compute
+  // execution.
+  virtual void MakeIndexedDataset(OpKernelContext* ctx,
+                                  IndexedDataset** output) = 0;
+
+  template <typename T>
+  Status ParseScalarArgument(OpKernelContext* ctx,
+                             const StringPiece argument_name, T* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a scalar");
     }
+    *output = argument_t->scalar<T>()();
+    return Status::OK();
   }
-
-  return Status::OK();
-}
+};
 
 class MaterializedDatasetResource : public ResourceBase {
  public:
@@ -164,8 +203,6 @@ class IndexedDatasetVariantWrapper {
   IndexedDataset* const dataset_;  // Owns one reference.
 };
 
-}  // namespace
-
 Status GetIndexedDatasetFromVariantTensor(const Tensor& tensor,
                                           IndexedDataset** out_dataset) {
   if (!(tensor.dtype() == DT_VARIANT ||
@@ -211,8 +248,6 @@ void IndexedDatasetOpKernel::Compute(OpKernelContext* ctx) {
   }
 }
 
-namespace {
-
 class MaterializedHandleOp : public OpKernel {
  public:
   explicit MaterializedHandleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -370,6 +405,144 @@ REGISTER_KERNEL_BUILDER(
     Name("ExperimentalIndexedDatasetGet").Device(DEVICE_CPU),
     IndexedDatasetGet);
 
+class IdentityIndexedDatasetOp : public IndexedDatasetOpKernel {
+ public:
+  using IndexedDatasetOpKernel::IndexedDatasetOpKernel;
+
+  void MakeIndexedDataset(OpKernelContext* ctx,
+                          IndexedDataset** output) override {
+    uint64 size = -1;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<uint64>(ctx, "size", &size));
+    OP_REQUIRES(ctx, size > 0, errors::InvalidArgument("`size` must be > 0"));
+    *output = new Dataset(ctx, size);
+  }
+
+  class Dataset : public IndexedDataset {
+   public:
+    Dataset(OpKernelContext* ctx, uint64 size)
+        : IndexedDataset(DatasetContext(ctx)), size_(size) {}
+
+    Status MaterializeDataset(
+        std::shared_ptr<MaterializedIndexedDataset>* materialized) override {
+      materialized->reset(new Materialized(this));
+      return Status::OK();
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_UINT64});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::IdentityIndexedDataset")}));
+    }
+
+    string DebugString() const override {
+      return "IdentityIndexedDataset::Dataset";
+    }
+
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** node) const override {
+      return errors::Unimplemented(
+          "identity_indexed_dataset.AsGraphDefInternal");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (cur_ < dataset()->size_) {
+          out_tensors->emplace_back(ctx->allocator({}), DT_UINT64,
+                                    TensorShape({}));
+          out_tensors->back().scalar<uint64>()() = cur_++;
+          *end_of_sequence = false;
+          return Status::OK();
+        }
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+     private:
+      mutex mu_;
+      uint64 cur_ GUARDED_BY(mu_);
+    };
+
+    class Materialized : public MaterializedIndexedDataset {
+     public:
+      explicit Materialized(Dataset* dataset) : dataset_(dataset) {
+        dataset->Ref();
+      }
+
+      ~Materialized() override {
+        // TODO(saeta): Pull this into MaterializedIndexedDataset
+        dataset_->Unref();
+      }
+
+      const DataTypeVector& output_dtypes() const override {
+        return dataset_->output_dtypes();
+      }
+
+      const std::vector<PartialTensorShape>& output_shapes() const override {
+        return dataset_->output_shapes();
+      }
+
+      Status Get(IteratorContext&& ctx, uint64 index,
+                 std::vector<Tensor>* out_tensors) const override {
+        LOG(INFO) << "Materialized(" << dataset_->size_ << ")::Get(" << index
+                  << ")";
+        if (index >= dataset_->size_) {
+          // Note: use InvalidArgument instead of OutOfRange error because many
+          // things consider OutOfRange to be a "clean termination" error.
+          return errors::InvalidArgument(
+              "Index ", index,
+              " is out of range for this dataset. (Size is: ", dataset_->size_,
+              ".)");
+        }
+        out_tensors->emplace_back(ctx.allocator({}), DT_UINT64,
+                                  TensorShape({}));
+        out_tensors->back().scalar<uint64>()() = index;
+        return Status::OK();
+      }
+
+      Status Size(uint64* size) const override {
+        *size = dataset_->size_;
+        return Status::OK();
+      }
+
+     private:
+      const Dataset* const dataset_;  // Not owned.
+    };
+
+    const uint64 size_;
+    std::shared_ptr<Materialized> materialized_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalIdentityIndexedDataset").Device(DEVICE_CPU),
+    IdentityIndexedDatasetOp);
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
similarity index 93%
rename from tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index ac001c22b91ec39730cb27dbbc36894a0d8ff8f7..3ff313559365051635d02d90281668fbca3e02a5 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -51,11 +51,12 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                          std::shared_ptr<std::vector<Tensor>>, StatusCallback)>;
 
   explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        op_version_(ctx->def().op() == "MapAndBatchDataset" ? 1 : 2) {
+      : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
   }
 
  protected:
@@ -68,29 +69,12 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         errors::InvalidArgument("batch_size must be greater than zero."));
 
     int64 num_parallel_calls;
-    switch (op_version_) {
-      case 1:
-        int64 num_parallel_batches;
-        OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_batches",
-                                                &num_parallel_batches));
-        num_parallel_calls = num_parallel_batches * batch_size;
-        OP_REQUIRES(ctx, num_parallel_batches > 0,
-                    errors::InvalidArgument(
-                        "num_parallel_batches must be greater than zero."));
-        break;
-      case 2:
-        OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
-                                                &num_parallel_calls));
-        OP_REQUIRES(ctx,
-                    num_parallel_calls > 0 || num_parallel_calls == kAutoTune,
-                    errors::InvalidArgument(
-                        "num_parallel_calls must be greater than zero."));
-        break;
-      default:
-        OP_REQUIRES(ctx, false,
-                    errors::Unimplemented("Unsupported operation version %d.",
-                                          op_version_));
-    }
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
+                                            &num_parallel_calls));
+    OP_REQUIRES(
+        ctx, num_parallel_calls > 0 || num_parallel_calls == model::kAutoTune,
+        errors::InvalidArgument(
+            "num_parallel_calls must be greater than zero."));
 
     bool drop_remainder;
     OP_REQUIRES_OK(ctx,
@@ -146,7 +130,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     *output = new Dataset(ctx, input, func_, batch_size, num_parallel_calls,
                           drop_remainder, output_types_, output_shapes_,
                           std::move(captured_func), &ctx->eigen_cpu_device(),
-                          std::move(map_func));
+                          std::move(map_func), preserve_cardinality_);
   }
 
  private:
@@ -159,7 +143,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
             const std::vector<PartialTensorShape>& output_shapes,
             std::unique_ptr<CapturedFunction> captured_func,
             const Eigen::ThreadPoolDevice* device,
-            MapAndBatchIteratorFunction map_func)
+            MapAndBatchIteratorFunction map_func, bool preserve_cardinality)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
@@ -170,7 +154,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           output_shapes_(output_shapes),
           captured_func_(std::move(captured_func)),
           device_(device),
-          map_func_(std::move(map_func)) {
+          map_func_(std::move(map_func)),
+          preserve_cardinality_(preserve_cardinality) {
       input_->Ref();
     }
 
@@ -195,6 +180,15 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       return "MapAndBatchDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -224,6 +218,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       b->BuildAttrValue(func_, &f);
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+      AttrValue preserve_cardinality_attr;
+      b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
 
       TF_RETURN_IF_ERROR(b->AddDataset(
           this,
@@ -233,7 +229,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
            std::make_pair(4, drop_remainder_node)},  // Single tensor inputs.
           {std::make_pair(1, other_arguments)},      // Tensor list inputs.
           {std::make_pair("f", f),
-           std::make_pair("Targuments", other_arguments_types_attr)},  // Attrs
+           std::make_pair("Targuments", other_arguments_types_attr),
+           std::make_pair("preserve_cardinality",
+                          preserve_cardinality_attr)},  // Attrs
           output));
       return Status::OK();
     }
@@ -271,9 +269,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(*mu_);
-        if (num_parallel_calls_->value == kAutoTune) {
+        if (num_parallel_calls_->value == model::kAutoTune) {
           num_parallel_calls_->value = ctx->runner_threadpool_size();
-          num_parallel_calls_->tunable = true;
         }
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
@@ -318,7 +315,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         while (num_calls_ > 0) {
           cond_var_->wait(l);
         }
-        CHECK_EQ(num_calls_, 0);
+        DCHECK_EQ(num_calls_, 0);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("call_counter"), call_counter_));
@@ -423,6 +420,15 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         std::shared_ptr<std::vector<Tensor>> return_values =
             std::make_shared<std::vector<Tensor>>();
         auto done = [this, ctx, result, return_values, offset](Status status) {
+          if (dataset()->preserve_cardinality_ &&
+              errors::IsOutOfRange(status)) {
+            // To guarantee that the transformation preserves the cardinality of
+            // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
+            // former may be interpreted by a caller as the end of sequence.
+            status = errors::InvalidArgument(
+                "Function invocation produced OutOfRangeError: ",
+                status.error_message());
+          }
           result->UpdateStatus(status, offset);
           if (status.ok()) {
             Status allocate_status =
@@ -449,8 +455,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                 // TODO(mrry): Add a version of DoParallelConcat that allows us
                 // to move `tensor` where possible, to speed up string tensor
                 // batching.
-                Status copy_status = ::tensorflow::functor::DoParallelConcat(
-                    *dataset()->device_, tensor, offset, batch);
+                Status copy_status =
+                    batch_util::CopyElementToSlice(tensor, batch, offset);
                 if (!copy_status.ok()) {
                   result->UpdateStatus(copy_status, offset);
                   break;
@@ -535,11 +541,14 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                            bool* end_of_sequence) {
         mutex_lock l(result->mu);
         if (result->num_elements == 0) {
-          *end_of_sequence = true;
-          return Status::OK();
+          if (result->status.ok() || errors::IsOutOfRange(result->status)) {
+            *end_of_sequence = true;
+            return Status::OK();
+          } else {
+            *end_of_sequence = false;
+            return result->status;
+          }
         }
-        // `f` may deliberately raise `errors::OutOfRange` to indicate that we
-        // should terminate the iteration early.
         if (!result->status.ok() && !errors::IsOutOfRange(result->status)) {
           // Deallocate tensors allocated for the output.
           result->output.clear();
@@ -569,7 +578,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         } else {
           *out_tensors = std::move(result->output);
         }
-        *end_of_sequence = result->num_elements == 0;
+        *end_of_sequence = false;
         return Status::OK();
       }
 
@@ -802,19 +811,18 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     const std::unique_ptr<CapturedFunction> captured_func_;
     const Eigen::ThreadPoolDevice* device_;  // not owned
     const MapAndBatchIteratorFunction map_func_;
+    const bool preserve_cardinality_;
   };
 
-  const int op_version_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList func_;
+  bool preserve_cardinality_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("MapAndBatchDataset").Device(DEVICE_CPU),
-                        MapAndBatchDatasetOp);
-
-REGISTER_KERNEL_BUILDER(Name("MapAndBatchDatasetV2").Device(DEVICE_CPU),
-                        MapAndBatchDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalMapAndBatchDataset").Device(DEVICE_CPU),
+    MapAndBatchDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
index 953e086de3786bcb101da9b8a15d5a19c0f8cc57..61811ea14eddc9f40987e12ce6343268da24a503 100644
--- a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
@@ -75,6 +75,8 @@ class NonSerializableDatasetOp : public UnaryDatasetOpKernel {
       return errors::Unimplemented(DebugString(), "::AsGraphDefInternal");
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
diff --git a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
index 75eacb4b5b2d15d525592e620dfc193ee5d8ca53..921f8ad58401dd0f06e205ce5ab04f3b27b45cd8 100644
--- a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
@@ -59,6 +59,9 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    // TODO(saeta): Implement support for preserve_cardinality logic.
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
   }
 
  protected:
@@ -73,9 +76,10 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     int64 num_parallel_calls;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
                                             &num_parallel_calls));
-    OP_REQUIRES(ctx, num_parallel_calls > 0 || num_parallel_calls == kAutoTune,
-                errors::InvalidArgument(
-                    "num_parallel_calls must be greater than zero."));
+    OP_REQUIRES(
+        ctx, num_parallel_calls > 0 || num_parallel_calls == model::kAutoTune,
+        errors::InvalidArgument(
+            "num_parallel_calls must be greater than zero."));
 
     bool drop_remainder;
     OP_REQUIRES_OK(ctx,
@@ -133,6 +137,17 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       return "NumaMapAndBatchDatasetOp::Dataset";
     }
 
+    // TODO(b/120482302): Note that this is inaccurate until
+    // NumaMapAndBatchMapDataset modified to preserve cardinality.
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -200,9 +215,8 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(*mu_);
-        if (num_parallel_calls_->value == kAutoTune) {
+        if (num_parallel_calls_->value == model::kAutoTune) {
           num_parallel_calls_->value = ctx->runner_threadpool_size();
-          num_parallel_calls_->tunable = true;
         }
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
@@ -1129,6 +1143,7 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList func_;
+  bool preserve_cardinality_;
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0230f90aba1c849483da5f8d7297c44c8a1174de
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -0,0 +1,1085 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <atomic>
+#include <deque>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// See documentation in ../../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 cycle_length = 0;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "cycle_length", &cycle_length));
+    OP_REQUIRES(ctx, cycle_length > 0,
+                errors::InvalidArgument("`cycle_length` must be > 0"));
+
+    int64 block_length = 0;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "block_length", &block_length));
+    OP_REQUIRES(ctx, block_length > 0,
+                errors::InvalidArgument("`block_length` must be > 0"));
+
+    bool sloppy = false;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "sloppy", &sloppy));
+
+    int64 buffer_output_elements = 0;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "buffer_output_elements",
+                                            &buffer_output_elements));
+    OP_REQUIRES(
+        ctx, buffer_output_elements > 0,
+        errors::InvalidArgument("`buffer_output_elements` must be > 0"));
+
+    int64 prefetch_input_elements = 0;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "prefetch_input_elements",
+                                            &prefetch_input_elements));
+    OP_REQUIRES(
+        ctx, prefetch_input_elements >= 0,
+        errors::InvalidArgument("`prefetch_input_elements` must be >= 0"));
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(interleave_func_, ctx, "other_arguments",
+                                      &captured_func));
+
+    *output =
+        new Dataset(ctx, input, interleave_func_, std::move(captured_func),
+                    cycle_length, block_length, sloppy, buffer_output_elements,
+                    prefetch_input_elements, output_types_, output_shapes_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
+            std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
+            int64 block_length, bool sloppy, int64 buffer_output_elements,
+            int64 prefetch_input_elements, const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          interleave_func_(func),
+          captured_func_(std::move(captured_func)),
+          cycle_length_(cycle_length),
+          block_length_(block_length),
+          sloppy_(sloppy),
+          buffer_output_elements_(buffer_output_elements),
+          prefetch_input_elements_(prefetch_input_elements),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::ParallelInterleave")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      return "ParallelInterleaveDatasetOp::Dataset";
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, interleave_func_.name()));
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
+      Node* cycle_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
+      Node* block_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(block_length_, &block_length_node));
+      Node* sloppy_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(sloppy_, &sloppy_node));
+      Node* buffer_output_elements_node;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(buffer_output_elements_, &buffer_output_elements_node));
+      Node* prefetch_input_elements_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(prefetch_input_elements_,
+                                      &prefetch_input_elements_node));
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(interleave_func_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this,
+          {{0, input_node},
+           {2, cycle_length_node},
+           {3, block_length_node},
+           {4, sloppy_node},
+           {5, buffer_output_elements_node},
+           {6, prefetch_input_elements_node}},
+          {{1, other_arguments}},
+          {{"f", f}, {"Targuments", other_arguments_types_attr}}, output));
+      return Status::OK();
+    }
+
+   private:
+    int64 num_threads() const {
+      return cycle_length_ + prefetch_input_elements_;
+    }
+
+    // Parallel interleave's implementation is designed around a few principles:
+    //  1. Thread creation is relatively expensive. (Not reusing
+    //     threads causes a number of indirect costs such as poorer tcmalloc
+    //     performance due to thread-local caches, etc.) We allocate a fixed
+    //     number of threads at the start and never change. This is why we've
+    //     fused functionality that is theoretically orthogonal (i.e.
+    //     .prefetch()) into the implementation.
+    //  2. Drop-in replacement for standard interleave. The goal will be to
+    //     auto-opt people into an optimized implementation without any work
+    //     on the customer's part. We thus go through great pains to maintain
+    //     identical iteration orders, full determinism (disabled only via a
+    //     flag, etc.)
+    //  3. Performance across a variety of environments and I/O envelopes.
+    //
+    // The actual implementation centers around a collection of worker threads
+    // and their corresponding worker state (tracked in the `workers_` vector).
+    // Worker threads repeatedly receive a vector of Tensors that are used as
+    // input to the flat-map function (`captured_func_`). The output of this
+    // function must be a dataset. The worker thread then repeatedly calls
+    // `GetNext()`, maintaining a buffer of elements to minimize the likelihood
+    // that a caller will block waiting for an element to be produced.
+    //
+    // Pointers to these worker states are kept in 2 disjoint data structures:
+    //  1. `interleave_indices_` is a vector containing indices of WorkerStates
+    //     in `workers_` that we are interleaving. Worker threads backing these
+    //     WorkerStates should be regularly producing values.
+    //  2. `staging_indices_` is a deque containing indices of WorkerStates in
+    //     `workers_` that we will move to `interleave_indices_` when an
+    //     iterator in `interleave_indices_` is exhausted.
+    //
+    // The client calls `GetNext[Internal]()` to retrieve an output element. The
+    // internal implementation updates the state of `interleave_indices_` and
+    // `staging_indices_` as output iterators (run by the worker threads) are
+    // exhausted.
+    //
+    // `input_impl_` is the input iterator that generates arguments for the
+    // flat-map function (`captured_func_`). It is set to an iterator at
+    // Iterator construction, and is fixed until we consume all input elements.
+    // Once it is exhausted, we reset the unique_ptr to eagerly deallocate
+    // memory.
+    //
+    // A few invariants are maintained:
+    //  1. No element in interleave_indices_ should be a -1 unless
+    //     `staging_indices_` is empty and `input_impl_` is empty.
+    //  2. Every `worker_` element is pointed to by at most one element of the
+    //     union of `interleave_indices_` and `staging_indices_`.
+    //  3. Unless `input_impl_` is empty, every `worker_` must be pointed to by
+    //     an element in `interleave_indices_` or `staging_indices_`.
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            workers_(dataset()->num_threads()),
+            worker_thread_states_(dataset()->num_threads()) {}
+
+      ~Iterator() override {
+        mutex_lock l(mu_);
+        cancelled_ = true;
+        // Notify all workers in case they are blocked.
+        for (auto& worker : workers_) {
+          worker.cond_var.notify_all();
+        }
+      }
+
+      Status Initialize(IteratorContext* ctx) override {
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
+      }
+
+      // It is implemented so that it matches the deterministic interleave
+      // unless getting the next element would block and we are allowed to be
+      // sloppy.
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(EnsureWorkerThreadsStarted(ctx));
+        while (!cancelled_) {
+          // Wait for an item to become available, blocking if necessary. If we
+          // are allowed to be sloppy, we can skip over input datasets that do
+          // not have an item readily available.
+          bool can_produce_elements = false;
+          bool must_wait_for_input = true;
+          for (int64 i = 0; i < interleave_indices_.size(); ++i) {
+            int64 index = (next_index_ + i) % interleave_indices_.size();
+            int64 current_worker_index = interleave_indices_[index];
+            if (current_worker_index < 0) {
+              continue;  // Empty interleave elements.
+            }
+            WorkerState* current_worker = &workers_[current_worker_index];
+            can_produce_elements |= current_worker->MayHaveElements();
+            if (!current_worker->outputs.empty()) {
+              // We have an element!
+              next_index_ = index;
+              const bool element_acquired_sloppily =
+                  dataset()->sloppy_ && i > 1;
+              if (!element_acquired_sloppily) {
+                // If the element was acquired in the regular (non-sloppy)
+                // order, then advance the current block and cycle pointers to
+                // the next element in the regular order.
+                block_count_++;
+                if (block_count_ == dataset()->block_length_) {
+                  next_index_ = (index + 1) % interleave_indices_.size();
+                  block_count_ = 0;
+                }
+              } else {
+                block_count_ = 0;
+              }
+              *end_of_sequence = false;
+              Status s = current_worker->outputs.front().status;
+              current_worker->outputs.front().output.swap(*out_tensors);
+              current_worker->outputs.pop_front();
+              current_worker->cond_var.notify_one();
+              return s;
+            } else if (current_worker->is_producing && !dataset()->sloppy_) {
+              // current_worker.outputs.empty(), and we must wait for this
+              // iterator.
+              if (next_index_ != index) {
+                // We have advanced to a new iterator; reset block counts.
+                next_index_ = index;
+                block_count_ = 0;
+              }
+              break;
+            } else if (!current_worker->is_producing) {
+              // This iterator has reached end of input.
+              interleave_indices_[index] = -1;
+              if (input_impl_) {
+                // Start prefetching a new iterator.
+                std::vector<Tensor> args;
+                bool end_of_input = false;
+                Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
+                if (end_of_input) {
+                  input_impl_.reset();
+                } else {
+                  current_worker->SetInputs(s, std::move(args));
+                  staging_indices_.emplace_back(current_worker_index);
+                }
+              }
+
+              if (!staging_indices_.empty()) {
+                // Move a worker from `staging_indices_` to
+                // `interleave_indices_`.
+                interleave_indices_[index] = staging_indices_.front();
+                staging_indices_.pop_front();
+
+                next_index_ = (index + 1) % interleave_indices_.size();
+                block_count_ = 0;
+                // Restart the inner [for] loop
+                can_produce_elements = true;
+                must_wait_for_input = false;
+                break;
+              }
+            }
+          }
+
+          if (!can_produce_elements && !input_impl_) {
+            // No potential for future values.
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          if (must_wait_for_input) {
+            // Wait for elements to become available.
+            RecordStop(ctx);
+            if (dataset()->sloppy_) {
+              sloppy_cond_var_.wait(l);
+            } else {
+              workers_[interleave_indices_[next_index_]].cond_var.wait(l);
+            }
+            RecordStart(ctx);
+          }
+        }
+        return errors::Cancelled(
+            "ParallelInterleaveDatasetOp::Dataset::Iterator::GetNext");
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeAsyncInterleaveManyNode(std::move(args),
+                                                  /*parameters=*/{});
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        // The order of locking is important here to avoid deadlock.
+        mutex_lock l(mu_);
+        mutex_lock ckpt_l(ckpt_mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_exhausted"), ""));
+        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("next_index"), next_index_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("block_count"), block_count_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("workers_size"), workers_.size()));
+        for (int i = 0; i < workers_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteWorkerStateLocked(writer, i));
+        }
+        for (int i = 0; i < worker_thread_states_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteWorkerThreadStateLocked(writer, i));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("interleave_size"),
+                                               interleave_indices_.size()));
+        for (int i = 0; i < interleave_indices_.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("interleave_indices_", i)),
+              interleave_indices_[i]));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("staging_size"),
+                                               staging_indices_.size()));
+        for (int i = 0; i < staging_indices_.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("staging_indices_", i)),
+              staging_indices_[i]));
+        }
+        if (!worker_threads_.empty()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("worker_threads_running"), ""));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        // The order of locking is important here to avoid deadlock.
+        mutex_lock l(mu_);
+        mutex_lock ckpt_l(ckpt_mu_);
+        if (!reader->Contains(full_name("input_exhausted"))) {
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        int64 temp;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("next_index"), &temp));
+        next_index_ = size_t(temp);
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("block_count"), &temp));
+        block_count_ = size_t(temp);
+
+        // Restore WorkerStates.
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("workers_size"), &temp));
+        if (temp != dataset()->num_threads()) {
+          return errors::Internal("Expected ", dataset()->num_threads(),
+                                  " worker states but found ", temp, ".");
+        }
+        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+          TF_RETURN_IF_ERROR(ReadWorkerStateLocked(reader, i, ctx));
+        }
+        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+          TF_RETURN_IF_ERROR(ReadWorkerThreadStateLocked(reader, i, ctx));
+        }
+
+        // Restore `interleave_indices_`.
+        std::set<int64> all_indices;
+        {
+          int64 interleave_size;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("interleave_size"),
+                                                &interleave_size));
+          interleave_indices_.reserve(interleave_size);
+          for (int64 i = 0; i < interleave_size; ++i) {
+            int64 temp;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("interleave_indices_", i)), &temp));
+            if (temp >= 0 && all_indices.find(temp) != all_indices.end()) {
+              return errors::Internal(
+                  "Duplicate entry for ", temp,
+                  " found when reading interleave and staging indices.");
+            }
+            if (temp >= 0) {
+              all_indices.insert(temp);
+            }
+            interleave_indices_.emplace_back(temp);
+          }
+        }
+
+        // Restore `staging_indices_`.
+        {
+          int64 staging_size;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("staging_size"), &staging_size));
+          for (int i = 0; i < staging_size; ++i) {
+            int64 temp;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("staging_indices_", i)), &temp));
+            if (all_indices.find(temp) != all_indices.end()) {
+              return errors::Internal(
+                  "Duplicate entry for ", temp,
+                  " found when reading interleave and staging indices.");
+            }
+            if (temp >= 0) {
+              all_indices.insert(temp);
+            }
+            staging_indices_.emplace_back(temp);
+          }
+        }
+
+        // Start Worker threads.
+        if (reader->Contains(full_name("worker_threads_running"))) {
+          worker_threads_.reserve(dataset()->num_threads());
+          for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
+            worker_threads_.emplace_back(ctx->env()->StartThread(
+                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
+          }
+        }
+        return Status::OK();
+      }
+
+     private:
+      // OutputElem contains the information from a call to GetNext by an output
+      // iterator.
+      struct OutputElem {
+        // The output iterator sets `status` if getting the output element
+        // fails.
+        Status status;
+        // The buffered data element.
+        std::vector<Tensor> output;
+
+        explicit OutputElem(const Status& s) : status(s) {}
+      };
+
+      // Worker threads operate on their relevant WorkerState structs.
+      //
+      // WorkerState's fields are all protected by mu_;
+      struct WorkerState {
+        // The arguments to be used to construct an output iterator.
+        std::vector<Tensor> input;
+        // The buffered output elements.
+        std::deque<OutputElem> outputs;
+        // Set to true iff the worker thread expects to append more elements to
+        // outputs. is_producing can be false despite !outputs.empty().
+        // Concretely, all output elements will have been consumed only when:
+        // is_producing == false && outputs.empty();
+        bool is_producing = false;
+        // Condition variable used to coordinate between threads. The worker
+        // thread waits on this condition variable when it is either (1) waiting
+        // for the main thread to add arguments to `input`, or (2) waiting for
+        // the main thread to consume an element of `outputs`. The main thread
+        // waits on cond_var if it is waiting for the worker thread to produce
+        // an element into `outputs` (this implies sloppy_==false).
+        condition_variable cond_var;
+
+        inline bool MayHaveElements() const {
+          return is_producing || !outputs.empty();
+        }
+
+        // Sets inputs for a worker thread and notifies it to start processing.
+        void SetInputs(const Status& s, std::vector<Tensor> input_arguments) {
+          if (s.ok()) {
+            DCHECK(!MayHaveElements())
+                << "Tried to start inputs, despite already producing!";
+            input = std::move(input_arguments);
+            is_producing = true;
+            cond_var.notify_one();
+          } else {
+            outputs.emplace_back(s);
+          }
+        }
+      };
+
+      // The internal state of a worker thread that is not already captured
+      // in its `WorkerState`.
+      //
+      // This is needed only for checkpointing purposes. We keep this
+      // separate from `WorkerState` and guard its fields using a separate
+      // lock `ckpt_mu_` so as to not affect the performance of main pipeline.
+      struct WorkerThreadState {
+        // The output element that has been produced from the input iterator
+        // and is waiting to be added to `WorkerState.outputs`.
+        OutputElem output_elem;
+
+        // Whether the input iterator returned an `end_of_sequence`.
+        bool end_of_sequence = false;
+
+        // Status returned from `MakeIteratorFromInputElement`.
+        Status iterator_creation_status;
+
+        // The arguments to be used to construct `iterator`.
+        std::vector<Tensor> input;
+
+        std::unique_ptr<IteratorBase> iterator;
+
+        WorkerThreadState() : output_elem(Status::OK()) {}
+      };
+
+      Status EnsureWorkerThreadsStarted(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (worker_threads_.empty()) {
+          worker_threads_.reserve(dataset()->num_threads());
+          for (int64 i = 0; i < dataset()->num_threads(); ++i) {
+            std::vector<Tensor> args;
+            bool end_of_input = false;
+            Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
+            if (end_of_input) {
+              input_impl_.reset();
+              return Status::OK();
+            }
+            workers_[i].SetInputs(s, std::move(args));
+            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
+            worker_threads_.emplace_back(ctx->env()->StartThread(
+                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
+            if (i < dataset()->cycle_length_) {
+              interleave_indices_.push_back(i);
+            } else {
+              staging_indices_.push_back(i);
+            }
+          }
+          DCHECK(interleave_indices_.size() == dataset()->cycle_length_);
+          DCHECK(staging_indices_.size() ==
+                 dataset()->prefetch_input_elements_);
+        }
+        return Status::OK();
+      }
+
+      // Produces elements into the worker's output buffers.
+      void WorkerThread(const std::shared_ptr<IteratorContext>& ctx,
+                        const int64 thread_index) {
+        // Notes on checkpointing thread local state, i.e., `WorkerThreadState`:
+        //
+        // 1. Any local state that may need to be checkpointed should be kept
+        //    in `worker_thread_states_[thread_index]`.
+        // 2. `WorkerThreadState` should contain state that is needed only for
+        //    checkpointing, i.e., if we were to remove checkpointing support,
+        //    we could keep that state as local variables in this thread.
+        // 3. This thread should only read/write state at `thread_index`
+        //    and should not access other thread states.
+        // 4. When restoring from checkpoint, threads are started only after
+        //    the restore is complete.
+        // 5. Once restored from a checkpoint, the local state is edited only
+        //    by this thread. 3 & 4 allow making assumptions like temporarily
+        //    caching local state in this thread and using it outside a lock
+        //    e.g. `make_new_iterator`.
+        // 6. `ckpt_mu_` should be wisely used to create *consistent*
+        //    checkpoint markers.
+
+        // std::function arguments are copy-constructable, so we pass raw
+        // pointers, and then immediately wrap them to ensure correct ownership.
+        RecordStart(ctx.get());
+        auto cleanup = gtl::MakeCleanup([this, thread_index, ctx] {
+          mutex_lock l(mu_);
+          workers_[thread_index].cond_var.notify_all();
+          RecordStop(ctx.get());
+        });
+        bool make_new_iterator;
+        {
+          tf_shared_lock l(ckpt_mu_);
+          // Decide whether a new iterator should be built.
+          // 1. If there is an existing iterator, we use it.
+          // 2. If there was an error in iterator creation that could not be
+          //    notified to the client we attempt to send that to the client
+          //    first.
+          make_new_iterator =
+              worker_thread_states_[thread_index].iterator == nullptr &&
+              worker_thread_states_[thread_index].iterator_creation_status.ok();
+        }
+        // Even though `make_new_iterator` has cached values from
+        // `worker_thread_states_[thread_index]` which is guarded by ckpt_mu_,
+        // it is safe to *read* `make_new_iterator`outside of a lock without
+        // worrying about concurrent changes to values in
+        // `worker_thread_states_[thread_index]`. See comment at the start of
+        // this function for details.
+        while (true) {
+          // Whether creation of the iterator succeeded.
+          Status iterator_creation_status;
+          // 1. Build a new iterator or use the existing one.
+          if (make_new_iterator) {
+            // 1a. Get new input tensors or use the exiting ones.
+            bool read_new_input;
+            {
+              tf_shared_lock l(ckpt_mu_);
+              // worker_thread_states_[thread_index].input will be non-empty
+              // if checkpointing happened at CHECKPOINT_MARKER_A.
+              read_new_input =
+                  worker_thread_states_[thread_index].input.empty();
+            }
+
+            if (read_new_input) {
+              mutex_lock l(mu_);
+              while (!cancelled_ && !workers_[thread_index].is_producing) {
+                RecordStop(ctx.get());
+                workers_[thread_index].cond_var.wait(l);
+                RecordStart(ctx.get());
+              }
+              if (cancelled_) return;
+              // Copy the input tensors so that we do not need to block on `mu_`
+              // when building the iterator.
+              // We keep a copy of the input tensors in
+              // `WorkerThreadState.input` till the iterator is in use. This is
+              // used in `RestoreInternal` to re-build the iterator.
+              // TODO(b/78046638): Explore ways to avoid tracking the input
+              // tensors.
+              tf_shared_lock ckpt_l(ckpt_mu_);
+              worker_thread_states_[thread_index].input.swap(
+                  workers_[thread_index].input);
+              // CHECKPOINT_MARKER_A
+              // We have the input tensors but have not built the iterator yet.
+            }
+
+            // 1b. Run the user defined function to produce a new iterator.
+            {
+              tf_shared_lock l(ckpt_mu_);
+              worker_thread_states_[thread_index].iterator_creation_status =
+                  MakeIteratorFromInputElement(
+                      ctx.get(), worker_thread_states_[thread_index].input,
+                      thread_index, *instantiated_captured_func_, prefix(),
+                      &worker_thread_states_[thread_index].iterator);
+              iterator_creation_status =
+                  worker_thread_states_[thread_index].iterator_creation_status;
+              if (!iterator_creation_status.ok()) {
+                worker_thread_states_[thread_index].input.clear();
+              }
+              // CHECKPOINT_MARKER_B
+              // Either an iterator has been successfully built and placed in
+              // `worker_thread_states_[thread_index].iterator` or it failed and
+              // a non-OK status has been put in
+              // `worker_thread_states_[thread_index].iterator_creation_status`.
+            }
+          } else {
+            tf_shared_lock l(ckpt_mu_);
+            iterator_creation_status =
+                worker_thread_states_[thread_index].iterator_creation_status;
+            // Mark that we have used up the restored iterator.
+            make_new_iterator = true;
+          }
+          // 2. Start producing elements or send error state to client if
+          //    iterator creation failed.
+          if (!iterator_creation_status.ok()) {
+            mutex_lock l(mu_);
+            // Wait for space in the prefetch queue.
+            while (!cancelled_ && workers_[thread_index].outputs.size() ==
+                                      dataset()->buffer_output_elements_) {
+              RecordStop(ctx.get());
+              workers_[thread_index].cond_var.wait(l);
+              RecordStart(ctx.get());
+            }
+            if (cancelled_) return;
+            tf_shared_lock ckpt_l(ckpt_mu_);
+            workers_[thread_index].outputs.emplace_back(
+                iterator_creation_status);
+            workers_[thread_index].is_producing = false;
+            worker_thread_states_[thread_index].iterator_creation_status =
+                Status::OK();
+            // CHECKPOINT_MARKER_C
+            // Non-OK iterator creation status has been notified to the
+            // client.
+            workers_[thread_index].cond_var.notify_one();
+          } else {
+            bool end_of_sequence = false;
+            while (!end_of_sequence) {
+              // 3.a Produce an element!
+              {
+                tf_shared_lock ckpt_l(ckpt_mu_);
+                if (worker_thread_states_[thread_index]
+                        .output_elem.status.ok() &&
+                    worker_thread_states_[thread_index]
+                        .output_elem.output.empty() &&
+                    !worker_thread_states_[thread_index].end_of_sequence) {
+                  worker_thread_states_[thread_index].output_elem.status =
+                      worker_thread_states_[thread_index].iterator->GetNext(
+                          ctx.get(),
+                          &worker_thread_states_[thread_index]
+                               .output_elem.output,
+                          &worker_thread_states_[thread_index].end_of_sequence);
+                  end_of_sequence =
+                      worker_thread_states_[thread_index].end_of_sequence;
+                } else {
+                  end_of_sequence =
+                      worker_thread_states_[thread_index].end_of_sequence;
+                }
+                // CHECKPOINT_MARKER_D
+                // An element has been read or an error or end_of_sequence has
+                // been received from the input iterator and is waiting to be
+                // sent to client.
+              }
+
+              // 3.b Make it available to the client.
+              {
+                mutex_lock l(mu_);
+
+                // Wait for space in the prefetch queue.
+                while (!cancelled_ && workers_[thread_index].outputs.size() ==
+                                          dataset()->buffer_output_elements_) {
+                  RecordStop(ctx.get());
+                  workers_[thread_index].cond_var.wait(l);
+                  RecordStart(ctx.get());
+                }
+                if (cancelled_) return;
+
+                tf_shared_lock ckpt_l(ckpt_mu_);
+                workers_[thread_index].is_producing = !end_of_sequence;
+
+                // Output the element.
+
+                // Move the temporary state in WorkerThreadState to WorkerState
+                // and mark it as used.
+                if (end_of_sequence) {
+                  worker_thread_states_[thread_index].iterator.reset();
+                  worker_thread_states_[thread_index].input.clear();
+                  worker_thread_states_[thread_index].end_of_sequence = false;
+                } else {
+                  workers_[thread_index].outputs.emplace_back(
+                      worker_thread_states_[thread_index].output_elem.status);
+                  workers_[thread_index].outputs.back().output.swap(
+                      worker_thread_states_[thread_index].output_elem.output);
+                }
+                worker_thread_states_[thread_index].output_elem.status =
+                    Status::OK();
+                if (dataset()->sloppy_) {
+                  sloppy_cond_var_.notify_one();
+                } else {
+                  workers_[thread_index].cond_var.notify_one();
+                }
+                // CHECKPOINT_MARKER_E
+                // Output element or iterator status has been sent to the
+                // client.
+              }
+            }
+          }
+        }
+      }
+
+      Status WriteWorkerStateLocked(IteratorStateWriter* writer, int index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string prefix = strings::StrCat("worker_", index);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_input_size")),
+            workers_[index].input.size()));
+        for (int i = 0; i < workers_[index].input.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_input_", i)),
+              workers_[index].input[i]));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_outputs_size")),
+            workers_[index].outputs.size()));
+        for (int i = 0; i < workers_[index].outputs.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteOutputElemLocked(
+              writer, workers_[index].outputs[i],
+              full_name(strings::StrCat(prefix, "_outputs_", i))));
+        }
+        if (workers_[index].is_producing) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_is_producing")), ""));
+        }
+        return Status::OK();
+      }
+
+      Status ReadWorkerStateLocked(IteratorStateReader* reader, int index,
+                                   IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string worker_prefix = strings::StrCat("worker_", index);
+        // Restore inputs.
+        int64 input_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_input_size")),
+            &input_size));
+        workers_[index].input.reserve(input_size);
+        for (int i = 0; i < input_size; ++i) {
+          workers_[index].input.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(worker_prefix, "_input_", i)),
+              &workers_[index].input.back()));
+        }
+        int64 outputs_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_outputs_size")),
+            &outputs_size));
+        for (int i = 0; i < outputs_size; ++i) {
+          workers_[index].outputs.emplace_back(Status::OK());
+          TF_RETURN_IF_ERROR(ReadOutputElemLocked(
+              reader, &workers_[index].outputs.back(),
+              full_name(strings::StrCat(worker_prefix, "_outputs_", i))));
+        }
+        if (reader->Contains(
+                full_name(strings::StrCat(worker_prefix, "_is_producing")))) {
+          workers_[index].is_producing = true;
+        } else {
+          workers_[index].is_producing = false;
+        }
+        return Status::OK();
+      }
+
+      Status WriteWorkerThreadStateLocked(IteratorStateWriter* writer,
+                                          int index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string prefix = strings::StrCat("worker_thread_", index);
+        if (worker_thread_states_[index].iterator != nullptr) {
+          TF_RETURN_IF_ERROR(
+              SaveInput(writer, worker_thread_states_[index].iterator));
+        } else {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_iterator_exhausted")), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_input_size")),
+            worker_thread_states_[index].input.size()));
+        for (int i = 0; i < worker_thread_states_[index].input.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_input_", i)),
+              worker_thread_states_[index].input[i]));
+        }
+        TF_RETURN_IF_ERROR(WriteStatusLocked(
+            writer, strings::StrCat(prefix, "_iterator_creation_status"),
+            worker_thread_states_[index].iterator_creation_status));
+        TF_RETURN_IF_ERROR(WriteOutputElemLocked(
+            writer, worker_thread_states_[index].output_elem,
+            full_name(strings::StrCat(prefix, "_output"))));
+        if (worker_thread_states_[index].end_of_sequence) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_end_of_sequence")), ""));
+        }
+        return Status::OK();
+      }
+
+      Status ReadWorkerThreadStateLocked(IteratorStateReader* reader, int index,
+                                         IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string worker_prefix = strings::StrCat("worker_thread_", index);
+        // Restore inputs.
+        int64 input_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_input_size")),
+            &input_size));
+        worker_thread_states_[index].input.reserve(input_size);
+        for (int i = 0; i < input_size; ++i) {
+          worker_thread_states_[index].input.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(worker_prefix, "_input_", i)),
+              &worker_thread_states_[index].input.back()));
+        }
+        // Restore iterator.
+        if (reader->Contains(full_name(
+                strings::StrCat(worker_prefix, "_iterator_exhausted")))) {
+          worker_thread_states_[index].iterator.reset();
+        } else {
+          std::unique_ptr<IteratorBase> iterator;
+          Status s = MakeIteratorFromInputElement(
+              ctx, worker_thread_states_[index].input, index,
+              *instantiated_captured_func_, prefix(), &iterator);
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, iterator));
+          worker_thread_states_[index].iterator.swap(iterator);
+        }
+        TF_RETURN_IF_ERROR(ReadStatusLocked(
+            reader, strings::StrCat(worker_prefix, "_iterator_creation_status"),
+            &worker_thread_states_[index].iterator_creation_status));
+        TF_RETURN_IF_ERROR(ReadOutputElemLocked(
+            reader, &worker_thread_states_[index].output_elem,
+            full_name(strings::StrCat(worker_prefix, "_output"))));
+        if (reader->Contains(full_name(
+                strings::StrCat(worker_prefix, "_end_of_sequence")))) {
+          worker_thread_states_[index].end_of_sequence = true;
+        } else {
+          worker_thread_states_[index].end_of_sequence = false;
+        }
+        return Status::OK();
+      }
+
+      Status WriteOutputElemLocked(IteratorStateWriter* writer,
+                                   const OutputElem& output_elem,
+                                   const string& prefix)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(WriteStatusLocked(
+            writer, strings::StrCat(prefix, "_status"), output_elem.status));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(strings::StrCat(prefix, "_output_size"),
+                                output_elem.output.size()));
+        for (int i = 0; i < output_elem.output.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              strings::StrCat(prefix, "_output_", i), output_elem.output[i]));
+        }
+        return Status::OK();
+      }
+
+      Status ReadOutputElemLocked(IteratorStateReader* reader,
+                                  OutputElem* output_elem, const string& prefix)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(ReadStatusLocked(
+            reader, strings::StrCat(prefix, "_status"), &output_elem->status));
+        int64 output_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            strings::StrCat(prefix, "_output_size"), &output_size));
+        output_elem->output.reserve(output_size);
+        for (int i = 0; i < output_size; ++i) {
+          output_elem->output.emplace_back();
+          TF_RETURN_IF_ERROR(
+              reader->ReadTensor(strings::StrCat(prefix, "_output_", i),
+                                 &output_elem->output.back()));
+        }
+        return Status::OK();
+      }
+
+      Status WriteStatusLocked(IteratorStateWriter* writer,
+                               const string& prefix, const Status& status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
+                                static_cast<int64>(status.code())));
+        if (!status.ok()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
+                                  status.error_message()));
+        }
+        return Status::OK();
+      }
+
+      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
+                              Status* status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        int64 code_int;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_code")), &code_int));
+        error::Code code = static_cast<error::Code>(code_int);
+
+        if (code != error::Code::OK) {
+          string error_message;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_msg")), &error_message));
+          *status = Status(code, error_message);
+        } else {
+          *status = Status::OK();
+        }
+        return Status::OK();
+      }
+
+      // Mutex & condition variable to guard mutable iterator internals and
+      // coordinate among worker threads and client thread[s].
+      mutex mu_ ACQUIRED_BEFORE(ckpt_mu_);
+      // The main thread waits on this condition variable if running in sloppy
+      // mode and no values are available.
+      condition_variable sloppy_cond_var_;
+      // Mutex used to wait for a consistent state while checkpointing.
+      // Only Save and Restore require an exclusive lock on this mutex. In
+      // other scenarios we just acquire a shared lock so the pipeline's
+      // performance should not be affected in the absence of checkpointing.
+      // A thread must not wait on any condition variable while holding
+      // `ckpt_mu_` in either shared or exclusive modes.
+      mutex ckpt_mu_;
+
+      // The iterator producing elements which are converted to datasets by
+      // the dataset()->captured_func_ then interleaved together.
+      // input_impl_ is reset when we have exhausted its input.
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
+
+      // The WorkerState structs the worker threads operate on.
+      // workers_ elements are in at most one of interleave_ and staging_.
+      std::vector<WorkerState> workers_ GUARDED_BY(mu_);
+
+      // Stores the temporary state of WorkerThreads which is not stored in
+      // WorkerState. This is used for checkpointing purposes only.
+      std::vector<WorkerThreadState> worker_thread_states_ GUARDED_BY(ckpt_mu_);
+
+      // Indices in `workers_` of iterators to interleave.
+      std::vector<int64> interleave_indices_ GUARDED_BY(mu_);
+      // Indices in `workers_` of prefetched iterators.
+      std::deque<int64> staging_indices_ GUARDED_BY(mu_);
+
+      // The index into output_elements_ for next element to produce.
+      size_t next_index_ GUARDED_BY(mu_) = 0;
+      // The number of items produced so far within the block
+      size_t block_count_ GUARDED_BY(mu_) = 0;
+      // Flag to instruct the worker threads to exit.
+      bool cancelled_ GUARDED_BY(mu_) = false;
+      // The worker threads. This must be last to ensure the
+      // threads have exited before any other members are deallocated.
+      // TODO(b/65178177): Avoid allocating additional threads.
+      std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    const NameAttrList interleave_func_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+    const int64 cycle_length_;
+    const int64 block_length_;
+    const bool sloppy_;
+    const int64 buffer_output_elements_;
+    const int64 prefetch_input_elements_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  NameAttrList interleave_func_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalParallelInterleaveDataset").Device(DEVICE_CPU),
+    ParallelInterleaveDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
similarity index 98%
rename from tensorflow/core/kernels/data/parse_example_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index c4ab4b28df2fb282eeebc96ff74ee96e210a535d..ea99a8b32c5a945f30945369ef2ed4f4b6725887 100644
--- a/tensorflow/core/kernels/data/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -187,7 +187,8 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
           new ParseExampleFunctor(this));
       return NewParallelMapIterator(
           {this, strings::StrCat(prefix, "::ParseExample")}, input_,
-          std::move(parse_example_functor), num_parallel_calls_, sloppy_);
+          std::move(parse_example_functor), num_parallel_calls_, sloppy_,
+          /*preserve_cardinality=*/true);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -202,6 +203,8 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
       return "ParseExampleDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -386,8 +389,9 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
   std::vector<std::size_t> elements_per_stride_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("ParseExampleDataset").Device(DEVICE_CPU),
-                        ParseExampleDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalParseExampleDataset").Device(DEVICE_CPU),
+    ParseExampleDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
similarity index 97%
rename from tensorflow/core/kernels/data/random_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index 816405fea90ef5de5fb5da9db03818b8775b0d3e..6d85cd5c450640a0042add2ead26836433166ade 100644
--- a/tensorflow/core/kernels/data/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -76,6 +76,8 @@ class RandomDatasetOp : public DatasetOpKernel {
                              ")::Dataset");
     }
 
+    int64 Cardinality() const override { return kInfiniteCardinality; }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -152,7 +154,7 @@ class RandomDatasetOp : public DatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("RandomDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("ExperimentalRandomDataset").Device(DEVICE_CPU),
                         RandomDatasetOp);
 
 }  // namespace
diff --git a/tensorflow/core/kernels/data/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
similarity index 88%
rename from tensorflow/core/kernels/data/scan_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index 12d07f191929f24dcce06bcd27cbb047d991aae6..0d9a629a27f907fca2214a574db1ea0074a9ed2e 100644
--- a/tensorflow/core/kernels/data/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -37,6 +37,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Tstate", &state_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -53,7 +55,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
     *output = new Dataset(ctx, input, func_, std::move(initial_state),
                           std::move(captured_func), state_types_, output_types_,
-                          output_shapes_);
+                          output_shapes_, preserve_cardinality_);
   }
 
  private:
@@ -64,7 +66,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& state_types,
             const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
+            const std::vector<PartialTensorShape>& output_shapes,
+            bool preserve_cardinality)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
@@ -72,7 +75,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
           captured_func_(std::move(captured_func)),
           state_types_(state_types),
           output_types_(output_types),
-          output_shapes_(output_shapes) {
+          output_shapes_(output_shapes),
+          preserve_cardinality_(preserve_cardinality) {
       input_->Ref();
     }
 
@@ -93,6 +97,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "ScanDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -123,12 +129,15 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
       b->BuildAttrValue(state_types_, &state_types);
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+      AttrValue preserve_cardinality_attr;
+      b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
       TF_RETURN_IF_ERROR(
           b->AddDataset(this, {{0, input_node}},
                         {{1, initial_state_nodes}, {2, other_arguments}},
                         {{"f", f},
                          {"Tstate", state_types},
-                         {"Targuments", other_arguments_types_attr}},
+                         {"Targuments", other_arguments_types_attr},
+                         {"preserve_cardinality", preserve_cardinality_attr}},
                         output));
       return Status::OK();
     }
@@ -203,10 +212,19 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
             out_tensors->push_back(std::move(state_and_output[i]));
           }
         } else if (errors::IsOutOfRange(s)) {
-          // `f` may deliberately raise `errors::OutOfRange` to indicate
-          // that we should terminate the iteration early.
-          *end_of_sequence = true;
-          return Status::OK();
+          if (dataset()->preserve_cardinality_) {
+            // To guarantee that the transformation preserves the cardinality of
+            // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
+            // former may be interpreted by a caller as the end of sequence.
+            return errors::InvalidArgument(
+                "Function invocation produced OutOfRangeError: ",
+                s.error_message());
+          } else {
+            // `f` may deliberately raise `errors::OutOfRange` to indicate
+            // that we should terminate the iteration early.
+            *end_of_sequence = true;
+            return Status::OK();
+          }
         }
         return s;
       }
@@ -263,15 +281,18 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
     const DataTypeVector state_types_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
+    const bool preserve_cardinality_;
   };
 
   DataTypeVector state_types_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList func_;
+  bool preserve_cardinality_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("ScanDataset").Device(DEVICE_CPU), ScanDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("ExperimentalScanDataset").Device(DEVICE_CPU),
+                        ScanDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
similarity index 97%
rename from tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
index a21b3fc16b7a93978bd2e03081aec9e7aa5e5ba4..fe128005faca9bd986e7c85600f7f871ebb97a25 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
@@ -129,6 +129,8 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
       return "SetStatsAggregatorDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -203,8 +205,9 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("SetStatsAggregatorDataset").Device(DEVICE_CPU),
-                        SetStatsAggregatorDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalSetStatsAggregatorDataset").Device(DEVICE_CPU),
+    SetStatsAggregatorDatasetOp);
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
index c7bf89cbdeb4f81da4346ceef68b46598b032d0c..d2fb8ac4f33b1e844bb39cc70a47ccb15424ace7 100644
--- a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
@@ -68,6 +68,8 @@ class SleepDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "SleepDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/slide_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
similarity index 95%
rename from tensorflow/core/kernels/data/slide_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
index e67c5272b6fa3eee4bd852da45dd5081e7ce12e4..1ce4fbd3136d7fbd245fbb920ff658c4eae794c6 100644
--- a/tensorflow/core/kernels/data/slide_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
@@ -29,9 +29,9 @@ namespace {
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-class SlideDatasetOp : public UnaryDatasetOpKernel {
+class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
  public:
-  explicit SlideDatasetOp(OpKernelConstruction* ctx)
+  explicit SlidingWindowDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {}
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -99,10 +99,18 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
     }
 
     string DebugString() const override {
-      return strings::StrCat("SlideDatasetOp(", window_size_, ", ",
+      return strings::StrCat("SlidingWindowDatasetOp(", window_size_, ", ",
                              window_shift_, ", ", window_stride_, ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / window_shift_;
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -295,8 +303,9 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("SlideDataset").Device(DEVICE_CPU),
-                        SlideDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalSlidingWindowDataset").Device(DEVICE_CPU),
+    SlidingWindowDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/sql/BUILD b/tensorflow/core/kernels/data/experimental/sql/BUILD
similarity index 100%
rename from tensorflow/core/kernels/data/sql/BUILD
rename to tensorflow/core/kernels/data/experimental/sql/BUILD
diff --git a/tensorflow/core/kernels/data/sql/driver_manager.cc b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
similarity index 88%
rename from tensorflow/core/kernels/data/sql/driver_manager.cc
rename to tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
index 783d1e6cb28fdd3f2e42caecc300ba9bd8b22c04..58174f69a44a5e28dd2d4fd018ee45688d407054 100644
--- a/tensorflow/core/kernels/data/sql/driver_manager.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/data/sql/driver_manager.h"
-#include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/driver_manager.h"
+#include "tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/tensorflow/core/kernels/data/sql/driver_manager.h b/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
similarity index 81%
rename from tensorflow/core/kernels/data/sql/driver_manager.h
rename to tensorflow/core/kernels/data/experimental/sql/driver_manager.h
index c5428f396b03f03390f53b6a2e50fca3821dac0c..6afadf91a478e5da470897c3aa2977462337b5e5 100644
--- a/tensorflow/core/kernels/data/sql/driver_manager.h
+++ b/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_DRIVER_MANAGER_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_DRIVER_MANAGER_H_
 
-#include "tensorflow/core/kernels/data/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/query_connection.h"
 
 namespace tensorflow {
 namespace data {
@@ -38,4 +38,4 @@ class DriverManager {
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_DRIVER_MANAGER_H_
diff --git a/tensorflow/core/kernels/data/sql/query_connection.h b/tensorflow/core/kernels/data/experimental/sql/query_connection.h
similarity index 92%
rename from tensorflow/core/kernels/data/sql/query_connection.h
rename to tensorflow/core/kernels/data/experimental/sql/query_connection.h
index 2fd229a9bfd4dd4f6e49eaa2452dbd9140050523..10c66436792a9794112a38a4a590e2e9fc3c05c5 100644
--- a/tensorflow/core/kernels/data/sql/query_connection.h
+++ b/tensorflow/core/kernels/data/experimental/sql/query_connection.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_QUERY_CONNECTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_QUERY_CONNECTION_H_
 
 #include "tensorflow/core/framework/tensor.h"
 
@@ -67,4 +67,4 @@ class QueryConnection {
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_QUERY_CONNECTION_H_
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
similarity index 97%
rename from tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
rename to tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
index 1d374898dc321b28b091b90c232afefb0457697b..cadceee8f516c08a45b63702aa321944e8f0a21e 100644
--- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h"
 
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -106,7 +106,7 @@ void SqliteQueryConnection::FillTensorWithResultSetEntry(
       break;
     // Error preemptively thrown by SqlDatasetOp::MakeDataset in this case.
     default:
-      LOG(FATAL)
+      LOG(ERROR)
           << "Use of unsupported TensorFlow data type by 'SqlQueryConnection': "
           << DataTypeString(data_type) << ".";
   }
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
similarity index 84%
rename from tensorflow/core/kernels/data/sql/sqlite_query_connection.h
rename to tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
index 175492c49dba512f602c7153f1ab66ba6427aa3d..61df29065e15281067ec0fbcb499d382b0ba73f8 100644
--- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_SQLITE_QUERY_CONNECTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_SQLITE_QUERY_CONNECTION_H_
 
 #include <memory>
 
-#include "tensorflow/core/kernels/data/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/query_connection.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -53,4 +53,4 @@ class SqliteQueryConnection : public QueryConnection {
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_SQLITE_QUERY_CONNECTION_H_
diff --git a/tensorflow/core/kernels/data/sql_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
similarity index 96%
rename from tensorflow/core/kernels/data/sql_dataset_ops.cc
rename to tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
index f01ecf84afab05fcbf87f70668489d2358e66817..c16d8ed02ccdfb01a41ff9206a003f4a8c04a667 100644
--- a/tensorflow/core/kernels/data/sql_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/sql/driver_manager.h"
-#include "tensorflow/core/kernels/data/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/driver_manager.h"
+#include "tensorflow/core/kernels/data/experimental/sql/query_connection.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -214,7 +214,8 @@ class SqlDatasetOp : public DatasetOpKernel {
   std::vector<PartialTensorShape> output_shapes_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("SqlDataset").Device(DEVICE_CPU), SqlDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("ExperimentalSqlDataset").Device(DEVICE_CPU),
+                        SqlDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
similarity index 95%
rename from tensorflow/core/kernels/data/stats_aggregator_ops.cc
rename to tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
index 2d5146761631f8ed28ebcafac9fd670da9e3b47d..894465e1814cf93b02ecbbb053494d4c032fe243 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
@@ -141,10 +141,12 @@ class StatsAggregatorSummaryOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("StatsAggregatorHandle").Device(DEVICE_CPU),
-                        StatsAggregatorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("StatsAggregatorSummary").Device(DEVICE_CPU),
-                        StatsAggregatorSummaryOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalStatsAggregatorHandle").Device(DEVICE_CPU),
+    StatsAggregatorHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalStatsAggregatorSummary").Device(DEVICE_CPU),
+    StatsAggregatorSummaryOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/stats_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
similarity index 95%
rename from tensorflow/core/kernels/data/stats_dataset_ops.cc
rename to tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
index da0039773cee1bcdf313c03b0b01198b03c71cc5..1961f25df846e8773bf6b0266d089c9d3bac355b 100644
--- a/tensorflow/core/kernels/data/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
@@ -78,6 +78,8 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
       return "LatencyStatsDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -186,6 +188,8 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
       return "BytesProducedStatsDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -255,10 +259,12 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("LatencyStatsDataset").Device(DEVICE_CPU),
-                        LatencyStatsDatasetOp);
-REGISTER_KERNEL_BUILDER(Name("BytesProducedStatsDataset").Device(DEVICE_CPU),
-                        BytesProducedStatsDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalLatencyStatsDataset").Device(DEVICE_CPU),
+    LatencyStatsDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalBytesProducedStatsDataset").Device(DEVICE_CPU),
+    BytesProducedStatsDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index 335f2b7a4b3f4695a73bbba884a0c6bc83a67288..8ae45ed5c9d9fe199ef392a1430f359172ec5c73 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -169,6 +169,8 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
       return "ThreadPoolDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -248,7 +250,7 @@ class MaxIntraOpParallelismDatasetOp : public UnaryDatasetOpKernel {
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            int max_intra_op_parallelism)
+            int64 max_intra_op_parallelism)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           max_intra_op_parallelism_(max_intra_op_parallelism) {
@@ -274,6 +276,8 @@ class MaxIntraOpParallelismDatasetOp : public UnaryDatasetOpKernel {
       return "MaxIntraOpParallelismDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -332,7 +336,7 @@ class MaxIntraOpParallelismDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const DatasetBase* const input_;
-    const int max_intra_op_parallelism_;
+    const int64 max_intra_op_parallelism_;
   };
 };
 
@@ -383,6 +387,8 @@ class PrivateThreadPoolDatasetOp : public UnaryDatasetOpKernel {
       return "PrivateThreadPoolDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
similarity index 97%
rename from tensorflow/core/kernels/data/writer_ops.cc
rename to tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
index c00089d06f9d76c0ec2f3a3120bb5c65d5800d04..7728baf1507c6cec2b44f41561f2ab3d04a80cc8 100644
--- a/tensorflow/core/kernels/data/writer_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
@@ -101,8 +101,8 @@ class ToTFRecordOp : public AsyncOpKernel {
   BackgroundWorker background_worker_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("DatasetToTFRecord").Device(DEVICE_CPU),
-                        ToTFRecordOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalDatasetToTFRecord").Device(DEVICE_CPU), ToTFRecordOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
similarity index 99%
rename from tensorflow/core/kernels/data/unbatch_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index af7f676370af9aa1b39e788c0809543f631bdba0..2626ec3ed7250b725650a76b8674e0a76ebc638f 100644
--- a/tensorflow/core/kernels/data/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -221,7 +221,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("UnbatchDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("ExperimentalUnbatchDataset").Device(DEVICE_CPU),
                         UnbatchDatasetOp);
 
 }  // namespace
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index cb7477f9e26c397ba7854ab26387ebd194362d97..9f5881563b5db2b6b5a678b777789091756a6e7a 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -60,8 +61,8 @@ class IteratorResource : public ResourceBase {
                    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
                    FunctionLibraryRuntime* lib)
       : device_mgr_(std::move(device_mgr)),
-        iterator_state_(
-            new State(std::move(flib_def), std::move(pflr), lib, nullptr)),
+        iterator_state_(std::make_shared<State>(
+            std::move(flib_def), std::move(pflr), lib, nullptr /* iterator */)),
         output_dtypes_(output_dtypes),
         output_shapes_(output_shapes) {}
 
@@ -77,6 +78,7 @@ class IteratorResource : public ResourceBase {
       params.lib = captured_state->lib;
       params.function_handle_cache =
           captured_state->function_handle_cache.get();
+      params.resource_mgr = &captured_state->resource_mgr;
       return captured_state->iterator->GetNext(
           IteratorContext(std::move(params)), out_tensors, end_of_sequence);
     } else {
@@ -135,8 +137,8 @@ class IteratorResource : public ResourceBase {
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
     TF_RETURN_IF_ERROR(ctx->function_library()->Clone(&flib_def, &pflr, &lib));
     TF_RETURN_IF_ERROR(flib_def->AddLibrary(graph_def.library()));
-    std::unique_ptr<State> new_state(
-        new State(std::move(flib_def), std::move(pflr), lib, nullptr));
+    std::unique_ptr<State> new_state = MakeUnique<State>(
+        std::move(flib_def), std::move(pflr), lib, nullptr /* iterator */);
 
     TF_RETURN_IF_ERROR(
         graph_runner.Run(&graph, new_state->lib, {}, {output_node}, &outputs));
@@ -145,6 +147,7 @@ class IteratorResource : public ResourceBase {
     IteratorContext::Params params(ctx);
     params.lib = new_state->lib;
     params.function_handle_cache = new_state->function_handle_cache.get();
+    params.resource_mgr = &new_state->resource_mgr;
     TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
                                              "Iterator", &new_state->iterator));
     TF_RETURN_IF_ERROR(
@@ -156,6 +159,7 @@ class IteratorResource : public ResourceBase {
       IteratorContext::Params params(ctx);
       params.lib = new_state->lib;
       params.function_handle_cache = new_state->function_handle_cache.get();
+      params.resource_mgr = &new_state->resource_mgr;
       DeviceBase* device = new_state->lib->device();
       params.allocator_getter = [device](AllocatorAttributes attrs) {
         return device->GetAllocator(attrs);
@@ -178,9 +182,10 @@ class IteratorResource : public ResourceBase {
     std::shared_ptr<State> new_state;
     {
       tf_shared_lock l(mu_);
-      new_state.reset(new State(iterator_state_->flib_def,
-                                iterator_state_->pflr, iterator_state_->lib,
-                                nullptr, nullptr));
+      new_state = std::make_shared<State>(
+          iterator_state_->flib_def, iterator_state_->pflr,
+          iterator_state_->lib, nullptr /* function_handle_cache */,
+          nullptr /* iterator */);
     }
 
     // Ensure that the iterator has access to all functions in the current
@@ -205,13 +210,14 @@ class IteratorResource : public ResourceBase {
       new_state->lib = lib;
     }
 
-    new_state->function_handle_cache.reset(
-        new FunctionHandleCache(new_state->lib));
+    new_state->function_handle_cache =
+        MakeUnique<FunctionHandleCache>(new_state->lib);
     // Create new iterator.
     std::unique_ptr<IteratorBase> iterator;
     IteratorContext::Params params(ctx);
     params.lib = new_state->lib;
     params.function_handle_cache = new_state->function_handle_cache.get();
+    params.resource_mgr = &new_state->resource_mgr;
     TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
                                              "Iterator", &iterator));
     TF_RETURN_IF_ERROR(
@@ -241,7 +247,7 @@ class IteratorResource : public ResourceBase {
         : flib_def(flib_def),
           pflr(pflr),
           lib(lib),
-          function_handle_cache(absl::make_unique<FunctionHandleCache>(lib)),
+          function_handle_cache(MakeUnique<FunctionHandleCache>(lib)),
           iterator(std::move(iterator)) {}
 
     State(std::shared_ptr<FunctionLibraryDefinition> flib_def,
@@ -259,6 +265,7 @@ class IteratorResource : public ResourceBase {
     std::shared_ptr<ProcessFunctionLibraryRuntime> pflr;
     FunctionLibraryRuntime* lib = nullptr;  // not owned.
     std::unique_ptr<FunctionHandleCache> function_handle_cache;
+    ResourceMgr resource_mgr;
     std::unique_ptr<IteratorBase> iterator;
   };
 
@@ -427,7 +434,7 @@ class IteratorStateVariant {
     SerializationContext::Params params;
     params.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
     SerializationContext serialization_ctx(params);
-    data_.reset(new VariantTensorData());
+    data_ = MakeUnique<VariantTensorData>();
     data_->set_type_name(TypeName());
     VariantTensorDataWriter writer(data_.get());
     TF_RETURN_IF_ERROR(iterator_resource->Save(&serialization_ctx, &writer));
@@ -440,10 +447,11 @@ class IteratorStateVariant {
     if (data.type_name() != TypeName()) {
       return false;
     }
-    std::unique_ptr<VariantTensorData> tensor_data(new VariantTensorData);
+    std::unique_ptr<VariantTensorData> tensor_data =
+        MakeUnique<VariantTensorData>();
     std::swap(*tensor_data, data);
-    std::unique_ptr<VariantTensorDataReader> reader(
-        new VariantTensorDataReader(tensor_data.get()));
+    std::unique_ptr<VariantTensorDataReader> reader =
+        MakeUnique<VariantTensorDataReader>(tensor_data.get());
     status_ = reader->status();
     if (!status_.ok()) {
       return false;
@@ -577,12 +585,12 @@ FunctionLibraryRuntime* IteratorHandleOp::CreatePrivateFLR(
   *device_mgr = absl::make_unique<DeviceMgr>(RenamedDevice::NewRenamedDevice(
       ctx->device()->name(), down_cast<Device*>(ctx->device()),
       false /* owns_underlying */, false /* isolate_session_state */));
-  flib_def->reset(new FunctionLibraryDefinition(
-      *ctx->function_library()->GetFunctionLibraryDefinition()));
-  pflr->reset(new ProcessFunctionLibraryRuntime(
+  *flib_def = MakeUnique<FunctionLibraryDefinition>(
+      *ctx->function_library()->GetFunctionLibraryDefinition());
+  *pflr = MakeUnique<ProcessFunctionLibraryRuntime>(
       device_mgr->get(), ctx->env(), graph_def_version_, flib_def->get(),
-      {} /* TODO(mrry): OptimizerOptions? */,
-      nullptr /* TODO(mrry): ClusterFLR */));
+      OptimizerOptions{} /* TODO(mrry): OptimizerOptions? */,
+      nullptr /* TODO(mrry): ClusterFLR */);
 
   return (*pflr)->GetFLR(ctx->device()->name());
 }
@@ -670,9 +678,11 @@ class ToSingleElementOp : public AsyncOpKernel {
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
       std::unique_ptr<IteratorBase> iterator;
       IteratorContext::Params params(ctx);
-      std::unique_ptr<FunctionHandleCache> function_handle_cache(
-          new FunctionHandleCache(params.lib));
+      std::unique_ptr<FunctionHandleCache> function_handle_cache =
+          MakeUnique<FunctionHandleCache>(params.lib);
       params.function_handle_cache = function_handle_cache.get();
+      std::unique_ptr<ResourceMgr> resource_mgr = MakeUnique<ResourceMgr>();
+      params.resource_mgr = resource_mgr.get();
       IteratorContext iter_ctx(std::move(params));
 
       OP_REQUIRES_OK_ASYNC(
@@ -758,9 +768,11 @@ class ReduceDatasetOp : public AsyncOpKernel {
           done);
 
       IteratorContext::Params params(ctx);
-      std::unique_ptr<FunctionHandleCache> function_handle_cache(
-          new FunctionHandleCache(params.lib));
+      std::unique_ptr<FunctionHandleCache> function_handle_cache =
+          MakeUnique<FunctionHandleCache>(params.lib);
       params.function_handle_cache = function_handle_cache.get();
+      std::unique_ptr<ResourceMgr> resource_mgr = MakeUnique<ResourceMgr>();
+      params.resource_mgr = resource_mgr.get();
       IteratorContext iter_ctx(std::move(params));
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func;
       OP_REQUIRES_OK_ASYNC(
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index a4e61e02e2339e838118463414aadc86a3604d49..fc6e93a81cb47372fa023a2f793d35008ab830c8 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -40,6 +40,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
                                      &use_inter_op_parallelism_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -86,9 +88,10 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
       };
     }
 
-    *output = new Dataset(ctx, input, func_, std::move(captured_func),
-                          output_types_, output_shapes_,
-                          use_inter_op_parallelism_, std::move(map_func));
+    *output =
+        new Dataset(ctx, input, func_, std::move(captured_func), output_types_,
+                    output_shapes_, use_inter_op_parallelism_,
+                    std::move(map_func), preserve_cardinality_);
   }
 
  private:
@@ -99,11 +102,13 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
-            bool use_inter_op_parallelism, MapIteratorFunction map_func)
+            bool use_inter_op_parallelism, MapIteratorFunction map_func,
+            bool preserve_cardinality)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
           use_inter_op_parallelism_(use_inter_op_parallelism),
+          preserve_cardinality_(preserve_cardinality),
           captured_func_(std::move(captured_func)),
           output_types_(output_types),
           output_shapes_(output_shapes),
@@ -128,6 +133,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "MapDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -161,13 +168,19 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
       b->BuildAttrValue(use_inter_op_parallelism_,
                         &use_inter_op_parallelism_attr);
 
+      // Attr: preserve_cardinality
+      AttrValue preserve_cardinality_attr;
+      b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
+
       TF_RETURN_IF_ERROR(b->AddDataset(
           this, {std::make_pair(0, input_graph_node)},  // Single tensor inputs.
           {std::make_pair(1, other_arguments)},         // Tensor list inputs.
           {std::make_pair("f", f_attr),
            std::make_pair("Targuments", other_arguments_types_attr),
            std::make_pair("use_inter_op_parallelism",
-                          use_inter_op_parallelism_attr)},  // Attrs
+                          use_inter_op_parallelism_attr),
+           std::make_pair("preserve_cardinality",
+                          preserve_cardinality_attr)},  // Attrs
           output));
       return Status::OK();
     }
@@ -202,10 +215,19 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
         Status s = map_func_(ctx, instantiated_captured_func_.get(), args,
                              out_tensors);
         if (errors::IsOutOfRange(s)) {
-          // `f` may deliberately raise `errors::OutOfRange` to indicate
-          // that we should terminate the iteration early.
-          *end_of_sequence = true;
-          return Status::OK();
+          if (dataset()->preserve_cardinality_) {
+            // To guarantee that the transformation preserves the cardinality of
+            // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
+            // former may be interpreted by a caller as the end of sequence.
+            return errors::InvalidArgument(
+                "Function invocation produced OutOfRangeError: ",
+                s.error_message());
+          } else {
+            // `f` may deliberately raise `errors::OutOfRange` to indicate
+            // that we should terminate the iteration early.
+            *end_of_sequence = true;
+            return Status::OK();
+          }
         } else {
           return s;
         }
@@ -238,6 +260,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
     const DatasetBase* const input_;
     const NameAttrList func_;
     const bool use_inter_op_parallelism_;
+    const bool preserve_cardinality_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
@@ -248,6 +271,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList func_;
   bool use_inter_op_parallelism_;
+  bool preserve_cardinality_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("MapDataset").Device(DEVICE_CPU), MapDatasetOp);
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index dcd23095968493a9051fe918f6c79c527dad638e..069d61d80d4f00eecdd77356626d7278c0842445 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -60,6 +60,8 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "ModelDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index a070456414c5686fb54acd0e4952cc2b8d92b27d..ba2125a66eb98985ebd0ae8f55bfc239997ad6df 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -98,6 +98,7 @@ class MultiDeviceIterator : public ResourceBase {
       IteratorContext::Params params(ctx);
       params.lib = lib_;
       params.function_handle_cache = function_handle_cache_.get();
+      params.resource_mgr = &resource_mgr_;
       IteratorContext iter_ctx(std::move(params));
       tf_shared_lock l(mu_);
       multi_device_buffer_->GetNextFromShard(
@@ -125,6 +126,8 @@ class MultiDeviceIterator : public ResourceBase {
     return function_handle_cache_.get();
   }
 
+  ResourceMgr* resource_mgr() { return &resource_mgr_; }
+
  private:
   // A private class that uses a background thread to keep a per device buffer
   // full.
@@ -350,6 +353,7 @@ class MultiDeviceIterator : public ResourceBase {
   const std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
   FunctionLibraryRuntime* const lib_ = nullptr;  // not owned.
   const std::unique_ptr<FunctionHandleCache> function_handle_cache_;
+  ResourceMgr resource_mgr_;
   std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
 
   int64 incarnation_id_ GUARDED_BY(mu_) = 0;
@@ -477,6 +481,7 @@ class MultiDeviceIteratorInitOp : public OpKernel {
     IteratorContext::Params params(ctx);
     params.lib = resource->lib();
     params.function_handle_cache = resource->function_handle_cache();
+    params.resource_mgr = resource->resource_mgr();
     IteratorContext iter_ctx(std::move(params));
     OP_REQUIRES_OK(
         ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index cac6c43565660820ace144f66da353fc23c617eb..9c50d8050a82397f1578ab3f577ef5ad77f81767 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/lib/core/refcount.h"
@@ -161,6 +162,8 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "OptimizeDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -217,6 +220,39 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_;
     };
 
+    void AddFakeSinks(FunctionDef* function_def) {
+      int counter = 0;
+      for (const auto& output : function_def->signature().output_arg()) {
+        NodeDef* node = function_def->add_node_def();
+        tensorflow::grappler::function_utils::SetUniqueFunctionNodeName(
+            strings::StrCat("FakeSink", counter++), function_def, node);
+        node->set_op("Identity");
+        node->add_input(function_def->ret().at(output.name()));
+        (*node->mutable_attr())["T"].set_type(output.type());
+
+        (*function_def->mutable_ret())[output.name()] =
+            strings::StrCat(node->name(), ":output:0");
+      }
+    }
+
+    void RemoveFakeSinks(FunctionDef* function_def) {
+      // Map from identity node names to their input tensor strings
+      std::map<string, string> identity_map;
+      for (const auto& node : function_def->node_def()) {
+        if (node.op() == "Identity" && node.input_size() == 1) {
+          identity_map[node.name()] = node.input(0);
+        }
+      }
+      for (const auto& output_arg : function_def->signature().output_arg()) {
+        const string& tensor = function_def->ret().at(output_arg.name());
+        const string& output_node = tensor.substr(0, tensor.find(':'));
+        if (identity_map.find(output_node) != identity_map.end()) {
+          (*function_def->mutable_ret())[output_arg.name()] =
+              identity_map.at(output_node);
+        }
+      }
+    }
+
     Status ApplyOptimizations(OpKernelContext* ctx, GraphDef* graph_def,
                               string* output_node) {
       // Add an identity node as the fetch node, otherwise we might get
@@ -230,6 +266,15 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       (*node->mutable_attr())["T"].set_type(DT_VARIANT);
       *output_node = node->name();
 
+      // Add fake sink node to graph and functions to allow rewriting the actual
+      // sink nodes.
+      // TODO(b/118820916): When MetaOptimizer adds provisions for function
+      // retvals to be optimizable, we will no longer need this.
+      for (auto& function_def :
+           *graph_def->mutable_library()->mutable_function()) {
+        AddFakeSinks(&function_def);
+      }
+
       // Create metagraph.
       MetaGraphDef meta_graph_def;
       (*meta_graph_def.mutable_graph_def()) = *graph_def;
@@ -260,8 +305,9 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
         // removing unused graph nodes)
         // TODO(b/118175421): This should be part of the tf.data optimization
         // pass manager.
-        for (const auto& optimizer : {"pruning", "function", "constfold",
-                                      "shape", "arithmetic", "dependency"}) {
+        // TODO(b/120437209): Apply `constfold` optimization when it is fixed.
+        for (const auto& optimizer :
+             {"pruning", "function", "shape", "arithmetic", "dependency"}) {
           rewriter_config.add_optimizers(optimizer);
         }
       }
@@ -283,6 +329,14 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
           *grappler_item, config, ctx->device(), &cluster, graph_def));
 
+      // Remove fake sinks after optimizations are done.
+      // TODO(b/118820916): When MetaOptimizer adds provisions for function
+      // retvals to be optimizable, we will no longer need this.
+      for (auto& function_def :
+           *graph_def->mutable_library()->mutable_function()) {
+        RemoveFakeSinks(&function_def);
+      }
+
       return Status::OK();
     }
 
diff --git a/tensorflow/core/kernels/data/optional_ops.cc b/tensorflow/core/kernels/data/optional_ops.cc
index d8a7f21c5f99c6d99e506847e00cabc6bd49168f..a406f7467fe1a1d221ee1d5bd9b2e858fb0044d3 100644
--- a/tensorflow/core/kernels/data/optional_ops.cc
+++ b/tensorflow/core/kernels/data/optional_ops.cc
@@ -159,9 +159,13 @@ static Status OptionalDeviceCopy(
     to_values.reserve(from_values.size());
     for (const Tensor& t : from_values) {
       if (DMAHelper::CanUseDMA(&t) || t.dtype() == DT_VARIANT) {
-        Tensor tmp(t.dtype());
-        TF_RETURN_IF_ERROR(copy(t, &tmp));
-        to_values.push_back(std::move(tmp));
+        // NOTE(skyewm): we're careful to make sure the lifetime of the 'to'
+        // Tensor passed to `copy` (i.e. to_values.back()) is the same as the
+        // returned 'to' OptionalVariant. This is because `copy` may spawn async
+        // callbacks that don't run until after this function returns and access
+        // the 'to' Tensor (e.g. BaseGPUDevice::MaybeCopyTensorToGPU).
+        to_values.emplace_back(t.dtype());
+        TF_RETURN_IF_ERROR(copy(t, &to_values.back()));
       } else {
         to_values.push_back(t);
       }
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 594a9ce7ec2d3ac634de4d643f19b6ec6c53ddc8..0fff4c53706269538f770889744e21fffcae3601 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -152,6 +152,15 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
                              ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index b5a167908225ea4e77b4bace6b464dd359b442f0..f844a005768bce33fe94e09cb2a3dbdd4011acf3 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
-#include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -36,1050 +35,6 @@ namespace {
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 cycle_length = 0;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument(ctx, "cycle_length", &cycle_length));
-    OP_REQUIRES(ctx, cycle_length > 0,
-                errors::InvalidArgument("`cycle_length` must be > 0"));
-
-    int64 block_length = 0;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument(ctx, "block_length", &block_length));
-    OP_REQUIRES(ctx, block_length > 0,
-                errors::InvalidArgument("`block_length` must be > 0"));
-
-    bool sloppy = false;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "sloppy", &sloppy));
-
-    int64 buffer_output_elements = 0;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "buffer_output_elements",
-                                            &buffer_output_elements));
-    OP_REQUIRES(
-        ctx, buffer_output_elements > 0,
-        errors::InvalidArgument("`buffer_output_elements` must be > 0"));
-
-    int64 prefetch_input_elements = 0;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "prefetch_input_elements",
-                                            &prefetch_input_elements));
-    OP_REQUIRES(
-        ctx, prefetch_input_elements >= 0,
-        errors::InvalidArgument("`prefetch_input_elements` must be >= 0"));
-
-    std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(
-        ctx, CapturedFunction::Create(interleave_func_, ctx, "other_arguments",
-                                      &captured_func));
-
-    *output =
-        new Dataset(ctx, input, interleave_func_, std::move(captured_func),
-                    cycle_length, block_length, sloppy, buffer_output_elements,
-                    prefetch_input_elements, output_types_, output_shapes_);
-  }
-
- private:
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func,
-            std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
-            int64 block_length, bool sloppy, int64 buffer_output_elements,
-            int64 prefetch_input_elements, const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          interleave_func_(func),
-          captured_func_(std::move(captured_func)),
-          cycle_length_(cycle_length),
-          block_length_(block_length),
-          sloppy_(sloppy),
-          buffer_output_elements_(buffer_output_elements),
-          prefetch_input_elements_(prefetch_input_elements),
-          output_types_(output_types),
-          output_shapes_(output_shapes) {
-      input_->Ref();
-    }
-
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::ParallelInterleave")}));
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
-
-    string DebugString() const override {
-      return "ParallelInterleaveDatasetOp::Dataset";
-    }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, interleave_func_.name()));
-      Node* input_node;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
-      Node* cycle_length_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
-      Node* block_length_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(block_length_, &block_length_node));
-      Node* sloppy_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(sloppy_, &sloppy_node));
-      Node* buffer_output_elements_node;
-      TF_RETURN_IF_ERROR(
-          b->AddScalar(buffer_output_elements_, &buffer_output_elements_node));
-      Node* prefetch_input_elements_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(prefetch_input_elements_,
-                                      &prefetch_input_elements_node));
-      DataTypeVector other_arguments_types;
-      other_arguments_types.reserve(captured_func_->captured_inputs().size());
-      std::vector<Node*> other_arguments;
-      other_arguments.reserve(captured_func_->captured_inputs().size());
-      for (const Tensor& t : captured_func_->captured_inputs()) {
-        Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
-        other_arguments.emplace_back(node);
-        other_arguments_types.emplace_back(t.dtype());
-      }
-      AttrValue f;
-      b->BuildAttrValue(interleave_func_, &f);
-      AttrValue other_arguments_types_attr;
-      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
-
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this,
-          {{0, input_node},
-           {2, cycle_length_node},
-           {3, block_length_node},
-           {4, sloppy_node},
-           {5, buffer_output_elements_node},
-           {6, prefetch_input_elements_node}},
-          {{1, other_arguments}},
-          {{"f", f}, {"Targuments", other_arguments_types_attr}}, output));
-      return Status::OK();
-    }
-
-   private:
-    int64 num_threads() const {
-      return cycle_length_ + prefetch_input_elements_;
-    }
-
-    // Parallel interleave's implementation is designed around a few principles:
-    //  1. Thread creation is relatively expensive. (Not reusing
-    //     threads causes a number of indirect costs such as poorer tcmalloc
-    //     performance due to thread-local caches, etc.) We allocate a fixed
-    //     number of threads at the start and never change. This is why we've
-    //     fused functionality that is theoretically orthogonal (i.e.
-    //     .prefetch()) into the implementation.
-    //  2. Drop-in replacement for standard interleave. The goal will be to
-    //     auto-opt people into an optimized implementation without any work
-    //     on the customer's part. We thus go through great pains to maintain
-    //     identical iteration orders, full determinism (disabled only via a
-    //     flag, etc.)
-    //  3. Performance across a variety of environments and I/O envelopes.
-    //
-    // The actual implementation centers around a collection of worker threads
-    // and their corresponding worker state (tracked in the `workers_` vector).
-    // Worker threads repeatedly receive a vector of Tensors that are used as
-    // input to the flat-map function (`captured_func_`). The output of this
-    // function must be a dataset. The worker thread then repeatedly calls
-    // `GetNext()`, maintaining a buffer of elements to minimize the likelihood
-    // that a caller will block waiting for an element to be produced.
-    //
-    // Pointers to these worker states are kept in 2 disjoint data structures:
-    //  1. `interleave_indices_` is a vector containing indices of WorkerStates
-    //     in `workers_` that we are interleaving. Worker threads backing these
-    //     WorkerStates should be regularly producing values.
-    //  2. `staging_indices_` is a deque containing indices of WorkerStates in
-    //     `workers_` that we will move to `interleave_indices_` when an
-    //     iterator in `interleave_indices_` is exhausted.
-    //
-    // The client calls `GetNext[Internal]()` to retrieve an output element. The
-    // internal implementation updates the state of `interleave_indices_` and
-    // `staging_indices_` as output iterators (run by the worker threads) are
-    // exhausted.
-    //
-    // `input_impl_` is the input iterator that generates arguments for the
-    // flat-map function (`captured_func_`). It is set to an iterator at
-    // Iterator construction, and is fixed until we consume all input elements.
-    // Once it is exhausted, we reset the unique_ptr to eagerly deallocate
-    // memory.
-    //
-    // A few invariants are maintained:
-    //  1. No element in interleave_indices_ should be a -1 unless
-    //     `staging_indices_` is empty and `input_impl_` is empty.
-    //  2. Every `worker_` element is pointed to by at most one element of the
-    //     union of `interleave_indices_` and `staging_indices_`.
-    //  3. Unless `input_impl_` is empty, every `worker_` must be pointed to by
-    //     an element in `interleave_indices_` or `staging_indices_`.
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            workers_(dataset()->num_threads()),
-            worker_thread_states_(dataset()->num_threads()) {}
-
-      ~Iterator() override {
-        mutex_lock l(mu_);
-        cancelled_ = true;
-        // Notify all workers in case they are blocked.
-        for (auto& worker : workers_) {
-          worker.cond_var.notify_all();
-        }
-      }
-
-      Status Initialize(IteratorContext* ctx) override {
-        TF_RETURN_IF_ERROR(
-            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(
-            ctx, &instantiated_captured_func_);
-      }
-
-      // It is implemented so that it matches the deterministic interleave
-      // unless getting the next element would block and we are allowed to be
-      // sloppy.
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(EnsureWorkerThreadsStarted(ctx));
-        while (!cancelled_) {
-          // Wait for an item to become available, blocking if necessary. If we
-          // are allowed to be sloppy, we can skip over input datasets that do
-          // not have an item readily available.
-          bool can_produce_elements = false;
-          bool must_wait_for_input = true;
-          for (int64 i = 0; i < interleave_indices_.size(); ++i) {
-            int64 index = (next_index_ + i) % interleave_indices_.size();
-            int64 current_worker_index = interleave_indices_[index];
-            if (current_worker_index < 0) {
-              continue;  // Empty interleave elements.
-            }
-            WorkerState* current_worker = &workers_[current_worker_index];
-            can_produce_elements |= current_worker->MayHaveElements();
-            if (!current_worker->outputs.empty()) {
-              // We have an element!
-              next_index_ = index;
-              const bool element_acquired_sloppily =
-                  dataset()->sloppy_ && i > 1;
-              if (!element_acquired_sloppily) {
-                // If the element was acquired in the regular (non-sloppy)
-                // order, then advance the current block and cycle pointers to
-                // the next element in the regular order.
-                block_count_++;
-                if (block_count_ == dataset()->block_length_) {
-                  next_index_ = (index + 1) % interleave_indices_.size();
-                  block_count_ = 0;
-                }
-              } else {
-                block_count_ = 0;
-              }
-              *end_of_sequence = false;
-              Status s = current_worker->outputs.front().status;
-              current_worker->outputs.front().output.swap(*out_tensors);
-              current_worker->outputs.pop_front();
-              current_worker->cond_var.notify_one();
-              return s;
-            } else if (current_worker->is_producing && !dataset()->sloppy_) {
-              // current_worker.outputs.empty(), and we must wait for this
-              // iterator.
-              if (next_index_ != index) {
-                // We have advanced to a new iterator; reset block counts.
-                next_index_ = index;
-                block_count_ = 0;
-              }
-              break;
-            } else if (!current_worker->is_producing) {
-              // This iterator has reached end of input.
-              interleave_indices_[index] = -1;
-              if (input_impl_) {
-                // Start prefetching a new iterator.
-                std::vector<Tensor> args;
-                bool end_of_input = false;
-                Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
-                if (end_of_input) {
-                  input_impl_.reset();
-                } else {
-                  current_worker->SetInputs(s, std::move(args));
-                  staging_indices_.emplace_back(current_worker_index);
-                }
-              }
-
-              if (!staging_indices_.empty()) {
-                // Move a worker from `staging_indices_` to
-                // `interleave_indices_`.
-                interleave_indices_[index] = staging_indices_.front();
-                staging_indices_.pop_front();
-
-                next_index_ = (index + 1) % interleave_indices_.size();
-                block_count_ = 0;
-                // Restart the inner [for] loop
-                can_produce_elements = true;
-                must_wait_for_input = false;
-                break;
-              }
-            }
-          }
-
-          if (!can_produce_elements && !input_impl_) {
-            // No potential for future values.
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-
-          if (must_wait_for_input) {
-            // Wait for elements to become available.
-            RecordStop(ctx);
-            if (dataset()->sloppy_) {
-              sloppy_cond_var_.wait(l);
-            } else {
-              workers_[interleave_indices_[next_index_]].cond_var.wait(l);
-            }
-            RecordStart(ctx);
-          }
-        }
-        return errors::Cancelled(
-            "ParallelInterleaveDatasetOp::Dataset::Iterator::GetNext");
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeAsyncInterleaveManyNode(std::move(args),
-                                                  /*parameters=*/{});
-      }
-
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        // The order of locking is important here to avoid deadlock.
-        mutex_lock l(mu_);
-        mutex_lock ckpt_l(ckpt_mu_);
-        if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        } else {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("input_exhausted"), ""));
-        }
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("next_index"), next_index_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("block_count"), block_count_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("workers_size"), workers_.size()));
-        for (int i = 0; i < workers_.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteWorkerStateLocked(writer, i));
-        }
-        for (int i = 0; i < worker_thread_states_.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteWorkerThreadStateLocked(writer, i));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("interleave_size"),
-                                               interleave_indices_.size()));
-        for (int i = 0; i < interleave_indices_.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("interleave_indices_", i)),
-              interleave_indices_[i]));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("staging_size"),
-                                               staging_indices_.size()));
-        for (int i = 0; i < staging_indices_.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("staging_indices_", i)),
-              staging_indices_[i]));
-        }
-        if (!worker_threads_.empty()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("worker_threads_running"), ""));
-        }
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        // The order of locking is important here to avoid deadlock.
-        mutex_lock l(mu_);
-        mutex_lock ckpt_l(ckpt_mu_);
-        if (!reader->Contains(full_name("input_exhausted"))) {
-          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        } else {
-          input_impl_.reset();
-        }
-        int64 temp;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("next_index"), &temp));
-        next_index_ = size_t(temp);
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("block_count"), &temp));
-        block_count_ = size_t(temp);
-
-        // Restore WorkerStates.
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name("workers_size"), &temp));
-        if (temp != dataset()->num_threads()) {
-          return errors::Internal("Expected ", dataset()->num_threads(),
-                                  " worker states but found ", temp, ".");
-        }
-        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
-          TF_RETURN_IF_ERROR(ReadWorkerStateLocked(reader, i, ctx));
-        }
-        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
-          TF_RETURN_IF_ERROR(ReadWorkerThreadStateLocked(reader, i, ctx));
-        }
-
-        // Restore `interleave_indices_`.
-        std::set<int64> all_indices;
-        {
-          int64 interleave_size;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("interleave_size"),
-                                                &interleave_size));
-          interleave_indices_.reserve(interleave_size);
-          for (int64 i = 0; i < interleave_size; ++i) {
-            int64 temp;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("interleave_indices_", i)), &temp));
-            if (temp >= 0 && all_indices.find(temp) != all_indices.end()) {
-              return errors::Internal(
-                  "Duplicate entry for ", temp,
-                  " found when reading interleave and staging indices.");
-            }
-            if (temp >= 0) {
-              all_indices.insert(temp);
-            }
-            interleave_indices_.emplace_back(temp);
-          }
-        }
-
-        // Restore `staging_indices_`.
-        {
-          int64 staging_size;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("staging_size"), &staging_size));
-          for (int i = 0; i < staging_size; ++i) {
-            int64 temp;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("staging_indices_", i)), &temp));
-            if (all_indices.find(temp) != all_indices.end()) {
-              return errors::Internal(
-                  "Duplicate entry for ", temp,
-                  " found when reading interleave and staging indices.");
-            }
-            if (temp >= 0) {
-              all_indices.insert(temp);
-            }
-            staging_indices_.emplace_back(temp);
-          }
-        }
-
-        // Start Worker threads.
-        if (reader->Contains(full_name("worker_threads_running"))) {
-          worker_threads_.reserve(dataset()->num_threads());
-          for (size_t i = 0; i < dataset()->num_threads(); ++i) {
-            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(ctx->env()->StartThread(
-                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
-          }
-        }
-        return Status::OK();
-      }
-
-     private:
-      // OutputElem contains the information from a call to GetNext by an output
-      // iterator.
-      struct OutputElem {
-        // The output iterator sets `status` if getting the output element
-        // fails.
-        Status status;
-        // The buffered data element.
-        std::vector<Tensor> output;
-
-        explicit OutputElem(const Status& s) : status(s) {}
-      };
-
-      // Worker threads operate on their relevant WorkerState structs.
-      //
-      // WorkerState's fields are all protected by mu_;
-      struct WorkerState {
-        // The arguments to be used to construct an output iterator.
-        std::vector<Tensor> input;
-        // The buffered output elements.
-        std::deque<OutputElem> outputs;
-        // Set to true iff the worker thread expects to append more elements to
-        // outputs. is_producing can be false despite !outputs.empty().
-        // Concretely, all output elements will have been consumed only when:
-        // is_producing == false && outputs.empty();
-        bool is_producing = false;
-        // Condition variable used to coordinate between threads. The worker
-        // thread waits on this condition variable when it is either (1) waiting
-        // for the main thread to add arguments to `input`, or (2) waiting for
-        // the main thread to consume an element of `outputs`. The main thread
-        // waits on cond_var if it is waiting for the worker thread to produce
-        // an element into `outputs` (this implies sloppy_==false).
-        condition_variable cond_var;
-
-        inline bool MayHaveElements() const {
-          return is_producing || !outputs.empty();
-        }
-
-        // Sets inputs for a worker thread and notifies it to start processing.
-        void SetInputs(const Status& s, std::vector<Tensor> input_arguments) {
-          if (s.ok()) {
-            DCHECK(!MayHaveElements())
-                << "Tried to start inputs, despite already producing!";
-            input = std::move(input_arguments);
-            is_producing = true;
-            cond_var.notify_one();
-          } else {
-            outputs.emplace_back(s);
-          }
-        }
-      };
-
-      // The internal state of a worker thread that is not already captured
-      // in its `WorkerState`.
-      //
-      // This is needed only for checkpointing purposes. We keep this
-      // separate from `WorkerState` and guard its fields using a separate
-      // lock `ckpt_mu_` so as to not affect the performance of main pipeline.
-      struct WorkerThreadState {
-        // The output element that has been produced from the input iterator
-        // and is waiting to be added to `WorkerState.outputs`.
-        OutputElem output_elem;
-
-        // Whether the input iterator returned an `end_of_sequence`.
-        bool end_of_sequence = false;
-
-        // Status returned from `MakeIteratorFromInputElement`.
-        Status iterator_creation_status;
-
-        // The arguments to be used to construct `iterator`.
-        std::vector<Tensor> input;
-
-        std::unique_ptr<IteratorBase> iterator;
-
-        WorkerThreadState() : output_elem(Status::OK()) {}
-      };
-
-      Status EnsureWorkerThreadsStarted(IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (worker_threads_.empty()) {
-          worker_threads_.reserve(dataset()->num_threads());
-          for (int64 i = 0; i < dataset()->num_threads(); ++i) {
-            std::vector<Tensor> args;
-            bool end_of_input = false;
-            Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
-            if (end_of_input) {
-              input_impl_.reset();
-              return Status::OK();
-            }
-            workers_[i].SetInputs(s, std::move(args));
-            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(ctx->env()->StartThread(
-                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
-            if (i < dataset()->cycle_length_) {
-              interleave_indices_.push_back(i);
-            } else {
-              staging_indices_.push_back(i);
-            }
-          }
-          DCHECK(interleave_indices_.size() == dataset()->cycle_length_);
-          DCHECK(staging_indices_.size() ==
-                 dataset()->prefetch_input_elements_);
-        }
-        return Status::OK();
-      }
-
-      // Produces elements into the worker's output buffers.
-      void WorkerThread(const std::shared_ptr<IteratorContext>& ctx,
-                        const int64 thread_index) {
-        // Notes on checkpointing thread local state, i.e., `WorkerThreadState`:
-        //
-        // 1. Any local state that may need to be checkpointed should be kept
-        //    in `worker_thread_states_[thread_index]`.
-        // 2. `WorkerThreadState` should contain state that is needed only for
-        //    checkpointing, i.e., if we were to remove checkpointing support,
-        //    we could keep that state as local variables in this thread.
-        // 3. This thread should only read/write state at `thread_index`
-        //    and should not access other thread states.
-        // 4. When restoring from checkpoint, threads are started only after
-        //    the restore is complete.
-        // 5. Once restored from a checkpoint, the local state is edited only
-        //    by this thread. 3 & 4 allow making assumptions like temporarily
-        //    caching local state in this thread and using it outside a lock
-        //    e.g. `make_new_iterator`.
-        // 6. `ckpt_mu_` should be wisely used to create *consistent*
-        //    checkpoint markers.
-
-        // std::function arguments are copy-constructable, so we pass raw
-        // pointers, and then immediately wrap them to ensure correct ownership.
-        RecordStart(ctx.get());
-        auto cleanup = gtl::MakeCleanup([this, thread_index, ctx] {
-          mutex_lock l(mu_);
-          workers_[thread_index].cond_var.notify_all();
-          RecordStop(ctx.get());
-        });
-        bool make_new_iterator;
-        {
-          tf_shared_lock l(ckpt_mu_);
-          // Decide whether a new iterator should be built.
-          // 1. If there is an existing iterator, we use it.
-          // 2. If there was an error in iterator creation that could not be
-          //    notified to the client we attempt to send that to the client
-          //    first.
-          make_new_iterator =
-              worker_thread_states_[thread_index].iterator == nullptr &&
-              worker_thread_states_[thread_index].iterator_creation_status.ok();
-        }
-        // Even though `make_new_iterator` has cached values from
-        // `worker_thread_states_[thread_index]` which is guarded by ckpt_mu_,
-        // it is safe to *read* `make_new_iterator`outside of a lock without
-        // worrying about concurrent changes to values in
-        // `worker_thread_states_[thread_index]`. See comment at the start of
-        // this function for details.
-        while (true) {
-          // Whether creation of the iterator succeeded.
-          Status iterator_creation_status;
-          // 1. Build a new iterator or use the existing one.
-          if (make_new_iterator) {
-            // 1a. Get new input tensors or use the exiting ones.
-            bool read_new_input;
-            {
-              tf_shared_lock l(ckpt_mu_);
-              // worker_thread_states_[thread_index].input will be non-empty
-              // if checkpointing happened at CHECKPOINT_MARKER_A.
-              read_new_input =
-                  worker_thread_states_[thread_index].input.empty();
-            }
-
-            if (read_new_input) {
-              mutex_lock l(mu_);
-              while (!cancelled_ && !workers_[thread_index].is_producing) {
-                RecordStop(ctx.get());
-                workers_[thread_index].cond_var.wait(l);
-                RecordStart(ctx.get());
-              }
-              if (cancelled_) return;
-              // Copy the input tensors so that we do not need to block on `mu_`
-              // when building the iterator.
-              // We keep a copy of the input tensors in
-              // `WorkerThreadState.input` till the iterator is in use. This is
-              // used in `RestoreInternal` to re-build the iterator.
-              // TODO(b/78046638): Explore ways to avoid tracking the input
-              // tensors.
-              tf_shared_lock ckpt_l(ckpt_mu_);
-              worker_thread_states_[thread_index].input.swap(
-                  workers_[thread_index].input);
-              // CHECKPOINT_MARKER_A
-              // We have the input tensors but have not built the iterator yet.
-            }
-
-            // 1b. Run the user defined function to produce a new iterator.
-            {
-              tf_shared_lock l(ckpt_mu_);
-              worker_thread_states_[thread_index].iterator_creation_status =
-                  MakeIteratorFromInputElement(
-                      ctx.get(), worker_thread_states_[thread_index].input,
-                      thread_index, *instantiated_captured_func_, prefix(),
-                      &worker_thread_states_[thread_index].iterator);
-              iterator_creation_status =
-                  worker_thread_states_[thread_index].iterator_creation_status;
-              if (!iterator_creation_status.ok()) {
-                worker_thread_states_[thread_index].input.clear();
-              }
-              // CHECKPOINT_MARKER_B
-              // Either an iterator has been successfully built and placed in
-              // `worker_thread_states_[thread_index].iterator` or it failed and
-              // a non-OK status has been put in
-              // `worker_thread_states_[thread_index].iterator_creation_status`.
-            }
-          } else {
-            tf_shared_lock l(ckpt_mu_);
-            iterator_creation_status =
-                worker_thread_states_[thread_index].iterator_creation_status;
-            // Mark that we have used up the restored iterator.
-            make_new_iterator = true;
-          }
-          // 2. Start producing elements or send error state to client if
-          //    iterator creation failed.
-          if (!iterator_creation_status.ok()) {
-            mutex_lock l(mu_);
-            // Wait for space in the prefetch queue.
-            while (!cancelled_ && workers_[thread_index].outputs.size() ==
-                                      dataset()->buffer_output_elements_) {
-              RecordStop(ctx.get());
-              workers_[thread_index].cond_var.wait(l);
-              RecordStart(ctx.get());
-            }
-            if (cancelled_) return;
-            tf_shared_lock ckpt_l(ckpt_mu_);
-            workers_[thread_index].outputs.emplace_back(
-                iterator_creation_status);
-            workers_[thread_index].is_producing = false;
-            worker_thread_states_[thread_index].iterator_creation_status =
-                Status::OK();
-            // CHECKPOINT_MARKER_C
-            // Non-OK iterator creation status has been notified to the
-            // client.
-            workers_[thread_index].cond_var.notify_one();
-          } else {
-            bool end_of_sequence = false;
-            while (!end_of_sequence) {
-              // 3.a Produce an element!
-              {
-                tf_shared_lock ckpt_l(ckpt_mu_);
-                if (worker_thread_states_[thread_index]
-                        .output_elem.status.ok() &&
-                    worker_thread_states_[thread_index]
-                        .output_elem.output.empty() &&
-                    !worker_thread_states_[thread_index].end_of_sequence) {
-                  worker_thread_states_[thread_index].output_elem.status =
-                      worker_thread_states_[thread_index].iterator->GetNext(
-                          ctx.get(),
-                          &worker_thread_states_[thread_index]
-                               .output_elem.output,
-                          &worker_thread_states_[thread_index].end_of_sequence);
-                  end_of_sequence =
-                      worker_thread_states_[thread_index].end_of_sequence;
-                } else {
-                  end_of_sequence =
-                      worker_thread_states_[thread_index].end_of_sequence;
-                }
-                // CHECKPOINT_MARKER_D
-                // An element has been read or an error or end_of_sequence has
-                // been received from the input iterator and is waiting to be
-                // sent to client.
-              }
-
-              // 3.b Make it available to the client.
-              {
-                mutex_lock l(mu_);
-
-                // Wait for space in the prefetch queue.
-                while (!cancelled_ && workers_[thread_index].outputs.size() ==
-                                          dataset()->buffer_output_elements_) {
-                  RecordStop(ctx.get());
-                  workers_[thread_index].cond_var.wait(l);
-                  RecordStart(ctx.get());
-                }
-                if (cancelled_) return;
-
-                tf_shared_lock ckpt_l(ckpt_mu_);
-                workers_[thread_index].is_producing = !end_of_sequence;
-
-                // Output the element.
-
-                // Move the temporary state in WorkerThreadState to WorkerState
-                // and mark it as used.
-                if (end_of_sequence) {
-                  worker_thread_states_[thread_index].iterator.reset();
-                  worker_thread_states_[thread_index].input.clear();
-                  worker_thread_states_[thread_index].end_of_sequence = false;
-                } else {
-                  workers_[thread_index].outputs.emplace_back(
-                      worker_thread_states_[thread_index].output_elem.status);
-                  workers_[thread_index].outputs.back().output.swap(
-                      worker_thread_states_[thread_index].output_elem.output);
-                }
-                worker_thread_states_[thread_index].output_elem.status =
-                    Status::OK();
-                if (dataset()->sloppy_) {
-                  sloppy_cond_var_.notify_one();
-                } else {
-                  workers_[thread_index].cond_var.notify_one();
-                }
-                // CHECKPOINT_MARKER_E
-                // Output element or iterator status has been sent to the
-                // client.
-              }
-            }
-          }
-        }
-      }
-
-      Status WriteWorkerStateLocked(IteratorStateWriter* writer, int index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string prefix = strings::StrCat("worker_", index);
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_input_size")),
-            workers_[index].input.size()));
-        for (int i = 0; i < workers_[index].input.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(strings::StrCat(prefix, "_input_", i)),
-              workers_[index].input[i]));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_outputs_size")),
-            workers_[index].outputs.size()));
-        for (int i = 0; i < workers_[index].outputs.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteOutputElemLocked(
-              writer, workers_[index].outputs[i],
-              full_name(strings::StrCat(prefix, "_outputs_", i))));
-        }
-        if (workers_[index].is_producing) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_is_producing")), ""));
-        }
-        return Status::OK();
-      }
-
-      Status ReadWorkerStateLocked(IteratorStateReader* reader, int index,
-                                   IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string worker_prefix = strings::StrCat("worker_", index);
-        // Restore inputs.
-        int64 input_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(worker_prefix, "_input_size")),
-            &input_size));
-        workers_[index].input.reserve(input_size);
-        for (int i = 0; i < input_size; ++i) {
-          workers_[index].input.emplace_back();
-          TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(worker_prefix, "_input_", i)),
-              &workers_[index].input.back()));
-        }
-        int64 outputs_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(worker_prefix, "_outputs_size")),
-            &outputs_size));
-        for (int i = 0; i < outputs_size; ++i) {
-          workers_[index].outputs.emplace_back(Status::OK());
-          TF_RETURN_IF_ERROR(ReadOutputElemLocked(
-              reader, &workers_[index].outputs.back(),
-              full_name(strings::StrCat(worker_prefix, "_outputs_", i))));
-        }
-        if (reader->Contains(
-                full_name(strings::StrCat(worker_prefix, "_is_producing")))) {
-          workers_[index].is_producing = true;
-        } else {
-          workers_[index].is_producing = false;
-        }
-        return Status::OK();
-      }
-
-      Status WriteWorkerThreadStateLocked(IteratorStateWriter* writer,
-                                          int index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string prefix = strings::StrCat("worker_thread_", index);
-        if (worker_thread_states_[index].iterator != nullptr) {
-          TF_RETURN_IF_ERROR(
-              SaveInput(writer, worker_thread_states_[index].iterator));
-        } else {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_iterator_exhausted")), ""));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_input_size")),
-            worker_thread_states_[index].input.size()));
-        for (int i = 0; i < worker_thread_states_[index].input.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(strings::StrCat(prefix, "_input_", i)),
-              worker_thread_states_[index].input[i]));
-        }
-        TF_RETURN_IF_ERROR(WriteStatusLocked(
-            writer, strings::StrCat(prefix, "_iterator_creation_status"),
-            worker_thread_states_[index].iterator_creation_status));
-        TF_RETURN_IF_ERROR(WriteOutputElemLocked(
-            writer, worker_thread_states_[index].output_elem,
-            full_name(strings::StrCat(prefix, "_output"))));
-        if (worker_thread_states_[index].end_of_sequence) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_end_of_sequence")), ""));
-        }
-        return Status::OK();
-      }
-
-      Status ReadWorkerThreadStateLocked(IteratorStateReader* reader, int index,
-                                         IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string worker_prefix = strings::StrCat("worker_thread_", index);
-        // Restore inputs.
-        int64 input_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(worker_prefix, "_input_size")),
-            &input_size));
-        worker_thread_states_[index].input.reserve(input_size);
-        for (int i = 0; i < input_size; ++i) {
-          worker_thread_states_[index].input.emplace_back();
-          TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(worker_prefix, "_input_", i)),
-              &worker_thread_states_[index].input.back()));
-        }
-        // Restore iterator.
-        if (reader->Contains(full_name(
-                strings::StrCat(worker_prefix, "_iterator_exhausted")))) {
-          worker_thread_states_[index].iterator.reset();
-        } else {
-          std::unique_ptr<IteratorBase> iterator;
-          Status s = MakeIteratorFromInputElement(
-              ctx, worker_thread_states_[index].input, index,
-              *instantiated_captured_func_, prefix(), &iterator);
-          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, iterator));
-          worker_thread_states_[index].iterator.swap(iterator);
-        }
-        TF_RETURN_IF_ERROR(ReadStatusLocked(
-            reader, strings::StrCat(worker_prefix, "_iterator_creation_status"),
-            &worker_thread_states_[index].iterator_creation_status));
-        TF_RETURN_IF_ERROR(ReadOutputElemLocked(
-            reader, &worker_thread_states_[index].output_elem,
-            full_name(strings::StrCat(worker_prefix, "_output"))));
-        if (reader->Contains(full_name(
-                strings::StrCat(worker_prefix, "_end_of_sequence")))) {
-          worker_thread_states_[index].end_of_sequence = true;
-        } else {
-          worker_thread_states_[index].end_of_sequence = false;
-        }
-        return Status::OK();
-      }
-
-      Status WriteOutputElemLocked(IteratorStateWriter* writer,
-                                   const OutputElem& output_elem,
-                                   const string& prefix)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        TF_RETURN_IF_ERROR(WriteStatusLocked(
-            writer, strings::StrCat(prefix, "_status"), output_elem.status));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(strings::StrCat(prefix, "_output_size"),
-                                output_elem.output.size()));
-        for (int i = 0; i < output_elem.output.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              strings::StrCat(prefix, "_output_", i), output_elem.output[i]));
-        }
-        return Status::OK();
-      }
-
-      Status ReadOutputElemLocked(IteratorStateReader* reader,
-                                  OutputElem* output_elem, const string& prefix)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        TF_RETURN_IF_ERROR(ReadStatusLocked(
-            reader, strings::StrCat(prefix, "_status"), &output_elem->status));
-        int64 output_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            strings::StrCat(prefix, "_output_size"), &output_size));
-        output_elem->output.reserve(output_size);
-        for (int i = 0; i < output_size; ++i) {
-          output_elem->output.emplace_back();
-          TF_RETURN_IF_ERROR(
-              reader->ReadTensor(strings::StrCat(prefix, "_output_", i),
-                                 &output_elem->output.back()));
-        }
-        return Status::OK();
-      }
-
-      Status WriteStatusLocked(IteratorStateWriter* writer,
-                               const string& prefix, const Status& status)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
-                                static_cast<int64>(status.code())));
-        if (!status.ok()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
-                                  status.error_message()));
-        }
-        return Status::OK();
-      }
-
-      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
-                              Status* status)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        int64 code_int;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(prefix, "_code")), &code_int));
-        error::Code code = static_cast<error::Code>(code_int);
-
-        if (code != error::Code::OK) {
-          string error_message;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat(prefix, "_msg")), &error_message));
-          *status = Status(code, error_message);
-        } else {
-          *status = Status::OK();
-        }
-        return Status::OK();
-      }
-
-      // Mutex & condition variable to guard mutable iterator internals and
-      // coordinate among worker threads and client thread[s].
-      mutex mu_ ACQUIRED_BEFORE(ckpt_mu_);
-      // The main thread waits on this condition variable if running in sloppy
-      // mode and no values are available.
-      condition_variable sloppy_cond_var_;
-      // Mutex used to wait for a consistent state while checkpointing.
-      // Only Save and Restore require an exclusive lock on this mutex. In
-      // other scenarios we just acquire a shared lock so the pipeline's
-      // performance should not be affected in the absence of checkpointing.
-      // A thread must not wait on any condition variable while holding
-      // `ckpt_mu_` in either shared or exclusive modes.
-      mutex ckpt_mu_;
-
-      // The iterator producing elements which are converted to datasets by
-      // the dataset()->captured_func_ then interleaved together.
-      // input_impl_ is reset when we have exhausted its input.
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-
-      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
-
-      // The WorkerState structs the worker threads operate on.
-      // workers_ elements are in at most one of interleave_ and staging_.
-      std::vector<WorkerState> workers_ GUARDED_BY(mu_);
-
-      // Stores the temporary state of WorkerThreads which is not stored in
-      // WorkerState. This is used for checkpointing purposes only.
-      std::vector<WorkerThreadState> worker_thread_states_ GUARDED_BY(ckpt_mu_);
-
-      // Indices in `workers_` of iterators to interleave.
-      std::vector<int64> interleave_indices_ GUARDED_BY(mu_);
-      // Indices in `workers_` of prefetched iterators.
-      std::deque<int64> staging_indices_ GUARDED_BY(mu_);
-
-      // The index into output_elements_ for next element to produce.
-      size_t next_index_ GUARDED_BY(mu_) = 0;
-      // The number of items produced so far within the block
-      size_t block_count_ GUARDED_BY(mu_) = 0;
-      // Flag to instruct the worker threads to exit.
-      bool cancelled_ GUARDED_BY(mu_) = false;
-      // The worker threads. This must be last to ensure the
-      // threads have exited before any other members are deallocated.
-      // TODO(b/65178177): Avoid allocating additional threads.
-      std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
-    };
-
-    const DatasetBase* const input_;
-    const NameAttrList interleave_func_;
-    const std::unique_ptr<CapturedFunction> captured_func_;
-    const int64 cycle_length_;
-    const int64 block_length_;
-    const bool sloppy_;
-    const int64 buffer_output_elements_;
-    const int64 prefetch_input_elements_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
-  };
-
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList interleave_func_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDataset").Device(DEVICE_CPU),
-                        ParallelInterleaveDatasetOp);
-
 // The motivation for creating an alternative implementation of parallel
 // interleave is to decouple the degree of parallelism from the cycle length.
 // This makes it possible to change the degree of parallelism (e.g. through
@@ -1094,9 +49,9 @@ REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDataset").Device(DEVICE_CPU),
 // The above design choices were made with automated optimizations in mind,
 // isolating the degree of parallelism as the single tunable knob of this
 // implementation.
-class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
+class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
  public:
-  explicit ParallelInterleaveDatasetV2Op(OpKernelConstruction* ctx)
+  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
@@ -1121,9 +76,10 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
     int64 num_parallel_calls;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
                                             &num_parallel_calls));
-    OP_REQUIRES(ctx, num_parallel_calls > 0 || num_parallel_calls == kAutoTune,
-                errors::InvalidArgument(
-                    "num_parallel_calls must be greater than zero."));
+    OP_REQUIRES(
+        ctx, num_parallel_calls > 0 || num_parallel_calls == model::kAutoTune,
+        errors::InvalidArgument(
+            "num_parallel_calls must be greater than zero."));
     OP_REQUIRES(
         ctx, num_parallel_calls <= cycle_length,
         errors::InvalidArgument(
@@ -1265,9 +221,8 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
 
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(*mu_);
-        if (num_parallel_calls_->value == kAutoTune) {
+        if (num_parallel_calls_->value == model::kAutoTune) {
           num_parallel_calls_->value = dataset()->cycle_length_;
-          num_parallel_calls_->tunable = true;
         }
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
@@ -1301,6 +256,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
 
         if (result->status.ok()) {
           *out_tensors = std::move(result->return_values);
+          RecordBufferDequeue(ctx, *out_tensors);
         }
         *end_of_sequence = false;
         return result->status;
@@ -1439,6 +395,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
           if (end_of_input) {
             result->skip = true;
           }
+          RecordBufferEnqueue(ctx.get(), result->return_values);
           {
             mutex_lock l(*mu_);
             result->notification.Notify();
@@ -1747,7 +704,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDatasetV2").Device(DEVICE_CPU),
-                        ParallelInterleaveDatasetV2Op);
+                        ParallelInterleaveDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 6fe582c9ae335894690c1a634e107bdcfb01f246..5c09b2d5dc88f512b8a63bcdec6340c2ec6d4600 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -41,6 +41,8 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
                                      &use_inter_op_parallelism_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("sloppy", &sloppy_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
   }
 
  protected:
@@ -49,9 +51,10 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     int32 num_parallel_calls;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
                                             &num_parallel_calls));
-    OP_REQUIRES(ctx, num_parallel_calls > 0 || num_parallel_calls == kAutoTune,
-                errors::InvalidArgument(
-                    "num_parallel_calls must be greater than zero."));
+    OP_REQUIRES(
+        ctx, num_parallel_calls > 0 || num_parallel_calls == model::kAutoTune,
+        errors::InvalidArgument(
+            "num_parallel_calls must be greater than zero."));
 
     std::unique_ptr<CapturedFunction> captured_func;
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
@@ -61,9 +64,10 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     std::vector<int> indices;
     OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
 
-    *output = new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
-                          output_shapes_, use_inter_op_parallelism_, sloppy_,
-                          std::move(captured_func), indices);
+    *output =
+        new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
+                    output_shapes_, use_inter_op_parallelism_, sloppy_,
+                    std::move(captured_func), indices, preserve_cardinality_);
   }
 
  private:
@@ -75,7 +79,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
             const std::vector<PartialTensorShape>& output_shapes,
             bool use_inter_op_parallelism, bool sloppy,
             std::unique_ptr<CapturedFunction> captured_func,
-            const std::vector<int> indices)
+            const std::vector<int> indices, bool preserve_cardinality)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
@@ -84,6 +88,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
           output_shapes_(output_shapes),
           use_inter_op_parallelism_(use_inter_op_parallelism),
           sloppy_(sloppy),
+          preserve_cardinality_(preserve_cardinality),
           captured_func_(std::move(captured_func)),
           indices_(indices),
           can_move_(indices.empty() ? std::vector<bool>()
@@ -103,7 +108,8 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
       }
       return NewParallelMapIterator(
           {this, strings::StrCat(prefix, "::ParallelMap")}, input_,
-          std::move(parallel_map_functor), num_parallel_calls_, sloppy_);
+          std::move(parallel_map_functor), num_parallel_calls_, sloppy_,
+          preserve_cardinality_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -118,6 +124,8 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
       return "ParallelMapDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -161,6 +169,10 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
       AttrValue sloppy_attr;
       b->BuildAttrValue(sloppy_, &sloppy_attr);
 
+      // Attr: preserve_cardinality
+      AttrValue preserve_cardinality_attr;
+      b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
+
       TF_RETURN_IF_ERROR(b->AddDataset(
           this,
           {std::make_pair(0, input_graph_node),
@@ -170,7 +182,9 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
            std::make_pair("Targuments", other_arguments_types_attr),
            std::make_pair("use_inter_op_parallelism",
                           use_inter_op_parallelism_attr),
-           std::make_pair("sloppy", sloppy_attr)},  // Attrs
+           std::make_pair("sloppy", sloppy_attr),
+           std::make_pair("preserve_cardinality",
+                          preserve_cardinality_attr)},  // Attrs
           output));
       return Status::OK();
     }
@@ -248,6 +262,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     const std::vector<PartialTensorShape> output_shapes_;
     const bool use_inter_op_parallelism_;
     const bool sloppy_;
+    const bool preserve_cardinality_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const std::vector<int> indices_;
     const std::vector<bool> can_move_;
@@ -257,6 +272,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
   std::vector<PartialTensorShape> output_shapes_;
   bool use_inter_op_parallelism_;
   bool sloppy_;
+  bool preserve_cardinality_;
   NameAttrList func_;
 };
 
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index 02ccf6b004cad20c4864322d70e88b8727f54abf..b62e7059bab42d7ace20c3fe9d681e2c129b926e 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -32,20 +32,34 @@ namespace {
 
 class ParallelMapIterator : public DatasetBaseIterator {
  public:
-  ParallelMapIterator(const typename DatasetBaseIterator::BaseParams& params,
-                      const DatasetBase* input_dataset,
-                      std::unique_ptr<ParallelMapFunctor> parallel_map_functor,
-                      int32 num_parallel_calls, bool sloppy)
-      : DatasetBaseIterator(params),
+  struct Params {
+    Params(std::unique_ptr<ParallelMapFunctor> parallel_map_functor,
+           int32 num_parallel_calls, bool sloppy, bool preserve_cardinality)
+        : parallel_map_functor(std::move(parallel_map_functor)),
+          num_parallel_calls(num_parallel_calls),
+          sloppy(sloppy),
+          preserve_cardinality(preserve_cardinality) {}
+
+    std::unique_ptr<ParallelMapFunctor> parallel_map_functor;
+    int32 num_parallel_calls;
+    bool sloppy;
+    bool preserve_cardinality;
+  };
+
+  ParallelMapIterator(
+      const typename DatasetBaseIterator::BaseParams& base_params,
+      const DatasetBase* input_dataset, Params params)
+      : DatasetBaseIterator(base_params),
         input_dataset_(input_dataset),
-        parallel_map_functor_(std::move(parallel_map_functor)),
+        parallel_map_functor_(std::move(params.parallel_map_functor)),
         mu_(std::make_shared<mutex>()),
         cond_var_(std::make_shared<condition_variable>()),
         num_parallel_calls_(std::make_shared<model::SharedState>(
-            num_parallel_calls, mu_, cond_var_)),
-        sloppy_(sloppy) {
+            params.num_parallel_calls, mu_, cond_var_)),
+        sloppy_(params.sloppy),
+        preserve_cardinality_(params.preserve_cardinality) {
     std::vector<string> components =
-        str_util::Split(params.prefix, "::", str_util::SkipEmpty());
+        str_util::Split(base_params.prefix, "::", str_util::SkipEmpty());
     prefix_end_ = components.back();
   }
 
@@ -62,9 +76,8 @@ class ParallelMapIterator : public DatasetBaseIterator {
 
   Status Initialize(IteratorContext* ctx) override {
     mutex_lock l(*mu_);
-    if (num_parallel_calls_->value == kAutoTune) {
+    if (num_parallel_calls_->value == model::kAutoTune) {
       num_parallel_calls_->value = ctx->runner_threadpool_size();
-      num_parallel_calls_->tunable = true;
     }
     TF_RETURN_IF_ERROR(
         input_dataset_->MakeIterator(ctx, prefix(), &input_impl_));
@@ -86,7 +99,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
     RecordStop(ctx);
     result->notification.WaitForNotification();
     RecordStart(ctx);
-    return ProcessResult(result, out_tensors, end_of_sequence);
+    return ProcessResult(ctx, result, out_tensors, end_of_sequence);
   }
 
  protected:
@@ -197,6 +210,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
           strings::StrCat(prefix_end_, "::active_parallel_calls"),
           static_cast<float>(num_calls_));
     }
+    RecordBufferEnqueue(ctx.get(), result->return_values);
     result->notification.Notify();
     cond_var_->notify_all();
   }
@@ -225,19 +239,30 @@ class ParallelMapIterator : public DatasetBaseIterator {
                                    &result->return_values, std::move(done));
   }
 
-  Status ProcessResult(const std::shared_ptr<InvocationResult>& result,
+  Status ProcessResult(IteratorContext* ctx,
+                       const std::shared_ptr<InvocationResult>& result,
                        std::vector<Tensor>* out_tensors, bool* end_of_sequence)
       LOCKS_EXCLUDED(*mu_) {
     if (!result->end_of_input && result->status.ok()) {
       *out_tensors = std::move(result->return_values);
+      RecordBufferDequeue(ctx, *out_tensors);
       *end_of_sequence = false;
       return Status::OK();
     }
     if (errors::IsOutOfRange(result->status)) {
-      // `f` may deliberately raise `errors::OutOfRange` to indicate that we
-      // should terminate the iteration early.
-      *end_of_sequence = true;
-      return Status::OK();
+      if (preserve_cardinality_) {
+        // To guarantee that the transformation preserves the cardinality of the
+        // dataset, we convert `OutOfRange` to `InvalidArgument` as the former
+        // may be interpreted by a caller as the end of sequence.
+        return errors::InvalidArgument(
+            "Function invocation produced OutOfRangeError: ",
+            result->status.error_message());
+      } else {
+        // `f` may deliberately raise `errors::OutOfRange` to indicate
+        // that we should terminate the iteration early.
+        *end_of_sequence = true;
+        return Status::OK();
+      }
     }
     *end_of_sequence = result->end_of_input;
     return result->status;
@@ -369,6 +394,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   const std::shared_ptr<model::SharedState> num_parallel_calls_;
   // Determines whether outputs can be produced in non-deterministic order.
   const bool sloppy_;
+  const bool preserve_cardinality_;
   // Counts the number of outstanding calls.
   int64 num_calls_ GUARDED_BY(*mu_) = 0;
   std::unique_ptr<IteratorBase> input_impl_;
@@ -386,10 +412,12 @@ std::unique_ptr<IteratorBase> NewParallelMapIterator(
     const DatasetBaseIterator::BaseParams& params,
     const DatasetBase* input_dataset,
     std::unique_ptr<ParallelMapFunctor> parallel_map_functor,
-    int32 num_parallel_calls, bool sloppy) {
-  return MakeUnique<ParallelMapIterator>(params, input_dataset,
-                                         std::move(parallel_map_functor),
-                                         num_parallel_calls, sloppy);
+    int32 num_parallel_calls, bool sloppy, bool preserve_cardinality) {
+  return MakeUnique<ParallelMapIterator>(
+      params, input_dataset,
+      ParallelMapIterator::Params{std::move(parallel_map_functor),
+                                  num_parallel_calls, sloppy,
+                                  preserve_cardinality});
 }
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.h b/tensorflow/core/kernels/data/parallel_map_iterator.h
index 08c16a6c112c0aa48b0dbcd1205f8893284cc978..de30446f2631c7e40e090a03517dcc53fdd873b9 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.h
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.h
@@ -48,7 +48,7 @@ std::unique_ptr<IteratorBase> NewParallelMapIterator(
     const DatasetBaseIterator::BaseParams& params,
     const DatasetBase* input_dataset,
     std::unique_ptr<ParallelMapFunctor> parallel_map_functor,
-    int32 num_parallel_calls, bool sloppy);
+    int32 num_parallel_calls, bool sloppy, bool preserve_cardinality);
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 9e518131ebb51728d231ebbeefc932a5d47e169b..08d6de4bf9a654d433e3cb6dddd6ab0cc1435136 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -56,6 +56,8 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
   string DebugString() const override { return "PrefetchDatasetOp::Dataset"; }
 
+  int64 Cardinality() const override { return input_->Cardinality(); }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -123,7 +125,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         }
 
         if (!buffer_.empty()) {
-          return Consume(out_tensors, end_of_sequence, ctx);
+          return Consume(ctx, out_tensors, end_of_sequence);
         }
 
         if (prefetch_thread_finished_) {
@@ -226,8 +228,8 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       std::vector<Tensor> value;
     };
 
-    Status Consume(std::vector<Tensor>* out_tensors, bool* end_of_sequence,
-                   IteratorContext* ctx) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    Status Consume(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                   bool* end_of_sequence) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       const auto& stats_aggregator = ctx->stats_aggregator();
       if (stats_aggregator) {
         stats_aggregator->AddToHistogram(
@@ -246,6 +248,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       Status s = buffer_.front().status;
       if (s.ok()) {
         *out_tensors = std::move(buffer_.front().value);
+        RecordBufferDequeue(ctx, *out_tensors);
       }
       auto_tuner_.RecordConsumption(buffer_.size());
       buffer_.pop_front();
@@ -316,6 +319,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         // 3. Signal that the element has been produced.
         {
           mutex_lock l(mu_);
+          RecordBufferEnqueue(ctx.get(), buffer_element.value);
           buffer_.push_back(std::move(buffer_element));
           cond_var_.notify_all();
         }
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 207e957e3747e4a03a7f91cc5502f92fb6953e1b..580702f741814b6bd86cab2d537b3ad49b4f6177 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -73,6 +73,14 @@ class RangeDatasetOp : public DatasetOpKernel {
                              step_, ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      if (step_ > 0) {
+        return std::max(0LL, (stop_ - start_ - 1) / step_ + 1);
+      } else {
+        return std::max(0LL, (start_ - stop_ - 1) / -step_ + 1);
+      }
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index cee14df69d07a1e477b4f10a569a7ec268cfe2ad..8100f2695b6ee529da252b7b012a7c87ebb0a670 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -71,6 +71,23 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "RepeatDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (count_ < 0) {
+        if (n == 0) {
+          return 0;
+        }
+        return kInfiniteCardinality;
+      }
+      if (count_ == 0) {
+        return 0;
+      }
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return count_ * n;
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index ad6960685e4284f129d96baa4eaffa7df99f3946..7134793e26da82e39f53ac21030a9e56e16e26ab 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -61,6 +62,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     template <class T>
     class Iterator : public DatasetIterator<T> {
@@ -68,9 +71,9 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
       explicit Iterator(const typename DatasetIterator<T>::Params& params,
                         int64 seed, int64 seed2)
           : DatasetIterator<T>(params),
-            input_impl_(nullptr),
             seed_(seed),
             seed2_(seed2),
+            input_impl_(nullptr),
             epoch_(0),
             num_elements_(0),
             parent_generator_(seed, seed2),
@@ -124,6 +127,7 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
                 ctx, this->prefix(), &input_impl_));
           }
           if (!end_of_input_sequence) {
+            this->RecordBufferEnqueue(ctx, input_element);
             buffer_[slices_.back()->end % this->dataset()->buffer_size_] =
                 std::move(input_element);
             num_elements_++;
@@ -151,6 +155,7 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
           int64 index =
               (slices_.front()->start + offset) % this->dataset()->buffer_size_;
           *out_tensors = std::move(buffer_[index]);
+          this->RecordBufferDequeue(ctx, *out_tensors);
           std::swap(
               buffer_[index],
               buffer_[slices_.front()->start % this->dataset()->buffer_size_]);
@@ -170,6 +175,14 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
                                          /*ratio=*/1);
       }
 
+      void ResetRngs() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        // Reset the generators based on the current iterator seeds.
+        parent_generator_ = random::PhiloxRandom(seed_, seed2_);
+        generator_ = random::SingleSampleAdapter<random::PhiloxRandom>(
+            &parent_generator_);
+        generator_.Skip(num_random_samples_);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         // Save state needed to restore the random number generators.
@@ -277,6 +290,10 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
+      mutex mu_;
+      int64 seed_ GUARDED_BY(mu_);
+      int64 seed2_ GUARDED_BY(mu_);
+
      private:
       // Used to represent slices of `buffer_` that belong to different epochs.
       // The invariant maintained by the implementation is: `start` <= `end`.
@@ -297,19 +314,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
         return out;
       }
 
-      void ResetRngs() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        // Reset the generators based on the current iterator seeds.
-        parent_generator_ = random::PhiloxRandom(seed_, seed2_);
-        generator_ = random::SingleSampleAdapter<random::PhiloxRandom>(
-            &parent_generator_);
-        generator_.Skip(num_random_samples_);
-      }
-
-      mutex mu_;
       std::unique_ptr<std::vector<Tensor>[]> buffer_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      int64 seed_ GUARDED_BY(mu_);
-      int64 seed2_ GUARDED_BY(mu_);
       int64 epoch_ GUARDED_BY(mu_);
       int64 num_elements_ GUARDED_BY(mu_);
       std::deque<std::unique_ptr<Slice>> slices_ GUARDED_BY(mu_);
@@ -366,7 +372,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
   }
 
  private:
-  // A dataset that uses a pseduorandom sequence of seeds for the iterators
+  // A dataset that uses a pseudorandom sequence of seeds for the iterators
   // created from it. Used when `reshuffle_each_iteration` is true.
   class ReshufflingDataset : public ShuffleDatasetBase {
    public:
@@ -374,37 +380,114 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
                        int64 buffer_size, int64 seed, int64 seed2, int64 count)
         : ShuffleDatasetBase(ctx, input, buffer_size, count),
           seed_(seed),
-          seed2_(seed2),
-          parent_generator_(seed, seed2),
-          generator_(&parent_generator_) {}
+          seed2_(seed2) {}
 
     string DebugString() const override {
-      mutex_lock l(mu_);
       return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
                              ", ", seed2_, ")::ReshufflingDataset");
     }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      int64 iterator_seed;
-      int64 iterator_seed2;
-      {
-        mutex_lock l(mu_);
-        iterator_seed = Random();
-        iterator_seed2 = Random();
-      }
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Shuffle")},
-                       iterator_seed, iterator_seed2));
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::Shuffle")}, seed_, seed2_));
     }
 
    protected:
+    class RandomSeedGenerator : public ResourceBase {
+     public:
+      RandomSeedGenerator(int64 seed, int64 seed2)
+          : seed_(seed),
+            seed2_(seed2),
+            parent_generator_(seed, seed2),
+            generator_(&parent_generator_) {}
+
+      string DebugString() override {
+        return "ReshufflingDataset::RandomSeedGenerator";
+      }
+
+      void GenerateRandomSeeds(int64* seed1, int64* seed2) {
+        mutex_lock l(mu_);
+        num_random_samples_++;
+        *seed1 = generator_();
+        num_random_samples_++;
+        *seed2 = generator_();
+      }
+
+      int64 num_random_samples() {
+        tf_shared_lock l(mu_);
+        return num_random_samples_;
+      }
+
+      void set_num_random_samples(int64 num_random_samples) {
+        mutex_lock l(mu_);
+        num_random_samples_ = num_random_samples;
+      }
+
+      void Reset() {
+        mutex_lock l(mu_);
+        // Reset the generators based on the current seeds.
+        parent_generator_ = random::PhiloxRandom(seed_, seed2_);
+        generator_ = random::SingleSampleAdapter<random::PhiloxRandom>(
+            &parent_generator_);
+        generator_.Skip(num_random_samples_);
+      }
+
+     private:
+      const int64 seed_;
+      const int64 seed2_;
+      mutex mu_;
+      random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
+      random::SingleSampleAdapter<random::PhiloxRandom> generator_
+          GUARDED_BY(mu_);
+      int64 num_random_samples_ GUARDED_BY(mu_) = 0;
+    };
+
     class Iterator : public ShuffleDatasetBase::Iterator<ReshufflingDataset> {
      public:
       explicit Iterator(const Params& params, int64 seed, int64 seed2)
           : ShuffleDatasetBase::Iterator<ReshufflingDataset>(params, seed,
                                                              seed2) {}
 
+      ~Iterator() override { seed_generator_->Unref(); }
+
+      Status Initialize(IteratorContext* ctx) override {
+        // Firstly, lookup or create a seed generator from the IteratorResource
+        // resource_mgr.
+        ResourceMgr* mgr = ctx->resource_mgr();
+        RandomSeedGenerator* seed_generator;
+        const string name = strings::StrCat(prefix(), "::", dataset()->name(),
+                                            "::RandomSeedGenerator");
+
+        int64 dataset_seed, dataset_seed2;
+        {
+          tf_shared_lock l(mu_);
+          // Ideally we'd like to hold this lock in the LookupOrCreate method,
+          // but that trips up our Deadlock detection code.
+          dataset_seed = seed_;
+          dataset_seed2 = seed2_;
+        }
+        TF_RETURN_IF_ERROR(mgr->LookupOrCreate<RandomSeedGenerator>(
+            "tf_data", name, &seed_generator,
+            [dataset_seed,
+             dataset_seed2](RandomSeedGenerator** seed_generator) {
+              // On the first iterator creation, use the original seeds from the
+              // dataset to seed a `RandomSeedGenerator` that will provide seeds
+              // for subsequent repetitions of the same dataset.
+              *seed_generator =
+                  new RandomSeedGenerator(dataset_seed, dataset_seed2);
+              return Status::OK();
+            }));
+        // Now use the seed generator to update the base class Iterator seeds
+        // and random number generator with generated seeds for the current
+        // repetition.
+        mutex_lock l(mu_);
+        seed_generator->GenerateRandomSeeds(&seed_, &seed2_);
+        ResetRngs();
+        seed_generator_ = seed_generator;
+        return Status::OK();
+      }
+
      protected:
       std::shared_ptr<model::Node> CreateNode(
           IteratorContext* ctx, model::Node::Args args) const override {
@@ -413,12 +496,10 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
       }
 
       Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(dataset()->mu_);
-
         // Save RNG state of Dataset.
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("ds_num_random_samples"),
-                                dataset()->num_random_samples_));
+                                seed_generator_->num_random_samples()));
 
         // Save the Iterator.
         return ShuffleDatasetBase::Iterator<ReshufflingDataset>::SaveInternal(
@@ -427,24 +508,25 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
-        mutex_lock l(dataset()->mu_);
-
         // Restore RNG state of Dataset.
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name("ds_num_random_samples"),
-                               &dataset()->num_random_samples_));
-        dataset()->ResetRngs();
+        int64 num_random_samples;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name("ds_num_random_samples"), &num_random_samples));
+        seed_generator_->set_num_random_samples(num_random_samples);
+        seed_generator_->Reset();
 
         // Restore the Iterator.
         return ShuffleDatasetBase::Iterator<
             ReshufflingDataset>::RestoreInternal(ctx, reader);
       }
+
+     private:
+      RandomSeedGenerator* seed_generator_;
     };
 
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      mutex_lock l(mu_);
       Node* input_graph_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* buffer_size = nullptr;
@@ -465,28 +547,8 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
     }
 
    private:
-    random::SingleSampleAdapter<random::PhiloxRandom>::ResultType Random() const
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-      num_random_samples_++;
-      auto out = generator_();
-      return out;
-    }
-
-    void ResetRngs() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-      // Reset the generators based on the current seeds.
-      parent_generator_ = random::PhiloxRandom(seed_, seed2_);
-      generator_ =
-          random::SingleSampleAdapter<random::PhiloxRandom>(&parent_generator_);
-      generator_.Skip(num_random_samples_);
-    }
-
-    mutable int64 seed_ GUARDED_BY(mu_);
-    mutable int64 seed2_ GUARDED_BY(mu_);
-    mutable mutex mu_;
-    mutable random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
-    mutable random::SingleSampleAdapter<random::PhiloxRandom> generator_
-        GUARDED_BY(mu_);
-    mutable int64 num_random_samples_ GUARDED_BY(mu_) = 0;
+    const int64 seed_;
+    const int64 seed2_;
   };
 
   // A dataset that uses the same fixed seed for all iterators created from it.
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 8379383662a760ba54b0b2542371890221e1a8c6..e321066a715d180f0791c9afdfa947560a0fd9ce 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -67,6 +67,14 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "SkipDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return std::max(0LL, n - count_);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index a002c605357381071884489df8079da3ddbfaa28..be105f8170b8fff79c0c60a76a699a6ee6ba13f9 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -54,6 +54,8 @@ class Dataset : public DatasetBase {
     return "SparseTensorSliceDatasetOp::Dataset";
   }
 
+  int64 Cardinality() const override { return sparse_tensor_.shape()[0]; }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 57c9b0d57f68129e5b00462be79bb5864b7853a7..0a3d5869534ddad9f7ed295171d8deefc2154107 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -68,6 +68,17 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "TakeDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kUnknownCardinality) {
+        return kUnknownCardinality;
+      }
+      if (n == kInfiniteCardinality) {
+        return count_;
+      }
+      return std::min(n, count_);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index c7d374f489740a62b837690a4a80278212e98cce..98c23f23b202dee580fb89f5473f69c61d57c640 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -61,6 +61,8 @@ class TensorDatasetOp : public DatasetOpKernel {
 
     string DebugString() const override { return "TensorDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return 1LL; }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
deleted file mode 100644
index 7fd1c4c9e0488ac47de7e8b2a618eb941f70f507..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
+++ /dev/null
@@ -1,657 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <deque>
-
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/util/batch_util.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-bool IsGreaterEqualToOrCompatibleWith(const PartialTensorShape& a,
-                                      const PartialTensorShape& b) {
-  // Returns true if dims[a] >= dims[b], or are compatible.
-  if (a.unknown_rank()) return true;
-  if (a.dims() != b.dims()) return false;
-  for (int d = 0; d < a.dims(); ++d) {
-    if (a.dim_size(d) == -1 || b.dim_size(d) == -1) continue;
-    if (a.dim_size(d) < b.dim_size(d)) return false;
-  }
-  return true;
-}
-
-DataTypeVector PrependQueueType(const DataTypeVector& dtypes) {
-  DataTypeVector out;
-  out.reserve(dtypes.size() + 1);
-  out.push_back(DT_VARIANT);  // The queue component.
-  for (const DataType& d : dtypes) out.push_back(d);
-  return out;
-}
-
-std::vector<PartialTensorShape> PrependQueueShapeWithBatch(
-    const std::vector<PartialTensorShape>& shapes) {
-  std::vector<PartialTensorShape> out;
-  out.reserve(shapes.size() + 1);
-  out.emplace_back(PartialTensorShape({-1}));  // The queue component.
-  for (PartialTensorShape s : shapes) {
-    s.InsertDim(0, -1);  // Unknown batch size.
-    out.push_back(std::move(s));
-  }
-  return out;
-}
-
-class EnqueueInQueueDatasetOp;
-
-class PrependFromQueueAndPaddedBatchDataset : public DatasetBase {
- public:
-  PrependFromQueueAndPaddedBatchDataset(
-      OpKernelContext* ctx, const int64 batch_size, const DatasetBase* input,
-      const DataTypeVector& dtypes,
-      const std::vector<PartialTensorShape>& shapes,
-      std::vector<Tensor> padding_values)
-      : DatasetBase(DatasetContext(ctx)),
-        batch_size_(batch_size),
-        input_(input),
-        dtypes_(dtypes),
-        shapes_(shapes),
-        padding_values_(std::move(padding_values)),
-        dtypes_with_queue_(PrependQueueType(dtypes)),
-        batched_shapes_with_queue_(PrependQueueShapeWithBatch(shapes)) {
-    input_->Ref();
-  }
-
-  ~PrependFromQueueAndPaddedBatchDataset() override { input_->Unref(); }
-
-  std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
-    return std::unique_ptr<IteratorBase>(new Iterator(
-        {this, strings::StrCat(prefix, "::PrependFromQueueAndPaddedBatch")}));
-  }
-
-  const DataTypeVector& output_dtypes() const override {
-    return dtypes_with_queue_;
-  }
-  const std::vector<PartialTensorShape>& output_shapes() const override {
-    return batched_shapes_with_queue_;
-  }
-
-  string DebugString() const override {
-    return "PrependFromQueueAndPaddedBatchDatasetOp::Dataset";
-  }
-
- protected:
-  Status AsGraphDefInternal(SerializationContext* ctx,
-                            DatasetGraphDefBuilder* b,
-                            Node** output) const override {
-    Node* input_graph = nullptr;
-    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph));
-    Node* batch_size = nullptr;
-    TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
-
-    std::vector<Node*> padded_shapes;
-    padded_shapes.reserve(shapes_.size());
-    for (int i = 0; i < shapes_.size(); i++) {
-      Node* node;
-      Tensor t(DT_INT64, TensorShape({shapes_[i].dims()}));
-      for (int j = 0; j < shapes_[i].dims(); j++) {
-        t.vec<int64>()(j) = shapes_[i].dim_size(j);
-      }
-      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
-      padded_shapes.emplace_back(node);
-    }
-
-    std::vector<Node*> padding_values;
-    padding_values.reserve(padding_values_.size());
-    for (const Tensor& t : padding_values_) {
-      Node* node;
-      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
-      padding_values.emplace_back(node);
-    }
-
-    AttrValue output_types;
-    b->BuildAttrValue(dtypes_, &output_types);
-
-    AttrValue output_shapes;
-    b->BuildAttrValue(batched_shapes_with_queue_, &output_shapes);
-
-    AttrValue N;
-    b->BuildAttrValue<int64>(shapes_.size(), &N);
-
-    TF_RETURN_IF_ERROR(b->AddDataset(this, {{0, input_graph}, {1, batch_size}},
-                                     {{2, padded_shapes}, {3, padding_values}},
-                                     {{"Toutput_types", output_types},
-                                      {"output_shapes", output_shapes},
-                                      {"N", N}},
-                                     output));
-
-    return Status::OK();
-  }
-
- private:
-  friend class EnqueueInQueueDatasetOp;
-
-  class Iterator
-      : public DatasetIterator<PrependFromQueueAndPaddedBatchDataset> {
-   public:
-    explicit Iterator(const Params& params)
-        : DatasetIterator<PrependFromQueueAndPaddedBatchDataset>(params) {}
-
-    ~Iterator() override { queue_->Unref(); }
-
-    Status Initialize(IteratorContext* ctx) override {
-      std::unique_ptr<IteratorBase> iterator;
-      TF_RETURN_IF_ERROR(
-          dataset()->input_->MakeIterator(ctx, prefix(), &iterator));
-      queue_ = new TensorQueue(std::move(iterator), dataset()->dtypes_,
-                               dataset()->shapes_);
-      return Status::OK();
-    }
-
-    Status GetNextInternal(IteratorContext* ctx,
-                           std::vector<Tensor>* out_tensors,
-                           bool* end_of_sequence) override {
-      std::vector<std::vector<Tensor>> batch;
-      TF_RETURN_IF_ERROR(queue_->GetNext(ctx, dataset()->batch_size_, &batch,
-                                         end_of_sequence));
-      const auto& dtypes = dataset()->dtypes_;
-      const auto& shapes = dataset()->shapes_;
-      const auto& input_shapes = dataset()->input_->output_shapes();
-      const auto& padding_values = dataset()->padding_values_;
-      const int64 batch_size = batch.size();
-      out_tensors->reserve(dtypes.size());
-
-      std::vector<TensorShape> max_shapes;  // Of non-queue components.
-      for (int i = 0; i < dtypes.size(); ++i) {
-        const PartialTensorShape& shape = shapes[i];
-        TensorShape out_shape({batch_size});
-        for (int r = 0; r < shape.dims(); ++r) {
-          if (shape.dim_size(r) >= 0) {
-            // padded_shape[r] is known.
-            out_shape.AddDim(shape.dim_size(r));
-          } else {
-            // padded_shape[r] is unknown, find the maximum across
-            // the batch.
-            int64 dim = 0;
-            for (int b = 0; b < batch.size(); ++b) {
-              dim = std::max(dim, batch[b][i].dim_size(r));
-            }
-            out_shape.AddDim(dim);
-          }
-        }
-        max_shapes.push_back(std::move(out_shape));
-      }
-
-      out_tensors->emplace_back(ctx->allocator({}), DT_VARIANT,
-                                TensorShape({batch_size}));
-      if (!batch.empty()) {
-        auto queues = out_tensors->back().flat<Variant>();
-        Variant& queue_inserter = queues(0);
-        queue_inserter = TensorQueueInserter();
-        queue_inserter.get<TensorQueueInserter>()->set_queue(queue_);
-        for (int b = 1; b < batch.size(); ++b) {
-          // Copy the TensorQueueInserter.  Each copy increments the
-          // Ref on the queue_.
-          queues(b) = queues(0);
-        }
-      }
-
-      for (int i = 0; i < max_shapes.size(); ++i) {
-        out_tensors->emplace_back(ctx->allocator({}), dtypes[i], max_shapes[i]);
-        Tensor& component = out_tensors->back();
-        // Try hard to take the fast path.
-        if (shapes[i].IsFullyDefined() &&
-            shapes[i].IsIdenticalTo(input_shapes[i])) {
-          // Take the fast path if we know all the shapes statically.
-          for (int64 b = 0; b < batch.size(); ++b) {
-            TF_RETURN_IF_ERROR(
-                batch_util::CopyElementToSlice(batch[b][i], &component, b));
-          }
-        } else {
-          TF_RETURN_IF_ERROR(
-              batch_util::SetElementZero(&component, padding_values[i]));
-          for (int64 b = 0; b < batch.size(); ++b) {
-            if (batch[b][i].shape() == max_shapes[i]) {
-              TF_RETURN_IF_ERROR(
-                  batch_util::CopyElementToSlice(batch[b][i], &component, b));
-            } else {
-              TF_RETURN_IF_ERROR(batch_util::CopyElementToLargerSlice(
-                  batch[b][i], &component, b));
-            }
-          }
-        }
-      }
-
-      // end_of_sequence was set before we populated out_tensors, so
-      // it's ok to return now.
-      return Status::OK();
-    }
-
-   protected:
-    // Work around bug in MSVC that disallows access to protected
-    // members of Iterator from within TensorQueue.
-    class TensorQueue;
-    friend class TensorQueue;
-
-    class TensorQueue : public core::RefCounted {
-     public:
-      TensorQueue(std::unique_ptr<IteratorBase> input_impl,
-                  const DataTypeVector& dtypes,
-                  const std::vector<PartialTensorShape>& shapes)
-          : dtypes_(dtypes),
-            shapes_(shapes),
-            input_impl_(std::move(input_impl)) {}
-
-      void MaybeWaitForNotificationLocked(mutex_lock* lock)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        // This essentially just releases the lock and immediately relocks.
-        cv_.wait_for(*lock, std::chrono::milliseconds(0));
-      }
-
-      void NotifyLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) { cv_.notify_all(); }
-
-      Status GetNext(IteratorContext* ctx, const int64 batch_size,
-                     std::vector<std::vector<Tensor>>* batch,
-                     bool* end_of_sequence) {
-        mutex_lock lock(mu_);
-
-        *end_of_sequence = false;
-
-        for (int64 b = 0; b < batch_size;) {
-          if (!entries_.empty()) {
-            batch->push_back(std::move(entries_.front()));
-            entries_.pop_front();
-            ++b;
-            continue;
-          } else {
-            if (input_impl_) {
-              // There's still input coming in.
-              std::vector<Tensor> tensors;
-              bool input_end;
-              TF_RETURN_IF_ERROR(
-                  input_impl_->GetNext(ctx, &tensors, &input_end));
-              if (!input_end) {
-                batch->push_back(std::move(tensors));
-                ++b;
-                continue;
-              } else {
-                input_impl_.reset();
-              }
-            }
-            if (!input_impl_) {
-              // There's no more input coming in.
-              if (RefCountIsOne()) {
-                // No TensorQueueInserters in the wild.
-                if (batch->empty()) {
-                  *end_of_sequence = true;
-                }
-                break;
-              } else {
-                MaybeWaitForNotificationLocked(&lock);
-                // If there's data available, try to add entries again.
-                // Otherwise return a smaller batch and hope the next
-                // iterator request has a non-empty or unused queue_.
-                if (entries_.empty()) {
-                  break;
-                }
-              }
-            }
-          }
-        }  // for (int64 b = ... batch_size)
-        return Status::OK();
-      }
-
-      Status Insert(const std::vector<Tensor>& tensors) {
-        if (tensors.size() != dtypes_.size()) {
-          return errors::InvalidArgument(
-              "TensorQueue::Insert: mismatched number of tensors.  Queue "
-              "expects ",
-              dtypes_.size(), " tensors but tried to insert ", tensors.size());
-        }
-        for (int i = 0; i < tensors.size(); ++i) {
-          if (tensors[i].dtype() != dtypes_[i]) {
-            return errors::InvalidArgument(
-                "TensorQueue::Insert: mismatched dtypes at component ", i,
-                ".  Attempted "
-                "to insert tensor of type ",
-                DataTypeString(tensors[i].dtype()),
-                " but queue expected type: ", DataTypeString(dtypes_[i]));
-          }
-          if (!shapes_[i].IsCompatibleWith(tensors[i].shape())) {
-            return errors::InvalidArgument(
-                "TensorQueue::Insert: mismatched shapes at component ", i,
-                ".  Attempted "
-                "to insert tensor with shape ",
-                tensors[i].shape().DebugString(),
-                " but queue expected shape: ", shapes_[i].DebugString());
-          }
-        }
-        mutex_lock lock(mu_);
-        entries_.push_back(tensors);
-        NotifyLocked();
-        return Status::OK();
-      }
-
-      Status Save(Iterator* iter, IteratorStateWriter* writer) {
-        mutex_lock lock(mu_);
-        if (input_impl_) {
-          TF_RETURN_IF_ERROR(iter->SaveInput(writer, input_impl_));
-        } else {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(iter->full_name("input_exhausted"), ""));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(iter->full_name("entries_size"),
-                                               entries_.size()));
-        for (int64 b = 0; b < entries_.size(); ++b) {
-          for (int i = 0; i < dtypes_.size(); ++i) {
-            TF_RETURN_IF_ERROR(
-                writer->WriteTensor(strings::StrCat(iter->full_name("entries"),
-                                                    "[", b, "][", i, "]"),
-                                    entries_[b][i]));
-          }
-        }
-        return Status::OK();
-      }
-
-      Status Restore(Iterator* iter, IteratorContext* ctx,
-                     IteratorStateReader* reader) {
-        mutex_lock l(mu_);
-        if (reader->Contains(iter->full_name("input_exhausted"))) {
-          input_impl_.reset();
-        } else {
-          TF_RETURN_IF_ERROR(iter->dataset_input()->MakeIterator(
-              ctx, iter->prefix(), &input_impl_));
-          TF_RETURN_IF_ERROR(iter->RestoreInput(ctx, reader, input_impl_));
-        }
-        entries_.clear();
-        int64 entries_size = -1;
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(iter->full_name("entries_size"), &entries_size));
-        if (entries_size < 0) {
-          return errors::DataLoss(
-              "Expected entries_size key '", iter->full_name("entries_size"),
-              "' to have nonnegative value, but saw: ", entries_size);
-        }
-        for (int64 b = 0; b < entries_size; ++b) {
-          std::vector<Tensor> entry;
-          for (int i = 0; i < dtypes_.size(); ++i) {
-            Tensor value;
-            TF_RETURN_IF_ERROR(
-                reader->ReadTensor(strings::StrCat(iter->full_name("entries"),
-                                                   "[", b, "][", i, "]"),
-                                   &value));
-            entry.push_back(std::move(value));
-          }
-          entries_.push_back(std::move(entry));
-        }
-        return Status::OK();
-      }
-
-      mutex* mu() { return &mu_; }
-
-     private:
-      DataTypeVector dtypes_;
-      std::vector<PartialTensorShape> shapes_;
-
-      mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      std::deque<std::vector<Tensor>> entries_ GUARDED_BY(mu_);
-      condition_variable cv_ GUARDED_BY(mu_);
-    };
-
-    const DatasetBase* dataset_input() const { return dataset()->input_; }
-
-    std::shared_ptr<model::Node> CreateNode(
-        IteratorContext* ctx, model::Node::Args args) const override {
-      return model::MakeKnownRatioNode(std::move(args), dataset()->batch_size_);
-    }
-
-    Status SaveInternal(IteratorStateWriter* writer) override {
-      return queue_->Save(this, writer);
-    }
-
-    Status RestoreInternal(IteratorContext* ctx,
-                           IteratorStateReader* reader) override {
-      return queue_->Restore(this, ctx, reader);
-    }
-
-   public:
-    class TensorQueueInserter {
-     public:
-      TensorQueueInserter() : queue_(nullptr) {}
-
-      void set_queue(TensorQueue* queue) {
-        queue_ = queue;
-        queue_->Ref();
-      }
-
-      TensorQueueInserter(const TensorQueueInserter& rhs) {
-        queue_ = rhs.queue_;
-        queue_->Ref();
-      }
-
-      TensorQueueInserter(TensorQueueInserter&& rhs) {
-        queue_ = rhs.queue_;
-        rhs.queue_ = nullptr;
-      }
-
-      TensorQueueInserter& operator=(const TensorQueueInserter& rhs) = delete;
-
-      string TypeName() const { return "tensorflow::TensorQueueInserter"; }
-      string DebugString() const { return TypeName(); }
-
-      void Encode(VariantTensorData*) const {}
-      bool Decode(const VariantTensorData&) { return false; }
-
-      ~TensorQueueInserter() {
-        if (queue_) {
-          mutex_lock lock(*queue_->mu());
-          queue_->Unref();
-          queue_->NotifyLocked();
-          queue_ = nullptr;
-        }
-      }
-
-      Status Insert(const std::vector<Tensor>& tensors) const {
-        CHECK(queue_);
-        return queue_->Insert(tensors);
-      }
-
-     private:
-      mutable TensorQueue* queue_;
-    };
-
-   private:
-    TensorQueue* queue_;
-  };
-
- private:
-  const int64 batch_size_;
-  const DatasetBase* input_;
-  const DataTypeVector dtypes_;
-  const std::vector<PartialTensorShape> shapes_;
-  const std::vector<Tensor> padding_values_;
-  const DataTypeVector dtypes_with_queue_;
-  const std::vector<PartialTensorShape> batched_shapes_with_queue_;
-};
-
-class PrependFromQueueAndPaddedBatchDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit PrependFromQueueAndPaddedBatchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutput_types", &output_types_));
-  }
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 batch_size = 0;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument<int64>(ctx, "batch_size", &batch_size));
-    OP_REQUIRES(
-        ctx, batch_size > 0,
-        errors::InvalidArgument("Batch size must be greater than zero."));
-
-    OpInputList padded_shape_tensors;
-    OP_REQUIRES_OK(ctx,
-                   ctx->input_list("padded_shapes", &padded_shape_tensors));
-    std::vector<PartialTensorShape> padded_shapes;
-    padded_shapes.reserve(padded_shape_tensors.size());
-    OP_REQUIRES(ctx,
-                padded_shape_tensors.size() == input->output_shapes().size(),
-                errors::InvalidArgument("Number of padded shapes (",
-                                        padded_shape_tensors.size(),
-                                        ") must match the number of components "
-                                        "in the input dataset's elements (",
-                                        input->output_shapes().size(), ")"));
-    for (const Tensor& padded_shape_t : padded_shape_tensors) {
-      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(padded_shape_t.shape()),
-                  errors::InvalidArgument("All padded shapes must be vectors"));
-      PartialTensorShape padded_shape;
-      OP_REQUIRES_OK(ctx, PartialTensorShape::MakePartialShape(
-                              padded_shape_t.vec<int64>().data(),
-                              padded_shape_t.NumElements(), &padded_shape));
-      padded_shapes.push_back(std::move(padded_shape));
-    }
-
-    OP_REQUIRES(
-        ctx, input->output_dtypes() == output_types_,
-        errors::InvalidArgument("Input dataset and this dataset "
-                                "have different output_types: ",
-                                DataTypeVectorString(input->output_dtypes()),
-                                " and ", DataTypeVectorString(output_types_)));
-
-    for (int i = 0; i < input->output_shapes().size(); ++i) {
-      // Exclude the queue from the tensor_shapes calculation.
-      const PartialTensorShape& tensor_shape = padded_shapes[i];
-      OP_REQUIRES(
-          ctx,
-          IsGreaterEqualToOrCompatibleWith(tensor_shape,
-                                           input->output_shapes()[i]),
-          errors::InvalidArgument("Incompatible input shapes at component ", i,
-                                  " between input dataset this dataset: ",
-                                  input->output_shapes()[i].DebugString(),
-                                  " vs. ", tensor_shape.DebugString()));
-    }
-
-    OpInputList padding_values_list;
-    OP_REQUIRES_OK(ctx,
-                   ctx->input_list("padding_values", &padding_values_list));
-    std::vector<Tensor> padding_values;
-    OP_REQUIRES(ctx,
-                padding_values_list.size() == input->output_shapes().size(),
-                errors::InvalidArgument(
-                    "Number of padding values (", padding_values_list.size(),
-                    ") must match the number of components in the input "
-                    "dataset's elements (",
-                    input->output_shapes().size(), ")"));
-    for (int i = 0; i < padding_values_list.size(); ++i) {
-      const Tensor& padding_value_t = padding_values_list[i];
-      OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(padding_value_t.shape()),
-          errors::InvalidArgument(
-              "All padding values must be scalars; but at component ", i,
-              " saw shape: ", padding_value_t.shape().DebugString()));
-      OP_REQUIRES(ctx, padding_value_t.dtype() == input->output_dtypes()[i],
-                  errors::InvalidArgument(
-                      "Mismatched type between padding value ", i,
-                      " and input dataset's component ", i, ": ",
-                      DataTypeString(padding_value_t.dtype()), " vs. ",
-                      DataTypeString(input->output_dtypes()[i])));
-      padding_values.push_back(padding_value_t);
-    }
-
-    *output = new PrependFromQueueAndPaddedBatchDataset(
-        ctx, batch_size, input, output_types_, padded_shapes,
-        std::move(padding_values));
-  }
-
- private:
-  DataTypeVector output_types_;
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("PrependFromQueueAndPaddedBatchDataset").Device(DEVICE_CPU),
-    PrependFromQueueAndPaddedBatchDatasetOp);
-
-class EnqueueInQueueDatasetOp : public OpKernel {
- public:
-  explicit EnqueueInQueueDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  void Compute(OpKernelContext* ctx) override {
-    using TensorQueueInserter =
-        PrependFromQueueAndPaddedBatchDataset::Iterator::TensorQueueInserter;
-
-    // TODO(ebrevdo): accept list of sequence lengths to do proper
-    // sub-slicing of tensors for placement into the queue?
-    const Tensor& tensor_queue_t = ctx->input(0);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_queue_t.shape()),
-                errors::InvalidArgument("queue must be a vector, saw shape: ",
-                                        tensor_queue_t.shape().DebugString()));
-    std::vector<const TensorQueueInserter*> inserters;
-    const int64 batch_size = tensor_queue_t.NumElements();
-    inserters.reserve(batch_size);
-    const Variant* variants = tensor_queue_t.flat<Variant>().data();
-    for (int i = 0; i < batch_size; ++i) {
-      const auto* inserter = variants[i].get<TensorQueueInserter>();
-      OP_REQUIRES(ctx, inserter != nullptr,
-                  errors::InvalidArgument(
-                      "Could not access TensorQueueInserter from queue[", i,
-                      "].  Received variant: ", variants[i].DebugString()));
-      inserters.push_back(inserter);
-    }
-
-    OpInputList components;
-    OP_REQUIRES_OK(ctx, ctx->input_list("components", &components));
-    for (int i = 0; i < components.size(); ++i) {
-      OP_REQUIRES(
-          ctx,
-          components[i].dims() > 0 && components[i].dim_size(0) == batch_size,
-          errors::InvalidArgument(
-              "Expected component ", i, " to have batched shape [", batch_size,
-              ",...], but saw shape: ", components[i].shape().DebugString()));
-    }
-    std::vector<TensorShape> element_shapes;
-    for (int i = 0; i < components.size(); ++i) {
-      TensorShape element_shape = components[i].shape();
-      element_shape.RemoveDim(0);
-      element_shapes.push_back(std::move(element_shape));
-    }
-    for (int64 b = 0; b < batch_size; ++b) {
-      std::vector<Tensor> tensors;
-      tensors.reserve(components.size());
-      for (int i = 0; i < components.size(); ++i) {
-        Tensor t(components[i].dtype(), element_shapes[i]);
-        OP_REQUIRES_OK(ctx,
-                       batch_util::CopySliceToElement(components[i], &t, b));
-        tensors.push_back(std::move(t));
-      }
-      // TODO(ebrevdo): Acquire the lock once for all inserters with
-      // the same underlying queue?  Add InsertLocked?
-      OP_REQUIRES_OK(ctx, inserters[b]->Insert(tensors));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("EnqueueInQueueDataset").Device(DEVICE_CPU),
-                        EnqueueInQueueDatasetOp);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 6291bfc110bafe028114b8f9ed010fdd2f97f1cd..4ba2bde718a6351ff13bc17cf14ae5c60332c6ca 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -84,6 +84,8 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
       return "TensorSliceDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return tensors_[0].dim_size(0); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
index 2ad4711aabe40bc6af771396c40006670eaf6b9b..c295631550aa008ccbf1abee0a91b27d64a6ba35 100644
--- a/tensorflow/core/kernels/data/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -41,6 +41,16 @@ class WindowDataset : public DatasetBase {
     return output_shapes_;
   }
 
+  int64 AllocatedBytes() const override {
+    int64 allocated_bytes = 0;
+    for (auto& element : elements_) {
+      allocated_bytes += GetAllocatedBytes(element);
+    }
+    return allocated_bytes;
+  }
+
+  int64 Cardinality() const override { return elements_.size(); }
+
   string DebugString() const override { return "WindowDataset"; }
 
  protected:
diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index 2c68e1ee05b542663e85839444560bdd8085393a..ae13ae5da8d4c093bdb4d6e168584bda234e4502 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -98,6 +98,15 @@ class WindowDatasetOp : public UnaryDatasetOpKernel {
                              window_stride_, drop_remainder_, ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / window_shift_ +
+             (n % window_shift_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -155,6 +164,7 @@ class WindowDatasetOp : public UnaryDatasetOpKernel {
               Status status =
                   input_impl_->GetNext(ctx, &element, end_of_sequence);
               if (!*end_of_sequence) {
+                RecordBufferEnqueue(ctx, element);
                 buffer_.emplace_back(std::move(element), status);
               } else {
                 input_impl_.reset();
@@ -192,8 +202,14 @@ class WindowDatasetOp : public UnaryDatasetOpKernel {
                 input_impl_.reset();
               }
             }
+            for (size_t i = 0; i < buffer_.size(); ++i) {
+              RecordBufferDequeue(ctx, buffer_.at(i).result);
+            }
             buffer_.clear();
           } else {
+            for (size_t i = 0; i < window_shift; ++i) {
+              RecordBufferDequeue(ctx, buffer_.at(i).result);
+            }
             buffer_.erase(buffer_.begin(), buffer_.begin() + window_shift);
           }
         }
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index 6e94d77867168df3aeaeae19b310ef93b0f654f5..1760e63a9e1c6b6262c19baa8354052d7d73fd3c 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -76,6 +76,21 @@ class ZipDatasetOp : public DatasetOpKernel {
 
     string DebugString() const override { return "ZipDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 result = kInfiniteCardinality;
+      for (const auto& input : inputs_) {
+        int64 n = input->Cardinality();
+        if (n == kUnknownCardinality) {
+          return kUnknownCardinality;
+        }
+        if (n != kInfiniteCardinality &&
+            (result == kInfiniteCardinality || n < result)) {
+          result = n;
+        }
+      }
+      return result;
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.cc b/tensorflow/core/kernels/eigen_contraction_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da42001781757e200d90108182905cb6b65ec0e3
--- /dev/null
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.cc
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+
+#include <mutex>  // NOLINT(build/c++11)
+
+// We need a pair of compile time and runtime flags to disable compilation of
+// custom contraction kernels for unsupported architectures (e.g. Android,
+// iOS, ARM and PPC CPUs, etc...), and to be able to fallback on default Eigen
+// matrix multiplication at runtime.
+//
+// It's not allowed to use absl flags library in Tensorflow, so we have to pass
+// the configuration through the environment variable.
+//
+// Example:
+//   bazel test --test_env=TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL=false //test
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+
+namespace Eigen {
+namespace internal {
+
+// TODO(ezhulenev): This is a temporary workaround for disabling custom kernels
+// at runtime in tests. We should always rely on compile time flags for that.
+// Example: ... --test_env=TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL=false //test
+bool UseCustomContractionKernels() {
+  static bool use_custom_contraction_kernel = true;
+
+  static std::once_flag initialized;
+  std::call_once(initialized, [&] {
+    char* flag = std::getenv("TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL");
+    if (flag && (strcmp(flag, "false") == 0 || strcmp(flag, "0") == 0)) {
+      use_custom_contraction_kernel = false;
+    }
+  });
+
+  return use_custom_contraction_kernel;
+}
+
+}  // namespace internal
+}  // namespace Eigen
+#endif
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.h b/tensorflow/core/kernels/eigen_contraction_kernel.h
index 92d29e39958e3cd30ee80776f2abb5c67f1a07e2..3d8e52ca0e49828b54604f7c5107f5dfd05d6891 100644
--- a/tensorflow/core/kernels/eigen_contraction_kernel.h
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.h
@@ -33,11 +33,20 @@ limitations under the License.
 //   #endif
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "third_party/intel_mkl_dnn/include/mkldnn.h"
+
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+#include "mkldnn.h"
+#endif
 
 namespace Eigen {
 namespace internal {
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+// Returns `true` iff we can use custom contraction kernels. This is a runtime
+// check, that uses environment variables.
+bool UseCustomContractionKernels();
+#endif  // TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL
+
 // Enabled by build option: "--define tensorflow_mkldnn_contraction_kernel=1"
 #if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
 
@@ -126,6 +135,11 @@ struct mkldnn_gemm_kernel</*Scalar*/ float, IndexType, OutputMapper,
                                       &alpha, blockA, &ldA, blockB, &ldB, &beta,
                                       const_cast<float*>(output.data()), &ldC);
     eigen_assert(st == 0);
+
+    // eigen_assert is a no-op in optimized mode so we add these to avoid
+    // compiler's unused-variable errors.
+    EIGEN_UNUSED_VARIABLE(max_index);
+    EIGEN_UNUSED_VARIABLE(st);
   }
 };
 
@@ -143,8 +157,8 @@ class TensorContractionBlocking<float, float, float, StorageIndex,
   // Multiply default choice of block size along M and N dimensions.
   // TODO(ezhulenev): Explore if this can work in general (kScaleM=2.0 worked
   // well in some of models).
-  static const float kScaleM = 1.5;
-  static const float kScaleN = 1.0;
+  static constexpr float kScaleM = 1.5;
+  static constexpr float kScaleN = 1.0;
 
   // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48.
   static const StorageIndex kUnrollM = 48;
@@ -165,6 +179,10 @@ class TensorContractionBlocking<float, float, float, StorageIndex,
                                                      num_threads);
     }
 
+    // If we are using default Eigen gebp kernel there is no need to adjust the
+    // block sizes for MKL-DNN.
+    if (!UseCustomContractionKernels()) return;
+
     // 2. And refine them to work well with mkldnn sgemm.
     mc_ = (std::min)(
         m, Eigen::divup(static_cast<StorageIndex>(mc_ * kScaleM), kUnrollM) *
@@ -206,23 +224,52 @@ struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper,
                                      typename RhsMapper::SubMapper, ColMajor>;
   using GemmKernel = mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper>;
 
+  // Fallback on default Eigen pack and GEBP kernel if custom contraction
+  // kernels disabled at runtime.
+  using EigenLhsPacker =
+      gemm_pack_lhs<Scalar, StorageIndex, typename LhsMapper::SubMapper,
+                    Traits::mr, Traits::LhsProgress,
+                    typename Traits::LhsPacket4Packing, ColMajor>;
+  using EigenRhsPacker =
+      gemm_pack_rhs<Scalar, StorageIndex, typename RhsMapper::SubMapper,
+                    Traits::nr, ColMajor>;
+  using GebpKernel =
+      gebp_kernel<Scalar, Scalar, StorageIndex, OutputMapper, Traits::mr,
+                  Traits::nr,
+                  /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>;
+
   EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packLhs(
       Scalar* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,
       const StorageIndex depth, const StorageIndex rows) {
-    LhsPacker()(lhsBlock, data_mapper, rows, depth);
+    if (UseCustomContractionKernels()) {
+      LhsPacker()(lhsBlock, data_mapper, rows, depth);
+    } else {
+      EigenLhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0,
+                       /*offset*/ 0);
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packRhs(
       Scalar* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,
       const StorageIndex depth, const StorageIndex cols) {
-    RhsPacker()(rhsBlock, data_mapper, depth, cols);
+    if (UseCustomContractionKernels()) {
+      RhsPacker()(rhsBlock, data_mapper, depth, cols);
+    } else {
+      EigenRhsPacker()(rhsBlock, data_mapper, depth, cols);
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void invoke(
       const OutputMapper& output_mapper, const Scalar* lhsBlock,
       const Scalar* rhsBlock, const StorageIndex rows, const StorageIndex depth,
       const StorageIndex cols, const Scalar alpha) {
-    GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha);
+    if (UseCustomContractionKernels()) {
+      GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha);
+    } else {
+      GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
+                   /*strideA*/ -1, /*strideB*/ -1,
+                   /*offsetA*/ 0, /*offsetB*/ 0);
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 25c735d080e1cef54b7c8cd87d25eb31612192b3..86d8c98ee65aebb2927b338dfb236f470a3a1d39 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -871,11 +871,9 @@ struct gemm_pack_rhs<
             const bool pad_col2 = dm2.padCol(c);
             const bool pad_col3 = dm3.padCol(c);
 
-            // We can squeeze reads along the `row` and `depth` dimensions if
-            // the row stride is `1`, which means that `row` and `depth`
-            // dimensions are contiguous (two innermost dimensions).
-            if (rhs.rowStride() == 1 &&                                //
-                !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
+            // Check if we can squeeze reads along the `row` and `depth`
+            // dimensions (two innermost dimensions).
+            if (!pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
                 !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
                 !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
                 !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index cca3cfbd7c0bc4729016c54bf1c9b417f9d4c28a..88a8a523e4780045c81f495959b157e44fe709dc 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -122,11 +122,15 @@ REGISTER_KERNEL_BUILDER(Name(kArgOp)
                             .TypeConstraint<string>("T"),
                         ArgOp);
 
+REGISTER_KERNEL_BUILDER(
+    Name(kArgOp).Device(DEVICE_GPU).TypeConstraint<Variant>("T"), ArgOp);
+
 #define REGISTER(type)     \
   REGISTER_KERNEL_BUILDER( \
       Name(kRetOp).Device(DEVICE_GPU).TypeConstraint<type>("T"), RetvalOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
 TF_CALL_QUANTIZED_TYPES(REGISTER)
+REGISTER(Variant)
 TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
                                                    .Device(DEVICE_GPU)
                                                    .HostMemory("input")
@@ -134,6 +138,7 @@ TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
                                                RetvalOp);
 REGISTER_KERNEL_BUILDER(
     Name(kDeviceRetOp).Device(DEVICE_GPU).TypeConstraint<int32>("T"), RetvalOp);
+
 REGISTER_KERNEL_BUILDER(Name(kRetOp)
                             .Device(DEVICE_GPU)
                             .TypeConstraint<ResourceHandle>("T")
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index 72088e0de24192c7a41780a51367288e97d76c8c..7300f7a4e249dd436fad9c1cdd3463e5bc73cbdc 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -64,3 +64,11 @@ tf_ops_fuzz_target_lib("parse_tensor_op")
 tf_ops_fuzz_target_lib("decode_compressed")
 
 tf_ops_fuzz_target_lib("decode_json_example")
+
+tf_oss_fuzz_corpus("decode_json_example")
+
+tf_oss_fuzz_dict("decode_json_example")
+
+tf_ops_fuzz_target_lib("check_numerics")
+
+tf_ops_fuzz_target_lib("one_hot")
diff --git a/tensorflow/core/kernels/fuzzing/check_numerics_fuzz.cc b/tensorflow/core/kernels/fuzzing/check_numerics_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2258a094d973e8e10f9ce6d1868d6b9913c41a17
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/check_numerics_fuzz.cc
@@ -0,0 +1,50 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzCheckNumerics : public FuzzSession {
+  void BuildGraph(const Scope& scope) override {
+    auto input =
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), DT_FLOAT);
+    auto prefix = "Error: ";
+    (void)tensorflow::ops::CheckNumerics(scope.WithOpName("output"), input,
+                                         prefix);
+  }
+
+  void FuzzImpl(const uint8_t* data, size_t size) override {
+    size_t ratio = sizeof(float) / sizeof(uint8_t);
+    size_t num_floats = size / ratio;
+    const float* float_data = reinterpret_cast<const float*>(data);
+
+    Tensor input_tensor(tensorflow::DT_FLOAT,
+                        TensorShape({static_cast<int64>(num_floats)}));
+    auto flat_tensor = input_tensor.flat<float>();
+    for (size_t i = 0; i < num_floats; i++) {
+      flat_tensor(i) = float_data[i];
+    }
+    RunInputs({{"input", input_tensor}});
+  }
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzCheckNumerics);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/013a29ea098a178f8a36741c9fd91144 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/013a29ea098a178f8a36741c9fd91144
new file mode 100644
index 0000000000000000000000000000000000000000..06fd8044808ff9cae8663cec970645bd22bf8ab8
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/013a29ea098a178f8a36741c9fd91144
@@ -0,0 +1,48 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie: {
+        bytes_list: {
+          value: "VGhlIFNoYXdzaGFuayBSZWRlbXB0aW9u",
+          value: "RmlnaHQgQ2x1Yg=="
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: 9.0,
+          value: 9.7
+        }
+      }
+    },
+    feature: {
+      suggestion: {
+        bytes_list: {
+          value: "SW5jZXB0aW9u"
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: 1.0
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/0875575fb76d630ccb19c5da8aab66b2 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/0875575fb76d630ccb19c5da8aab66b2
new file mode 100644
index 0000000000000000000000000000000000000000..4ae686974e2be25e49e3a25064dcfdfb91a41b5b
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/0875575fb76d630ccb19c5da8aab66b2
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:[29.0,2,3,4]}}},feature:{movie_ratings:{float_list:{value:[9.0,9.7]}}},feature:{suggestion_purchased:{float_list:{value:[1.0,2,3,4,5]}}},feature:{purchase_price:{float_list:{value:[9.99,8.88,7.77,6.66,5.55],value:[4.44,3.33,2.22,1.11],value:[1.11,2.22,3.33],value:[4.44,5.55],value:0}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/7e7f58fc443a11a0a2c5d9b643b7e99b b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/7e7f58fc443a11a0a2c5d9b643b7e99b
new file mode 100644
index 0000000000000000000000000000000000000000..150f8710f7dc094ad1189f1d3c659910d2e1b3e2
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/7e7f58fc443a11a0a2c5d9b643b7e99b
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie_ratings:{float_list:{value:[9.0,9.7]}}},feature:{suggestion_purchased:{float_list:{value:1.0}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/849a23936269a261c0370b5e9abe2416 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/849a23936269a261c0370b5e9abe2416
new file mode 100644
index 0000000000000000000000000000000000000000..fcfdfedd1b090871954e1d9b99d90480f6082dae
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/849a23936269a261c0370b5e9abe2416
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie_ratings:{float_list:{value:[[[[[[9.0,9.7]]]]]],value:[[[9.0,-9.2]]]}}},feature:{suggestion_purchased:{float_list:{value:[1.0,[2,3,[4,5,6,[7,8,9,0]]]]}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/85282c1696d98b9843ce3e8bd1cd899f b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/85282c1696d98b9843ce3e8bd1cd899f
new file mode 100644
index 0000000000000000000000000000000000000000..7c9981d482fcf5a2a138cc2583ea0dca9589e756
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/85282c1696d98b9843ce3e8bd1cd899f
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie_ratings:{float_list:{value:9.0,value:9.7}}},feature:{suggestion_purchased:{float_list:{value:1.0}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/90388b9c8093d8adedad0644b618da87 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/90388b9c8093d8adedad0644b618da87
new file mode 100644
index 0000000000000000000000000000000000000000..a1315bb8f9363858c6d79066cac3e93dc40f1602
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/90388b9c8093d8adedad0644b618da87
@@ -0,0 +1,33 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: [[[[[[9.0,9.7]]]]]],
+          value: [[[9.0, -9.2]]]
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: [1.0, [2, 3, [4, 5, 6, [7, 8, 9, 0]]]]
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/9fa2f86ea6d3ade36e961247c3026f8d b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/9fa2f86ea6d3ade36e961247c3026f8d
new file mode 100644
index 0000000000000000000000000000000000000000..d4f9494bbd3f945ed6926f8669c9fab62ae3ede6
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/9fa2f86ea6d3ade36e961247c3026f8d
@@ -0,0 +1,33 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: 9.0,
+          value: 9.7
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: 1.0
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/c4f18ca60a84e9869a28faf6f65dc758 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/c4f18ca60a84e9869a28faf6f65dc758
new file mode 100644
index 0000000000000000000000000000000000000000..e8ba267eb27b84fb427f33dea60623b8dace79cf
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/c4f18ca60a84e9869a28faf6f65dc758
@@ -0,0 +1,32 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: [9.0,9.7]
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: 1.0
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/d456ee029700adef5d28438593010223 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/d456ee029700adef5d28438593010223
new file mode 100644
index 0000000000000000000000000000000000000000..3428a1e0fcd730a5e0bce03f0dfd1d5fec90ea74
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/d456ee029700adef5d28438593010223
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie:{bytes_list:{value:"VGhlIFNoYXdzaGFuayBSZWRlbXB0aW9u",value:"RmlnaHQgQ2x1Yg=="}}},feature:{movie_ratings:{float_list:{value:9.0,value:9.7}}},feature:{suggestion:{bytes_list:{value:"SW5jZXB0aW9u"}}},feature:{suggestion_purchased:{float_list:{value:1.0}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/e9f0ff6ee8d691ae69d2ecb4710030a2 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/e9f0ff6ee8d691ae69d2ecb4710030a2
new file mode 100644
index 0000000000000000000000000000000000000000..ef0923c4500ecc3c6e8f01a87d1109066a752f48
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/e9f0ff6ee8d691ae69d2ecb4710030a2
@@ -0,0 +1,36 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: [29.0, 2, 3, 4]
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: [9.0,9.7]
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: [1.0, 2, 3, 4, 5]
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: [9.99, 8.88, 7.77, 6.66, 5.55],
+          value: [4.44, 3.33, 2.22, 1.11],
+          value: [1.11, 2.22, 3.33],
+          value: [4.44, 5.55],
+          value: 0
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/decode_compressed_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_compressed_fuzz.cc
index 0a56f4b63f4574d3a6fc62a5d770915255b93bf3..b9fc014b868801fd0fe7299802bbc72cfa141102 100644
--- a/tensorflow/core/kernels/fuzzing/decode_compressed_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_compressed_fuzz.cc
@@ -22,7 +22,7 @@ namespace fuzzing {
 class FuzzDecodeCompressed : public FuzzStringInputOp {
   void BuildGraph(const Scope& scope) override {
     auto input =
-        tensorflow::ops::Placeholder(scope.WithOpName("input1"), DT_STRING);
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), DT_STRING);
     auto d1 = tensorflow::ops::DecodeCompressed(
         scope.WithOpName("d1"), input,
         tensorflow::ops::DecodeCompressed::CompressionType(""));
diff --git a/tensorflow/core/kernels/fuzzing/dictionaries/decode_json_example.dict b/tensorflow/core/kernels/fuzzing/dictionaries/decode_json_example.dict
new file mode 100644
index 0000000000000000000000000000000000000000..5fe4ca23d1f9403b6ac7fc3084c9165b55391caf
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/dictionaries/decode_json_example.dict
@@ -0,0 +1,6 @@
+"features"
+"feature"
+"bytes_list"
+"float_list"
+"int64_list"
+"value"
diff --git a/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
index f5dd47a052cd098937d66394ed04c66831ee5972..09d196147c86556a3277c96dcf1a3677acb5fca0 100644
--- a/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
@@ -52,8 +52,7 @@ class FuzzEncodeJpeg : public FuzzSession {
     for (size_t i = 0; i < actual_pixels; i++) {
       flat_tensor(i) = data[i];
     }
-    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-    RunOneInput(input_tensor).IgnoreError();
+    RunInputs({{"input", input_tensor}});
   }
 };
 
diff --git a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
index 5b029bf5ec0f20bb160ff7d0091d6a7fd3a627ed..f72dfb39b31ef058e85e6c8e7e71de22d5e288c9 100644
--- a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
@@ -30,7 +30,7 @@ class FuzzExampleProtoFastParsing : public FuzzSession {
   void BuildGraph(const Scope& scope) final {
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
     // The serialized proto.
-    auto input = Placeholder(scope.WithOpName("input1"), DT_STRING);
+    auto input = Placeholder(scope.WithOpName("input"), DT_STRING);
 
     auto in_expanded = ExpandDims(scope, input, Const<int>(scope, 0));
 
@@ -53,8 +53,7 @@ class FuzzExampleProtoFastParsing : public FuzzSession {
     Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
     input_tensor.scalar<string>()() =
         string(reinterpret_cast<const char*>(data), size);
-    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-    RunOneInput(input_tensor).IgnoreError();
+    RunInputs({{"input", input_tensor}});
   }
 };
 
diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h
index 57d562ddf43142e47e5d52e4c0dfbbcbbb4bdfe0..4b036b181de127ca996251b538b983971ff12172 100644
--- a/tensorflow/core/kernels/fuzzing/fuzz_session.h
+++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -35,11 +35,11 @@ limitations under the License.
 #endif
 
 // Standard builder for hooking one placeholder to one op.
-#define SINGLE_INPUT_OP_BUILDER(dtype, opName)                           \
-  void BuildGraph(const Scope& scope) override {                         \
-    auto op_node =                                                       \
-        tensorflow::ops::Placeholder(scope.WithOpName("input1"), dtype); \
-    (void)tensorflow::ops::opName(scope.WithOpName("output"), op_node);  \
+#define SINGLE_INPUT_OP_BUILDER(dtype, opName)                          \
+  void BuildGraph(const Scope& scope) override {                        \
+    auto op_node =                                                      \
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), dtype); \
+    (void)tensorflow::ops::opName(scope.WithOpName("output"), op_node); \
   }
 
 namespace tensorflow {
@@ -61,7 +61,7 @@ namespace fuzzing {
 //   SINGLE_INPUT_OP_BUILDER(DT_INT8, Identity);
 //   void FuzzImpl(const uint8_t* data, size_t size) {
 //      ... convert data and size to a Tensor, pass it to:
-//      RunOneInput(input_tensor);
+//      RunInputs({{"input", input_tensor}});
 //
 class FuzzSession {
  public:
@@ -107,15 +107,18 @@ class FuzzSession {
   }
 
   // Runs the TF session by pulling on the "output" node, attaching
-  // the supplied input_tensor to the "input1" node, and discarding
+  // the supplied input_tensor to the input node(s), and discarding
   // any returned output.
-  Status RunOneInput(const Tensor& input_tensor) {
-    return session_->Run({{"input1", input_tensor}}, {}, {"output"}, nullptr);
+  // Note: We are ignoring Status from Run here since fuzzers don't need to
+  // check it (as that will slow them down and printing/logging is useless).
+  void RunInputs(const std::vector<std::pair<string, Tensor> >& inputs) {
+    RunInputsWithStatus(inputs).IgnoreError();
   }
 
-  Status RunTwoInputs(const Tensor& input1, const Tensor& input2) {
-    return session_->Run({{"input1", input1}, {"input2", input2}}, {},
-                         {"output"}, nullptr);
+  // Same as RunInputs but don't ignore status
+  Status RunInputsWithStatus(
+      const std::vector<std::pair<string, Tensor> >& inputs) {
+    return session_->Run(inputs, {}, {"output"}, nullptr);
   }
 
   // Dispatches to FuzzImpl;  small amount of sugar to keep the code
@@ -144,8 +147,7 @@ class FuzzStringInputOp : public FuzzSession {
     Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
     input_tensor.scalar<string>()() =
         string(reinterpret_cast<const char*>(data), size);
-    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-    RunOneInput(input_tensor).IgnoreError();
+    RunInputs({{"input", input_tensor}});
   }
 };
 
diff --git a/tensorflow/core/kernels/fuzzing/identity_fuzz.cc b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
index 5c3fc4a2795430d1f8f269f42131e882106db7b0..4c1049d381b458f674cbc8f20e5b64649ff53b22 100644
--- a/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
@@ -30,9 +30,9 @@ class FuzzIdentity : public FuzzSession {
       flat_tensor(i) = data[i];
     }
 
-    Status s = RunOneInput(input_tensor);
     // Note:  For many ops, we don't care about this success -- but when
     // testing to make sure the harness actually works, it's useful.
+    Status s = RunInputsWithStatus({{"input", input_tensor}});
     if (!s.ok()) {
       LOG(ERROR) << "Execution failed: " << s.error_message();
     }
diff --git a/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc b/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..85cbe51ba8bd10ef904d8b27e566c0353118a3c4
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc
@@ -0,0 +1,78 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzOneHot : public FuzzSession {
+  void BuildGraph(const Scope& scope) override {
+    auto input =
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), DT_UINT8);
+    auto depth =
+        tensorflow::ops::Placeholder(scope.WithOpName("depth"), DT_INT32);
+    auto on = tensorflow::ops::Placeholder(scope.WithOpName("on"), DT_UINT8);
+    auto off = tensorflow::ops::Placeholder(scope.WithOpName("off"), DT_UINT8);
+    (void)tensorflow::ops::OneHot(scope.WithOpName("output"), input, depth, on,
+                                  off);
+  }
+
+  void FuzzImpl(const uint8_t* data, size_t size) override {
+    int64 input_size;
+    int32 depth;
+    uint8 on, off;
+    const uint8_t* input_data;
+
+    if (size > 3) {
+      depth = static_cast<int32>(data[0]);
+      on = data[1];
+      off = data[2];
+      input_size = static_cast<int64>(size - 3);
+      input_data = data + 3;
+    } else {
+      depth = 1;
+      on = 1;
+      off = 0;
+      input_size = static_cast<int64>(size);
+      input_data = data;
+    }
+
+    Tensor input_tensor(tensorflow::DT_UINT8, TensorShape({input_size}));
+    Tensor depth_tensor(tensorflow::DT_INT32, TensorShape({}));
+    Tensor on_tensor(tensorflow::DT_UINT8, TensorShape({}));
+    Tensor off_tensor(tensorflow::DT_UINT8, TensorShape({}));
+
+    auto flat_tensor = input_tensor.flat<uint8>();
+    for (size_t i = 0; i < input_size; i++) {
+      flat_tensor(i) = input_data[i];
+    }
+    depth_tensor.scalar<int32>()() = depth;
+    on_tensor.scalar<uint8>()() = on;
+    off_tensor.scalar<uint8>()() = off;
+
+    RunInputs({{"input", input_tensor},
+               {"depth", depth_tensor},
+               {"on", on_tensor},
+               {"off", off_tensor}});
+  }
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzOneHot);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
index ab6812c5f1534426da15fbe73a282ddf21d02931..0ce4206fc3c329beeeb6bf5f43eea77aebb0c8ab 100644
--- a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
@@ -25,7 +25,7 @@ class FuzzParseTensor : public FuzzSession {
   void BuildGraph(const Scope& scope) final {
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
     // The serialized proto.
-    auto input = Placeholder(scope.WithOpName("input1"), DT_STRING);
+    auto input = Placeholder(scope.WithOpName("input"), DT_STRING);
 
     (void)ParseTensor(scope.WithOpName("output"), input, DT_FLOAT);
   }
@@ -62,8 +62,7 @@ class FuzzParseTensor : public FuzzSession {
     // Now we can do the actual fuzz implementation
     Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
     input_tensor.scalar<string>()() = as_string;
-    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-    RunOneInput(input_tensor).IgnoreError();
+    RunInputs({{"input", input_tensor}});
   }
 };
 
diff --git a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
index 2564f8ed0303d1c80bad32181507eb678b18345b..10958602b2fe3fd53d4acde8dce2fff0ccb5cd1d 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
@@ -22,16 +22,16 @@ namespace fuzzing {
 class FuzzStringSplit : public FuzzSession {
   void BuildGraph(const Scope& scope) override {
     auto input =
-        tensorflow::ops::Placeholder(scope.WithOpName("input1"), DT_STRING);
-    auto delimeter =
-        tensorflow::ops::Placeholder(scope.WithOpName("input2"), DT_STRING);
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), DT_STRING);
+    auto delimiter =
+        tensorflow::ops::Placeholder(scope.WithOpName("delimiter"), DT_STRING);
     (void)tensorflow::ops::StringSplit(scope.WithOpName("output"), input,
-                                       delimeter);
+                                       delimiter);
   }
 
   void FuzzImpl(const uint8_t* data, size_t size) final {
     Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
-    Tensor delimeter_tensor(tensorflow::DT_STRING, TensorShape({}));
+    Tensor delimiter_tensor(tensorflow::DT_STRING, TensorShape({}));
 
     if (size > 0) {
       // The spec for split is that the delimeter should be 0 or 1 characters.
@@ -42,14 +42,13 @@ class FuzzStringSplit : public FuzzSession {
       if (delim_len > size) {
         delim_len = size - 1;
       }
-      delimeter_tensor.scalar<string>()() =
+      delimiter_tensor.scalar<string>()() =
           string(reinterpret_cast<const char*>(data), delim_len);
       input_tensor.scalar<string>()() = string(
           reinterpret_cast<const char*>(data + delim_len), size - delim_len);
     }
 
-    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-    RunTwoInputs(input_tensor, delimeter_tensor).IgnoreError();
+    RunInputs({{"input", input_tensor}, {"delimiter", delimiter_tensor}});
   }
 };
 
diff --git a/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
index 787bccc15ba3987edc64056bdad091d382b07500..969821dbba70907a1d1d26e84cc4887acd604a82 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
@@ -22,9 +22,9 @@ namespace fuzzing {
 class FuzzStringSplitV2 : public FuzzSession {
   void BuildGraph(const Scope& scope) override {
     auto input =
-        tensorflow::ops::Placeholder(scope.WithOpName("input1"), DT_STRING);
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), DT_STRING);
     auto separator =
-        tensorflow::ops::Placeholder(scope.WithOpName("input2"), DT_STRING);
+        tensorflow::ops::Placeholder(scope.WithOpName("separator"), DT_STRING);
     (void)tensorflow::ops::StringSplitV2(scope.WithOpName("output"),
                                                input, separator);
   }
@@ -52,7 +52,7 @@ class FuzzStringSplitV2 : public FuzzSession {
           reinterpret_cast<const char*>(data + sep_len), size - sep_len);
     }
 
-    RunTwoInputs(input_tensor, separator_tensor).IgnoreError();
+    RunInputs({{"input", input_tensor}, {"separator", separator_tensor}});
   }
 
  private:
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index 5f244b1b10f65c60becc1ce3c0e87836a48e3ae3..42fad1d4b053f84a7f5eaae4382f0a090ba628da 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -483,9 +483,19 @@ REGISTER_KERNEL_BUILDER(Name("TensorListGetItem").Device(DEVICE_CPU),
 
 #if GOOGLE_CUDA
 
-REGISTER_KERNEL_BUILDER(
-    Name("TensorListGetItem").Device(DEVICE_GPU).HostMemory("index"),
-    TensorListGetItem);
+#define REGISTER_TENSOR_LIST_GET_ITEM_GPU(T)                      \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")               \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("index"),               \
+                          TensorListGetItem);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+REGISTER_TENSOR_LIST_GET_ITEM_GPU(bfloat16)
+#undef REGISTER_TENSOR_LIST_GET_ITEM_GPU
 
 #endif  // GOOGLE_CUDA
 
@@ -537,9 +547,19 @@ REGISTER_KERNEL_BUILDER(Name("TensorListSetItem").Device(DEVICE_CPU),
 
 #if GOOGLE_CUDA
 
-REGISTER_KERNEL_BUILDER(
-    Name("TensorListSetItem").Device(DEVICE_GPU).HostMemory("index"),
-    TensorListSetItem);
+#define REGISTER_TENSOR_LIST_SET_ITEM_GPU(T)                      \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSetItem")               \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("index"),               \
+                          TensorListSetItem);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+REGISTER_TENSOR_LIST_SET_ITEM_GPU(bfloat16)
+#undef REGISTER_TENSOR_LIST_SET_ITEM_GPU
 
 #endif  // GOOGLE_CUDA
 
@@ -660,7 +680,11 @@ REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(bfloat16);
   REGISTER_KERNEL_BUILDER(Name("TensorListGather")                \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_CPU),                \
-                          TensorListGather<CPUDevice, T>)
+                          TensorListGather<CPUDevice, T>)         \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListConcat<CPUDevice, T>)
 
 TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_STACK_CPU);
 REGISTER_TENSOR_LIST_STACK_CPU(quint8);
@@ -680,7 +704,11 @@ REGISTER_TENSOR_LIST_STACK_CPU(bfloat16);
   REGISTER_KERNEL_BUILDER(Name("TensorListScatter")               \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_CPU),                \
-                          TensorListScatter<CPUDevice, T>)
+                          TensorListScatter<CPUDevice, T>)        \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListSplit<CPUDevice, T>)
 
 TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_CPU);
 REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(quint8);
diff --git a/tensorflow/core/kernels/list_kernels.cu.cc b/tensorflow/core/kernels/list_kernels.cu.cc
index a00bf700ca21ea2a69fdcc84815ca473375b333c..23f552642cac273cf53b25a6d43e1e6ca23ea0cc 100644
--- a/tensorflow/core/kernels/list_kernels.cu.cc
+++ b/tensorflow/core/kernels/list_kernels.cu.cc
@@ -45,7 +45,12 @@ typedef Eigen::GpuDevice GPUDevice;
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_GPU)                 \
                               .HostMemory("indices"),             \
-                          TensorListGather<GPUDevice, T>)
+                          TensorListGather<GPUDevice, T>)         \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("lengths"),             \
+                          TensorListConcat<GPUDevice, T>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_STACK_GPU);
 REGISTER_TENSOR_LIST_STACK_GPU(bfloat16);
@@ -82,7 +87,13 @@ REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bool);
                               .Device(DEVICE_GPU)                 \
                               .HostMemory("element_shape")        \
                               .HostMemory("indices"),             \
-                          TensorListScatter<GPUDevice, T>)
+                          TensorListScatter<GPUDevice, T>)        \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("element_shape")        \
+                              .HostMemory("lengths"),             \
+                          TensorListSplit<GPUDevice, T>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
 REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(bfloat16);
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index d4adc068a212a5a057d96add6a322e3dd15ec5b1..686679474c40dc922683786cdfe65ffb3fbc03e2 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/util/tensor_ops_util.h"
 #include "tensorflow/core/util/util.h"
 
@@ -77,26 +78,30 @@ class TensorListStack : public OpKernel {
   ~TensorListStack() {}
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
+    const TensorList* tensor_list =
+        c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, tensor_list != nullptr,
                 errors::InvalidArgument(
                     "Input handle is not a list. Saw: '",
                     c->input(0).scalar<Variant>()().DebugString(), "'"));
-    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
-                errors::InvalidArgument("Invalid data types; op elements ",
-                                        DataTypeString(element_dtype_),
-                                        " but list elements ",
-                                        DataTypeString(l->element_dtype)));
-    OP_REQUIRES(c, !l->tensors.empty() || l->element_shape.IsFullyDefined(),
-                errors::InvalidArgument("Tried to stack elements of a empty ",
-                                        "list with non-fully-defined shape: ",
-                                        l->element_shape.DebugString()));
+    OP_REQUIRES(
+        c, element_dtype_ == tensor_list->element_dtype,
+        errors::InvalidArgument(
+            "Invalid data types; op elements ", DataTypeString(element_dtype_),
+            " but list elements ", DataTypeString(tensor_list->element_dtype)));
+    OP_REQUIRES(
+        c,
+        !tensor_list->tensors.empty() ||
+            tensor_list->element_shape.IsFullyDefined(),
+        errors::InvalidArgument("Tried to stack elements of a empty ",
+                                "list with non-fully-defined shape: ",
+                                tensor_list->element_shape.DebugString()));
     if (num_elements_ != -1) {
-      OP_REQUIRES(c, l->tensors.size() == num_elements_,
-                  errors::InvalidArgument("Operation expected a list with ",
-                                          num_elements_,
-                                          " elements but got a list with ",
-                                          l->tensors.size(), " elements."));
+      OP_REQUIRES(c, tensor_list->tensors.size() == num_elements_,
+                  errors::InvalidArgument(
+                      "Operation expected a list with ", num_elements_,
+                      " elements but got a list with ",
+                      tensor_list->tensors.size(), " elements."));
     }
     // Compute the shape of the output tensor.
     // If `element_shape` is fully-defined it gets used. It is assumed that all
@@ -105,11 +110,11 @@ class TensorListStack : public OpKernel {
     // tensor is used and it is checked that all other tensors have the same
     // shape.
     TensorShape resulting_shape;
-    if (!l->element_shape.AsTensorShape(&resulting_shape)) {
-      const Tensor& t = l->tensors[0];
+    if (!tensor_list->element_shape.AsTensorShape(&resulting_shape)) {
+      const Tensor& t = tensor_list->tensors[0];
       resulting_shape = t.shape();
-      for (int i = 1; i < l->tensors.size(); ++i) {
-        const Tensor& t = l->tensors[i];
+      for (int i = 1; i < tensor_list->tensors.size(); ++i) {
+        const Tensor& t = tensor_list->tensors[i];
         OP_REQUIRES(c, t.shape() == resulting_shape,
                     errors::InvalidArgument(
                         "Tried to stack tensors with unequal shapes: ",
@@ -117,7 +122,7 @@ class TensorListStack : public OpKernel {
                         t.shape().DebugString()));
       }
     }
-    resulting_shape.InsertDim(0, l->tensors.size());
+    resulting_shape.InsertDim(0, tensor_list->tensors.size());
     Tensor* output;
     OP_REQUIRES_OK(c, c->allocate_output(0, resulting_shape, &output));
     if (output->NumElements() == 0) {
@@ -125,8 +130,8 @@ class TensorListStack : public OpKernel {
     }
 
     ConstMatrixVector inputs_flat;
-    inputs_flat.reserve(l->tensors.size());
-    for (const auto& t : l->tensors) {
+    inputs_flat.reserve(tensor_list->tensors.size());
+    for (const auto& t : tensor_list->tensors) {
       inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
           t.shaped<T, 2>({1, t.NumElements()})));
     }
@@ -146,6 +151,200 @@ class TensorListStack : public OpKernel {
   DataType element_dtype_;
 };
 
+template <typename Device, typename T>
+class TensorListConcat : public OpKernel {
+ public:
+  using ConstMatrixVector =
+      std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>;
+  explicit TensorListConcat(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  ~TensorListConcat() {}
+
+  void Compute(OpKernelContext* c) override {
+    // Check that the input Variant tensor is indeed a TensorList and has the
+    // correct element type.
+    const TensorList* tensor_list =
+        c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, tensor_list != nullptr,
+                errors::InvalidArgument(
+                    "Input handle is not a list. Saw: '",
+                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    OP_REQUIRES(
+        c, element_dtype_ == tensor_list->element_dtype,
+        errors::InvalidArgument(
+            "Invalid data types; op elements ", DataTypeString(element_dtype_),
+            " but list elements ", DataTypeString(tensor_list->element_dtype)));
+    // If the TensorList is empty, its element_shape must be fully defined
+    // except for the first dimension.
+    PartialTensorShape shape_except_first_dim;
+    if (!tensor_list->element_shape.unknown_rank()) {
+      OP_REQUIRES(c, tensor_list->element_shape.dims() >= 1,
+                  errors::InvalidArgument(
+                      "Concat requires elements to be at least vectors, ",
+                      "found scalars instead."));
+      shape_except_first_dim = PartialTensorShape(
+          gtl::ArraySlice<int64>(tensor_list->element_shape.dim_sizes())
+              .subspan(1));
+    }
+    OP_REQUIRES(c,
+                !tensor_list->tensors.empty() ||
+                    shape_except_first_dim.IsFullyDefined(),
+                errors::InvalidArgument(
+                    "All except the first dimension must be fully defined ",
+                    "when concating an empty tensor list. element_shape: ",
+                    tensor_list->element_shape.DebugString()));
+    // 1. Compute the shape of the output tensor.
+    // If `shape_except_first_dim` is fully-defined we just prepend the leading
+    // dim to it. Otherwise we use the shape of the first element tensor and
+    // check to make sure shapes of all tensors are compatible.
+    TensorShape output_shape;
+    if (!shape_except_first_dim.AsTensorShape(&output_shape)) {
+      const Tensor& element_tensor = tensor_list->tensors[0];
+      OP_REQUIRES(
+          c, TensorShapeUtils::IsVectorOrHigher(element_tensor.shape()),
+          errors::InvalidArgument("Concat saw a scalar shape at index ", 0,
+                                  " but requires at least vectors."));
+      output_shape =
+          TensorShape(gtl::ArraySlice<int64>(element_tensor.shape().dim_sizes())
+                          .subspan(1));
+      for (int i = 1; i < tensor_list->tensors.size(); ++i) {
+        const Tensor& element_tensor = tensor_list->tensors[i];
+        OP_REQUIRES(
+            c, TensorShapeUtils::IsVectorOrHigher(element_tensor.shape()),
+            errors::InvalidArgument("Concat saw a scalar shape at index ", i,
+                                    " but requires at least vectors."));
+        TensorShape actual_shape(
+            gtl::ArraySlice<int64>(element_tensor.shape().dim_sizes())
+                .subspan(1));
+        OP_REQUIRES(c, actual_shape.dim_sizes() == output_shape.dim_sizes(),
+                    errors::InvalidArgument(
+                        "Tried to concat tensors with unequal shapes: ",
+                        output_shape.DebugString(), " vs ",
+                        actual_shape.DebugString()));
+      }
+    }
+    // 2. Build the lengths_tensor and leading dim of the output tensor by
+    // iterating over all element tensors.
+    Tensor* lengths_tensor = nullptr;
+    OP_REQUIRES_OK(
+        c,
+        c->allocate_output(
+            1, TensorShape({static_cast<int64>(tensor_list->tensors.size())}),
+            &lengths_tensor));
+    auto lengths_tensor_vec = lengths_tensor->vec<int64>();
+    int64 leading_dim = 0;
+    for (size_t i = 0; i < tensor_list->tensors.size(); i++) {
+      int64 dim = tensor_list->tensors[i].shape().dim_size(0);
+      leading_dim += dim;
+      lengths_tensor_vec(i) = dim;
+    }
+    output_shape.InsertDim(0, leading_dim);
+    Tensor* output;
+    // 3. Allocate the output tensor and fill it up with the concated element
+    // tensors.
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+    if (output->NumElements() == 0) {
+      return;
+    }
+
+    ConstMatrixVector inputs_flat;
+    inputs_flat.reserve(tensor_list->tensors.size());
+    for (const auto& element_tensor : tensor_list->tensors) {
+      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+          element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
+    }
+    auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
+
+#if GOOGLE_CUDA
+    if (std::is_same<Device, Eigen::GpuDevice>::value) {
+      ConcatGPU<T>(c, inputs_flat, output, &output_flat);
+      return;
+    }
+#endif  // GOOGLE_CUDA
+    ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+template <typename Device, typename T>
+class TensorListSplit : public OpKernel {
+ public:
+  TensorListSplit(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    Tensor* output_tensor;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(c, c->allocate_output(0, {}, &output_tensor, attr));
+    PartialTensorShape element_shape;
+    OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(1), &element_shape));
+    OP_REQUIRES(c, element_shape.unknown_rank() || element_shape.dims() >= 1,
+                errors::InvalidArgument(
+                    "TensorListSplit requires element_shape to be at least of ",
+                    "rank 1, but saw: ", element_shape.DebugString()));
+    TensorList output_list;
+    const Tensor& input_tensor = c->input(0);
+    output_list.element_dtype = input_tensor.dtype();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_tensor.shape()),
+                errors::InvalidArgument(
+                    "Tensor must be at least a vector, but saw shape: ",
+                    input_tensor.shape().DebugString()));
+    TensorShape tensor_shape_without_first_dim(input_tensor.shape());
+    tensor_shape_without_first_dim.RemoveDim(0);
+    PartialTensorShape element_shape_without_first_dim;
+    if (!element_shape.unknown_rank()) {
+      element_shape_without_first_dim =
+          PartialTensorShape(element_shape.dim_sizes());
+      element_shape_without_first_dim.RemoveDim(0);
+    }
+    OP_REQUIRES(c,
+                element_shape_without_first_dim.IsCompatibleWith(
+                    tensor_shape_without_first_dim),
+                errors::InvalidArgument(
+                    "tensor shape ", input_tensor.shape().DebugString(),
+                    " is not compatible with element_shape ",
+                    element_shape.DebugString()));
+    output_list.element_shape = element_shape;
+    const Tensor& lengths = c->input(2);
+    OP_REQUIRES(c, TensorShapeUtils::IsVector(lengths.shape()),
+                errors::InvalidArgument(
+                    "Expected lengths to be a vector, received shape: ",
+                    lengths.shape().DebugString()));
+    output_list.tensors.reserve(lengths.shape().dim_size(0));
+    int64 start = 0;
+    int64 end = 0;
+    for (int i = 0; i < lengths.shape().dim_size(0); ++i) {
+      int64 length = lengths.vec<int64>()(i);
+      OP_REQUIRES(
+          c, length >= 0,
+          errors::InvalidArgument("Invalid value in lengths: ", length));
+      end = start + length;
+      OP_REQUIRES(c, end <= input_tensor.shape().dim_size(0),
+                  errors::InvalidArgument("Attempting to slice [", start, ", ",
+                                          end, "] from tensor with length ",
+                                          input_tensor.shape().dim_size(0)));
+      Tensor tmp = input_tensor.Slice(start, end);
+      start = end;
+      // TODO(apassos) maybe not always align; but weird compiler bugs seem to
+      // prevent this.
+      Tensor aligned;
+      OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
+      aligned.flat<T>().device(c->eigen_device<Device>()) =
+          tmp.unaligned_flat<T>();
+      output_list.tensors.emplace_back(aligned);
+    }
+    OP_REQUIRES(c, end == input_tensor.shape().dim_size(0),
+                errors::InvalidArgument(
+                    "Unused values in tensor. Length of tensor: ",
+                    input_tensor.shape().dim_size(0), " Values used: ", end));
+    output_tensor->scalar<Variant>()() = std::move(output_list);
+  }
+};
+
 template <typename Device, typename T>
 class TensorListGather : public OpKernel {
  public:
@@ -156,22 +355,25 @@ class TensorListGather : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
+    const TensorList* tensor_list =
+        c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, tensor_list != nullptr,
                 errors::InvalidArgument(
                     "Input handle is not a list. Saw: '",
                     c->input(0).scalar<Variant>()().DebugString(), "'"));
-    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
-                errors::InvalidArgument("Invalid data types; op elements ",
-                                        DataTypeString(element_dtype_),
-                                        " but list elements ",
-                                        DataTypeString(l->element_dtype)));
+    OP_REQUIRES(
+        c, element_dtype_ == tensor_list->element_dtype,
+        errors::InvalidArgument(
+            "Invalid data types; op elements ", DataTypeString(element_dtype_),
+            " but list elements ", DataTypeString(tensor_list->element_dtype)));
     Tensor indices = c->input(1);
-    OP_REQUIRES(c,
-                indices.NumElements() > 0 || l->element_shape.IsFullyDefined(),
-                errors::InvalidArgument("Tried to gather 0-elements from "
-                                        "a list with non-fully-defined shape: ",
-                                        l->element_shape.DebugString()));
+    OP_REQUIRES(
+        c,
+        indices.NumElements() > 0 ||
+            tensor_list->element_shape.IsFullyDefined(),
+        errors::InvalidArgument("Tried to gather 0-elements from "
+                                "a list with non-fully-defined shape: ",
+                                tensor_list->element_shape.DebugString()));
     // Compute the shape of the output tensor.
     // If `element_shape` is fully-defined it gets used. It is assumed that all
     // requested tensors have the same shape.
@@ -179,17 +381,17 @@ class TensorListGather : public OpKernel {
     // tensor is used and it is checked that all other tensors have the same
     // shape.
     TensorShape resulting_shape;
-    if (!l->element_shape.AsTensorShape(&resulting_shape)) {
+    if (!tensor_list->element_shape.AsTensorShape(&resulting_shape)) {
       const int i = indices.flat<int32>()(0);
       OP_REQUIRES(
-          c, i < l->tensors.size(),
+          c, i < tensor_list->tensors.size(),
           errors::InvalidArgument("Index ", i, " out o range; list only has ",
-                                  l->tensors.size(), " elements."));
-      const Tensor& t = l->tensors[i];
+                                  tensor_list->tensors.size(), " elements."));
+      const Tensor& t = tensor_list->tensors[i];
       resulting_shape = t.shape();
       for (int index = 1; index < indices.NumElements(); ++index) {
         const int i = indices.flat<int32>()(index);
-        const Tensor& t = l->tensors[i];
+        const Tensor& t = tensor_list->tensors[i];
         OP_REQUIRES(c, t.shape() == resulting_shape,
                     errors::InvalidArgument(
                         "Tried to gather elements with unequal shapes: ",
@@ -205,14 +407,14 @@ class TensorListGather : public OpKernel {
     }
 
     ConstMatrixVector inputs_flat;
-    inputs_flat.reserve(l->tensors.size());
+    inputs_flat.reserve(tensor_list->tensors.size());
     for (int index = 0; index < indices.NumElements(); ++index) {
       const int i = indices.flat<int32>()(index);
       OP_REQUIRES(
-          c, i < l->tensors.size(),
+          c, i < tensor_list->tensors.size(),
           errors::InvalidArgument("Index ", i, " out o range; list only has ",
-                                  l->tensors.size(), " elements."));
-      const Tensor& t = l->tensors[i];
+                                  tensor_list->tensors.size(), " elements."));
+      const Tensor& t = tensor_list->tensors[i];
       inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
           t.shaped<T, 2>({1, t.NumElements()})));
     }
@@ -290,13 +492,13 @@ class TensorListScatter : public OpKernel {
     PartialTensorShape element_shape;
     OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(2), &element_shape));
     TensorList output_list;
-    const Tensor& t = c->input(0);
-    output_list.element_dtype = t.dtype();
-    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(t.shape()),
+    const Tensor& input_tensor = c->input(0);
+    output_list.element_dtype = input_tensor.dtype();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_tensor.shape()),
                 errors::InvalidArgument(
                     "Tensor must be at least a vector, but saw shape: ",
-                    t.shape().DebugString()));
-    TensorShape output_shape(t.shape());
+                    input_tensor.shape().DebugString()));
+    TensorShape output_shape(input_tensor.shape());
     output_shape.RemoveDim(0);
     OP_REQUIRES(c, element_shape.IsCompatibleWith(output_shape),
                 errors::InvalidArgument(
@@ -306,11 +508,11 @@ class TensorListScatter : public OpKernel {
     output_list.tensors.reserve(indices.NumElements());
     for (int index = 0; index < indices.NumElements(); ++index) {
       const int i = indices.flat<int32>()(index);
-      OP_REQUIRES(c, i < t.shape().dim_size(0),
-                  errors::InvalidArgument("Trying to scatter index ", i,
-                                          " from tensor with ",
-                                          t.shape().dim_size(0), " rows."));
-      Tensor tmp = t.Slice(i, i + 1);
+      OP_REQUIRES(c, i < input_tensor.shape().dim_size(0),
+                  errors::InvalidArgument(
+                      "Trying to scatter index ", i, " from tensor with ",
+                      input_tensor.shape().dim_size(0), " rows."));
+      Tensor tmp = input_tensor.Slice(i, i + 1);
       TensorShape tmp_shape = tmp.shape();
       tmp_shape.RemoveDim(0);
       OP_REQUIRES(c, tmp.CopyFrom(tmp, tmp_shape),
diff --git a/tensorflow/core/kernels/lu_op.cc b/tensorflow/core/kernels/lu_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9591d1bdf2fddea7b9d6265d4a8dd6c3f5f5df6
--- /dev/null
+++ b/tensorflow/core/kernels/lu_op.cc
@@ -0,0 +1,193 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/Eigen/LU"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Scalar, typename Tidx>
+class LuOp : public OpKernel {
+ public:
+  explicit LuOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ protected:
+  using TensorShapes = gtl::InlinedVector<TensorShape, 4>;
+  using TensorOutputs = gtl::InlinedVector<Tensor*, 4>;
+
+  using Matrix =
+      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+  using Indices =
+      Eigen::Matrix<Tidx, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using IndicesMap = Eigen::Map<Indices>;
+  using ConstIndicesMap = Eigen::Map<const Indices>;
+
+ public:
+  // Returns the cost per matrix operation. This is used to determine the
+  // number of threads to use for parallelizing factorization in batch mode.
+  // Cost per unit is assumed to be roughly 1ns, based on comments
+  // in core/util/work_sharder.cc.
+  // LU decomposition for a square matrix takes roughly (2/3) * (num_rows)^3.
+  // TODO(anudhyan): Refine this estimate after taking constant factors into
+  // account.
+  int64 GetCostPerUnit(const TensorShape& input_matrix_shape) const {
+    double num_rows = static_cast<double>(input_matrix_shape.dim_size(0));
+    double cost = (2 / 3.0) * MathUtil::IPow(num_rows, 3);
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    OP_REQUIRES(context, context->num_inputs() == 1,
+                errors::InvalidArgument("Expecting exactly one input, got ",
+                                        context->num_inputs()));
+
+    const Tensor& input = context->input(0);
+    int input_rank = input.dims();
+    OP_REQUIRES(context, input_rank >= 2,
+                errors::InvalidArgument(
+                    "Input tensor must have rank >= 2, got ", input_rank));
+
+    // If the tensor rank is greater than 2, we consider the inner-most
+    // dimensions as matrices, and loop over all the other outer ("batch")
+    // dimensions to compute the results.
+    TensorShape input_matrix_shape;
+    TensorShape batch_shape;
+    for (int dim = 0; dim < input_rank - 2; ++dim) {
+      batch_shape.AddDim(input.dim_size(dim));
+    }
+    const int64 num_rows = input.dim_size(input_rank - 2);
+    const int64 num_cols = input.dim_size(input_rank - 1);
+
+    input_matrix_shape.AppendShape({num_rows, num_cols});
+    OP_REQUIRES(context, TensorShapeUtils::IsSquareMatrix(input_matrix_shape),
+                errors::InvalidArgument("Input matrix must be square."));
+
+    // packed_triangular_factors is a matrix with the same shape as the input;
+    // permutation is a vector.
+    TensorShape permutation_shape = batch_shape;
+    permutation_shape.AddDim(num_rows);
+
+    TensorShapes output_matrix_shapes({input.shape(), permutation_shape});
+
+    TensorOutputs outputs;
+    Tensor* output_packed_triangular_factors = nullptr;
+    OP_REQUIRES_OK(
+        context, context->forward_input_or_allocate_output(
+                     {0}, 0, input.shape(), &output_packed_triangular_factors));
+    outputs.emplace_back(output_packed_triangular_factors);
+
+    Tensor* output_permutation = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, permutation_shape,
+                                                     &output_permutation));
+    outputs.emplace_back(output_permutation);
+
+    if (num_rows == 0) {
+      return;
+    }
+
+    // Process the individual matrix problems in parallel using a threadpool.
+    auto shard = [this, &input, &num_rows, &num_cols, &outputs,
+                  &output_matrix_shapes, context](int64 begin, int64 end) {
+      for (int64 i = begin; i < end; ++i) {
+        ComputeTensorSlice(context, i, input, num_rows, num_cols, outputs,
+                           output_matrix_shapes);
+      }
+    };
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers,
+          batch_shape.num_elements(), GetCostPerUnit(input_matrix_shape),
+          shard);
+  }
+
+  void ComputeTensorSlice(OpKernelContext* context, int64 matrix_index,
+                          const Tensor& input, int64 num_rows, int64 num_cols,
+                          const TensorOutputs& outputs,
+                          const TensorShapes& output_matrix_shapes) {
+    // TODO(kalakris): Handle alignment if possible. Eigen::Map is
+    // unaligned by default.
+    ConstMatrixMap input_matrix(
+        input.flat<Scalar>().data() + matrix_index * num_rows * num_cols,
+        num_rows, num_cols);
+
+    // packed_triangular_factors has shape [num_rows, num_cols]
+    MatrixMap packed_triangular_factors(
+        outputs[0]->flat<Scalar>().data() + matrix_index * num_rows * num_cols,
+        num_rows, num_rows);
+
+    // permutation has shape [num_rows, 1]
+    IndicesMap permutation_indices(
+        outputs[1]->flat<Tidx>().data() + matrix_index * num_rows, num_rows, 1);
+
+    Eigen::PartialPivLU<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>
+        lu_decomposition(input_matrix);
+
+    // Output the packed triangular factors in a dense form.
+    // The lower triangular factor L corresponds to the strictly lower
+    // triangular part of packed_triangular_factors with an implicit unit
+    // diagonal. The upper triangular factor U is the upper triangular part of
+    // packed_triangular_factors. The triangular factors satisfy the equation
+    //     P * input_matrix = L * U
+    // where P is the permutation matrix corresponding to the indices in
+    // permutation_indices.
+    packed_triangular_factors = lu_decomposition.matrixLU();
+    // Output the permutation matrix used for pivoting.
+    Eigen::PermutationMatrix<-1, -1, Tidx> permutation =
+        lu_decomposition.permutationP().transpose();
+    permutation_indices = permutation.indices();
+
+    // PartialPivLU cannot give strong guarantees on invertibility,
+    // but we can at least guard against exact zero pivots. This can occur as
+    // a result of basic user mistakes such providing integer valued
+    // matrices that are exactly singular, or due to underflow if this
+    // code is run with denormals being flushed to zero.
+    const RealScalar min_abs_pivot =
+        packed_triangular_factors.diagonal().cwiseAbs().minCoeff();
+    OP_REQUIRES(context, min_abs_pivot > RealScalar(0),
+                errors::InvalidArgument("Input is not invertible."));
+  }
+};
+
+#define REGISTER_LU(type, idx_type)                                         \
+  REGISTER_KERNEL_BUILDER(Name("Lu")                                        \
+                              .Device(DEVICE_CPU)                           \
+                              .TypeConstraint<type>("T")                    \
+                              .TypeConstraint<idx_type>("output_idx_type"), \
+                          LuOp<type, idx_type>);
+
+REGISTER_LU(float, int32);
+REGISTER_LU(double, int32);
+REGISTER_LU(complex64, int32);
+REGISTER_LU(complex128, int32);
+
+REGISTER_LU(float, int64);
+REGISTER_LU(double, int64);
+REGISTER_LU(complex64, int64);
+REGISTER_LU(complex128, int64);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lu_op_gpu.cu.cc b/tensorflow/core/kernels/lu_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f83744b50de5ca7fd247b17e3fcac52889f5f288
--- /dev/null
+++ b/tensorflow/core/kernels/lu_op_gpu.cu.cc
@@ -0,0 +1,275 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace {
+template <typename Scalar>
+__device__ void ComputePermutationFromTranspositions(
+    int64 num_rows, const int* pivots, Scalar* permutation_indices) {
+  // Fill in the output array with the identity permutation.
+  for (int i = 0; i < num_rows; ++i) {
+    permutation_indices[i] = Scalar(i);
+  }
+
+  // Compute the permutation from a sequence of transpositions encoded
+  // in the pivot array by applying the transpositions in order on the
+  // identity permutation.
+  for (int i = 0; i < num_rows; ++i) {
+    // Note: Internally, the cuBlas code uses Fortran convention (1-based)
+    // indexing so ith row was swapped with (pivots[i]-1)'th row in 0-based
+    // indexing.
+    Scalar t = permutation_indices[i];
+    permutation_indices[i] = permutation_indices[pivots[i] - 1];
+    permutation_indices[pivots[i] - 1] = t;
+  }
+}
+}  // namespace
+
+// Kernel to compute the inverse of a permutation from a sequence of
+// transpositions.
+template <typename Scalar>
+__global__ void ComputePermutationFromTranspositionsKernel(
+    CudaLaunchConfig config, const int64 num_rows, const int* all_pivots,
+    Scalar* all_permutation_indices) {
+  // We only parallelize over batches here. Performance is not critical,
+  // since this cheap O(num_rows) kernel always follows an O(num_rows^3)
+  // LU factorization.
+  CUDA_1D_KERNEL_LOOP(index, config.virtual_thread_count) {
+    ComputePermutationFromTranspositions(
+        num_rows, all_pivots + index * num_rows,
+        all_permutation_indices + index * num_rows);
+  }
+}
+
+template <class Scalar, class Tidx>
+class LuOpGpu : public AsyncOpKernel {
+ public:
+  explicit LuOpGpu(OpKernelConstruction* context) : AsyncOpKernel(context) {}
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+    const Tensor& input = context->input(0);
+
+    // Analyze shape and validate inputs.
+    const int input_rank = input.dims();
+
+    OP_REQUIRES_ASYNC(
+        context, input_rank >= 2,
+        errors::InvalidArgument("Input must have rank >= 2, got ", input_rank),
+        done);
+
+    const int64 num_rows = input.dim_size(input_rank - 2);
+    const int64 num_cols = input.dim_size(input_rank - 1);
+
+    OP_REQUIRES_ASYNC(
+        context, num_rows == num_cols,
+        errors::InvalidArgument("Input matrices must be squares, got", num_rows,
+                                " != ", num_cols),
+        done);
+
+    TensorShape batch_shape;
+    for (int dim = 0; dim < input_rank - 2; ++dim) {
+      batch_shape.AddDim(input.dim_size(dim));
+    }
+    TensorShape permutation_indices_shape = batch_shape;
+    permutation_indices_shape.AddDim(num_rows);
+
+    const GPUDevice& device = context->eigen_device<GPUDevice>();
+    auto solver = absl::make_unique<CudaSolver>(context);
+
+    // We output the packed triangular factors in a dense form.
+    // The lower triangular factor L corresponds to the strictly lower
+    // triangular part of packed_triangular_factors with an implicit unit
+    // diagonal. The upper triangular factor U is the upper triangular part of
+    // packed_triangular_factors. The triangular factors satisfy the equation
+    //     P * input_matrix = L * U
+    // where P is the permutation matrix corresponding to the indices in
+    // permutation_indices.
+    //
+    // Reuse the input buffer or make a copy for the factorization step,
+    // depending on whether this ops owns it exclusively.
+    Tensor* packed_triangular_factors;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->forward_input_or_allocate_output(
+                             {0}, 0, input.shape(), &packed_triangular_factors),
+                         done);
+    if (!packed_triangular_factors->SharesBufferWith(input)) {
+      device.memcpy(packed_triangular_factors->flat<Scalar>().data(),
+                    input.flat<Scalar>().data(),
+                    input.NumElements() * sizeof(Scalar));
+    }
+
+    // Allocate output permutation.
+    Tensor* permutation_indices = nullptr;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->allocate_output(1, permutation_indices_shape,
+                                                  &permutation_indices),
+                         done);
+
+    if (input.NumElements() == 0) {
+      done();
+      return;
+    }
+
+    // Allocate a temporary Tensor to store the transposed packed triangular
+    // factors.
+    Tensor packed_triangular_factors_transpose;
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->allocate_temp(DataTypeToEnum<Scalar>::value, input.shape(),
+                               &packed_triangular_factors_transpose),
+        done);
+    auto packed_triangular_factors_transpose_reshaped =
+        packed_triangular_factors_transpose
+            .template flat_inner_dims<Scalar, 3>();
+    const int64 batch_size =
+        packed_triangular_factors_transpose_reshaped.dimension(0);
+
+    // Allocate pivots on the device.
+    Tensor pivots;
+    OP_REQUIRES_OK_ASYNC(context,
+                         solver->allocate_scoped_tensor(
+                             DataTypeToEnum<int32>::value,
+                             TensorShape{batch_size, num_rows}, &pivots),
+                         done);
+    auto pivots_mat = pivots.template matrix<int32>();
+
+    // Transpose the input. This is necessary because cuBLAS assumes
+    // column-major storage while TensorFlow uses row-major.
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        DoMatrixTranspose(device, *packed_triangular_factors,
+                          &packed_triangular_factors_transpose),
+        done);
+
+    std::vector<DeviceLapackInfo> dev_info;
+    if (num_rows == num_cols && num_rows / batch_size <= 128) {
+      // For small matrices or large batch sizes, we use the batched
+      // interface from cuBlas.
+      auto packed_triangular_factors_ptrs = solver->GetScratchSpace<uint8>(
+          sizeof(Scalar*) * batch_size, "packed_triangular_factors_ptrs",
+          /* on_host */ true);
+      const Scalar** packed_triangular_factors_ptrs_base =
+          reinterpret_cast<const Scalar**>(
+              packed_triangular_factors_ptrs.mutable_data());
+      for (int batch = 0; batch < batch_size; ++batch) {
+        packed_triangular_factors_ptrs_base[batch] =
+            &packed_triangular_factors_transpose_reshaped(batch, 0, 0);
+      }
+      dev_info.push_back(
+          solver->GetDeviceLapackInfo(batch_size, "getrfBatched"));
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          solver->GetrfBatched(num_rows, packed_triangular_factors_ptrs_base,
+                               num_rows, pivots_mat.data(), &dev_info.back(),
+                               batch_size),
+          done);
+    } else {
+      // For small batch sizes we use the non-batched interface from cuSolver,
+      // which is much faster for large matrices.
+      dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "getrf"));
+      for (int batch = 0; batch < batch_size; ++batch) {
+        OP_REQUIRES_OK_ASYNC(
+            context,
+            solver->Getrf(
+                num_rows, num_cols,
+                &packed_triangular_factors_transpose_reshaped(batch, 0, 0),
+                num_rows, &pivots_mat(batch, 0), &dev_info.back()(batch)),
+            done);
+      }
+    }
+
+    // Transpose the result since we had transposed the input.
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        DoMatrixTranspose(device, packed_triangular_factors_transpose,
+                          packed_triangular_factors),
+        done);
+
+    // Pivots encode the permutation of the rows as a sequences of row swaps.
+    // For each index i, row i is swapped with row pivots[i].
+    int* pivots_ptr = pivots.flat<int>().data();
+    Tidx* permutation_indices_ptr =
+        permutation_indices->template flat<Tidx>().data();
+    CudaLaunchConfig cfgPivots = GetCudaLaunchConfig(batch_size, device);
+    ComputePermutationFromTranspositionsKernel<<<cfgPivots.block_count,
+                                                 cfgPivots.thread_per_block, 0,
+                                                 device.stream()>>>(
+        cfgPivots, num_rows, pivots_ptr, permutation_indices_ptr);
+
+    // Callback for checking info after kernels finish. Also capture the
+    // temporary Tensors/ScratchSpace so they don't get deallocated before the
+    // kernels run.
+    // TODO(rmlarsen): Use move capture once C++14 becomes available.
+    auto info_checker = [context, done, dev_info](
+                            const Status& status,
+                            const std::vector<HostLapackInfo>& host_infos) {
+      if (!status.ok() && errors::IsInvalidArgument(status) &&
+          !host_infos.empty()) {
+        for (int i = 0; i < host_infos[0].size(); ++i) {
+          // Match the CPU error message for singular matrices. Otherwise
+          // just print the original error message from the status below.
+          OP_REQUIRES_ASYNC(context, host_infos[0].data()[i] <= 0,
+                            errors::InvalidArgument("Input is not invertible."),
+                            done);
+        }
+      }
+      OP_REQUIRES_OK_ASYNC(context, status, done);
+      done();
+    };
+
+    CudaSolver::CheckLapackInfoAndDeleteSolverAsync(std::move(solver), dev_info,
+                                                    std::move(info_checker));
+  }
+};
+
+#define REGISTER_LU_GPU(type, idx_type)                                     \
+  REGISTER_KERNEL_BUILDER(Name("Lu")                                        \
+                              .Device(DEVICE_GPU)                           \
+                              .TypeConstraint<type>("T")                    \
+                              .TypeConstraint<idx_type>("output_idx_type"), \
+                          LuOpGpu<type, idx_type>);
+
+REGISTER_LU_GPU(float, int32);
+REGISTER_LU_GPU(double, int32);
+REGISTER_LU_GPU(complex64, int32);
+REGISTER_LU_GPU(complex128, int32);
+
+REGISTER_LU_GPU(float, int64);
+REGISTER_LU_GPU(double, int64);
+REGISTER_LU_GPU(complex64, int64);
+REGISTER_LU_GPU(complex128, int64);
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 4b0ced3340e8f25cca6b33aa7fcdad4a422808de..029c539277f46704680eb10067ffbef85ddcbc9c 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -850,7 +850,8 @@ REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")
 
 // Base class for convolution forward operations
 template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
-          typename Toutput, typename Ttemp_output, bool biasEnabled>
+          typename Toutput, typename Ttemp_output, typename Tpadding,
+          bool biasEnabled, bool padEnabled>
 class MklConvOp : public OpKernel {
  public:
   ~MklConvOp() {}
@@ -928,6 +929,11 @@ class MklConvOp : public OpKernel {
           dilations, strides;
       memory::dims dst_dims_tf_order, dst_dims_mkl_order;
 
+      // If pad with conv2d fusion is enabled
+      if (padEnabled) {
+        PadWithConvFusion(context, padding_left, padding_right);
+      }
+
       // Get shapes of input tensors in MKL-DNN order
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_,
                               dilations_);
@@ -936,7 +942,7 @@ class MklConvOp : public OpKernel {
       conv_utl.GetConvFwdSizesInMklOrder(
           src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
           &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, &padding_left,
-          &padding_right);
+          &padding_right, padEnabled);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
@@ -967,7 +973,12 @@ class MklConvOp : public OpKernel {
       }
 
       bool isConv2D = (strides_.size() == 4);
-
+      // TODO(Intel-tf) Add check to make sure padEnabled is true only for 2D
+      if (!isConv2D) {
+        OP_REQUIRES(
+            context, !padEnabled,
+            errors::InvalidArgument("Pad+Conv fusion only works for 2D"));
+      }
       // Create memory for user data.
       // Describe how the inputs and outputs of Convolution look like. Also
       // specify buffers containing actual input and output data.
@@ -1011,7 +1022,7 @@ class MklConvOp : public OpKernel {
       // get a conv2d fwd from primitive pool
       MklConvFwdPrimitive<float, Tinput, Tfilter, Tbias, Ttemp_output>*
           conv_fwd = nullptr;
-      if (biasEnabled) {
+      if (fuse_biasadd_) {
         memory::dims bias_dims = {};
         conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims);
         MklConvFwdParams convFwdDims(src_dims, filter_dims, bias_dims,
@@ -1083,7 +1094,7 @@ class MklConvOp : public OpKernel {
       }
 
       // execute convolution
-      if (biasEnabled) {
+      if (fuse_biasadd_) {
         const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
         Tbias* bias_data =
             this->GetBiasHandle(context, conv_fwd_pd, bias_tensor);
@@ -1104,7 +1115,51 @@ class MklConvOp : public OpKernel {
     }
   }
 
+  void PadWithConvFusion(OpKernelContext* context, memory::dims& padding_left,
+                         memory::dims& padding_right) {
+    const Tensor& paddings_tf = MklGetInput(context, 2);
+    OP_REQUIRES(context, paddings_tf.dims() == 2,
+                errors::InvalidArgument("paddings must be 2-dimensional: ",
+                                        paddings_tf.shape().DebugString()));
+    Tpadding* paddings = nullptr;
+    // To get individual pad, need to flatten the tensor
+    paddings = static_cast<Tpadding*>(
+        const_cast<Tpadding*>(paddings_tf.flat<Tpadding>().data()));
+    // For NHWC format:
+    // paddings[0], paddings[1], paddings[6], paddings[7] should be zero
+    // if the paddings_tf is [ [0, 0] [1,2] [3,4] [0,0] ]
+    // paddings = {0, 0, 1, 2, 3, 4, 0, 0} ; flat method is row major
+    // then, values are: top = 1, bottom =2, left=3, right=4
+    // For NCHW format:
+    // paddings[0], paddings[1], paddings[2], paddings[3] should be zero
+    // similar explanation as NHWC format will apply.
+    int64 pad_top, pad_left;
+    int64 pad_bottom, pad_right;
+    string data_format = ToString(data_format_);
+    if (data_format == "NHWC") {
+      pad_top = paddings[2];
+      pad_bottom = paddings[3];
+      pad_left = paddings[4];
+      pad_right = paddings[5];
+    } else if (data_format == "NCHW") {
+      pad_top = paddings[4];
+      pad_bottom = paddings[5];
+      pad_left = paddings[6];
+      pad_right = paddings[7];
+    }
+    // Create padding arrays for MKL DNN convolutions.
+    // MKL-DNN uses asymetric padding.
+    padding_left = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
+    padding_right = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
+  }
+
  protected:
+  void set_fuse_biasadd(bool fuse_biasadd) { fuse_biasadd_ = fuse_biasadd; }
+  void set_fuse_relu(bool fuse_relu) { fuse_relu_ = fuse_relu; }
+
+  // This method is for the base class MklConvOp, which handles the
+  // floating point implementation of Conv. The quantized conv implementations
+  // will use overidden versions of this method.
   virtual void ExtendConvFwdParams(OpKernelContext* context,
                                    MklConvFwdParams& params) {
     // Create a string from data types of input, filter, bias, and output.
@@ -1112,6 +1167,11 @@ class MklConvOp : public OpKernel {
     params.dtypes.append(typeid(Tfilter).name());
     params.dtypes.append(typeid(Tbias).name());
     params.dtypes.append(typeid(Toutput).name());
+
+    // Add fusions as post ops
+    // Note: Fusion of BiasAdd is handled directly inside MklConvOp by
+    // checking fuse_biasadd_ flag.
+    if (fuse_relu_) params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
   }
 
   virtual Tbias* GetBiasHandle(
@@ -1119,7 +1179,7 @@ class MklConvOp : public OpKernel {
       std::shared_ptr<mkldnn::convolution_forward::primitive_desc>&
           conv2d_fwd_pd,
       const Tensor& bias_tensor) {
-    if (biasEnabled) {
+    if (fuse_biasadd_) {
       return static_cast<Tbias*>(
           const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
     } else {
@@ -1165,7 +1225,13 @@ class MklConvOp : public OpKernel {
   std::vector<int32> dilations_;
   Padding padding_;
   TensorFormat data_format_;
+
+  // Initialize to values the template is instantiated with
+  bool fuse_biasadd_ = biasEnabled;
+  bool fuse_relu_ = false;
+
   const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
+  const int kInputIndex_Pad = 2;
   const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
   const int kDilationH = 0, kDilationW = 1;
 
@@ -1217,12 +1283,12 @@ class MklConvOp : public OpKernel {
     // Create convolution primitive and add it to net.
     std::vector<primitive> net;
     if (bias) {
-      DCHECK(biasEnabled);
+      DCHECK(fuse_biasadd_);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
                                         filter->GetOpMem(), bias->GetOpMem(),
                                         output->GetOpMem()));
     } else {
-      DCHECK(!biasEnabled);
+      DCHECK(!fuse_biasadd_);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
                                         filter->GetOpMem(),
                                         output->GetOpMem()));
@@ -1232,13 +1298,56 @@ class MklConvOp : public OpKernel {
   }
 };
 
+// Base class for fused convolution forward operations
+template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
+          typename Toutput, typename Ttemp_output>
+class MklFusedConvOp : public MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput,
+                                        Ttemp_output, int32, false, false> {
+ public:
+  explicit MklFusedConvOp(OpKernelConstruction* context)
+      : MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output, int32,
+                  false, false>(context) {
+    // Since we came here through the registration of _MklFusedConv2D, get
+    // all information from 'fused_ops' and 'num_args'
+    std::vector<string> fused_ops;
+    OP_REQUIRES_OK(context, context->GetAttr("fused_ops", &fused_ops));
+
+    int num_args;
+    OP_REQUIRES_OK(context, context->GetAttr("num_args", &num_args));
+    OP_REQUIRES(context, !fused_ops.empty(),
+                errors::InvalidArgument(
+                    "Fused Conv2D must have at least one fused op."));
+
+    if (fused_ops == std::vector<string>{"BiasAdd"}) {
+      this->set_fuse_biasadd(true);
+      OP_REQUIRES(context, num_args == 1,
+                  errors::InvalidArgument(
+                      "Fused Conv2D must have one extra argument: bias."));
+    } else if (fused_ops == std::vector<string>{"Relu"}) {
+      this->set_fuse_relu(true);
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "Relu"}) {
+      this->set_fuse_biasadd(true);
+      this->set_fuse_relu(true);
+      OP_REQUIRES(context, num_args == 1,
+                  errors::InvalidArgument(
+                      "Fused Conv2D must have one extra argument: bias."));
+    } else {
+      OP_REQUIRES(context, false,
+                  errors::Unimplemented("Fusion is not implemented: [",
+                                        str_util::Join(fused_ops, ","), "]"));
+    }
+  }
+
+  virtual ~MklFusedConvOp() {}
+};
+
 // We create new class for each verison of Quantized Convolution and inherit
 // from the FP32 version of the base class
 template <typename Device, typename Tbias, typename Toutput,
           typename Ttemp_output, bool biasEnabled>
 class MklQuantizedConv2DOp
     : public MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
-                       biasEnabled> {
+                       int32, biasEnabled, false> {
  public:
   virtual ~MklQuantizedConv2DOp() {
     if (this->input_bias_ != nullptr) {
@@ -1253,13 +1362,13 @@ class MklQuantizedConv2DOp
   }
 
   explicit MklQuantizedConv2DOp(OpKernelConstruction* context)
-      : MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
-                  biasEnabled>(context) {}
+      : MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
+                  biasEnabled, false>(context) {}
 
   void Compute(OpKernelContext* context) override {
     // Compute int32 output tensor
-    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
-              biasEnabled>::Compute(context);
+    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
+              biasEnabled, false>::Compute(context);
 
     // Compute additional outputs: min/max scalars.
     int bias_index_offset;
@@ -1305,8 +1414,8 @@ class MklQuantizedConv2DOp
  protected:
   void ExtendConvFwdParams(OpKernelContext* context,
                            MklConvFwdParams& params) override {
-    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
-              biasEnabled>::ExtendConvFwdParams(context, params);
+    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
+              biasEnabled, false>::ExtendConvFwdParams(context, params);
 
     // When the output type is quint8, the output data id requantized
     // into quint8. A post_op "output_scale" is added to do the conversion.
@@ -1517,11 +1626,11 @@ class MklQuantizedConv2DSumReluOp
       }
     }
     // TODO(mdfaijul): Add cleaner code for non-mkl tensor
-    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
-              biasEnabled>::AllocateOutputTensor(context, conv_prim_desc,
-                                                 output_dims_mkl_order,
-                                                 output_tf_format,
-                                                 output_tensor);
+    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
+              biasEnabled, false>::AllocateOutputTensor(context, conv_prim_desc,
+                                                        output_dims_mkl_order,
+                                                        output_tf_format,
+                                                        output_tensor);
     const Tensor& summand = MklGetInput(context, summand_idx);
     if (summand.dtype() != DT_FLOAT)
       TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
@@ -1790,34 +1899,65 @@ REGISTER_KERNEL_BUILDER(
 #endif  // INTEL_MKL_ML
 
 // Register 2D operations
-#define REGISTER_MKL_CPU_2D(T)                                         \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("_MklConv2D")                                               \
-          .Device(DEVICE_CPU)                                          \
-          .TypeConstraint<float>("T")                                  \
-          .Label(mkl_op_registry::kMklOpLabel),                        \
-      MklConvOp<CPUDevice, float, float, float, float, float, false>); \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("_MklConv2DWithBias")                                       \
-          .Device(DEVICE_CPU)                                          \
-          .TypeConstraint<float>("T")                                  \
-          .Label(mkl_op_registry::kMklOpLabel),                        \
-      MklConvOp<CPUDevice, float, float, float, float, float, true>);  \
-  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")             \
-                              .Device(DEVICE_CPU)                      \
-                              .TypeConstraint<T>("T")                  \
-                              .Label(mkl_op_registry::kMklOpLabel),    \
+#define REGISTER_MKL_CPU_2D(T)                                             \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2D")                               \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklConvOp<CPUDevice, float, float, float, float, \
+                                    float, int32, false, false>);          \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")                       \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklConvOp<CPUDevice, float, float, float, float, \
+                                    float, int32, true, false>);           \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")                 \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklDummyOp<CPUDevice, T>);                       \
+  REGISTER_KERNEL_BUILDER(Name("_MklPadWithConv2D")                        \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<int32>("Tpaddings")          \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklConvOp<CPUDevice, float, float, float, float, \
+                                    float, int32, false, true>);           \
+  REGISTER_KERNEL_BUILDER(Name("_MklPadWithConv2D")                        \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<int64>("Tpaddings")          \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklConvOp<CPUDevice, float, float, float, float, \
+                                    float, int64, false, true>);           \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyPadWithConv2D")                  \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<int32>("Tpaddings")          \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
                           MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_CPU_2D);
 
-// Register 3D operations
-#define REGISTER_MKL_CPU_3D(T)                                      \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv3D")                        \
+#define REGISTER_MKL_CPU_2D_FUSED(T)                                \
+  REGISTER_KERNEL_BUILDER(Name("_MklFusedConv2D")                   \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConvOp<CPUDevice, T, T, T, T, T, false>);
+                          MklFusedConvOp<CPUDevice, T, T, T, T, T>);
+// We check the fused_ops attributes to decide if bias is enabled or not.
+
+TF_CALL_float(REGISTER_MKL_CPU_2D_FUSED);
+
+// Register 3D operations
+#define REGISTER_MKL_CPU_3D(T)                  \
+  REGISTER_KERNEL_BUILDER(                      \
+      Name("_MklConv3D")                        \
+          .Device(DEVICE_CPU)                   \
+          .TypeConstraint<T>("T")               \
+          .Label(mkl_op_registry::kMklOpLabel), \
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false>);
 TF_CALL_float(REGISTER_MKL_CPU_3D);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index e6989d884d68f59e4bfe9d102dcdfcaa0946c2ed..e61c20dea9f8c3f8749c302f88a46233dab270b7 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 
 #include <limits>
-#include <vector>
 #include <memory>
+#include <vector>
 
 #include "mkldnn.hpp"
 #include "tensorflow/core/framework/numeric_op.h"
@@ -85,7 +85,7 @@ class MklDnnConvUtil {
   }
 
   // Calculate Convolution dilations
-  virtual inline void GetDilationsInMklOrder(memory::dims *dilations) {
+  virtual inline void GetDilationsInMklOrder(memory::dims* dilations) {
     // For now we take the dilation from the second and third dimensions only
     // (we do not support dilation on the batch or depth dimension).
     CHECK_NOTNULL(dilations);
@@ -288,7 +288,7 @@ class MklDnnConvUtil {
       const TensorShape& input_shape, const TensorShape& filter_shape,
       const memory::dims& strides, const memory::dims& dilations,
       memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
-      memory::dims* pad_l, memory::dims* pad_r) {
+      memory::dims* pad_l, memory::dims* pad_r, bool padEnabled = false) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -373,6 +373,36 @@ class MklDnnConvUtil {
                                    padding_, &out_cols, &pad_left, &pad_right));
     }
 
+    if (isConv2D) {
+      // Conv + pad fusion is enabled only for 2D
+      // If padEnabled, i.e., pad and conv op are fused, then
+      // all pads are already passed from pad op through
+      // *pad_l and *pad_r
+      if (padEnabled) {
+        pad_top = static_cast<int64>((*pad_l)[0]);
+        pad_left = static_cast<int64>((*pad_l)[1]);
+        pad_bottom = static_cast<int64>((*pad_r)[0]);
+        pad_right = static_cast<int64>((*pad_r)[1]);
+        // update the out_rows and out_cols based on all
+        // sides of the pads coming from pad op.
+        out_rows = out_rows + (pad_top + pad_bottom) / stride_rows;
+        out_cols = out_cols + (pad_left + pad_right) / stride_cols;
+      }
+      // Handle padding. MKL-DNN uses asymetric padding.
+      // But, if padEnabled, i.e., pad and conv op are fused,
+      // then, *pad_l and *pad_r are already set from pad op.
+      // In that case they need not set here.
+      else {
+        *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
+        *pad_r = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
+      }
+    } else {
+      // Set padding for Conv3D here
+      *pad_l = {static_cast<int>(pad_D1), static_cast<int>(pad_top),
+                static_cast<int>(pad_left)};
+      *pad_r = {static_cast<int>(pad_D2), static_cast<int>(pad_bottom),
+                static_cast<int>(pad_right)};
+    }
     // Tensorflow output is in data_format order.
     //     Conv2D: NHWC or NCHW
     //     Conv3D: NDHWC or NCDHW
@@ -393,9 +423,6 @@ class MklDnnConvUtil {
       mkldnn_sizes[MklDnnDims::Dim_H] = static_cast<int>(out_rows);
       mkldnn_sizes[MklDnnDims::Dim_W] = static_cast<int>(out_cols);
       *output_dims_mkl_order = mkldnn_sizes;
-
-      *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
-      *pad_r = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
     } else {
       std::vector<int> mkldnn_sizes(5, -1);
       mkldnn_sizes[MklDnnDims3D::Dim3d_N] = out_batch;
@@ -404,11 +431,6 @@ class MklDnnConvUtil {
       mkldnn_sizes[MklDnnDims3D::Dim3d_H] = static_cast<int>(out_rows);
       mkldnn_sizes[MklDnnDims3D::Dim3d_W] = static_cast<int>(out_cols);
       *output_dims_mkl_order = mkldnn_sizes;
-
-      *pad_l = {static_cast<int>(pad_D1), static_cast<int>(pad_top),
-                static_cast<int>(pad_left)};
-      *pad_r = {static_cast<int>(pad_D2), static_cast<int>(pad_bottom),
-                static_cast<int>(pad_right)};
     }
   }
 
@@ -441,8 +463,8 @@ class MklDnnConvUtil {
                                           input_tf_shape.DebugString()));
     }
 
-    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape,
-                                  strides, dilations, output_dims_tf_order,
+    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, strides,
+                                  dilations, output_dims_tf_order,
                                   output_dims_mkl_order, pad_l, pad_r);
   }
 
@@ -457,10 +479,9 @@ class MklDnnConvUtil {
   inline void GetConvFwdSizesInMklOrder(
       const TensorShape& input_shape, const TensorShape& filter_shape,
       memory::dims* input_dims, memory::dims* filter_dims,
-      memory::dims* strides, memory::dims *dilations,
-      memory::dims* output_dims_tf_order,
-      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
-      memory::dims* pad_r) {
+      memory::dims* strides, memory::dims* dilations,
+      memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
+      memory::dims* pad_l, memory::dims* pad_r, bool padEnabled = false) {
     CHECK_NOTNULL(input_dims);
     CHECK_NOTNULL(filter_dims);
     CHECK_NOTNULL(strides);
@@ -476,10 +497,9 @@ class MklDnnConvUtil {
     if (!context_->status().ok()) return;
     GetStridesInMklOrder(strides);
     GetDilationsInMklOrder(dilations);
-    GetOutputAndPadSizeInMklOrder(input_shape, filter_shape,
-                                  *strides, *dilations,
-                                  output_dims_tf_order, output_dims_mkl_order,
-                                  pad_l, pad_r);
+    GetOutputAndPadSizeInMklOrder(
+        input_shape, filter_shape, *strides, *dilations, output_dims_tf_order,
+        output_dims_mkl_order, pad_l, pad_r, padEnabled);
     if (!context_->status().ok()) return;
   }
 };
@@ -536,7 +556,6 @@ class MklConvBackpropCommonOp : public OpKernel {
   TensorFormat data_format_;  // NCHW or NHWC
 };
 
-
 /////////////////////////////////////////////////////////////////////
 ///  Dummy Mkl op that is just used for operators that are intermediate
 ///  output of node fusion in the graph
diff --git a/tensorflow/core/kernels/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl_fused_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..258cca9332b5b86adbf0bbcb285210552729243e
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_fused_ops_test.cc
@@ -0,0 +1,405 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef INTEL_MKL
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+
+// Helper class for converting MKL tensors to TF tensors and comparing to
+// expected values
+
+static const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0};
+static const TensorShape dummy_shape({8});
+
+template <typename T>
+class ConvMklToTF : public OpsTestBase {
+ public:
+  void PerformConversion(DataType dtype, const Tensor& tensor,
+                         const Tensor& mkl_meta_tensor, Tensor* output) {
+    // Create an MKL to TF conversion node and execute it
+    TF_EXPECT_OK(NodeDefBuilder("mkl_to_tf_op", "_MklToTf")
+                     .Input(FakeInput(dtype))     // Input
+                     .Input(FakeInput(DT_UINT8))  // Mkl second tensor
+                     .Attr("T", dtype)
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    AddInputFromArray<T>(tensor.shape(), tensor.flat<T>());
+    AddInputFromArray<uint8>(mkl_meta_tensor.shape(),
+                             mkl_meta_tensor.flat<uint8>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    *output = *GetOutput(0);
+  }
+
+  void ConvertAndCompare(DataType dtype, const Tensor& tensor,
+                         const Tensor& mkl_meta_tensor,
+                         const Tensor& expected) {
+    Tensor output;
+    PerformConversion(dtype, tensor, mkl_meta_tensor, &output);
+    test::ExpectTensorNear<T>(expected, output, 1e-5);
+  }
+  void TestBody() {}
+};
+
+// Testing MKL's fused convolution ops
+
+template <typename T>
+class MklFusedConv2DOpTest : public OpsTestBase {
+ protected:
+  static constexpr int kDepth = 3;
+  static constexpr int kImageWidth = 32;
+  static constexpr int kImageHeight = 32;
+  static constexpr int kImageBatchCount = 8;
+
+  using BiasAddGraphRunner =
+      std::function<void(const Tensor& input_data, const Tensor& filter_data,
+                         const Tensor& bias_data, Tensor* out)>;
+
+  // Runs a Tensorflow graph defined by the root scope, and fetches the result
+  // of 'fetch' node into the output Tensor.
+  void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
+                   Tensor* output) {
+    tensorflow::GraphDef graph;
+    TF_ASSERT_OK(root.ToGraphDef(&graph));
+
+    std::unique_ptr<tensorflow::Session> session(
+        tensorflow::NewSession(tensorflow::SessionOptions()));
+    TF_ASSERT_OK(session->Create(graph));
+
+    std::vector<Tensor> unfused_tensors;
+    TF_ASSERT_OK(session->Run({}, {fetch}, {}, &unfused_tensors));
+
+    *output = unfused_tensors[0];
+  }
+
+  void RunConv2DWithBias(const Tensor& input_data, const Tensor& filter_data,
+                         const Tensor& bias_data, Tensor* output,
+                         int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    RunAndFetch(root, "with_bias", output);
+  }
+
+  void RunConv2DWithBiasAndRelu(const Tensor& input_data,
+                                const Tensor& filter_data,
+                                const Tensor& bias_data, Tensor* output,
+                                int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    auto with_relu = ops::Relu(root.WithOpName("with_relu"), with_bias);
+
+    RunAndFetch(root, "with_relu", output);
+  }
+
+  void RunMklFusedConv2DOp(const Tensor& image, const Tensor& filter,
+                           const std::vector<Tensor>& args,
+                           const std::vector<string>& fused_ops, Tensor* output,
+                           int stride = 1) {
+    DataType dtype = DataTypeToEnum<T>::v();
+    int num_args = static_cast<int>(args.size());
+
+    TF_EXPECT_OK(NodeDefBuilder("fused_conv_op", "_MklFusedConv2D")
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(dtype))
+                     .Attr("num_args", num_args)
+                     .Input(FakeInput(num_args, dtype))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(num_args, DT_UINT8))
+                     .Attr("T", dtype)
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("padding", "SAME")
+                     .Attr("fused_ops", fused_ops)
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+
+    TF_EXPECT_OK(InitOp());
+
+    AddInputFromArray<T>(image.shape(), image.flat<T>());
+    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
+    for (const Tensor& arg : args)
+      AddInputFromArray<T>(arg.shape(), arg.flat<T>());
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    for (const Tensor& arg : args)
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Compare output to expected results
+    const Tensor& output_tensor = *GetOutput(0);
+    // Index 2 will need to be changed if the number of outputs produced
+    // by MklConv2D change.
+    const Tensor& output_meta_tensor = *GetOutput(2);
+    ConvMklToTF<T> conv_comp;
+    conv_comp.PerformConversion(dtype, output_tensor, output_meta_tensor,
+                                output);
+  }
+
+  void VerifyBiasAddTensorsNear(int depth, int image_width, int image_height,
+                                int image_batch_count, int filter_size,
+                                int filter_count,
+                                const BiasAddGraphRunner& run_default,
+                                const BiasAddGraphRunner& run_fused) {
+    DataType dtype = DataTypeToEnum<T>::v();
+
+    Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
+    image.flat<T>() = image.flat<T>().setRandom();
+
+    Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
+    filter.flat<T>() = filter.flat<T>().setRandom();
+
+    const int bias_size = filter_count;
+    Tensor bias(dtype, {bias_size});
+    bias.flat<T>() = bias.flat<T>().setRandom();
+
+    Tensor conv_2d;
+    Tensor fused_conv_2d;
+
+    run_default(image, filter, bias, &conv_2d);
+    run_fused(image, filter, bias, &fused_conv_2d);
+
+    ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
+    ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
+
+    test::ExpectClose(conv_2d, fused_conv_2d);
+  }
+
+  // Verifies that computing Conv2D+BiasAdd in a graph is identical to
+  // FusedConv2D.
+  void VerifyConv2DWithBias(int filter_size, int filter_count,
+                            int depth = kDepth, int image_width = kImageWidth,
+                            int image_height = kImageHeight,
+                            int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunConv2DWithBias(input_data, filter_data, bias_data, out);
+        };
+
+    const BiasAddGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunMklFusedConv2DOp(input_data, filter_data, {bias_data}, {"BiasAdd"},
+                              out);
+        };
+
+    VerifyBiasAddTensorsNear(depth, image_width, image_height,
+                             image_batch_count, filter_size, filter_count,
+                             run_default, run_fused);
+  }
+
+  // Verifies that computing Conv2D+BiasAdd+Relu in a graph is identical to
+  // FusedConv2D.
+  void VerifyConv2DWithBiasAndRelu(int filter_size, int filter_count,
+                                   int depth = kDepth,
+                                   int image_width = kImageWidth,
+                                   int image_height = kImageHeight,
+                                   int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunConv2DWithBiasAndRelu(input_data, filter_data, bias_data, out);
+        };
+
+    const BiasAddGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunMklFusedConv2DOp(input_data, filter_data, {bias_data},
+                              {"BiasAdd", "Relu"}, out);
+        };
+
+    VerifyBiasAddTensorsNear(depth, image_width, image_height,
+                             image_batch_count, filter_size, filter_count,
+                             run_default, run_fused);
+  }
+};
+
+template <typename T>
+class MklFusedConv2DWithBiasOpTest : public MklFusedConv2DOpTest<T> {};
+
+TYPED_TEST_CASE_P(MklFusedConv2DWithBiasOpTest);
+
+// -------------------------------------------------------------------------- //
+// Conv2D + BiasAdd + {Relu}                                                  //
+// -------------------------------------------------------------------------- //
+
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, OneByOneConvolution) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolution) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, OneByOneConvolutionAndRelu) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolutionAndRelu) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+REGISTER_TYPED_TEST_CASE_P(MklFusedConv2DWithBiasOpTest,  //
+                           OneByOneConvolution,           //
+                           SpatialConvolution,            //
+                           OneByOneConvolutionAndRelu,    //
+                           SpatialConvolutionAndRelu);
+
+using MklFusedBiasAddDataTypes = ::testing::Types<float>;
+INSTANTIATE_TYPED_TEST_CASE_P(Test, MklFusedConv2DWithBiasOpTest,
+                              MklFusedBiasAddDataTypes);
+// Testing fusion of pad and convolution
+
+class FusedPadConvOpTest : public OpsTestBase {
+ public:
+  template <typename T>
+  void Run(DataType dtype, Tensor& image, Tensor& filter, Tensor& padding,
+           Tensor& expected, const string data_format) {
+    const int stride = 1;
+
+    // Create a fused pad+conv2d node
+    TF_EXPECT_OK(NodeDefBuilder("fused_pad_conv_op", "_MklPadWithConv2D")
+                     .Input(FakeInput(dtype))     // Input
+                     .Input(FakeInput(dtype))     // Filter
+                     .Input(FakeInput(DT_INT32))  // Padding
+                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                     .Attr("padding", "VALID")
+                     .Attr("data_format", data_format)
+                     .Attr("T", dtype)
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+
+    // Setting up inputs and execute
+    AddInputFromArray<T>(image.shape(), image.flat<T>());
+    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
+    AddInputFromArray<int32>(padding.shape(), padding.flat<int32>());
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Compare output to expected results
+    const Tensor& first = *GetOutput(0);
+    const Tensor& second = *GetOutput(2);
+    ConvMklToTF<T> conv_comp;
+    conv_comp.ConvertAndCompare(dtype, first, second, expected);
+  }
+};
+
+TEST_F(FusedPadConvOpTest, PaddingConvTest) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  Tensor image(DT_FLOAT, {image_batch_count, image_height, image_width, depth});
+  test::FillValues<float>(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+
+  const int filter_size = 3;
+  const int filter_count = 1;
+  Tensor filter(DT_FLOAT, {filter_size, filter_size, depth, filter_count});
+  test::FillValues<float>(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9});
+
+  const int padding_height = 4;
+  const int padding_width = 2;
+  Tensor padding(DT_INT32, {padding_height, padding_width});
+  test::FillValues<int32>(&padding, {0, 0, 3, 4, 1, 2, 0, 0});
+
+  Tensor expected(DT_FLOAT, TensorShape({1, 8, 5, 1}));
+  test::FillValues<float>(
+      &expected,
+      {0,  0,   0,   0,   0,   24, 42,  60,  33,  12,  105, 150, 183, 95,
+       32, 235, 312, 357, 178, 56, 187, 234, 261, 121, 32,  106, 126, 138,
+       59, 12,  0,   0,   0,   0,  0,   0,   0,   0,   0,   0});
+
+  Run<float>(DT_FLOAT, image, filter, padding, expected, "NHWC");
+}
+
+TEST_F(FusedPadConvOpTest, PaddingConvTestNchw) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  Tensor image(DT_FLOAT, {image_batch_count, depth, image_height, image_width});
+  test::FillValues<float>(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+
+  const int filter_size = 3;
+  const int filter_count = 1;
+  Tensor filter(DT_FLOAT, {filter_size, filter_size, depth, filter_count});
+  test::FillValues<float>(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9});
+
+  const int padding_height = 4;
+  const int padding_width = 2;
+  Tensor padding(DT_INT32, {padding_height, padding_width});
+  test::FillValues<int32>(&padding, {0, 0, 0, 0, 3, 4, 1, 2});
+
+  Tensor expected(DT_FLOAT, TensorShape({1, 1, 8, 5}));
+  test::FillValues<float>(
+      &expected,
+      {0,  0,   0,   0,   0,   24, 42,  60,  33,  12,  105, 150, 183, 95,
+       32, 235, 312, 357, 178, 56, 187, 234, 261, 121, 32,  106, 126, 138,
+       59, 12,  0,   0,   0,   0,  0,   0,   0,   0,   0,   0});
+
+  Run<float>(DT_FLOAT, image, filter, padding, expected, "NCHW");
+}
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 22ff4cd80fe6d4d0b8a85c88dd65a58b7288a351..4d46abb0a4dd232ef13c8b6b0547b0779af1f98f 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 #include <vector>
+#include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -29,25 +30,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
 #endif
 
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
 using mkldnn::lrn_across_channels;
 using mkldnn::lrn_backward;
 using mkldnn::lrn_forward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
-#else
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
-
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
@@ -69,672 +63,6 @@ void GetBandMatrix(int depth, int depth_radius,
 
 }  // namespace
 
-#ifdef INTEL_MKL_ML_ONLY
-
-template <typename T>
-class MklLRNOp : public OpKernel {
- public:
-  ~MklLRNOp() {}
-
-  explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
-    int64 depth_radius64;
-    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                " larger than int max"));
-    depth_radius_ = static_cast<size_t>(depth_radius64);
-
-    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
-    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
-    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
-    workspace_enabled_ = false;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("workspace_enabled", &workspace_enabled_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklLRNOpContext mkl_context;
-
-    const Tensor& input = MklGetInput(context, 0);
-    GetMklShape(context, 0, &mkl_context.input_shape);
-    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
-
-    // Sanity checks
-    mkl_context.in_dims = input_in_mkl_format
-                              ? mkl_context.input_shape.GetDimension()
-                              : input.dims();
-    OP_REQUIRES(context, mkl_context.in_dims == 4,
-                errors::InvalidArgument("input must be 4-dimensional"));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input.NumElements(), std::numeric_limits<int>::max()),
-        errors::InvalidArgument("argument to LRN too large"));
-
-    if (!input_in_mkl_format) {
-      mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
-                                    beta_, input);
-      return;
-    }
-
-    if (input_in_mkl_format) {
-      // MKL supports normalization over channel dimension only
-      if (mkl_context.input_shape.tf_dim_idx(mkl_context.in_dims - 1) ==
-          MklDims::C) {
-        mkl_context.lt_input =
-            static_cast<dnnLayout_t>(mkl_context.input_shape.GetCurLayout());
-        workspace_enabled_ = true;
-      } else {
-        Tensor converted_tensor =
-            ConvertMklToTF<T>(context, input, mkl_context.input_shape);
-        mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
-                                      beta_, converted_tensor);
-        return;
-      }
-    }
-
-    int kernel_size = 2 * depth_radius_ + 1;
-
-    CHECK_EQ(dnnLRNCreateForward_F32(
-                 &mkl_context.lrn_fwd, NULL, mkl_context.lt_input, kernel_size,
-                 static_cast<float>(alpha_ * kernel_size), beta_, bias_),
-             E_SUCCESS);
-
-    // Allocate output tensor and shape
-    Tensor* output = nullptr;
-    Tensor* workspace = nullptr;
-
-    // Convert Inputs if needed
-    Tensor mkl_tmp_input_buf_tensor;
-    mkl_context.MklPrepareLRNInputs(context, &mkl_tmp_input_buf_tensor);
-
-    // Allocate Layer Outputs
-    mkl_context.MklAllocateOutputs(context, &output, &workspace,
-                                   workspace_enabled_);
-
-    Tensor mkl_tmp_workspace_buf_tensor;
-    mkl_context.MklPrepareLRNOutputs(context, output, workspace,
-                                     &mkl_tmp_workspace_buf_tensor,
-                                     workspace_enabled_);
-
-    // Execute LRN.
-    CHECK_EQ(dnnExecute_F32(mkl_context.lrn_fwd, mkl_context.lrn_res),
-             E_SUCCESS);
-
-    // Release MKL resources.
-    mkl_context.MklCleanup();
-  }
-
- private:
-  typedef struct {
-    size_t in_dims;
-    size_t in_sizes[4];
-    size_t in_strides[4];
-    size_t out_sizes[4];
-    size_t out_strides[4];
-    MklShape input_shape;
-    dnnPrimitive_t lrn_fwd = nullptr;
-    dnnPrimitive_t convert_input = nullptr;
-    dnnLayout_t lt_input = nullptr;
-    dnnLayout_t lt_internal_input = nullptr;
-    dnnLayout_t lt_internal_workspace = nullptr;
-    dnnLayout_t lt_internal_output = nullptr;
-    void* lrn_res[dnnResourceNumber];
-
-    // Convert Inputs if needed
-    void MklPrepareLRNInputs(OpKernelContext* context,
-                             Tensor* mkl_tmp_input_buf_tensor) {
-      const Tensor& input = MklGetInput(context, 0);
-      void* mkl_buf_input =
-          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_input, lrn_fwd,
-                                                dnnResourceSrc),
-               E_SUCCESS);
-
-      void* mkl_buf_convert_input = nullptr;
-      bool mkl_convert_input = false;
-      mkl_convert_input = !dnnLayoutCompare_F32(lt_internal_input, lt_input);
-
-      if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_input,
-                                         lt_internal_input),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_internal_input,
-                       &mkl_buf_convert_input);
-        CHECK_EQ(dnnConversionExecute_F32(convert_input, mkl_buf_input,
-                                          mkl_buf_convert_input),
-                 E_SUCCESS);
-        dnnDelete_F32(convert_input);
-      }
-
-      lrn_res[dnnResourceSrc] =
-          (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
-    }
-
-    // Allocate Layer Outputs
-    void MklAllocateOutputs(OpKernelContext* context, Tensor** output,
-                            Tensor** workspace, bool workspace_enabled_) {
-      TensorShape mkl_output_tf_shape; /* First tensor */
-      MklShape mkl_output_mkl_shape;   /* Second tensor */
-
-      mkl_output_mkl_shape.SetMklTensor(true);
-      mkl_output_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceDst);
-      mkl_output_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(),
-                                       input_shape.GetStrides());
-      mkl_output_mkl_shape.SetTfDimOrder(in_dims,
-                                         input_shape.GetTfToMklDimMap());
-      mkl_output_tf_shape.AddDim(
-          dnnLayoutGetMemorySize_F32(
-              static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
-          sizeof(T));
-      AllocateOutputSetMklShape(context, 0, output,
-                                mkl_output_tf_shape /* First tensor */,
-                                mkl_output_mkl_shape /* Second Tensor */);
-
-      if (workspace_enabled_) {
-        TensorShape mkl_workspace_tf_shape; /* First tensor */
-        MklShape mkl_workspace_mkl_shape;   /* Second tensor */
-        mkl_workspace_mkl_shape.SetMklTensor(false);
-        mkl_workspace_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceWorkspace);
-        // Assumes workspace has same TF layout and TF dim order as input
-        mkl_workspace_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(),
-                                            input_shape.GetStrides());
-        mkl_workspace_mkl_shape.SetTfDimOrder(in_dims,
-                                              input_shape.GetTfToMklDimMap());
-        mkl_workspace_tf_shape.AddDim(
-            dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                mkl_workspace_mkl_shape.GetMklLayout())) /
-            sizeof(T));
-        AllocateOutputSetMklShape(context, 1, workspace,
-                                  mkl_workspace_tf_shape /* First tensor */,
-                                  mkl_workspace_mkl_shape /* Second Tensor */);
-      }
-    }
-
-    void MklPrepareLRNOutputs(OpKernelContext* context, Tensor* output,
-                              Tensor* workspace,
-                              Tensor* mkl_tmp_workspace_buf_tensor,
-                              bool workspace_enabled_) {
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_workspace, lrn_fwd,
-                                                dnnResourceWorkspace),
-               E_SUCCESS);
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_output, lrn_fwd,
-                                                dnnResourceDst),
-               E_SUCCESS);
-
-      void* mkl_buf_output =
-          const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
-      lrn_res[dnnResourceDst] = mkl_buf_output;
-
-      void* mkl_buf_workspace = nullptr;
-      if (workspace_enabled_) {
-        mkl_buf_workspace = const_cast<void*>(
-            static_cast<const void*>(workspace->flat<T>().data()));
-      } else {
-        AllocTmpBuffer(context, mkl_tmp_workspace_buf_tensor,
-                       lt_internal_workspace, &mkl_buf_workspace);
-      }
-      lrn_res[dnnResourceWorkspace] = mkl_buf_workspace;
-    }
-
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context, int depth_radius_,
-                           float bias_, float alpha_, float beta_,
-                           const Tensor& input) {
-      const int batch = static_cast<int>(input.dim_size(0));
-      const int rows = static_cast<int>(input.dim_size(1));
-      const int cols = static_cast<int>(input.dim_size(2));
-      const int depth = static_cast<int>(input.dim_size(3));
-      const int nodes = cols * rows;
-
-      auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
-      // Multiplying the input with the band matrix has the effect of reducing
-      // the
-      // correct patch along the depth.
-      Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
-      GetBandMatrix<T>(depth, depth_radius_, &multiplier);
-
-      Tensor *output, *workspace;
-      MklShape mkl_output_mkl_shape, mkl_workspace_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, 0, &output, input.shape(),
-                                mkl_output_mkl_shape);
-
-      mkl_workspace_mkl_shape.SetMklTensor(false);
-      mkl_workspace_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, 1, &workspace, input.shape(),
-                                mkl_workspace_mkl_shape);
-
-      auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
-      Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
-      auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
-      if (beta_ == T(1)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.inverse();
-      } else if (beta_ == T(0.5)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.rsqrt();
-      } else {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * (tmp.log() * -beta_).exp();
-      }
-    }
-
-    // Release MKL resources.
-    void MklCleanup() {
-      dnnDelete_F32(lrn_fwd);
-      dnnLayoutDelete_F32(lt_internal_input);
-      dnnLayoutDelete_F32(lt_internal_workspace);
-      dnnLayoutDelete_F32(lt_internal_output);
-    }
-  } MklLRNOpContext;
-
-  typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
-
-  bool workspace_enabled_;
-  int depth_radius_;
-  float bias_;
-  float alpha_;
-  float beta_;
-};
-
-template <typename T>
-class MklLRNGradOp : public OpKernel {
- public:
-  explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
-    int64 depth_radius64;
-    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                " larger than int max"));
-    depth_radius_ = static_cast<int>(depth_radius64);
-    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
-    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
-    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
-    workspace_enabled_ = false;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("workspace_enabled", &workspace_enabled_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklLRNGradOpContext mkl_context;
-    mkl_context.depth_radius_ = depth_radius_;
-    mkl_context.bias_ = bias_;
-    mkl_context.alpha_ = alpha_;
-    mkl_context.beta_ = beta_;
-
-    const Tensor& in_grads = MklGetInput(context, 0);
-    const Tensor& in_image = MklGetInput(context, 1);
-    const Tensor& out_image = MklGetInput(context, 2);
-
-    GetMklShape(context, 0, &mkl_context.ingrad_shape);
-    GetMklShape(context, 1, &mkl_context.inimage_shape);
-    GetMklShape(context, 2, &mkl_context.outimage_shape);
-
-    bool ingrad_in_mkl_format = mkl_context.ingrad_shape.IsMklTensor();
-    bool inimage_in_mkl_format = mkl_context.inimage_shape.IsMklTensor();
-    bool outimage_in_mkl_format = mkl_context.outimage_shape.IsMklTensor();
-
-    mkl_context.in_dims = inimage_in_mkl_format
-                              ? mkl_context.inimage_shape.GetDimension()
-                              : in_image.dims();
-    OP_REQUIRES(context, mkl_context.in_dims == 4,
-                errors::InvalidArgument("input images must be 4-dimensional"));
-
-    if (!workspace_enabled_) {
-      mkl_context.MklDefaultToEigen(context);
-      return;
-    }
-
-    if (ingrad_in_mkl_format || inimage_in_mkl_format) {
-      const MklShape* tmp_mkl_shape = (ingrad_in_mkl_format)
-                                          ? &mkl_context.ingrad_shape
-                                          : &mkl_context.inimage_shape;
-      if (tmp_mkl_shape->tf_dim_idx(mkl_context.in_dims - 1) != MklDims::C) {
-        // Fallback to eigen
-        mkl_context.MklDefaultToEigen(context);
-        return;
-      } else {  // MKL supports normalization over channel dimension only
-        for (int i = 0; i < mkl_context.in_dims; i++) {
-          mkl_context.in_sizes[i] = mkl_context.out_sizes[i] =
-              tmp_mkl_shape->GetSizes()[i];
-          mkl_context.in_strides[i] = mkl_context.out_strides[i] =
-              tmp_mkl_shape->GetStrides()[i];
-        }
-      }
-    } else {
-      // Fallback to eigen
-      mkl_context.MklDefaultToEigen(context);
-      return;
-    }
-
-    // Dimensions check for sanity purpose
-    if (ingrad_in_mkl_format) {
-      OP_REQUIRES(
-          context, mkl_context.ingrad_shape.GetDimension() == 4,
-          errors::InvalidArgument("input gradient must be 4-dimensional"));
-    } else {
-      OP_REQUIRES(
-          context, in_grads.dims() == 4,
-          errors::InvalidArgument("input gradient must be 4-dimensional"));
-    }
-
-    if (outimage_in_mkl_format) {
-      OP_REQUIRES(
-          context, mkl_context.outimage_shape.GetDimension() == 4,
-          errors::InvalidArgument("Output image must be 4-dimensional"));
-    } else {
-      OP_REQUIRES(
-          context, out_image.dims() == 4,
-          errors::InvalidArgument("Output image must be 4-dimensional"));
-    }
-
-    // Prepare mkl input layout
-    mkl_context.MklPrepareLRNInputsLayouts(context);
-    int ksize = 2 * depth_radius_ + 1;
-
-    CHECK_EQ(dnnLRNCreateBackward_F32(
-                 &mkl_context.lrn_bwd, NULL, mkl_context.lt_input,
-                 mkl_context.lt_output, ksize,
-                 static_cast<float>(alpha_ * ksize), beta_, bias_),
-             E_SUCCESS);
-
-    // Allocate output tensor and shape.
-    TensorShape mkl_output_tf_shape; /* First tensor */
-    MklShape mkl_output_mkl_shape;   /* Second tensor */
-    mkl_output_mkl_shape.SetMklTensor(true);
-    CHECK_NE(mkl_context.lrn_bwd, nullptr);
-    mkl_output_mkl_shape.SetMklLayout(mkl_context.lrn_bwd, dnnResourceDiffSrc);
-    mkl_output_mkl_shape.SetTfLayout(mkl_context.in_dims, mkl_context.out_sizes,
-                                     mkl_context.out_strides);
-    if (ingrad_in_mkl_format) {
-      mkl_output_mkl_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.ingrad_shape.GetTfToMklDimMap());
-    } else {
-      mkl_output_mkl_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.inimage_shape.GetTfToMklDimMap());
-    }
-    mkl_output_tf_shape.AddDim(
-        dnnLayoutGetMemorySize_F32(
-            static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
-        sizeof(T));
-    Tensor* output = nullptr;
-    AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape,
-                              mkl_output_mkl_shape);
-
-    // Get pointers to output data.
-    void* user_output =
-        const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
-
-    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_image_buf_tensor,
-        mkl_tmp_outimage_buf_tensor;
-    // Convert Inputs if needed
-    mkl_context.MklPrepareLRNGradInput(context, &mkl_tmp_input_buf_tensor,
-                                       &mkl_tmp_image_buf_tensor,
-                                       &mkl_tmp_outimage_buf_tensor);
-
-    // We do not do any conversion for output. But we simply emit it
-    // in MKL format.
-    mkl_context.res_lrn_bwd[dnnResourceDiffSrc] = user_output;
-    // Execute LRN backward using dnnExecute
-    CHECK_EQ(dnnExecute_F32(mkl_context.lrn_bwd, mkl_context.res_lrn_bwd),
-             E_SUCCESS);
-    // Release MKL resources.
-    mkl_context.Mklcleanup();
-  }
-
- private:
-  typedef struct {
-    int depth_radius_;
-    float bias_;
-    float alpha_;
-    float beta_;
-    size_t in_dims;
-    size_t in_sizes[4];
-    size_t in_strides[4];
-    size_t out_sizes[4];
-    size_t out_strides[4];
-    MklShape ingrad_shape, inimage_shape, outimage_shape;
-    dnnPrimitive_t lrn_bwd = nullptr;
-    dnnPrimitive_t convert_input = nullptr;
-    dnnLayout_t lt_input = nullptr;
-    dnnLayout_t lt_output = nullptr;
-    dnnLayout_t lt_bdw_input = nullptr;
-    dnnLayout_t lt_workspace = nullptr;
-    dnnLayout_t lt_internal_input = nullptr;
-    void* res_lrn_bwd[dnnResourceNumber];
-
-    // prepare mkl input
-    void MklPrepareLRNInputsLayouts(OpKernelContext* context) {
-      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
-      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
-      if (!ingrad_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_input = static_cast<dnnLayout_t>(ingrad_shape.GetCurLayout());
-      }
-
-      if (!inimage_in_mkl_format) {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&lt_output, in_dims, out_sizes, out_strides),
-            E_SUCCESS);
-      } else {
-        lt_output = static_cast<dnnLayout_t>(inimage_shape.GetCurLayout());
-      }
-    }
-
-    // convert input if needed
-    void MklPrepareLRNGradInput(OpKernelContext* context,
-                                Tensor* mkl_tmp_input_buf_tensor,
-                                Tensor* mkl_tmp_image_buf_tensor,
-                                Tensor* mkl_tmp_outimage_buf_tensor) {
-      const Tensor& in_grads = MklGetInput(context, 0);
-      const Tensor& in_image = MklGetInput(context, 1);
-      const Tensor& workspace = MklGetInput(
-          context,
-          3); /*Worskpsace is enabled, get the buffer to the workspace */
-
-      void* user_input = const_cast<void*>(
-          static_cast<const void*>(in_grads.flat<T>().data()));
-      void* user_fwd_input = const_cast<void*>(
-          static_cast<const void*>(in_image.flat<T>().data()));
-      void* workspace_buffer = const_cast<void*>(
-          static_cast<const void*>(workspace.flat<T>().data()));
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, lrn_bwd,
-                                                dnnResourceWorkspace),
-               E_SUCCESS);
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_bdw_input, lrn_bwd,
-                                                dnnResourceDiffDst),
-               E_SUCCESS);
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_input, lrn_bwd,
-                                                dnnResourceSrc),
-               E_SUCCESS);
-
-      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
-      if (ingrad_in_mkl_format) {
-        if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) {
-          AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input,
-                         &res_lrn_bwd[dnnResourceDiffDst]);
-          ingrad_shape.GetConvertedFlatData(lt_bdw_input, user_input,
-                                            res_lrn_bwd[dnnResourceDiffDst]);
-        } else {
-          res_lrn_bwd[dnnResourceDiffDst] = user_input;
-        }
-      } else {
-        if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) {
-          CHECK_EQ(
-              dnnConversionCreate_F32(&convert_input, lt_input, lt_bdw_input),
-              E_SUCCESS);
-
-          AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input,
-                         &res_lrn_bwd[dnnResourceDiffDst]);
-          CHECK_EQ(dnnConversionExecute_F32(convert_input, user_input,
-                                            res_lrn_bwd[dnnResourceDiffDst]),
-                   E_SUCCESS);
-          dnnDelete_F32(convert_input);
-        } else {
-          res_lrn_bwd[dnnResourceDiffDst] = user_input;
-        }
-      }
-
-      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
-      if (inimage_in_mkl_format) {
-        if (!dnnLayoutCompare_F32(
-                lt_internal_input,
-                static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()))) {
-          AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
-                         &res_lrn_bwd[dnnResourceSrc]);
-          ingrad_shape.GetConvertedFlatData(lt_internal_input, user_fwd_input,
-                                            res_lrn_bwd[dnnResourceSrc]);
-        } else {
-          res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
-        }
-      } else {
-        if (!dnnLayoutCompare_F32(
-                lt_internal_input,
-                static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()))) {
-          CHECK_EQ(dnnConversionCreate_F32(
-                       &convert_input,
-                       static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()),
-                       lt_internal_input),
-                   E_SUCCESS);
-
-          AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
-                         &res_lrn_bwd[dnnResourceSrc]);
-          CHECK_EQ(dnnConversionExecute_F32(convert_input, user_fwd_input,
-                                            res_lrn_bwd[dnnResourceSrc]),
-                   E_SUCCESS);
-          dnnDelete_F32(convert_input);
-        } else {
-          res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
-        }
-      }
-
-      res_lrn_bwd[dnnResourceWorkspace] = workspace_buffer;
-    }
-
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context) {
-      Tensor in_grads;
-      Tensor in_image;
-      Tensor out_image;
-
-      GetMklShape(context, 0, &ingrad_shape);
-      GetMklShape(context, 1, &inimage_shape);
-      GetMklShape(context, 2, &outimage_shape);
-
-      if (ingrad_shape.IsMklTensor()) {
-        in_grads =
-            ConvertMklToTF<T>(context, MklGetInput(context, 0), ingrad_shape);
-      } else {
-        in_grads = MklGetInput(context, 0);
-      }
-
-      if (inimage_shape.IsMklTensor()) {
-        in_image =
-            ConvertMklToTF<T>(context, MklGetInput(context, 1), inimage_shape);
-      } else {
-        in_image = MklGetInput(context, 1);
-      }
-
-      if (outimage_shape.IsMklTensor()) {
-        out_image =
-            ConvertMklToTF<T>(context, MklGetInput(context, 2), outimage_shape);
-      } else {
-        out_image = MklGetInput(context, 2);
-      }
-
-      const int64 batch = static_cast<int64>(in_grads.dim_size(0));
-      const int64 rows = static_cast<int64>(in_grads.dim_size(1));
-      const int64 cols = static_cast<int64>(in_grads.dim_size(2));
-      const int64 depth = static_cast<int64>(in_grads.dim_size(3));
-      const auto nodes = cols * rows;
-
-      auto grads_shaped = in_grads.shaped<T, 2>({nodes * batch, depth});
-
-      auto in_shaped = in_image.shaped<T, 2>({nodes * batch, depth});
-      auto activations = out_image.shaped<T, 2>({nodes * batch, depth});
-
-      Tensor* output;
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, 0, &output, in_grads.shape(),
-                                mkl_output_mkl_shape);
-
-      auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
-      out_shaped.setZero();
-      auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
-                    depth](int64 begin, int64 end) {
-        for (int64 i = begin; i < end; ++i) {
-          for (int64 j = 0; j < depth; ++j) {
-            int64 depth_begin = std::max<int64>(0, j - depth_radius_);
-            int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
-
-            T norm(0);
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              norm += in_shaped(i, k) * in_shaped(i, k);
-            }
-            norm = alpha_ * norm + bias_;
-            DCHECK_GT(norm, T(1e-6));
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
-                      activations(i, j) / norm;
-              if (k == j) {
-                dyi += Eigen::numext::pow(norm, -beta_);
-              }
-              dyi *= grads_shaped(i, j);
-              const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) +=
-                  dyi;
-            }
-          }
-        }
-      };
-      auto worker_threads =
-          *(context->device()->tensorflow_cpu_worker_threads());
-      Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
-            depth * depth, shard);
-    }
-
-    // release mkl resources
-    void Mklcleanup() {
-      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
-      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
-      if (!ingrad_in_mkl_format) {
-        CHECK_EQ(dnnLayoutDelete_F32(lt_input), E_SUCCESS);
-      }
-
-      if (!inimage_in_mkl_format) {
-        CHECK_EQ(dnnLayoutDelete_F32(lt_output), E_SUCCESS);
-      }
-      dnnDelete_F32(lrn_bwd);
-      dnnLayoutDelete_F32(lt_bdw_input);
-      dnnLayoutDelete_F32(lt_workspace);
-    }
-  } MklLRNGradOpContext;
-
-  typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
-  bool workspace_enabled_;
-  int depth_radius_;
-  float bias_;
-  float alpha_;
-  float beta_;
-};
-
-#else
-
 template <typename T>
 class MklLRNOp : public OpKernel {
  public:
@@ -847,7 +175,6 @@ class MklLRNOp : public OpKernel {
                             MklDnnData<T>* src_dnn_data,
                             MklDnnData<T>* dst_dnn_data,
                             MklDnnData<uint8>* wksp_dnn_data = nullptr) {
-
     // Check for input reorder
     src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc());
 
@@ -1160,7 +487,6 @@ class MklLRNGradOp : public OpKernel {
       MklDnnData<T>* output_diff_src,
       const memory::primitive_desc& target_diff_dst_pd,
       const MklDnnData<uint8>* workspace_dnn_data = nullptr) {
-
     // Check for input reordering on the diff dst input
     input_gradient_diff_dst->CheckReorderToOpMem(
         lrn_bkwd_desc.diff_dst_primitive_desc());
@@ -1345,8 +671,6 @@ class MklLRNGradOp : public OpKernel {
   float beta_;
 };
 
-#endif  // INTEL_MKL_ML_ONLY
-
 #define REGISTER_MKL_LRN_CPU(T)                                     \
   REGISTER_KERNEL_BUILDER(Name("_MklLRN")                           \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 708213648b48e2dfbbfe9a63851428aa97c72b64..d8ab1cd25b9e09e6b25e2b0454567caa3dcea9e0 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -204,7 +204,7 @@ class MklEltwiseFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
   ~MklEltwiseFwdPrimitiveFactory() {}
 
   static string CreateKey(const MklEltwiseFwdParams<T>& fwdParams,
-                               memory::format src_fmt) {
+                          memory::format src_fmt) {
     string prefix = "eltwise_fwd";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
@@ -422,8 +422,8 @@ class MklEltwiseBwdPrimitiveFactory : public MklPrimitiveFactory<T> {
 
  private:
   static string CreateKey(const MklEltwiseBwdParams<T>& bwdParams,
-                               const memory::format& src_fmt,
-                               const memory::format& diff_dst_fmt) {
+                          const memory::format& src_fmt,
+                          const memory::format& diff_dst_fmt) {
     string prefix = "eltwise_bwd";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
@@ -856,9 +856,9 @@ class MklReluOpBase : public OpKernel {
 
       Tensor* dst_tensor = nullptr;
       OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                      {static_cast<const int>(src_index)},
-                                      static_cast<const int>(dst_index),
-                                      tf_shape_dst, &dst_tensor));
+                                  {static_cast<const int>(src_index)},
+                                  static_cast<const int>(dst_index),
+                                  tf_shape_dst, &dst_tensor));
       AllocateOutputSetMklShape(context, dst_index, dnn_shape_dst);
 
       T* dst_data = dst_tensor->flat<T>().data();
@@ -867,18 +867,19 @@ class MklReluOpBase : public OpKernel {
       eltwise_fwd->Execute(src_data, dst_data);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) +
-                         ", in file " + string(__FILE__) + ":" +
-                         std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                        error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
  private:
   engine cpu_engine = engine(engine::cpu, 0);
   std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+
+ protected:
   float alpha_;
   float beta_;
 };
@@ -947,11 +948,11 @@ class MklReluGradOpBase : public OpKernel {
         auto diff_dst_tf_data_format =
             MklDnnDataFormatToTFDataFormat(diff_dst_mkl_data_format);
 
-        src_dims = (src_tensor.dims() == 4) 
-                 ? TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
-                                             diff_dst_tf_data_format)
-                 : TFShapeToMklDnnDimsInNCDHW(src_tensor.shape(),
-                                              diff_dst_tf_data_format);
+        src_dims = (src_tensor.dims() == 4)
+                       ? TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
+                                                   diff_dst_tf_data_format)
+                       : TFShapeToMklDnnDimsInNCDHW(src_tensor.shape(),
+                                                    diff_dst_tf_data_format);
         src_md =
             memory::desc(src_dims, MklDnnType<T>(), diff_dst_mkl_data_format);
       } else {
@@ -1001,8 +1002,7 @@ class MklReluGradOpBase : public OpKernel {
       // allocate diff_src tensor
       MklDnnShape dnn_shape_diff_src;
       TensorShape tf_shape_diff_src;
-      if (dnn_shape_src.IsMklTensor() ||
-              dnn_shape_diff_dst.IsMklTensor()) {
+      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
         auto diff_src_pd = eltwise_bwd_pd->diff_src_primitive_desc();
         dnn_shape_diff_src.SetMklTensor(true);
         dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
@@ -1012,9 +1012,10 @@ class MklReluGradOpBase : public OpKernel {
                                          dnn_shape_src.GetSizesAsMklDnnDims(),
                                          dnn_shape_src.GetTfDataFormat());
         } else {
-          dnn_shape_diff_src.SetTfLayout(dnn_shape_diff_dst.GetDimension(),
-                                 dnn_shape_diff_dst.GetSizesAsMklDnnDims(),
-                                 dnn_shape_diff_dst.GetTfDataFormat());
+          dnn_shape_diff_src.SetTfLayout(
+              dnn_shape_diff_dst.GetDimension(),
+              dnn_shape_diff_dst.GetSizesAsMklDnnDims(),
+              dnn_shape_diff_dst.GetTfDataFormat());
         }
         tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
       } else {
@@ -1045,6 +1046,8 @@ class MklReluGradOpBase : public OpKernel {
  private:
   engine cpu_engine = engine(engine::cpu, 0);
   std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+
+ protected:
   float alpha_;
   float beta_;
 };
@@ -1312,8 +1315,86 @@ class MklRelu6GradOp
     T* out_o = diff_src_tensor->flat<T>().data();
     T* user_i = const_cast<T*>(src_tensor.flat<T>().data());
     T* user_g = const_cast<T*>(diff_dst_tensor.flat<T>().data());
-    out_o[0] = user_g[0] * user_i[0] > 0 &&
-               (user_i[0] < static_cast<T>(RELU6_UPPER_BOUND));
+    out_o[0] = user_g[0] * (user_i[0] > 0 &&
+                            (user_i[0] < static_cast<T>(RELU6_UPPER_BOUND)));
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklLeakyReluOp : public MklReluOpBase<Device, T, eltwise_relu> {
+ public:
+  ~MklLeakyReluOp() {}
+
+  explicit MklLeakyReluOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_relu>(context, 0.0f, 0.0f) {
+    float alpha;
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha));
+    OP_REQUIRES(
+        context, alpha <= 1,
+        errors::InvalidArgument("MKL LeakyRelu only supports alpha <= 1. "
+                                "alpha is: ",
+                                alpha));
+
+    this->alpha_ = alpha;
+  }
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t src_index = 0;  // index of src input tensor
+    const size_t dst_index = 0;  // index of dst output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    MklDnnShape dnn_shape_src;
+    GetMklShape(context, src_index, &dnn_shape_src);
+
+    Tensor* dst_tensor = nullptr;
+    T* user_i = const_cast<T*>(src_tensor.flat<T>().data());
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                              src_tensor.shape(), dnn_shape_dst);
+    T* out_o = dst_tensor->flat<T>().data();
+    out_o[0] = user_i[0] >= 0 ? user_i[0] : user_i[0] * this->alpha_;
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklLeakyReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
+ public:
+  ~MklLeakyReluGradOp() {}
+
+  explicit MklLeakyReluGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_relu>(context, 0.0f, 0.0f) {
+    float alpha;
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha));
+    OP_REQUIRES(
+        context, alpha <= 1,
+        errors::InvalidArgument("MKL LeakyRelu only supports alpha <= 1. "
+                                "alpha is: ",
+                                alpha));
+
+    this->alpha_ = alpha;
+  }
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+    const size_t src_index = 1;       // index of src input tensor
+    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+    Tensor* diff_src_tensor = nullptr;
+
+    MklDnnShape dnn_shape_diff_dst;
+    GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                              diff_dst_tensor.shape(), dnn_shape_diff_src);
+    T* out_o = diff_src_tensor->flat<T>().data();
+    T* user_i = const_cast<T*>(src_tensor.flat<T>().data());
+    T* user_g = const_cast<T*>(diff_dst_tensor.flat<T>().data());
+    out_o[0] = user_i[0] >= 0 ? user_g[0] : user_g[0] * this->alpha_;
     return;
   }
 };
@@ -1376,6 +1457,19 @@ TF_CALL_float(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES);
                           MklRelu6GradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_RELU6_MKL_SUPPORTED_KERNELS_TYPES);
 
+#define REGISTER_LeakyRelu_MKL_SUPPORTED_KERNELS_TYPES(type)        \
+  REGISTER_KERNEL_BUILDER(Name("_MklLeakyRelu")                     \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklLeakyReluOp<CPUDevice, type>);         \
+  REGISTER_KERNEL_BUILDER(Name("_MklLeakyReluGrad")                 \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklLeakyReluGradOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_LeakyRelu_MKL_SUPPORTED_KERNELS_TYPES);
+
 #endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_slice_op.cc b/tensorflow/core/kernels/mkl_slice_op.cc
index 85cabeb92b69653787ebeebd2eae4f17017063bc..e2cbeec2d2831b0dd18e325af71489ef7d8c03bc 100644
--- a/tensorflow/core/kernels/mkl_slice_op.cc
+++ b/tensorflow/core/kernels/mkl_slice_op.cc
@@ -59,9 +59,10 @@ gtl::InlinedVector<int64, 4> IntTensorToInt64Vec(const Tensor& tensor) {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 // A version of SharedValidation (slice_op.h) written for input that is in
-// either Mkl layout or Tensorflow layout.
-// A shared code to validate input shapes and check for identity, which is not dependent on the type of T.
-// We do this to reduce code size by not duplicating all this for all T (float, double, int32, etc.)
+// either Mkl layout or Tensorflow layout. A shared code to validate input
+// shapes and check for identity, which is not dependent on the type of T.
+// We do this to reduce code size by not duplicating all this for all T
+// (float, double, int32, etc.)
 static void ValidateMklInputs(OpKernelContext* context, bool* is_identity,
                               gtl::InlinedVector<int64, 4>* begin,
                               gtl::InlinedVector<int64, 4>* size) {
@@ -157,13 +158,156 @@ static void CheckCommonCasesForMklInputs(OpKernelContext* context,
   }
 }
 
+// This structure aggregates multiple inputs to Slice methods.
+struct MklSliceParams {
+  // Parameters from & to represents memory pointing to reorder.
+  const memory* from;
+  const memory* to;
+
+  // Parameters begin_dims & size_dims represents offset and length
+  // passed to view primitive.
+  memory::dims begin_dims;
+  memory::dims size_dims;
+
+  MklSliceParams(const memory* from, const memory* to, memory::dims begin_dims,
+                 memory::dims size_dims)
+      : from(from), to(to), begin_dims(begin_dims), size_dims(size_dims) {}
+};
+
+// This implements the shared interface of Slice reorders.
+template <typename T>
+class MklSlicePrimitive : public MklPrimitive {
+ public:
+  explicit MklSlicePrimitive(const MklSliceParams& sliceParams) {
+    context_.slice_stream.reset(new stream(stream::kind::eager));
+    Setup(sliceParams);
+  }
+
+  ~MklSlicePrimitive() {}
+
+  void Execute(const MklSliceParams& sliceParams) {
+    context_.src_mem->set_data_handle(sliceParams.from->get_data_handle());
+    context_.dst_mem->set_data_handle(sliceParams.to->get_data_handle());
+    context_.slice_stream->submit(context_.slice_primitives);
+
+    // We should set it back to DummyData so as to make the primitive
+    // in cache pool stateless. Otherwise, if the result for previous
+    // iteration is kept, problems of current iteration won't be
+    // thrown immediately, and wrong data would be reused.
+    context_.src_mem->set_data_handle(DummyData);
+    context_.dst_mem->set_data_handle(DummyData);
+    return;
+  }
+
+  std::shared_ptr<primitive> GetPrimitive() { return context_.reorder_prim; }
+
+ private:
+  struct SliceContext {
+    std::shared_ptr<mkldnn::memory> src_mem;
+    std::shared_ptr<mkldnn::memory> dst_mem;
+    std::shared_ptr<primitive> reorder_prim;
+    std::shared_ptr<reorder::primitive_desc> reorder_pd;
+    std::shared_ptr<view::primitive_desc> view_pd;
+    std::shared_ptr<mkldnn::stream> slice_stream;
+    std::vector<mkldnn::primitive> slice_primitives;
+    SliceContext()
+        : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {}
+  } context_;
+
+  engine cpu_engine_ = engine(engine::cpu, 0);
+
+  void Setup(const MklSliceParams& sliceParams) {
+    // Actually, this DummyData will not be used in computation,
+    // because the real data will be filled before real execution.
+    context_.src_mem.reset(
+        new memory({sliceParams.from->get_primitive_desc().desc(), cpu_engine_},
+                   DummyData));
+    context_.dst_mem.reset(new memory(
+        {sliceParams.to->get_primitive_desc().desc(), cpu_engine_}, DummyData));
+    auto src_pd = context_.src_mem->get_primitive_desc();
+    auto dst_pd = context_.dst_mem->get_primitive_desc();
+    context_.view_pd =
+        std::make_shared<view::primitive_desc>(view::primitive_desc(
+            src_pd, sliceParams.size_dims, sliceParams.begin_dims));
+    context_.reorder_pd =
+        std::make_shared<reorder::primitive_desc>(reorder::primitive_desc(
+            context_.view_pd->dst_primitive_desc(), dst_pd));
+    context_.reorder_prim = std::make_shared<mkldnn::reorder>(
+        reorder(*context_.reorder_pd, *context_.src_mem, *context_.dst_mem));
+    context_.slice_primitives.push_back(*context_.reorder_prim);
+  }
+};
+
+template <typename T>
+class MklSlicePrimitiveFactory : public MklPrimitiveFactory<T> {
+ public:
+  static MklSlicePrimitive<T>* Get(const MklSliceParams& sliceParams) {
+    auto reorderPrim = static_cast<MklSlicePrimitive<T>*>(
+        MklSlicePrimitiveFactory<T>::GetInstance().GetReorder(sliceParams));
+    if (reorderPrim == nullptr) {
+      reorderPrim = new MklSlicePrimitive<T>(sliceParams);
+      MklSlicePrimitiveFactory<T>::GetInstance().SetReorder(sliceParams,
+                                                            reorderPrim);
+    }
+    return reorderPrim;
+  }
+
+  static MklSlicePrimitiveFactory& GetInstance() {
+    static MklSlicePrimitiveFactory instance_;
+    return instance_;
+  }
+
+ private:
+  MklSlicePrimitiveFactory() {}
+  ~MklSlicePrimitiveFactory() {}
+
+  static string CreateKey(const MklSliceParams& sliceParams) {
+    string prefix = "reorder";
+    FactoryKeyCreator key_creator;
+    auto const& from_desc = sliceParams.from->get_primitive_desc().desc().data;
+    auto const& to_desc = sliceParams.to->get_primitive_desc().desc().data;
+    const int kIdxFirstStride = 0;
+    memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
+    memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
+    memory::dims from_strides(
+        from_desc.layout_desc.blocking.strides[kIdxFirstStride],
+        &from_desc.layout_desc.blocking
+             .strides[kIdxFirstStride][from_desc.ndims]);
+    memory::dims to_strides(
+        to_desc.layout_desc.blocking.strides[kIdxFirstStride],
+        &to_desc.layout_desc.blocking.strides[kIdxFirstStride][to_desc.ndims]);
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(static_cast<int>(from_desc.format));
+    key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
+    key_creator.AddAsKey(from_dims);
+    key_creator.AddAsKey(from_strides);
+    key_creator.AddAsKey(static_cast<int>(to_desc.format));
+    key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
+    key_creator.AddAsKey(to_dims);
+    key_creator.AddAsKey(to_strides);
+    key_creator.AddAsKey(sliceParams.begin_dims);
+    key_creator.AddAsKey(sliceParams.size_dims);
+    return key_creator.GetKey();
+  }
+
+  MklPrimitive* GetReorder(const MklSliceParams& sliceParams) {
+    string key = CreateKey(sliceParams);
+    return this->GetOp(key);
+  }
+
+  void SetReorder(const MklSliceParams& sliceParams, MklPrimitive* op) {
+    string key = CreateKey(sliceParams);
+    this->SetOp(key, op);
+  }
+};
+
 // MKL-DNN implementation of Slice
 template <typename Device, typename T>
-class MklDnnSliceOp : public OpKernel {
+class MklSliceOp : public OpKernel {
  public:
-  explicit MklDnnSliceOp(OpKernelConstruction* context) : OpKernel(context) {}
+  explicit MklSliceOp(OpKernelConstruction* context) : OpKernel(context) {}
 
-  ~MklDnnSliceOp() {}
+  ~MklSliceOp() {}
 
   void Compute(OpKernelContext* context) override {
     gtl::InlinedVector<int64, 4> begin;
@@ -179,17 +323,17 @@ class MklDnnSliceOp : public OpKernel {
     if (begin.size() >= 8) {
       OP_REQUIRES(
           context, false,
-          errors::Unimplemented("MklDnnSliceOp : Unhandled input dimensions"));
+          errors::Unimplemented("MklSliceOp : Unhandled input dimensions"));
     }
 
-    ComputeMklDnnSlice(context, begin, size);
+    ComputeMklSlice(context, begin, size);
   }
 
  private:
   // Slice op implemented using MKL-DNN APIs.
-  void ComputeMklDnnSlice(OpKernelContext* context,
-                          const gtl::InlinedVector<int64, 4>& begin,
-                          const gtl::InlinedVector<int64, 4>& size) {
+  void ComputeMklSlice(OpKernelContext* context,
+                       const gtl::InlinedVector<int64, 4>& begin,
+                       const gtl::InlinedVector<int64, 4>& size) {
     try {
       // MKL-DNN API usage below is guided by description at:
       //  https://github.com/01org/mkl-dnn/issues/69
@@ -200,16 +344,15 @@ class MklDnnSliceOp : public OpKernel {
       // probably change the format). Then your steps are:
       //
       // 1. create memory primitive descriptor in_mem_pd and memory primitive
-      //    in_mem_p for the entire source data.
-      // 2. create view primitive descriptor in_submem_pd based on in_mem_pd,
-      //    initial offsets, and sub-sizes
-      // 3. create memory primitive descriptor out_mem_pd and memory primitive
+      //    in_mem_p for the entire source data. create view primitive
+      //    descriptor in_submem_pd based on in_mem_pd, initial offsets,
+      //    and sub-sizes
+      // 2. create memory primitive descriptor out_mem_pd and memory primitive
       //    out_mem_p for the output (the logical sizes should match sub-sizes
-      //    used in step 2, but the format might be arbitrary)
-      // 4. create reorder primitive descriptor reorder_pd based on in_submem_pd
-      //    and out_mem_pd
-      // 5. create reorder primitive itself based on reorder_pd, in_mem_p, and
-      //    out_mem_p.
+      //    used in step 1, but the format might be arbitrary)
+      // 3. create reorder primitive descriptor reorder_pd based on in_submem_pd
+      //    and out_mem_pd. create reorder primitive itself based on reorder_pd,
+      //    in_mem_p, and out_mem_p.
       //
       // Please notice that there is no view primitive. There is only view
       // primitive descriptor. And the reorder uses source memory as input but
@@ -268,32 +411,24 @@ class MklDnnSliceOp : public OpKernel {
         src.SetUsrMem(input_md, &input_tensor);
       }
 
-      // Step 2 - create view primitive descriptor
-      auto view_pd =
-          view::primitive_desc(src.GetUsrMemPrimDesc(), size_dims, begin_dims)
-              .dst_primitive_desc();
+      // Step 2 - Create memory for output.
       auto output_strides = CalculateTFStrides(size_dims);
       auto output_md =
           MklDnnData<T>::CreateBlockedMemDesc(size_dims, output_strides);
       auto output_pd = memory::primitive_desc(output_md, cpu_engine);
-
-      // Step 3 - Create memory for output. If input is in MklDnn layout, then
-      // output is also in MklDnn layout. Otherwise, output is in Tensorflow
-      // layout.
       AllocateOutputTensor(context, input_mkl_shape, &output_pd, size_dims,
                            &output_tensor, &output_mkl_shape);
       DCHECK(output_tensor);
       DCHECK_EQ(input_mkl_shape.IsMklTensor(), output_mkl_shape.IsMklTensor());
       output.SetUsrMem(output_md, output_tensor);
 
-      std::vector<primitive> net;
-      // Step 4 - create reorder primitive desc between view_pd and output_pd.
-      auto reorder_pd =
-          reorder::primitive_desc(view_pd, output.GetUsrMemPrimDesc());
-      // Step 5 - create reorder primitive itself.
-      net.push_back(reorder(reorder_pd, *src.GetUsrMem(), *output.GetUsrMem()));
-      // Execute the reorder primitive.
-      stream(stream::kind::eager).submit(net).wait();
+      // Step 3 - create reorder primitive.
+      MklSliceParams sliceParams(src.GetUsrMem(), output.GetUsrMem(),
+                                 begin_dims, size_dims);
+      MklSlicePrimitive<T>* reorder_prim =
+          MklSlicePrimitiveFactory<T>::Get(sliceParams);
+      // Execute slice reorder.
+      reorder_prim->Execute(sliceParams);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
                          string(e.message) + ", in file " + string(__FILE__) +
@@ -347,7 +482,7 @@ class MklDnnSliceOp : public OpKernel {
                               .HostMemory("begin")                  \
                               .HostMemory("size")                   \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklDnnSliceOp<CPUDevice, type>);
+                          MklSliceOp<CPUDevice, type>);
 
 TF_CALL_float(REGISTER_MKL_SLICE);
 #undef REGISTER_MKL_SLICE
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index 094129ae3efe87e070f8a27c8584f67c927bbec3..dc3ae3d93471e3af78da63a3fcbaa51644163aa2 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -50,8 +50,6 @@ class MklSoftmaxOp : public OpKernel {
       // src_tensor now points to the 0-th input of global data struct "context"
       size_t src_idx = 0;
       const Tensor& src_tensor = MklGetInput(context, src_idx);
-      const int input_dims = src_tensor.dims();
-
       // Add: get MklShape
       MklDnnShape src_mkl_shape;
       GetMklShape(context, src_idx, &src_mkl_shape);
@@ -61,15 +59,27 @@ class MklSoftmaxOp : public OpKernel {
       auto src_tf_shape = src_mkl_shape.IsMklTensor()
                               ? src_mkl_shape.GetTfShape()
                               : src_tensor.shape();
+      const int input_dims = src_tf_shape.dims();
       auto src_dims = TFShapeToMklDnnDims(src_tf_shape);
-      auto output_dims = src_dims;
+      memory::dims output_dims;
+      int axis;
+      if (src_mkl_shape.IsMklTensor()) {
+        axis = 1;
+        output_dims = src_mkl_shape.GetSizesAsMklDnnDims();
+      } else {
+        axis = input_dims - 1;
+        output_dims = src_dims;
+      }
       memory::format layout_type;
       // In MKL, data format passed to mkl softmax op depends on dimension of
       // the input tensor. Here "x" data format in MKL is used for 1 dim tensor,
       // "nc" for 2 dim tensor, "tnc" for 3 dim tensor, "nchw" for 4 dim tensor,
-      // and "ncdhw" for 5 dim tensor. Each of the simbols has the following
+      // and "ncdhw" for 5 dim tensor. Each of the symbols has the following
       // meaning: n = batch, c = channels, t = sequence length, h = height, w =
-      // width, d = depth
+      // width, d = depth. When src tensor is MKL, layout_type here is only used
+      // for setting TF layout type of output tensor. When input is TF Tensor,
+      // layout here is no special sense. We use axis to define on which
+      // dimension to do softmax.
       switch (input_dims) {
         case 1:
           layout_type = memory::format::x;
@@ -81,13 +91,22 @@ class MklSoftmaxOp : public OpKernel {
           layout_type = memory::format::tnc;
           break;
         case 4:
-          layout_type = memory::format::nchw;
+          if (src_mkl_shape.IsMklTensor()) {
+            layout_type = memory::format::nhwc;
+          } else {
+            layout_type = memory::format::nchw;
+          }
           break;
         case 5:
-          layout_type = memory::format::ncdhw;
+          if (src_mkl_shape.IsMklTensor()) {
+            layout_type = memory::format::ndhwc;
+          } else {
+            layout_type = memory::format::ncdhw;
+          }
           break;
         default:
-          OP_REQUIRES_OK(context, errors::Aborted("Input dims must be <= 5 and >=1"));
+          OP_REQUIRES_OK(context,
+                         errors::Aborted("Input dims must be <= 5 and >=1"));
           return;
       }
       // Create softmax memory for src, dst: both are defined in mkl_util.h,
@@ -99,25 +118,17 @@ class MklSoftmaxOp : public OpKernel {
       // construct input Tf layout. For TF layout, although input shape
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
       // layout
-      auto src_md =
-          src_mkl_shape.IsMklTensor()
-              ? src_mkl_shape.GetMklLayout()
-              : memory::desc(src_dims, MklDnnType<T>(), layout_type);
-
-      // src: setting memory descriptor and op memory descriptor
-      // Basically following two functions maps the TF "src_tensor" to mkl
-      // tensor object "src"
+      auto src_md = src_mkl_shape.IsMklTensor()
+                        ? src_mkl_shape.GetMklLayout()
+                        : memory::desc(src_dims, MklDnnType<T>(), layout_type);
+
+      // src: setting memory descriptor
       // following functions are in mkl_util.h
-      // data format is "nc" for src and dst; since the src and dst buffer is
-      // always in 2D shape
       src.SetUsrMem(src_md, &src_tensor);
-      src.SetOpMemDesc(src_dims, layout_type);
 
       // creating a memory descriptor
-      // passing outermost dim as default axis, where the softmax is applied
-      int axis = input_dims - 1;
       auto softmax_fwd_desc = softmax_forward::desc(prop_kind::forward_scoring,
-                                                    src.GetOpMemDesc(), axis);
+                                                    src.GetUsrMemDesc(), axis);
       auto softmax_fwd_pd =
           softmax_forward::primitive_desc(softmax_fwd_desc, cpu_engine);
 
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 6c90ffd75e5d3021e68662818fa2d31ce4911d07..fbecd909beacd88d80384a259345727981b64b6c 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -184,12 +184,6 @@ class PartitionedCallOp : public AsyncOpKernel {
             OptimizationPassRegistry::Global()->RunGrouping(
                 OptimizationPassRegistry::POST_PLACEMENT, optimization_options),
             done);
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            OptimizationPassRegistry::Global()->RunGrouping(
-                OptimizationPassRegistry::POST_REWRITE_FOR_EXEC,
-                optimization_options),
-            done);
 
         Device* cpu_device;
         OP_REQUIRES_OK_ASYNC(
@@ -197,10 +191,19 @@ class PartitionedCallOp : public AsyncOpKernel {
 
         // Run grappler passes on the graph. It is possible that these are
         // optimized by the graph executor already.
-        OP_REQUIRES_OK_ASYNC(ctx,
-                             OptimizeGraph(ctx, fbody->ret_nodes, overlay_lib,
-                                           device_set, cpu_device, &graph),
-                             done);
+        Status optimized = OptimizeGraph(ctx, fbody->ret_nodes, overlay_lib,
+                                         device_set, cpu_device, &graph);
+        if (!optimized.ok()) {
+          LOG(WARNING) << "Grappler optimization failed. Error: "
+                       << optimized.error_message();
+        }
+
+        OP_REQUIRES_OK_ASYNC(
+            ctx,
+            OptimizationPassRegistry::Global()->RunGrouping(
+                OptimizationPassRegistry::POST_REWRITE_FOR_EXEC,
+                optimization_options),
+            done);
 
         std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
         OP_REQUIRES_OK_ASYNC(
@@ -531,6 +534,12 @@ class PartitionedCallOp : public AsyncOpKernel {
 
     tensorflow::grappler::GrapplerItem item;
 
+    // Add all available devices so that inlined function can be placed.
+    for (const Device* d : device_set.devices()) {
+      Status added_device = item.AddDevice(d->name());
+      if (!added_device.ok()) VLOG(3) << added_device.error_message();
+    }
+
     // Add fetches so that the graph can be pruned.
     for (Node* node : ret_nodes) {
       item.fetch.push_back(node->name());
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 170b08b4b7f6c8a6842dd12ad7389900b2d83b86..4167b6005194409d780b3698fda688728a50b3cc 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -55,6 +55,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -84,6 +85,47 @@ ReadVariableOp::ReadVariableOp(OpKernelConstruction* c) : OpKernel(c) {
   OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_));
 }
 
+namespace {
+Status CopyVariable(int output_idx, OpKernelContext* ctx, const Tensor* t) {
+  Tensor* output;
+  Notification n;
+  Status status;
+  AllocatorAttributes attr;
+  if (t->dtype() == DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+  TF_RETURN_IF_ERROR(
+      ctx->allocate_output(output_idx, t->shape(), &output, attr));
+  if (t->dtype() == DT_VARIANT) {
+    output->flat<Variant>() = t->flat<Variant>();
+  } else if (ctx->op_device_context() != nullptr) {
+    // TODO(apassos): remove the down_cast by just returning Device* from
+    // OpKernelContext
+    Device* device = static_cast<Device*>(ctx->device());
+    ctx->op_device_context()->CopyTensorInSameDevice(
+        t, device, output, [&n, &status](const Status& s) {
+          status = s;
+          n.Notify();
+        });
+    n.WaitForNotification();
+    return status;
+  } else {
+    switch (t->dtype()) {
+#define HANDLER(type)                       \
+  case DataTypeToEnum<type>::value:         \
+    output->flat<type>() = t->flat<type>(); \
+    break;
+      TF_CALL_ALL_TYPES(HANDLER);
+#undef HANDLER
+      default:
+        return errors::Internal("Unsupported dtype", t->dtype());
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
 void ReadVariableOp::Compute(OpKernelContext* ctx) {
   Var* variable = nullptr;
   const ResourceHandle& handle = HandleFromInput(ctx, 0);
@@ -100,12 +142,16 @@ void ReadVariableOp::Compute(OpKernelContext* ctx) {
   // holding a shared lock to guarantee ordering of reads and
   // writes.
   tf_shared_lock ml(*variable->mu());
-  const Tensor& t = *variable->tensor();
-  OP_REQUIRES(ctx, dtype_ == t.dtype(),
+  const Tensor* t = variable->tensor();
+  OP_REQUIRES(ctx, dtype_ == t->dtype(),
               errors::InvalidArgument(
                   "Trying to read variable with wrong dtype. Expected ",
-                  DataTypeString(dtype_), " got ", DataTypeString(t.dtype())));
-  ctx->set_output(0, t);
+                  DataTypeString(dtype_), " got ", DataTypeString(t->dtype())));
+  if (variable->copy_on_read_mode.load()) {
+    OP_REQUIRES_OK(ctx, CopyVariable(0, ctx, t));
+  } else {
+    ctx->set_output(0, *t);
+  }
 }
 
 ReadVariablesOp::ReadVariablesOp(OpKernelConstruction* c) : OpKernel(c) {
@@ -146,14 +192,18 @@ void ReadVariablesOp::Compute(OpKernelContext* ctx) {
     // holding a shared lock to guarantee ordering of reads and
     // writes.
     tf_shared_lock ml(*variables[i]->mu());
-    const Tensor& t = *variables[i]->tensor();
-    OP_REQUIRES(ctx, dtypes_[i] == t.dtype(),
+    OP_REQUIRES(ctx, dtypes_[i] == variables[i]->tensor()->dtype(),
                 errors::InvalidArgument(
                     "Trying to read variable ", handles[i]->name(),
                     " from Container: ", handles[i]->container(),
                     " with wrong dtype. Expected ", DataTypeString(dtypes_[i]),
-                    " got ", DataTypeString(t.dtype())));
-    ctx->set_output(i, t);
+                    " got ", DataTypeString(variables[i]->tensor()->dtype())));
+    if (variables[i]->copy_on_read_mode.load()) {
+      OP_REQUIRES_OK(ctx, CopyVariable(i, ctx, variables[i]->tensor()));
+    } else {
+      const Tensor& t = *variables[i]->tensor();
+      ctx->set_output(i, t);
+    }
   }
 }
 
@@ -308,8 +358,23 @@ class AssignVariableOp : public OpKernel {
                     "Trying to assign variable with wrong dtype. Expected ",
                     DataTypeString(variable->tensor()->dtype()), " got ",
                     DataTypeString(dtype_)));
+    if (variable->copy_on_read_mode.load()) {
+      PersistentTensor unused;
+      Tensor* tmp;
+      AllocatorAttributes attr;
+      attr.set_gpu_compatible(true);
+      attr.set_nic_compatible(true);
+      OP_REQUIRES_OK(context,
+                     context->allocate_persistent(value.dtype(), value.shape(),
+                                                  &unused, &tmp, attr));
+      functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
+      copy_functor(context->eigen_device<Device>(), tmp->flat<T>(),
+                   value.flat<T>());
+      *variable->tensor() = *tmp;
+    } else {
+      *variable->tensor() = value;
+    }
     variable->is_initialized = true;
-    *variable->tensor() = value;
   }
 
  private:
@@ -442,8 +507,9 @@ class AssignUpdateVariableOp : public OpKernel {
                                         " using a Tensor with shape ",
                                         value.shape().DebugString(),
                                         ", shapes must be equal."));
-    OP_REQUIRES_OK(context,
-                   PrepareToUpdateVariable<Device, T>(context, var_tensor));
+    OP_REQUIRES_OK(
+        context, PrepareToUpdateVariable<Device, T>(
+                     context, var_tensor, variable->copy_on_read_mode.load()));
     functor::DenseUpdate<Device, T, Op> update_functor;
     update_functor(context->eigen_device<Device>(), var_tensor->flat<T>(),
                    value.flat<T>());
@@ -524,6 +590,7 @@ class ResourceGatherOp : public OpKernel {
     Var* v = nullptr;
     OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
     core::ScopedUnref su(v);
+    OP_REQUIRES_OK(c, EnsureSparseVariableAccess<Device, T>(c, v));
     // NOTE: We hold the lock for the whole gather operation instead
     // of increasing the reference count of v->tensor() to avoid a
     // situation where a write to the same variable will see a
@@ -639,9 +706,9 @@ class ResourceScatterUpdateOp : public OpKernel {
     Var* v = nullptr;
     OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
     core::ScopedUnref unref_v(v);
-    mutex_lock ml(*v->mu());
+    OP_REQUIRES_OK(c, EnsureSparseVariableAccess<Device, T>(c, v));
+    tf_shared_lock ml(*v->mu());
     Tensor* params = v->tensor();
-    OP_REQUIRES_OK(c, PrepareToUpdateVariable<Device, T>(c, params));
     const Tensor& indices = c->input(1);
     const Tensor& updates = c->input(2);
 
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 63bb793fdcb7eb20daeee1708cb4ba78274cb9f7..b466e572495ae709d0fb05d58d964ee358077558 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -231,6 +231,7 @@ class ScatterNdUpdateOp : public OpKernel {
       Var* v;
       OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
       core::ScopedUnref scoped_unref(v);
+      OP_REQUIRES_OK(c, EnsureSparseVariableAccess<Device, T>(c, v));
       mutex_lock m(*v->mu());
       DoCompute(c);
     } else if (use_exclusive_lock_) {
@@ -258,7 +259,6 @@ class ScatterNdUpdateOp : public OpKernel {
       Var* v;
       OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
       Tensor* t = v->tensor();
-      OP_REQUIRES_OK(c, PrepareToUpdateVariable<Device, T>(c, t));
       params = *t;
       params_shape = params.shape();
     } else if (IsRefType(c->input_dtype(0))) {
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index 0fbde764d57eb661314b699ef9902238ad38b2cf..ee3c5833470eca54121ab73209e484578b42149e 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -288,7 +288,7 @@ TF_CALL_ALL_TYPES(REGISTER_SCATTER_UPDATE_CPU);
 #define REGISTER_SCATTER_UPDATE_GPU(type) REGISTER_SCATTER_UPDATE(type, GPU);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHMETIC_GPU);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_MINMAX_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_MINMAX_GPU);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_GPU);
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/scatter_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_op_gpu.cu.cc
index 0df329310f0dc51bbe91b784a40fd7bf68b012f0..d4defb8503679f3b2b6d479719f1378bd53cff19 100644
--- a/tensorflow/core/kernels/scatter_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_op_gpu.cu.cc
@@ -41,6 +41,7 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+DEFINE_GPU_SPECS(Eigen::half);
 DEFINE_GPU_SPECS(float);
 DEFINE_GPU_SPECS(double);
 // TODO: The following fails to compile.
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
index ac48202ada2204ea36478257630f20f7892be50b..a4e89f439ed9f5711253924ad120f7a6751e1728 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
@@ -88,12 +88,12 @@ class SparseDenseBinaryOpShared : public OpKernel {
     const auto rhs_dims = BCast::FromShape(dense_t->shape());
     BCast b(lhs_dims, rhs_dims, false);  // false for keeping the same num dims.
 
-    // True iff (size(lhs) > size(rhs)), or (sizes equal, lhs cwise rhs).
+    // True iff (size(lhs) >= size(rhs)) and all dims in lhs is greater or equal
+    // to dims in rhs (from right to left).
     auto VecGreaterEq = [](ArraySlice<int64> lhs, ArraySlice<int64> rhs) {
-      if (lhs.size() > rhs.size()) return true;
       if (lhs.size() < rhs.size()) return false;
-      for (size_t i = 0; i < lhs.size(); ++i) {
-        if (lhs[i] < rhs[i]) return false;
+      for (size_t i = 0; i < rhs.size(); ++i) {
+        if (lhs[lhs.size() - 1 - i] < rhs[rhs.size() - 1 - i]) return false;
       }
       return true;
     };
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 70a7ddbd0643e88655e1c0e1ad197316078267de..6db68f937def6fb4827b7fc85bff873b651a0002 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -307,9 +307,9 @@ class StridedSliceAssignOp : public OpKernel {
       OP_REQUIRES_OK(context,
                      LookupResource(context, HandleFromInput(context, 0), &v));
       core::ScopedUnref scoped_unref(v);
-      mutex_lock ml(*v->mu());
       OP_REQUIRES_OK(context,
-                     PrepareToUpdateVariable<Device, T>(context, v->tensor()));
+                     EnsureSparseVariableAccess<Device, T>(context, v));
+      mutex_lock ml(*v->mu());
       old_lhs = v->tensor();
       OP_REQUIRES(context, old_lhs->dtype() == DataTypeToEnum<T>::value,
                   errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc
index 29b21ee7353fe03ce87bc03dad72b05ca8fd4311..68f17c2e78d53ade46dead0bf040967cd2957bb1 100644
--- a/tensorflow/core/kernels/summary_image_op.cc
+++ b/tensorflow/core/kernels/summary_image_op.cc
@@ -78,6 +78,11 @@ class SummaryImageOp : public OpKernel {
     const int hw = h * w;  // Compact these two dims for simplicity
     const int depth = static_cast<int>(tensor.dim_size(3));
 
+    OP_REQUIRES(c, hw > 0 && depth > 0,
+                errors::InvalidArgument(
+                    "input tensor must have non-zero dims. Found: [",
+                    batch_size, ", ", h, ", ", w, ", ", depth, "]."));
+
     Summary s;
     if (tensor.dtype() == DT_UINT8) {
       // For uint8 input, no normalization is necessary
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index d714876bdaa964a35c9f011e34b6ec1d7b962ce7..b9b37612ad569fa8c23f4bb06d641a8c9215383d 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -325,6 +325,7 @@ class TileGradientOp : public OpKernel {
     TF_CALL_int16(HANDLE_TYPE_NAME);
     TF_CALL_int64(HANDLE_TYPE_NAME);
     TF_CALL_half(HANDLE_TYPE_NAME);
+    TF_CALL_bfloat16(HANDLE_TYPE_NAME);
     TF_CALL_complex64(HANDLE_TYPE_NAME);
     TF_CALL_complex128(HANDLE_TYPE_NAME);
 
diff --git a/tensorflow/core/kernels/training_op_helpers.cc b/tensorflow/core/kernels/training_op_helpers.cc
index 4262a5404b6ac233d0fe7a8453e3e875eb9caf1f..20c08cf8fbb6b911c8b89b719237ac4677151e3c 100644
--- a/tensorflow/core/kernels/training_op_helpers.cc
+++ b/tensorflow/core/kernels/training_op_helpers.cc
@@ -19,70 +19,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input,
-                                Var** maybe_resource) {
-  *maybe_resource = nullptr;
-  if (ctx->input_dtype(input) == DT_RESOURCE) {
-    if (LookupResource(ctx, HandleFromInput(ctx, input), maybe_resource).ok()) {
-      return (*maybe_resource)->mu();
-    } else {
-      ctx->CtxFailureWithWarning(
-          errors::Internal("Invalid variable reference."));
-      return nullptr;
-    }
-  }
-  return ctx->input_ref_mutex(input);
-}
-
-// MaybeLockVariableInputMutexesInOrder is a helper function to acquire mutexes
-// in address order to mitigate deadlock.  Returns a structure that, when
-// deleted, will release the acquired mutexes. Safe to pass duplicates - will
-// only lock each distinct mutex once.  If do_lock is false, returns
-// immediately.  Note that this silently doesn't lock mutexes for invalid
-// variable references; in all usages this is followed by GetInputTensor which
-// will signal a failure.
-VariableInputLockHolder MaybeLockVariableInputMutexesInOrder(
-    OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids) {
-  bool any_resource = false;
-  for (auto i : input_ids) {
-    if (ctx->input_dtype(i) == DT_RESOURCE) {
-      any_resource = true;
-      break;
-    }
-  }
-  if (!do_lock && !any_resource) {
-    return VariableInputLockHolder({}, {});
-  }
-  std::vector<Var*> vars;
-  std::vector<mutex*> mutexes;
-  std::vector<int> acquire_order;
-  for (auto input : input_ids) {
-    Var* var;
-    mutex* mutex = GetTrainingVariableMutex(ctx, input, &var);
-    if (var) vars.push_back(var);
-    // Only lock each mutex once if duplicates exist (n^2 but n is 2 or 3).
-    if (std::find(mutexes.begin(), mutexes.end(), mutex) == mutexes.end()) {
-      acquire_order.push_back(mutexes.size());
-      mutexes.push_back(mutex);
-    }
-  }
-  std::sort(acquire_order.begin(), acquire_order.end(),
-            [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });
-
-  std::unique_ptr<std::vector<mutex_lock>> locks =
-      MakeUnique<std::vector<mutex_lock>>();
-  locks->reserve(acquire_order.size());
-
-  for (auto input : acquire_order) {
-    Var* var;
-    mutex* mu = GetTrainingVariableMutex(ctx, input, &var);
-    core::ScopedUnref scoped_unref(var);
-    if (mu != nullptr) {
-      locks->emplace_back(*mu);
-    }
-  }
-  return VariableInputLockHolder(std::move(vars), std::move(locks));
-}
 
 void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
                                      int output) {
diff --git a/tensorflow/core/kernels/training_op_helpers.h b/tensorflow/core/kernels/training_op_helpers.h
index 9f173a80f74612beaa4da265658eafb5b9e92360..715dd8af7daa1d31587a0efe5965025461231ec4 100644
--- a/tensorflow/core/kernels/training_op_helpers.h
+++ b/tensorflow/core/kernels/training_op_helpers.h
@@ -17,30 +17,72 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_TRAINING_OP_HELPERS_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
 namespace tensorflow {
 
-// Returns a borrowed pointer to the mutex for the variable `input` in `ctx`.
-//
-// If `input` corresponds to a `DT_RESOURCE`-type variable input,
-// `*maybe_resource` will be updated to contain the underlying resource, and the
-// caller will be responsible for calling `Unref()` on that resource.
-mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input,
-                                Var** maybe_resource);
+// Must be called before performing a sparse operation on a variable. Ensures
+// that no concurrent dense operations can happen while holding the variable's
+// lock.
+template <typename Device, typename T>
+Status EnsureSparseVariableAccess(OpKernelContext* ctx, Var* var) {
+  if (var->copy_on_read_mode.load()) {
+    return Status::OK();
+  }
+  mutex_lock ml(*var->mu());
+  // Once copy-on-read mode is True the refcount is guaranteed to be 1. This can
+  // also happen if there are no concurrent reads of the variable and
+  // copy-on-read mode is false.
+  if (var->tensor()->RefCountIsOne()) {
+    var->copy_on_read_mode.store(true);
+    return Status::OK();
+  }
+  PersistentTensor unused;
+  Tensor* tmp;
+  if (std::is_same<T, Variant>::value) {
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+        var->tensor()->dtype(), var->tensor()->shape(), &unused, &tmp, attr));
+
+    const auto elements_in = var->tensor()->flat<Variant>();
+    auto elements_out = tmp->flat<Variant>();
+    for (int64 i = 0; i < elements_in.size(); ++i) {
+      elements_out(i) = elements_in(i);
+    }
+  } else {
+    AllocatorAttributes attr;
+    attr.set_gpu_compatible(true);
+    attr.set_nic_compatible(true);
+    TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+        var->tensor()->dtype(), var->tensor()->shape(), &unused, &tmp, attr));
+    functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
+    copy_functor(ctx->eigen_device<Device>(), tmp->flat<T>(),
+                 const_cast<const Tensor*>(var->tensor())->flat<T>());
+  }
+  *var->tensor() = *tmp;
+  var->copy_on_read_mode.store(true);
+  return Status::OK();
+}
 
 // Utility structure that releases a sequence of borrowed mutexes when it is
 // deleted.
 struct VariableInputLockHolder {
  public:
-  VariableInputLockHolder(std::vector<Var*> vars,
-                          std::unique_ptr<std::vector<mutex_lock>> locks)
-      : vars_(std::move(vars)), locks_(std::move(locks)) {}
+  VariableInputLockHolder(
+      std::vector<Var*> vars, std::unique_ptr<std::vector<mutex_lock>> locks,
+      std::unique_ptr<std::vector<tf_shared_lock>> shared_locks)
+      : vars_(std::move(vars)),
+        locks_(std::move(locks)),
+        shared_locks_(std::move(shared_locks)) {}
 
   VariableInputLockHolder(VariableInputLockHolder&& other)
-      : vars_(std::move(other.vars_)), locks_(std::move(other.locks_)) {}
+      : vars_(std::move(other.vars_)),
+        locks_(std::move(other.locks_)),
+        shared_locks_(std::move(other.shared_locks_)) {}
 
   ~VariableInputLockHolder() {
     // Release the locks before unreffing the Vars, because each lock
@@ -56,10 +98,96 @@ struct VariableInputLockHolder {
   // NOTE: Use a `std::unique_ptr` instead of moving in a vector directly,
   // because a `std::vector<mutex_lock>` is not movable on all platforms.
   std::unique_ptr<std::vector<mutex_lock>> locks_;
+  std::unique_ptr<std::vector<tf_shared_lock>> shared_locks_;
 };
 
+// Returns a borrowed pointer to the mutex for the variable `input` in `ctx`.
+//
+// If `input` corresponds to a `DT_RESOURCE`-type variable input,
+// `*maybe_resource` will be updated to contain the underlying resource, and the
+// caller will be responsible for calling `Unref()` on that resource.
+template <typename Device, typename T>
+mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input, bool sparse,
+                                Var** maybe_resource) {
+  *maybe_resource = nullptr;
+  if (ctx->input_dtype(input) == DT_RESOURCE) {
+    if (LookupResource(ctx, HandleFromInput(ctx, input), maybe_resource).ok()) {
+      if (sparse) {
+        EnsureSparseVariableAccess<Device, T>(ctx, *maybe_resource)
+            .IgnoreError();
+      }
+      return (*maybe_resource)->mu();
+    } else {
+      ctx->CtxFailureWithWarning(
+          errors::Internal("Invalid variable reference."));
+      return nullptr;
+    }
+  }
+  return ctx->input_ref_mutex(input);
+}
+
+// MaybeLockVariableInputMutexesInOrder is a helper function to acquire mutexes
+// in address order to mitigate deadlock.  Returns a structure that, when
+// deleted, will release the acquired mutexes. Safe to pass duplicates - will
+// only lock each distinct mutex once. If sparse is true will ensure the
+// variable gets switched to copy-on-read mode before trying to acquire the
+// locks. If do_lock is false, returns immediately for reference variables. For
+// resource variables in copy-on-read-mode it will grab a shared lock if do_lock
+// is false, exclusive lock otherwise.  Note that this silently doesn't lock
+// mutexes for invalid variable references; in all usages this is followed by
+// GetInputTensor which will signal a failure.
+template <typename Device, typename T>
 VariableInputLockHolder MaybeLockVariableInputMutexesInOrder(
-    OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids);
+    OpKernelContext* ctx, bool do_lock, bool sparse,
+    const std::vector<int>& input_ids) {
+  bool any_resource = false;
+  for (auto i : input_ids) {
+    if (ctx->input_dtype(i) == DT_RESOURCE) {
+      any_resource = true;
+      break;
+    }
+  }
+  if (!do_lock && !any_resource) {
+    return VariableInputLockHolder({}, {}, {});
+  }
+  std::vector<Var*> vars;
+  std::vector<mutex*> mutexes;
+  std::vector<int> acquire_order;
+  for (auto input : input_ids) {
+    Var* var;
+    mutex* mutex =
+        GetTrainingVariableMutex<Device, T>(ctx, input, sparse, &var);
+    if (var) vars.push_back(var);
+    // Only lock each mutex once if duplicates exist (n^2 but n is 2 or 3).
+    if (std::find(mutexes.begin(), mutexes.end(), mutex) == mutexes.end()) {
+      acquire_order.push_back(mutexes.size());
+      mutexes.push_back(mutex);
+    }
+  }
+  std::sort(acquire_order.begin(), acquire_order.end(),
+            [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });
+
+  std::unique_ptr<std::vector<mutex_lock>> locks =
+      absl::make_unique<std::vector<mutex_lock>>();
+  std::unique_ptr<std::vector<tf_shared_lock>> shared_locks =
+      absl::make_unique<std::vector<tf_shared_lock>>();
+  locks->reserve(acquire_order.size());
+
+  for (auto input : acquire_order) {
+    Var* var;
+    mutex* mu = GetTrainingVariableMutex<Device, T>(ctx, input, sparse, &var);
+    core::ScopedUnref scoped_unref(var);
+    if (mu != nullptr) {
+      if (!sparse || do_lock) {
+        locks->emplace_back(*mu);
+      } else {
+        shared_locks->emplace_back(*mu);
+      }
+    }
+  }
+  return VariableInputLockHolder(std::move(vars), std::move(locks),
+                                 std::move(shared_locks));
+}
 
 void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
                                      int output);
@@ -68,8 +196,9 @@ void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
 // reference count of 1 before you update it.
 // REQUIRES: If you pass in variable->tensor(), *variable->mu() must be held.
 template <typename Device, typename T>
-Status PrepareToUpdateVariable(OpKernelContext* ctx, Tensor* tensor) {
-  if (!tensor->RefCountIsOne()) {
+Status PrepareToUpdateVariable(OpKernelContext* ctx, Tensor* tensor,
+                               bool copy_on_read_mode) {
+  if (copy_on_read_mode || !tensor->RefCountIsOne()) {
     // Tensor's buffer is in use by some read, so we need to copy before
     // updating.
     PersistentTensor unused;
@@ -100,12 +229,14 @@ Status PrepareToUpdateVariable(OpKernelContext* ctx, Tensor* tensor) {
   return Status::OK();
 }
 
-// This gives you `*out`, a tensor you can update, corresponding to a
-// variable passed as input index `input`.  This handles the
-// differences between reference and resource variables.  For resource
-// variables, we ensure `*out` has a reference count of 1 (using
-// PrepareToUpdateVariable() to copy if necessary) unless
-// sparse && !lock_held, in which case it never copies.
+// This gives you `*out`, a tensor you can update, corresponding to a variable
+// passed as input index `input`.  This handles the differences between
+// reference and resource variables. For reference variables we can just grab
+// the tensor, grabbing the lock if lock_held is False.
+//
+// For resource variables we, if sparse is true, ensure it's in copy-on-read
+// mode, and then, regardless of the value of sparse, ensure its refcount is 1
+// (by potentially copying its contents). In this case lock_held is ignored.
 template <typename Device, typename T>
 Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
                                   bool lock_held, bool sparse, Tensor* out) {
@@ -113,7 +244,13 @@ Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
     Var* var;
     TF_RETURN_IF_ERROR(LookupResource(ctx, HandleFromInput(ctx, input), &var));
     core::ScopedUnref unref_var(var);
-    TF_RETURN_IF_ERROR(PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
+    if (sparse) {
+      TF_RETURN_IF_ERROR(EnsureSparseVariableAccess<Device, T>(ctx, var));
+      *out = *var->tensor();
+      return Status::OK();
+    }
+    TF_RETURN_IF_ERROR(PrepareToUpdateVariable<Device, T>(
+        ctx, var->tensor(), var->copy_on_read_mode.load()));
     *out = *var->tensor();
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index acf162deec9bdb05183103ce6b47f364106a2036..b2239ab5c39fea33fc70b6aaf170d456cd1ba3fe 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -283,6 +283,22 @@ struct ApplyMomentum<CPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyKerasMomentum<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov) {
+    accum.device(d) = accum * momentum() - grad * lr();
+    if (use_nesterov) {
+      var.device(d) += (accum * momentum() - grad * lr());
+    } else {
+      var.device(d) += accum;
+    }
+  }
+};
+
 template <typename Device, typename T>
 struct ApplyAdamNonCuda {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
@@ -331,6 +347,28 @@ struct ApplyAdamSYCL {
 template <typename T>
 struct ApplyAdam<CPUDevice, T> : ApplyAdamNonCuda<CPUDevice, T> {};
 
+template <typename T>
+struct ApplyAdamWithAmsgrad<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Flat vhat,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    const T alpha = lr() * Eigen::numext::sqrt(T(1) - beta2_power()) /
+                    (T(1) - beta1_power());
+
+    m.device(d) += (grad - m) * (T(1) - beta1());
+    v.device(d) += (grad.square() - v) * (T(1) - beta2());
+    vhat.device(d) = vhat.cwiseMax(v);
+    var.device(d) -= (m * alpha) / (vhat.sqrt() + epsilon());
+  }
+};
+
 template <typename Device, typename T>
 struct ApplyAdaMaxNonCuda {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
@@ -427,11 +465,12 @@ class ApplyGradientDescentOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -468,11 +507,12 @@ class ApplyGradientDescentOp<SYCLDevice, T> : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<SYCLDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -562,7 +602,8 @@ class ApplyAdadeltaOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     Var* resource;
-    mutex* mu = GetTrainingVariableMutex(ctx, 0, &resource);
+    const bool sparse = false;
+    mutex* mu = GetTrainingVariableMutex<Device, T>(ctx, 0, sparse, &resource);
     core::ScopedUnref scoped_unref(resource);
     if (use_exclusive_lock_ && mu != nullptr) {
       mutex_lock l1(*mu);
@@ -586,14 +627,16 @@ class ApplyAdadeltaOp : public OpKernel {
 
   void DoValidate(OpKernelContext* ctx) {
     Tensor var;
+    const bool sparse = false;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     Tensor accum_update;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, false, &accum_update));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable<Device, T>(ctx, 2, use_exclusive_lock_,
+                                                   sparse, &accum_update));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -640,14 +683,16 @@ class ApplyAdadeltaOp : public OpKernel {
   void DoCompute(OpKernelContext* ctx) {
     const Device& device = ctx->template eigen_device<Device>();
     Tensor var;
+    const bool sparse = false;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     Tensor accum_update;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, false, &accum_update));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable<Device, T>(ctx, 2, use_exclusive_lock_,
+                                                   sparse, &accum_update));
 
     const Tensor& lr = ctx->input(3);
     const Tensor& rho = ctx->input(4);
@@ -713,7 +758,8 @@ class SparseApplyAdadeltaOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     Var* var;
-    mutex* mu = GetTrainingVariableMutex(ctx, 0, &var);
+    const bool sparse = true;
+    mutex* mu = GetTrainingVariableMutex<CPUDevice, T>(ctx, 0, sparse, &var);
     core::ScopedUnref scoped_unref(var);
     // mu_accum is actually the same mutex as mu_var since currently we use a
     // global mutex.
@@ -729,14 +775,16 @@ class SparseApplyAdadeltaOp : public OpKernel {
 
   void DoCompute(OpKernelContext* ctx) {
     Tensor var;
+    const bool sparse = true;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum_grad;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 1, use_exclusive_lock_, true, &accum_grad));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum_grad));
     Tensor accum_update;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 2, use_exclusive_lock_, true, &accum_update));
+    OP_REQUIRES_OK(ctx,
+                   GetInputTensorFromVariable<CPUDevice, T>(
+                       ctx, 2, use_exclusive_lock_, sparse, &accum_update));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -869,11 +917,12 @@ class ApplyProximalGradientDescentOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -938,11 +987,12 @@ class SparseApplyProximalGradientDescentOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
                 errors::InvalidArgument("var must be at least 1 dimensional"));
 
@@ -1083,14 +1133,15 @@ class ApplyAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1176,14 +1227,15 @@ class ApplyProximalAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1278,14 +1330,15 @@ class SparseApplyAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 1, use_exclusive_lock_, true, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1418,14 +1471,15 @@ class SparseApplyProximalAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 1, use_exclusive_lock_, true, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1590,19 +1644,20 @@ class ApplyAdagradDAOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor gradient_accum;
     OP_REQUIRES_OK(
         ctx, GetInputTensorFromVariable<Device, T>(ctx, 1, use_exclusive_lock_,
-                                                   false, &gradient_accum));
+                                                   sparse, &gradient_accum));
     Tensor gradient_squared_accum;
     OP_REQUIRES_OK(
         ctx, GetInputTensorFromVariable<Device, T>(
-                 ctx, 2, use_exclusive_lock_, false, &gradient_squared_accum));
+                 ctx, 2, use_exclusive_lock_, sparse, &gradient_squared_accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1691,19 +1746,20 @@ class SparseApplyAdagradDAOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor gradient_accum;
     OP_REQUIRES_OK(ctx,
                    GetInputTensorFromVariable<CPUDevice, T>(
-                       ctx, 1, use_exclusive_lock_, true, &gradient_accum));
+                       ctx, 1, use_exclusive_lock_, sparse, &gradient_accum));
     Tensor gradient_squared_accum;
     OP_REQUIRES_OK(
         ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                 ctx, 2, use_exclusive_lock_, true, &gradient_squared_accum));
+                 ctx, 2, use_exclusive_lock_, sparse, &gradient_squared_accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1889,18 +1945,19 @@ class ApplyFtrlOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     Tensor linear;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, false, &linear));
+                            ctx, 2, use_exclusive_lock_, sparse, &linear));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2041,17 +2098,18 @@ class SparseApplyFtrlOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, true, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     Tensor linear;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, true, &linear));
+                            ctx, 2, use_exclusive_lock_, sparse, &linear));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2315,15 +2373,16 @@ class ApplyMomentumOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2416,15 +2475,16 @@ class SparseApplyMomentumOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 1, use_exclusive_lock_, true, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2525,6 +2585,219 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
+template <typename Device, typename T>
+class ApplyKerasMomentumOp : public OpKernel {
+ public:
+  explicit ApplyKerasMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& grad = ctx->input(3);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Tensor& momentum = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()),
+                errors::InvalidArgument("momentum is not a scalar: ",
+                                        momentum.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyKerasMomentum<Device, T>()(
+        device, var.flat<T>(), accum.flat<T>(), lr.scalar<T>(), grad.flat<T>(),
+        momentum.scalar<T>(), use_nesterov_);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  bool use_nesterov_;
+};
+
+#define REGISTER_KERNELS(D, T)                               \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyKerasMomentum") \
+                              .Device(DEVICE_##D)            \
+                              .HostMemory("var")             \
+                              .HostMemory("accum")           \
+                              .TypeConstraint<T>("T"),       \
+                          ApplyKerasMomentumOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                               \
+  template <>                                                             \
+  void ApplyKerasMomentum<GPUDevice, T>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T>::Flat var,                   \
+      typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstFlat grad,                                 \
+      typename TTypes<T>::ConstScalar momentum, bool use_nesterov);       \
+  extern template struct ApplyKerasMomentum<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+// Note, this op works on cpu only.
+template <typename T, typename Tindex>
+class SparseApplyKerasMomentumOp : public OpKernel {
+ public:
+  explicit SparseApplyKerasMomentumOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
+  }
+
+  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
+                errors::InvalidArgument("var must be at least 1 dimensional"));
+
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr.shape().DebugString()));
+    const Tensor& grad = ctx->input(3);
+    const Tensor& indices = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    for (int d = 1; d < var.dims(); d++) {
+      OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d)));
+    }
+    const Tindex N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    const Tensor& momentum = ctx->input(5);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()),
+                errors::InvalidArgument("momentum is not a scalar: ",
+                                        momentum.shape().DebugString()));
+
+    if (N > 0) {
+      const Tindex first_dim_size = var.dim_size(0);
+      auto indices_vec = indices.vec<Tindex>();
+      auto var_flat = var.flat_outer_dims<T>();
+      auto accum_flat = accum.flat_outer_dims<T>();
+      auto grad_flat = grad.flat_outer_dims<T>();
+      T lr_scalar = lr.scalar<T>()();
+      T momentum_scalar = momentum.scalar<T>()();
+
+      for (Tindex i = 0; i < N; i++) {
+        const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+        OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                    errors::InvalidArgument(
+                        strings::StrCat("Index ", index, " at offset ", i,
+                                        " in indices is out of range")));
+        auto a = accum_flat.template chip<0>(index);
+        auto g = grad_flat.template chip<0>(i);
+        auto v = var_flat.template chip<0>(index);
+        a = a * a.constant(momentum_scalar) - g * g.constant(lr_scalar);
+        if (use_nesterov_) {
+          v += a * a.constant(momentum_scalar) - g * g.constant(lr_scalar);
+        } else {
+          v += a;
+        }
+      }
+    }
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  bool use_nesterov_;
+};
+
+#define REGISTER_KERNELS(T, Tindices)                                \
+  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyKerasMomentum")   \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyKerasMomentumOp<T, Tindices>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 template <typename Device, typename T>
 class ApplyAdamOp : public OpKernel {
  public:
@@ -2534,18 +2807,19 @@ class ApplyAdamOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor m;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &m));
+                            ctx, 1, use_exclusive_lock_, sparse, &m));
     Tensor v;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, false, &v));
+                            ctx, 2, use_exclusive_lock_, sparse, &v));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2624,18 +2898,19 @@ class ApplyAdamOp<SYCLDevice, T> : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<SYCLDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor m;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 1, use_exclusive_lock_, false, &m));
+                            ctx, 1, use_exclusive_lock_, sparse, &m));
     Tensor v;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 2, use_exclusive_lock_, false, &v));
+                            ctx, 2, use_exclusive_lock_, sparse, &v));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2786,6 +3061,148 @@ REGISTER_KERNELS(GPU, double);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
+template <typename Device, typename T>
+class ApplyAdamWithAmsgradOp : public OpKernel {
+ public:
+  explicit ApplyAdamWithAmsgradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
+    Tensor m;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, sparse, &m));
+    Tensor v;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 2, use_exclusive_lock_, sparse, &v));
+    Tensor vhat;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 3, use_exclusive_lock_, sparse, &vhat));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, m.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    OP_REQUIRES(
+        ctx, v.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(2)));
+    OP_REQUIRES(
+        ctx, vhat.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(2)));
+
+    const Tensor& beta1_power = ctx->input(4);
+    const Tensor& beta2_power = ctx->input(5);
+    const Tensor& lr = ctx->input(6);
+    const Tensor& beta1 = ctx->input(7);
+    const Tensor& beta2 = ctx->input(8);
+    const Tensor& epsilon = ctx->input(9);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()),
+                errors::InvalidArgument("beta1_power is not a scalar: ",
+                                        beta1_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power.shape()),
+                errors::InvalidArgument("beta2_power is not a scalar: ",
+                                        beta2_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(10);
+    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        m.shape().DebugString()));
+    OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()),
+                errors::InvalidArgument("var and v do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        v.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyAdamWithAmsgrad<Device, T>()(
+        device, var.flat<T>(), m.flat<T>(), v.flat<T>(), vhat.flat<T>(),
+        beta1_power.scalar<T>(), beta2_power.scalar<T>(), lr.scalar<T>(),
+        beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
+        grad.flat<T>());
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T)                                 \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdamWithAmsgrad") \
+                              .HostMemory("var")               \
+                              .HostMemory("m")                 \
+                              .HostMemory("v")                 \
+                              .HostMemory("vhat")              \
+                              .Device(DEVICE_##D)              \
+                              .TypeConstraint<T>("T"),         \
+                          ApplyAdamWithAmsgradOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                   \
+  template <>                                                 \
+  void ApplyAdamWithAmsgrad<GPUDevice, T>::operator()(        \
+      const GPUDevice& d, typename TTypes<T>::Flat var,       \
+      typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, \
+      typename TTypes<T>::Flat vhat,                          \
+      typename TTypes<T>::ConstScalar beta1_power,            \
+      typename TTypes<T>::ConstScalar beta2_power,            \
+      typename TTypes<T>::ConstScalar lr,                     \
+      typename TTypes<T>::ConstScalar beta1,                  \
+      typename TTypes<T>::ConstScalar beta2,                  \
+      typename TTypes<T>::ConstScalar epsilon,                \
+      typename TTypes<T>::ConstFlat grad);                    \
+  extern template struct ApplyAdamWithAmsgrad<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 template <typename Device, typename T>
 class ApplyAdaMaxOp : public OpKernel {
  public:
@@ -2794,18 +3211,19 @@ class ApplyAdaMaxOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor m;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &m));
+                            ctx, 1, use_exclusive_lock_, sparse, &m));
     Tensor v;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, false, &v));
+                            ctx, 2, use_exclusive_lock_, sparse, &v));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2922,18 +3340,19 @@ class ApplyRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor ms;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &ms));
+                            ctx, 1, use_exclusive_lock_, sparse, &ms));
     Tensor mom;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, false, &mom));
+                            ctx, 2, use_exclusive_lock_, sparse, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -3004,21 +3423,22 @@ class ApplyCenteredRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2, 3});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2, 3});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor mg;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &mg));
+                            ctx, 1, use_exclusive_lock_, sparse, &mg));
     Tensor ms;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, false, &ms));
+                            ctx, 2, use_exclusive_lock_, sparse, &ms));
     Tensor mom;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 3, use_exclusive_lock_, false, &mom));
+                            ctx, 3, use_exclusive_lock_, sparse, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -3163,18 +3583,19 @@ class SparseApplyRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor ms;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 1, use_exclusive_lock_, true, &ms));
+                            ctx, 1, use_exclusive_lock_, sparse, &ms));
     Tensor mom;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 2, use_exclusive_lock_, true, &mom));
+                            ctx, 2, use_exclusive_lock_, sparse, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -3292,21 +3713,22 @@ class SparseApplyCenteredRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2, 3});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2, 3});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor mg;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 1, use_exclusive_lock_, true, &mg));
+                            ctx, 1, use_exclusive_lock_, sparse, &mg));
     Tensor ms;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 2, use_exclusive_lock_, true, &ms));
+                            ctx, 2, use_exclusive_lock_, sparse, &ms));
     Tensor mom;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 3, use_exclusive_lock_, true, &mom));
+                            ctx, 3, use_exclusive_lock_, sparse, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -3462,15 +3884,16 @@ class ApplyAddSignOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor m;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &m));
+                            ctx, 1, use_exclusive_lock_, sparse, &m));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -3568,15 +3991,16 @@ class ApplyPowerSignOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor m;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &m));
+                            ctx, 1, use_exclusive_lock_, sparse, &m));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index e10a4cb125410dee383932f134e0339ba1c19b93..054f07350e60cd8a0c3713efc31d5a606fa6d2bc 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -126,6 +126,15 @@ struct ApplyMomentum {
                   typename TTypes<T>::ConstScalar momentum, bool use_nesterov);
 };
 
+template <typename Device, typename T>
+struct ApplyKerasMomentum {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov);
+};
+
 template <typename Device, typename T>
 struct ApplyAdam {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
@@ -139,6 +148,20 @@ struct ApplyAdam {
                   typename TTypes<T>::ConstFlat grad, bool use_nesterov);
 };
 
+template <typename Device, typename T>
+struct ApplyAdamWithAmsgrad {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Flat vhat,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
 template <typename Device, typename T>
 struct ApplyAdaMax {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 4bd32592db16b70b2731a6cf775dbf774263d283..f45b9ffca7c9970ca2aee1416d2c5bf4d90f413a 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -101,6 +101,27 @@ struct ApplyMomentum<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyKerasMomentum<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    accum.device(d) = (accum * momentum.reshape(single).broadcast(bcast) -
+                       grad * lr.reshape(single).broadcast(bcast));
+    if (use_nesterov) {
+      var.device(d) += (accum * momentum.reshape(single).broadcast(bcast) -
+                        grad * lr.reshape(single).broadcast(bcast));
+    } else {
+      var.device(d) += accum;
+    }
+  }
+};
+
 template <typename T>
 struct ApplyAdam<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -144,6 +165,39 @@ struct ApplyAdam<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyAdamWithAmsgrad<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Flat vhat,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    const auto one = static_cast<T>(1.0);
+    m.device(d) =
+        m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+                (grad - m);
+    v.device(d) =
+        v + (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
+                (grad.square() - v);
+    vhat.device(d) = vhat.cwiseMax(v);
+
+    var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
+                      (beta1_power.constant(one) - beta1_power))
+                         .reshape(single)
+                         .broadcast(bcast) *
+                     m /
+                     (epsilon.reshape(single).broadcast(bcast) + vhat.sqrt());
+  }
+};
+
 template <typename T>
 struct ApplyAdaMax<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -302,10 +356,18 @@ template struct functor::ApplyMomentum<GPUDevice, Eigen::half>;
 template struct functor::ApplyMomentum<GPUDevice, float>;
 template struct functor::ApplyMomentum<GPUDevice, double>;
 
+template struct functor::ApplyKerasMomentum<GPUDevice, Eigen::half>;
+template struct functor::ApplyKerasMomentum<GPUDevice, float>;
+template struct functor::ApplyKerasMomentum<GPUDevice, double>;
+
 template struct functor::ApplyAdam<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdam<GPUDevice, float>;
 template struct functor::ApplyAdam<GPUDevice, double>;
 
+template struct functor::ApplyAdamWithAmsgrad<GPUDevice, Eigen::half>;
+template struct functor::ApplyAdamWithAmsgrad<GPUDevice, float>;
+template struct functor::ApplyAdamWithAmsgrad<GPUDevice, double>;
+
 template struct functor::ApplyAdaMax<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdaMax<GPUDevice, float>;
 template struct functor::ApplyAdaMax<GPUDevice, double>;
diff --git a/tensorflow/core/kernels/unicode_ops.cc b/tensorflow/core/kernels/unicode_ops.cc
index 6c4ed1eaaf21649420039771e9490af4b150d6f9..c9c2ac1e69c431957b3db60f10e598b102ba9ebe 100644
--- a/tensorflow/core/kernels/unicode_ops.cc
+++ b/tensorflow/core/kernels/unicode_ops.cc
@@ -350,10 +350,10 @@ class UnicodeTranscodeOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("UnicodeTranscode").Device(DEVICE_CPU),
                         UnicodeTranscodeOp);
 
-class UnicodeDecodeWithOffsetsOp : public OpKernel {
+class UnicodeDecodeBaseOp : public OpKernel {
  public:
-  explicit UnicodeDecodeWithOffsetsOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {
+  explicit UnicodeDecodeBaseOp(OpKernelConstruction* ctx, bool generate_offsets)
+      : OpKernel(ctx), generate_offsets_(generate_offsets) {
     OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("input_encoding", &input_encoding_));
     // Make a temporary UConverter to ensure it will create without error
@@ -369,7 +369,7 @@ class UnicodeDecodeWithOffsetsOp : public OpKernel {
   }
 
   void Decode(OpKernelContext* ctx, std::vector<UChar32>* char_values,
-              std::vector<int64>* offset_values, int* string_length,
+              std::vector<int64>* offset_values, int* current_offset,
               int64* next_row_split, UChar32 char_value, int char_length,
               bool found_any_format_error) {
     if (error_options_.error_on_malformatting && found_any_format_error) {
@@ -379,7 +379,8 @@ class UnicodeDecodeWithOffsetsOp : public OpKernel {
     UChar32 decoded_value = char_value;
     if (ShouldHandleFormatError(error_options_, char_value,
                                 found_any_format_error)) {
-      if (error_options_.elide_replacement) {
+      if (error_options_.elide_replacement && (offset_values != nullptr)) {
+        *current_offset += char_length;
         return;
       } else {
         decoded_value = error_options_.subst;
@@ -390,8 +391,10 @@ class UnicodeDecodeWithOffsetsOp : public OpKernel {
     char_values->push_back(decoded_value);
 
     // Emit the byte offset
-    offset_values->push_back(*string_length);
-    *string_length += char_length;
+    if (offset_values != nullptr) {
+      offset_values->push_back(*current_offset);
+      *current_offset += char_length;
+    }
     *next_row_split += 1;
   }
 
@@ -428,42 +431,63 @@ class UnicodeDecodeWithOffsetsOp : public OpKernel {
       // the fields needed to construct a RaggedTensor.
       out_row_splits(row_split_index) = next_row_split;
       row_split_index++;
-      int string_length = 0;
+      int current_offset = 0;
       IterateUnicodeString(
           input, input_encoder->converter_,
-          std::bind(&UnicodeDecodeWithOffsetsOp::Decode, this, ctx,
-                    &char_values, &offset_values, &string_length,
-                    &next_row_split, std::placeholders::_1,
-                    std::placeholders::_2, std::placeholders::_3));
+          std::bind(&UnicodeDecodeBaseOp::Decode, this, ctx, &char_values,
+                    &offset_values, &current_offset, &next_row_split,
+                    std::placeholders::_1, std::placeholders::_2,
+                    std::placeholders::_3));
     }
     out_row_splits(row_split_index) = next_row_split;
 
-    DCHECK(offset_values.size() == char_values.size());
     Tensor* output_char_values;
     OP_REQUIRES_OK(
         ctx, ctx->allocate_output("char_values",
                                   {static_cast<int64>(char_values.size())},
                                   &output_char_values));
-    Tensor* output_offset_values;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output("char_to_byte_starts",
-                                  {static_cast<int64>(offset_values.size())},
-                                  &output_offset_values));
     auto out_char_values = output_char_values->vec<int32>();
-    auto out_offset_values = output_offset_values->vec<int64>();
-
-    // Load output tensors from intermediate value arrays.
-    for (int i = 0; i < char_values.size(); ++i) {
-      out_char_values(i) = static_cast<int32>(char_values[i]);
-      out_offset_values(i) = offset_values[i];
+    if (generate_offsets_) {
+      DCHECK(offset_values.size() == char_values.size());
+      Tensor* output_offset_values;
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output("char_to_byte_starts",
+                                    {static_cast<int64>(offset_values.size())},
+                                    &output_offset_values));
+      auto out_offset_values = output_offset_values->vec<int64>();
+
+      // Load output tensors from intermediate value arrays.
+      for (int i = 0; i < char_values.size(); ++i) {
+        out_char_values(i) = static_cast<int32>(char_values[i]);
+        out_offset_values(i) = offset_values[i];
+      }
+    } else {
+      for (int i = 0; i < char_values.size(); ++i) {
+        out_char_values(i) = static_cast<int32>(char_values[i]);
+      }
     }
   }
 
  private:
   string input_encoding_;
   ErrorOptions error_options_;
+  bool generate_offsets_ = false;
+};
+
+class UnicodeDecodeOp : public UnicodeDecodeBaseOp {
+ public:
+  explicit UnicodeDecodeOp(OpKernelConstruction* ctx)
+      : UnicodeDecodeBaseOp(ctx, false) {}
+};
+
+class UnicodeDecodeWithOffsetsOp : public UnicodeDecodeBaseOp {
+ public:
+  explicit UnicodeDecodeWithOffsetsOp(OpKernelConstruction* ctx)
+      : UnicodeDecodeBaseOp(ctx, true) {}
 };
 
+REGISTER_KERNEL_BUILDER(Name("UnicodeDecode").Device(DEVICE_CPU),
+                        UnicodeDecodeOp);
 REGISTER_KERNEL_BUILDER(Name("UnicodeDecodeWithOffsets").Device(DEVICE_CPU),
                         UnicodeDecodeWithOffsetsOp);
 
@@ -493,7 +517,7 @@ class UnicodeEncodeOp : public OpKernel {
     const Tensor& input_splits = context->input(1);
     const auto input_splits_flat = input_splits.flat<int64>();
 
-    // Since we limit to a 2-D input (inner_values of rank 1 and a single splits
+    // Since we limit to a 2-D input (flat_values of rank 1 and a single splits
     // tensor), our output dimension will be 1 with it's size equal to the
     // number of splits (outer dimension or ragged tensor).
     TensorShape output_shape({input_splits.dim_size(0) - 1});
diff --git a/tensorflow/core/lib/core/status.cc b/tensorflow/core/lib/core/status.cc
index cb2a06e620cab34f35d2b6398234ad8cb6d71dc9..7be5b9b51316d5c325e5f7eb4186819d3e1476b8 100644
--- a/tensorflow/core/lib/core/status.cc
+++ b/tensorflow/core/lib/core/status.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/status.h"
 #include <stdio.h>
+#include <map>
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace tensorflow {
 
@@ -44,68 +49,72 @@ const string& Status::empty_string() {
   return *empty;
 }
 
+string error_name(error::Code code) {
+  switch (code) {
+    case tensorflow::error::OK:
+      return "OK";
+      break;
+    case tensorflow::error::CANCELLED:
+      return "Cancelled";
+      break;
+    case tensorflow::error::UNKNOWN:
+      return "Unknown";
+      break;
+    case tensorflow::error::INVALID_ARGUMENT:
+      return "Invalid argument";
+      break;
+    case tensorflow::error::DEADLINE_EXCEEDED:
+      return "Deadline exceeded";
+      break;
+    case tensorflow::error::NOT_FOUND:
+      return "Not found";
+      break;
+    case tensorflow::error::ALREADY_EXISTS:
+      return "Already exists";
+      break;
+    case tensorflow::error::PERMISSION_DENIED:
+      return "Permission denied";
+      break;
+    case tensorflow::error::UNAUTHENTICATED:
+      return "Unauthenticated";
+      break;
+    case tensorflow::error::RESOURCE_EXHAUSTED:
+      return "Resource exhausted";
+      break;
+    case tensorflow::error::FAILED_PRECONDITION:
+      return "Failed precondition";
+      break;
+    case tensorflow::error::ABORTED:
+      return "Aborted";
+      break;
+    case tensorflow::error::OUT_OF_RANGE:
+      return "Out of range";
+      break;
+    case tensorflow::error::UNIMPLEMENTED:
+      return "Unimplemented";
+      break;
+    case tensorflow::error::INTERNAL:
+      return "Internal";
+      break;
+    case tensorflow::error::UNAVAILABLE:
+      return "Unavailable";
+      break;
+    case tensorflow::error::DATA_LOSS:
+      return "Data loss";
+      break;
+    default:
+      char tmp[30];
+      snprintf(tmp, sizeof(tmp), "Unknown code(%d)", static_cast<int>(code));
+      return tmp;
+      break;
+  }
+}
+
 string Status::ToString() const {
   if (state_ == nullptr) {
     return "OK";
   } else {
-    char tmp[30];
-    const char* type;
-    switch (code()) {
-      case tensorflow::error::CANCELLED:
-        type = "Cancelled";
-        break;
-      case tensorflow::error::UNKNOWN:
-        type = "Unknown";
-        break;
-      case tensorflow::error::INVALID_ARGUMENT:
-        type = "Invalid argument";
-        break;
-      case tensorflow::error::DEADLINE_EXCEEDED:
-        type = "Deadline exceeded";
-        break;
-      case tensorflow::error::NOT_FOUND:
-        type = "Not found";
-        break;
-      case tensorflow::error::ALREADY_EXISTS:
-        type = "Already exists";
-        break;
-      case tensorflow::error::PERMISSION_DENIED:
-        type = "Permission denied";
-        break;
-      case tensorflow::error::UNAUTHENTICATED:
-        type = "Unauthenticated";
-        break;
-      case tensorflow::error::RESOURCE_EXHAUSTED:
-        type = "Resource exhausted";
-        break;
-      case tensorflow::error::FAILED_PRECONDITION:
-        type = "Failed precondition";
-        break;
-      case tensorflow::error::ABORTED:
-        type = "Aborted";
-        break;
-      case tensorflow::error::OUT_OF_RANGE:
-        type = "Out of range";
-        break;
-      case tensorflow::error::UNIMPLEMENTED:
-        type = "Unimplemented";
-        break;
-      case tensorflow::error::INTERNAL:
-        type = "Internal";
-        break;
-      case tensorflow::error::UNAVAILABLE:
-        type = "Unavailable";
-        break;
-      case tensorflow::error::DATA_LOSS:
-        type = "Data loss";
-        break;
-      default:
-        snprintf(tmp, sizeof(tmp), "Unknown code(%d)",
-                 static_cast<int>(code()));
-        type = tmp;
-        break;
-    }
-    string result(type);
+    string result(error_name(code()));
     result += ": ";
     result += state_->msg;
     return result;
@@ -131,4 +140,93 @@ string* TfCheckOpHelperOutOfLine(const ::tensorflow::Status& v,
   return new string(r);
 }
 
+void StatusGroup::Update(const Status& s) {
+  if (s.ok()) {
+    ++num_ok_;
+  } else {
+    ok_ = false;
+    children_.push_back(s);
+  }
+}
+
+const int kMaxChildMessageSize = 2048;
+
+Status StatusGroup::as_status() const {
+  if (ok_) {
+    return Status::OK();
+  }
+
+  // If there is only one message, or all of the messages are identical, return
+  // the original status.  This reduces verbosity and preserves existing
+  // behavior when possible.
+  bool single_status = true;
+  for (const Status& s : children_) {
+    if (s != children_[0]) {
+      single_status = false;
+      break;
+    }
+  }
+
+  if (single_status) {
+    return children_[0];
+  }
+
+  std::vector<string> fmt;
+
+  // Compute a final output string with status codes sorted by frequency in
+  // increasing order.  This prefers more "interesting" messages over child
+  // messages that may come from cancellation.
+  std::map<error::Code, std::vector<Status>> code_to_status;
+  for (const Status& s : children_) {
+    code_to_status[s.code()].push_back(s);
+  }
+
+  std::vector<std::pair<error::Code, int>> count_vec;
+  count_vec.reserve(code_to_status.size());
+  for (auto& p : code_to_status) {
+    count_vec.push_back(std::make_pair(p.first, p.second.size()));
+  }
+
+  std::sort(
+      count_vec.begin(), count_vec.end(),
+      [](const std::pair<error::Code, int>& a,
+         const std::pair<error::Code, int>& b) { return a.second < b.second; });
+
+  fmt.push_back(
+      strings::Printf("Combined status information from %lu operations:\n",
+                      num_ok_ + children_.size()));
+
+  for (const auto& p : count_vec) {
+    // Deduplicate error messages
+    std::map<string, int> child_errors;
+    for (const Status& s : code_to_status[p.first]) {
+      ++child_errors[s.error_message()];
+    }
+
+    string child_fmt;
+    for (auto& m : child_errors) {
+      child_fmt.append(strings::Printf(
+          "  %s [%dx]",
+          str_util::StringReplace(m.first, "\n", "\n  ", true).c_str(),
+          m.second));
+      child_fmt.append("\n");
+    }
+    // Strip last newline.
+    child_fmt = child_fmt.substr(0, child_fmt.size() - 1);
+
+    if (child_fmt.size() > kMaxChildMessageSize) {
+      child_fmt =
+          strings::StrCat(child_fmt.substr(0, kMaxChildMessageSize), "...");
+    }
+    fmt.push_back(strings::Printf("Status code: %s [%dx]\n%s",
+                                  error_name(p.first).c_str(), p.second,
+                                  child_fmt.c_str()));
+  }
+
+  fmt.push_back(strings::Printf("(%zd successful operations.)", num_ok_));
+
+  // TODO(power): use the least-frequently occurring status for the return code
+  return Status(children_[0].code(), str_util::Join(fmt, "\n"));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/status.h b/tensorflow/core/lib/core/status.h
index eb0ff555a5f2d8f6464067c51e6ac197fa1aab2c..fe3eec1be00ff7a48b5166b9b9f2d1eb18dd03cd 100644
--- a/tensorflow/core/lib/core/status.h
+++ b/tensorflow/core/lib/core/status.h
@@ -97,6 +97,26 @@ class Status {
   void SlowCopyFrom(const State* src);
 };
 
+// Helper class to manage multiple child status values.
+class StatusGroup {
+ public:
+  // Return a merged status with combined child status messages.
+  //
+  // The status code returned is OK if all children were successful, otherwise
+  // the first non-OK child status code is reported.
+  Status as_status() const;
+
+  bool ok() const { return ok_; }
+
+  // Augment this group with the child status `status`.
+  void Update(const Status& status);
+
+ private:
+  bool ok_ = true;
+  size_t num_ok_ = 0;
+  std::vector<Status> children_;
+};
+
 inline Status::Status(const Status& s)
     : state_((s.state_ == NULL) ? NULL : new State(*s.state_)) {}
 
diff --git a/tensorflow/core/lib/core/status_test.cc b/tensorflow/core/lib/core/status_test.cc
index d95d8f20aa354603f37358c7047f6171cca08f1c..d3296b4fac451215fa2b13d6713965740966da9a 100644
--- a/tensorflow/core/lib/core/status_test.cc
+++ b/tensorflow/core/lib/core/status_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -97,6 +98,49 @@ TEST(Status, EqualsDifferentMessage) {
   ASSERT_NE(a, b);
 }
 
+TEST(StatusGroup, AcceptsFirstCode) {
+  StatusGroup c;
+  const Status internal(errors::Internal("Original error."));
+  c.Update(internal);
+  c.Update(Status::OK());
+  c.Update(Status::OK());
+  c.Update(Status::OK());
+  ASSERT_EQ(c.as_status().code(), internal.code());
+  ASSERT_EQ(c.ok(), false);
+}
+
+TEST(StatusGroup, ContainsChildMessages) {
+  StatusGroup c;
+  const Status internal(errors::Internal("Original error."));
+  const Status cancelled(errors::Cancelled("Cancelled after 10 steps."));
+  const Status aborted(errors::Aborted("Aborted after 10 steps."));
+  c.Update(internal);
+  for (size_t i = 0; i < 5; ++i) {
+    c.Update(cancelled);
+  }
+  for (size_t i = 0; i < 10; ++i) {
+    c.Update(aborted);
+  }
+  for (size_t i = 0; i < 100; ++i) {
+    c.Update(Status::OK());
+  }
+
+  ASSERT_EQ(c.as_status().code(), internal.code());
+  EXPECT_TRUE(str_util::StrContains(c.as_status().error_message(),
+                                    internal.error_message()));
+  EXPECT_TRUE(str_util::StrContains(c.as_status().error_message(),
+                                    cancelled.error_message()));
+  EXPECT_TRUE(str_util::StrContains(c.as_status().error_message(),
+                                    aborted.error_message()));
+  StatusGroup d;
+  d.Update(c.as_status());
+  c.Update(errors::FailedPrecondition("Failed!"));
+  d.Update(c.as_status());
+  c.Update(errors::DataLoss("Data loss!"));
+  d.Update(c.as_status());
+  LOG(INFO) << d.as_status();
+}
+
 static void BM_TF_CHECK_OK(int iters) {
   tensorflow::Status s =
       (iters < 0) ? errors::InvalidArgument("Invalid") : Status::OK();
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index 50d9a2e8daa8ae8abf0c61fb1a74dd8ad72d949f..4be33b2a0cf10a2525f9a93b5d4942b381d92629 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -11,6 +11,10 @@ exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
 
 cc_library(
     name = "nccl_lib",
@@ -34,27 +38,17 @@ cc_library(
 tf_cuda_cc_test(
     name = "nccl_manager_test",
     size = "medium",
-    srcs = if_cuda(
-        [
-            "nccl_manager_test.cc",
-        ],
-        [],
-    ),
-    # Disabled on jenkins until errors finding nvmlShutdown are found.
-    tags = [
-        "manual",
-        "multi_gpu",
-        "no_oss",
-        "noguitar",
-        "notap",
+    srcs = ["nccl_manager_test.cc"],
+    tags = tf_cuda_tests_tags() + [
+        "no_cuda_on_cpu_tap",  # TODO(b/120284216): re-enable multi_gpu
     ],
-    deps =
-        if_cuda([
-            ":nccl_lib",
-            "@local_config_nccl//:nccl",
-            "//tensorflow/core:cuda",
-            "//tensorflow/core:test",
-            "//tensorflow/core:test_main",
-            "//tensorflow/core:testlib",
-        ]),
+    deps = [
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ] + if_cuda([
+        ":nccl_lib",
+        "@local_config_nccl//:nccl",
+        "//tensorflow/core:cuda",
+    ]),
 )
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index f8e8c752227a414f6fbe2739314a2efd6d9e0063..df49bf1b976726b3c1cbc3917c881dbc380f2f9a 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -24,6 +24,22 @@ limitations under the License.
 
 namespace tensorflow {
 
+#define NCCL_RETURN_IF_ERROR(...)                               \
+  do {                                                          \
+    ncclResult_t nccl_status = (__VA_ARGS__);                   \
+    if (nccl_status != ncclSuccess) {                           \
+      return errors::Internal(ncclGetErrorString(nccl_status)); \
+    }                                                           \
+  } while (0)
+
+#define CUDA_RETURN_IF_ERROR(...)                               \
+  do {                                                          \
+    cudaError_t cuda_status = (__VA_ARGS__);                    \
+    if (cuda_status != cudaSuccess) {                           \
+      return errors::Internal(cudaGetErrorString(cuda_status)); \
+    }                                                           \
+  } while (0)
+
 using se::cuda::ScopedActivateExecutorContext;
 
 // Contains data for a single stream used for nccl communication; this includes
@@ -177,8 +193,8 @@ NcclManager* NcclManager::instance() {
   return instance;
 }
 
-NcclManager::Communicator* NcclManager::GetCommunicator(
-    NcclManager::Collective* collective) {
+Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
+                                    NcclManager::Communicator** communicator) {
   // Sort by executor to make ordering of executors deterministic.
   std::sort(collective->participants.begin(), collective->participants.end(),
             [](const std::unique_ptr<Participant>& a,
@@ -217,7 +233,10 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
           break;
         }
       }
-      if (i == num_devices) return comm.get();
+      if (i == num_devices) {
+        *communicator = comm.get();
+        return Status::OK();
+      }
     }
   }
 
@@ -264,37 +283,36 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
   // NCCL2 prevents InitAll for more communicators than devices (but doesn't
   // check that device ids are unique). Work around it by initializing each
   // rank individually.
-  cudaGetDeviceCount(&device_count);
+  CUDA_RETURN_IF_ERROR(cudaGetDeviceCount(&device_count));
 #endif
   std::vector<ncclComm_t> nccl_comms(num_devices);
   if (num_devices <= device_count) {
-    auto result =
-        ncclCommInitAll(nccl_comms.data(), num_devices, devices.data());
-    CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+    NCCL_RETURN_IF_ERROR(
+        ncclCommInitAll(nccl_comms.data(), num_devices, devices.data()));
   } else {
     int savedDevice = 0;
-    CHECK_EQ(cudaGetDevice(&savedDevice), cudaSuccess);
+    CUDA_RETURN_IF_ERROR(cudaGetDevice(&savedDevice));
     ncclUniqueId commId;
-    ncclGetUniqueId(&commId);
+    NCCL_RETURN_IF_ERROR(ncclGetUniqueId(&commId));
 #if NCCL_MAJOR >= 2
-    CHECK_EQ(ncclGroupStart(), ncclSuccess);
+    NCCL_RETURN_IF_ERROR(ncclGroupStart());
 #endif
     for (int rank = 0; rank < num_devices; ++rank) {
-      cudaSetDevice(devices[rank]);
-      auto result =
-          ncclCommInitRank(nccl_comms.data() + rank, num_devices, commId, rank);
-      CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+      CUDA_RETURN_IF_ERROR(cudaSetDevice(devices[rank]));
+      NCCL_RETURN_IF_ERROR(ncclCommInitRank(nccl_comms.data() + rank,
+                                            num_devices, commId, rank));
     }
 #if NCCL_MAJOR >= 2
-    CHECK_EQ(ncclGroupEnd(), ncclSuccess);
+    NCCL_RETURN_IF_ERROR(ncclGroupEnd());
 #endif
-    cudaSetDevice(savedDevice);
+    CUDA_RETURN_IF_ERROR(cudaSetDevice(savedDevice));
   }
   for (int rank = 0; rank < num_devices; ++rank) {
     members[rank].nccl_comm = nccl_comms[rank];
   }
   communicators_.emplace_back(new Communicator(std::move(members)));
-  return communicators_.back().get();
+  *communicator = communicators_.back().get();
+  return Status::OK();
 }
 
 void NcclManager::AddToAllReduce(int num_devices, const string& key,
@@ -400,10 +418,18 @@ void NcclManager::AddParticipant(int num_devices, const string& key,
 void NcclManager::RunCollective(const string& key, Collective* collective) {
   static mutex collective_mu(LINKER_INITIALIZED);
 
-  auto* communicator = GetCommunicator(collective);
-  collective->communicator = communicator;
-  const int size = communicator->num_devices;
+  Communicator* communicator = nullptr;
+  const int size = static_cast<int>(collective->participants.size());
+  Status s = GetCommunicator(collective, &communicator);
+  if (!s.ok()) {
+    for (int i = 0; i < size; ++i) {
+      collective->participants[i]->done_callback(s);
+    }
+    delete collective;
+    return;
+  }
 
+  collective->communicator = communicator;
   for (int rank = 0; rank < size; ++rank) {
     Participant* p = collective->participants[rank].get();
     NcclStream* nccl_stream = communicator->members[rank].nccl_stream;
diff --git a/tensorflow/core/nccl/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
index 76b49101d47559d47783d91aaec56fa604fc26b9..5da4fe5554d134f79c279542666c841a4e205485 100644
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -103,7 +103,13 @@ class NcclManager {
   struct NcclStream;
   struct Participant;
 
-  Communicator* GetCommunicator(Collective* collective);
+  // Gets the `Communicator` object that will be used to enqueue NCCL kernels
+  // for `collective`, and returns it via `communicator`.
+  //
+  // This may involve creating CUDA streams and NCCL initialization.  If a NCCL
+  // or CUDA error occurs in the process, this returns an INTERNAL error with
+  // the corresponding NCCL/CUDA error string.
+  Status GetCommunicator(Collective* collective, Communicator** communicator);
 
   void AddParticipant(int num_devices, const string& key,
                       std::unique_ptr<Participant> participant,
diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
index dbc07865f0b7a96b941d21131b689a7be32c445e..f9ed4d0b9a26c390bc5974f206faea16c8b5b974 100644
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -28,8 +28,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-static std::vector<BaseGPUDevice*> GetGPUDevices() {
-  std::vector<Device*> devices;
+static std::vector<std::unique_ptr<BaseGPUDevice>> GetGPUDevices() {
+  std::vector<std::unique_ptr<Device>> devices;
   SessionOptions session_options;
   session_options.config.mutable_gpu_options()
       ->set_per_process_gpu_memory_fraction(0.1);
@@ -37,12 +37,12 @@ static std::vector<BaseGPUDevice*> GetGPUDevices() {
   Status s = DeviceFactory::GetFactory(DEVICE_GPU)
                  ->AddDevices(session_options, "", &devices);
   TF_CHECK_OK(s);
-  std::vector<BaseGPUDevice*> gpus;
-  for (Device* d : devices) {
-    if (d->device_type() == "GPU") {
-      gpus.push_back(static_cast<BaseGPUDevice*>(d));
-    } else {
-      delete d;
+  std::vector<std::unique_ptr<BaseGPUDevice>> gpus;
+  for (std::unique_ptr<Device>& device : devices) {
+    if (device->device_type() == "GPU") {
+      // If `device_type()` is GPU, this `Device` is guaranteed to be a
+      // `BaseGPUDevice`, which is a subclass of `Device`.
+      gpus.emplace_back(static_cast<BaseGPUDevice*>(device.release()));
     }
   }
   return gpus;
@@ -64,16 +64,15 @@ class NcclManagerTest : public ::testing::Test {
   };
 
   static void SetUpTestCase() {
-    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
-    devices_ = new std::vector<BaseGPUDevice*>(GetGPUDevices());
-    CHECK(!devices_->empty());
+    setenv("NCCL_DEBUG", "WARN", 1 /* replace */);
+    setenv("NCCL_LAUNCH_MODE", "PARALLEL", 1 /* replace */);
+    devices_ = new std::vector<std::unique_ptr<BaseGPUDevice>>(GetGPUDevices());
     LOG(ERROR) << "Running test with " << devices_->size() << " gpus";
   }
 
-  static void TearDownTestCase() {
-    for (auto device : *devices_) delete device;
-    delete devices_;
-  }
+  static int32 NumGPUs() { return static_cast<int32>(devices_->size()); }
+
+  static void TearDownTestCase() { delete devices_; }
 
   TestCase* MakeTestCase(int num_ranks, ncclRedOp_t reduction_op,
                          TensorShape shape, float value_offset) {
@@ -153,7 +152,7 @@ class NcclManagerTest : public ::testing::Test {
       stream->ThenMemcpy(out_cpu.flat<Scalar>().data(), out_gpu_mem,
                          out_cpu.TotalBytes());
       SE_ASSERT_OK(stream->BlockHostUntilDone());
-      test::ExpectTensorNear<Scalar>(test_case->expected, out_cpu, 0.01);
+      test::ExpectClose(test_case->expected, out_cpu);
     }
   }
 
@@ -166,7 +165,7 @@ class NcclManagerTest : public ::testing::Test {
   }
 
   static BaseGPUDevice* GetDevice(size_t rank) {
-    return devices_->at(rank % devices_->size());
+    return devices_->at(rank % devices_->size()).get();
   }
 
  private:
@@ -181,13 +180,14 @@ class NcclManagerTest : public ::testing::Test {
   }
 
  private:
-  static std::vector<BaseGPUDevice*>* devices_;
+  static std::vector<std::unique_ptr<BaseGPUDevice>>* devices_;
   static const DataType data_type_;
   static const Scalar max_;
 };
 
 template <typename Scalar>
-std::vector<BaseGPUDevice*>* NcclManagerTest<Scalar>::devices_ = nullptr;
+std::vector<std::unique_ptr<BaseGPUDevice>>* NcclManagerTest<Scalar>::devices_ =
+    nullptr;
 template <typename Scalar>
 const DataType NcclManagerTest<Scalar>::data_type_ =
     DataTypeToEnum<Scalar>::value;
@@ -195,13 +195,13 @@ template <typename Scalar>
 const Scalar NcclManagerTest<Scalar>::max_ =
     Eigen::NumTraits<Scalar>::highest();
 
-// Instantiate tests for float and half.
-using TypeList = ::testing::Types<float, Eigen::half>;
+// Instantiate tests for float and double.
+using TypeList = ::testing::Types<float, double>;
 TYPED_TEST_CASE(NcclManagerTest, TypeList);
 
 // Test basic sum reduction.
 TYPED_TEST(NcclManagerTest, BasicSumReduction) {
-  const int num_ranks = 3;
+  const int num_ranks = 4;
 
   for (int op = 0; op < 4; ++op) {
     ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
@@ -209,6 +209,7 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
         this->MakeTestCase(num_ranks, reduction_op, TensorShape({2, 3}), 0.0f));
     for (int rank = 0; rank < num_ranks; ++rank) {
       auto* device = this->GetDevice(rank);
+      VLOG(2) << "rank " << rank << " device " << device->name();
       auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
       auto* stream = device->tensorflow_gpu_device_info()->stream;
       NcclManager::instance()->AddToAllReduce(
@@ -225,15 +226,13 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
 // Same as the Basic test, but with multiple threads launching parts of many
 // reductions.
 //
-// Testing the multi-rank execution is currently reduced as it can hang when run
-// with num_ranks > devices->size(), for some GPUs (e.g. K20m).
-// To test the higher settings, increase num_ranks,
-// num_collectives_per_iteration and time_limit_micros.
+// To run test longer, increase num_ranks, num_collectives_per_iteration and
+// time_limit_micros.
 TYPED_TEST(NcclManagerTest, MultipleCallers) {
-  const int num_ranks = 1;                      // 2;
-  const int num_collectives_per_iteration = 1;  // 1000;
-  const int num_threads = 3;
-  const int time_limit_micros = 1;  // 60 * 30 * 1000 * 1000;
+  const int num_ranks = 4;
+  const int num_collectives_per_iteration = 10;  // 1000;
+  const int num_threads = num_ranks * 2;
+  const int time_limit_micros = 100;  // 60 * 30 * 1000 * 1000;
 
   int64 start = Env::Default()->NowMicros();
   srand(Env::Default()->NowMicros());
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 735ba6b0f8f9fa51d300eac0da81e9a3061a11ee..602d4a009d08155a2dee6bb62c34f85ac610ec39 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -12076,33 +12076,6 @@ op {
     type: "list(float)"
   }
 }
-op {
-  name: "BytesProducedStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
@@ -17326,21 +17299,6 @@ op {
     minimum: 1
   }
 }
-op {
-  name: "DatasetToTFRecord"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "compression_type"
-    type: DT_STRING
-  }
-}
 op {
   name: "DebugGradientIdentity"
   input_arg {
@@ -18487,69 +18445,6 @@ op {
     }
   }
 }
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "DenseToSparseSetOperation"
   input_arg {
@@ -21153,24 +21048,6 @@ op {
     type: DT_STRING
   }
 }
-op {
-  name: "EnqueueInQueueDataset"
-  input_arg {
-    name: "queue"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
-  }
-  attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "EnsureShape"
   input_arg {
@@ -21626,6 +21503,33 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalBytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalCSVDataset"
   input_arg {
@@ -21692,15 +21596,44 @@ op {
   is_stateful: true
 }
 op {
-  name: "ExperimentalDirectedInterleaveDataset"
+  name: "ExperimentalDatasetCardinality"
   input_arg {
-    name: "selector_input_dataset"
+    name: "input_dataset"
     type: DT_VARIANT
   }
+  output_arg {
+    name: "cardinality"
+    type: DT_INT64
+  }
+}
+op {
+  name: "ExperimentalDatasetToTFRecord"
   input_arg {
-    name: "data_input_datasets"
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
     type: DT_VARIANT
-    number_attr: "N"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
   }
   output_arg {
     name: "handle"
@@ -21718,31 +21651,22 @@ op {
     has_minimum: true
     minimum: 1
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ExperimentalIdentityIndexedDataset"
-  input_arg {
-    name: "size"
-    type: DT_UINT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
   is_stateful: true
 }
 op {
-  name: "ExperimentalIgnoreErrorsDataset"
+  name: "ExperimentalDenseToSparseBatchDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
   }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -21761,18 +21685,19 @@ op {
   }
 }
 op {
-  name: "ExperimentalIndexedDatasetGet"
+  name: "ExperimentalDirectedInterleaveDataset"
   input_arg {
-    name: "materialized"
-    type: DT_RESOURCE
+    name: "selector_input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "index"
-    type: DT_UINT64
+    name: "data_input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
   }
   output_arg {
-    name: "components"
-    type_list_attr: "output_types"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
     name: "output_types"
@@ -21786,42 +21711,75 @@ op {
     has_minimum: true
     minimum: 1
   }
-  is_stateful: true
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
 }
 op {
-  name: "ExperimentalIndexedDatasetMaterialize"
+  name: "ExperimentalGroupByReducerDataset"
   input_arg {
-    name: "dataset"
+    name: "input_dataset"
     type: DT_VARIANT
   }
   input_arg {
-    name: "materialized"
-    type: DT_RESOURCE
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
   }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalIteratorGetDevice"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "init_func_other_arguments"
+    type_list_attr: "Tinit_func_other_arguments"
   }
-  output_arg {
-    name: "device"
-    type: DT_STRING
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
   }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalLMDBDataset"
   input_arg {
-    name: "filenames"
-    type: DT_STRING
+    name: "finalize_func_other_arguments"
+    type_list_attr: "Tfinalize_func_other_arguments"
   }
   output_arg {
     name: "handle"
     type: DT_VARIANT
   }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "init_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tinit_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
   attr {
     name: "output_types"
     type: "list(type)"
@@ -21837,25 +21795,51 @@ op {
   is_stateful: true
 }
 op {
-  name: "ExperimentalMapDataset"
+  name: "ExperimentalGroupByWindowDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
   }
   output_arg {
     name: "handle"
     type: DT_VARIANT
   }
   attr {
-    name: "f"
+    name: "key_func"
     type: "func"
   }
   attr {
-    name: "Targuments"
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
     type: "list(type)"
     has_minimum: true
   }
@@ -21871,67 +21855,56 @@ op {
     has_minimum: true
     minimum: 1
   }
-  attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
+  is_stateful: true
 }
 op {
-  name: "ExperimentalMatchingFilesDataset"
+  name: "ExperimentalGroupByWindowDataset"
   input_arg {
-    name: "patterns"
-    type: DT_STRING
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
   }
   output_arg {
     name: "handle"
     type: DT_VARIANT
   }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalMaterializedIndexDatasetHandle"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
+  attr {
+    name: "key_func"
+    type: "func"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "reduce_func"
+    type: "func"
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "window_size_func"
+    type: "func"
   }
   attr {
-    name: "output_types"
+    name: "Tkey_func_other_arguments"
     type: "list(type)"
     has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
     has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalMaxIntraOpParallelismDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
   }
-  input_arg {
-    name: "max_intra_op_parallelism"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
     name: "output_types"
@@ -21947,7 +21920,19 @@ op {
   }
 }
 op {
-  name: "ExperimentalNonSerializableDataset"
+  name: "ExperimentalIdentityIndexedDataset"
+  input_arg {
+    name: "size"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIgnoreErrorsDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -21969,6 +21954,443 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalIndexedDatasetGet"
+  input_arg {
+    name: "materialized"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "index"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIndexedDatasetMaterialize"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "materialized"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIteratorGetDevice"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "device"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalLMDBDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalLatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalMatchingFilesDataset"
+  input_arg {
+    name: "patterns"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalMaterializedIndexDatasetHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalMaxIntraOpParallelismDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "max_intra_op_parallelism"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalNonSerializableDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalNumaMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalNumaMapAndBatchDataset"
   input_arg {
@@ -22016,6 +22438,216 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalParallelInterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sloppy"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ExperimentalPrivateThreadPoolDataset"
@@ -22044,6 +22676,169 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalRandomDataset"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalSetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "counter_prefix"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalSleepDataset"
   input_arg {
@@ -22071,6 +22866,107 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalSlidingWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalSqlDataset"
+  input_arg {
+    name: "driver_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data_source_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "query"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorSummary"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalThreadPoolDataset"
   input_arg {
@@ -22136,6 +23032,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalUnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalUniqueDataset"
   input_arg {
@@ -26360,207 +27279,6 @@ op {
     }
   }
 }
-op {
-  name: "GroupByReducerDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "init_func_other_arguments"
-    type_list_attr: "Tinit_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "finalize_func_other_arguments"
-    type_list_attr: "Tfinalize_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "init_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "finalize_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tinit_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tfinalize_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "GuaranteeConst"
   input_arg {
@@ -29025,119 +29743,59 @@ op {
   }
 }
 op {
-  name: "LRN"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "depth_radius"
-    type: "int"
-    default_value {
-      i: 5
-    }
-  }
-  attr {
-    name: "bias"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "alpha"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "beta"
-    type: "float"
-    default_value {
-      f: 0.5
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "LRNGrad"
-  input_arg {
-    name: "input_grads"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_image"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "depth_radius"
-    type: "int"
-    default_value {
-      i: 5
-    }
-  }
-  attr {
-    name: "bias"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "alpha"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "beta"
-    type: "float"
-    default_value {
-      f: 0.5
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
-    }
-  }
-}
-op {
+  name: "LRN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
   name: "LRNGrad"
   input_arg {
     name: "input_grads"
@@ -29191,38 +29849,71 @@ op {
     }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "LatencyStatsDataset"
+  name: "LRNGrad"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "input_grads"
+    type_attr: "T"
   }
   input_arg {
-    name: "tag"
-    type: DT_STRING
+    name: "input_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_image"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
   }
 }
 op {
@@ -30670,6 +31361,46 @@ op {
     }
   }
 }
+op {
+  name: "Lu"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "lu"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "output_idx_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "MakeIterator"
   input_arg {
@@ -30683,7 +31414,45 @@ op {
   is_stateful: true
 }
 op {
-  name: "MapAndBatchDataset"
+  name: "MapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -30692,17 +31461,42 @@ op {
     name: "other_arguments"
     type_list_attr: "Targuments"
   }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
   }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "MapDataset"
   input_arg {
-    name: "num_parallel_batches"
-    type: DT_INT64
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
     name: "handle"
@@ -30731,7 +31525,7 @@ op {
   }
 }
 op {
-  name: "MapAndBatchDatasetV2"
+  name: "MapDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -30740,17 +31534,48 @@ op {
     name: "other_arguments"
     type_list_attr: "Targuments"
   }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
   }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MapDataset"
   input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
     name: "handle"
@@ -30777,158 +31602,18 @@ op {
     has_minimum: true
     minimum: 1
   }
-}
-op {
-  name: "MapClear"
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
+    name: "use_inter_op_parallelism"
+    type: "bool"
     default_value {
-      s: ""
+      b: true
     }
   }
-  is_stateful: true
-}
-op {
-  name: "MapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "MapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "MapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "use_inter_op_parallelism"
+    name: "preserve_cardinality"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
 }
@@ -39047,7 +39732,7 @@ op {
   }
 }
 op {
-  name: "ParallelInterleaveDataset"
+  name: "ParallelInterleaveDatasetV2"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -39065,15 +39750,7 @@ op {
     type: DT_INT64
   }
   input_arg {
-    name: "sloppy"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "buffer_output_elements"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "prefetch_input_elements"
+    name: "num_parallel_calls"
     type: DT_INT64
   }
   output_arg {
@@ -39149,9 +39826,16 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "ParallelInterleaveDatasetV2"
+  name: "ParallelMapDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -39160,17 +39844,9 @@ op {
     name: "other_arguments"
     type_list_attr: "Targuments"
   }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
   input_arg {
     name: "num_parallel_calls"
-    type: DT_INT64
+    type: DT_INT32
   }
   output_arg {
     name: "handle"
@@ -39197,13 +39873,7 @@ op {
     has_minimum: true
     minimum: 1
   }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
+  is_stateful: true
 }
 op {
   name: "ParallelMapDataset"
@@ -39244,7 +39914,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  is_stateful: true
 }
 op {
   name: "ParallelMapDataset"
@@ -39285,6 +39954,13 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
   name: "ParallelMapDataset"
@@ -39332,6 +40008,13 @@ op {
       b: true
     }
   }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ParallelMapDataset"
@@ -39386,69 +40069,13 @@ op {
       b: false
     }
   }
-}
-op {
-  name: "ParameterizedTruncatedNormal"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "means"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "stdevs"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "minvals"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "maxvals"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
   attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
+    name: "preserve_cardinality"
+    type: "bool"
     default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
-  is_stateful: true
 }
 op {
   name: "ParameterizedTruncatedNormal"
@@ -39496,7 +40123,6 @@ op {
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -39515,183 +40141,119 @@ op {
   is_stateful: true
 }
 op {
-  name: "ParseExample"
+  name: "ParameterizedTruncatedNormal"
   input_arg {
-    name: "serialized"
-    type: DT_STRING
+    name: "shape"
+    type_attr: "T"
   }
   input_arg {
-    name: "names"
-    type: DT_STRING
+    name: "means"
+    type_attr: "dtype"
   }
   input_arg {
-    name: "sparse_keys"
-    type: DT_STRING
-    number_attr: "Nsparse"
+    name: "stdevs"
+    type_attr: "dtype"
   }
   input_arg {
-    name: "dense_keys"
-    type: DT_STRING
-    number_attr: "Ndense"
+    name: "minvals"
+    type_attr: "dtype"
   }
   input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-    number_attr: "Nsparse"
-  }
-  output_arg {
-    name: "sparse_values"
-    type_list_attr: "sparse_types"
-  }
-  output_arg {
-    name: "sparse_shapes"
-    type: DT_INT64
-    number_attr: "Nsparse"
+    name: "maxvals"
+    type_attr: "dtype"
   }
   output_arg {
-    name: "dense_values"
-    type_list_attr: "Tdense"
+    name: "output"
+    type_attr: "dtype"
   }
   attr {
-    name: "Nsparse"
+    name: "seed"
     type: "int"
-    has_minimum: true
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "Ndense"
+    name: "seed2"
     type: "int"
-    has_minimum: true
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
+    name: "dtype"
+    type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
+        type: DT_INT32
         type: DT_INT64
-        type: DT_STRING
       }
     }
   }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
+  is_stateful: true
 }
 op {
-  name: "ParseExampleDataset"
+  name: "ParseExample"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "serialized"
+    type: DT_STRING
   }
   input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
+    name: "names"
+    type: DT_STRING
   }
   input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
     name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
+    type: DT_STRING
+    number_attr: "Nsparse"
   }
-  attr {
+  input_arg {
     name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    type: DT_STRING
+    number_attr: "Ndense"
   }
-}
-op {
-  name: "ParseExampleDataset"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
   }
-  input_arg {
-    name: "num_parallel_calls"
+  output_arg {
+    name: "sparse_indices"
     type: DT_INT64
+    number_attr: "Nsparse"
   }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
+  output_arg {
+    name: "sparse_values"
+    type_list_attr: "sparse_types"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "dense_values"
+    type_list_attr: "Tdense"
   }
   attr {
-    name: "sparse_keys"
-    type: "list(string)"
+    name: "Nsparse"
+    type: "int"
     has_minimum: true
   }
   attr {
-    name: "dense_keys"
-    type: "list(string)"
+    name: "Ndense"
+    type: "int"
     has_minimum: true
   }
   attr {
@@ -39723,25 +40285,6 @@ op {
     type: "list(shape)"
     has_minimum: true
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
   name: "ParseSequenceExample"
@@ -40545,69 +41088,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Pow"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "Pow"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_HALF
+        type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
@@ -40618,32 +41099,66 @@ op {
   }
 }
 op {
-  name: "PrefetchDataset"
+  name: "Pow"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "buffer_size"
-    type: DT_INT64
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  is_stateful: true
 }
 op {
   name: "PrefetchDataset"
@@ -40671,32 +41186,24 @@ op {
     has_minimum: true
     minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "PrependFromQueueAndPaddedBatchDataset"
+  name: "PrefetchDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
   }
   input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
+    name: "buffer_size"
     type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
   }
   output_arg {
     name: "handle"
     type: DT_VARIANT
   }
   attr {
-    name: "Toutput_types"
+    name: "output_types"
     type: "list(type)"
     has_minimum: true
     minimum: 1
@@ -40707,12 +41214,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
 }
 op {
   name: "PreventGradient"
@@ -44990,34 +45491,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "RandomDataset"
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "RandomGamma"
   input_arg {
@@ -49402,6 +49875,86 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdamWithAmsgrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "vhat"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyAddSign"
   input_arg {
@@ -50050,21 +50603,89 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -50078,7 +50699,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrl"
+  name: "ResourceApplyFtrlV2"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -50107,6 +50728,10 @@ op {
     name: "l2"
     type_attr: "T"
   }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
   input_arg {
     name: "lr_power"
     type_attr: "T"
@@ -50118,21 +50743,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -50202,6 +50824,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -50273,6 +50897,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -50330,21 +50955,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -50358,41 +50983,17 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrlV2"
+  name: "ResourceApplyGradientDescent"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
+    name: "delta"
     type_attr: "T"
   }
   attr {
@@ -50402,21 +51003,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -50462,6 +51060,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -50509,6 +51109,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -50542,21 +51143,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -50570,17 +51171,25 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyGradientDescent"
+  name: "ResourceApplyKerasMomentum"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "alpha"
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "delta"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
     type_attr: "T"
   }
   attr {
@@ -50615,6 +51224,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
@@ -52670,17 +53286,173 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -52707,7 +53479,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdadelta"
+  name: "ResourceSparseApplyAdagrad"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -52716,20 +53488,71 @@ op {
     name: "accum"
     type: DT_RESOURCE
   }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
@@ -52761,7 +53584,6 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -52785,7 +53607,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdadelta"
+  name: "ResourceSparseApplyAdagrad"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -52794,20 +53616,74 @@ op {
     name: "accum"
     type: DT_RESOURCE
   }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
@@ -52891,18 +53767,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -52923,21 +53802,28 @@ op {
       b: false
     }
   }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagrad"
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "gradient_accumulator"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
@@ -52947,6 +53833,22 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   attr {
     name: "T"
     type: "type"
@@ -52966,8 +53868,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -52991,18 +53891,18 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagrad"
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "gradient_accumulator"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
@@ -53012,6 +53912,22 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   attr {
     name: "T"
     type: "type"
@@ -53033,7 +53949,6 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -53057,18 +53972,18 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagrad"
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "gradient_accumulator"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
@@ -53078,6 +53993,22 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   attr {
     name: "T"
     type: "type"
@@ -53085,21 +54016,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -53123,18 +54054,18 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagrad"
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "gradient_accumulator"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
@@ -53144,6 +54075,22 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   attr {
     name: "T"
     type: "type"
@@ -53186,107 +54133,41 @@ op {
       b: false
     }
   }
-  attr {
-    name: "update_slots"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagradDA"
+  name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_accumulator"
+    name: "mg"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_squared_accumulator"
+    name: "ms"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "mom"
+    type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "momentum"
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
+    name: "epsilon"
+    type_attr: "T"
   }
   input_arg {
     name: "grad"
@@ -53296,22 +54177,6 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
   attr {
     name: "T"
     type: "type"
@@ -53331,8 +54196,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -53356,42 +54219,46 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagradDA"
+  name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_accumulator"
+    name: "mg"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_squared_accumulator"
+    name: "ms"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "rho"
+    type_attr: "T"
   }
   input_arg {
-    name: "lr"
+    name: "momentum"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
@@ -53414,7 +54281,6 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -53438,42 +54304,46 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagradDA"
+  name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_accumulator"
+    name: "mg"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_squared_accumulator"
+    name: "ms"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "rho"
+    type_attr: "T"
   }
   input_arg {
-    name: "lr"
+    name: "momentum"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
@@ -53482,21 +54352,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -53568,18 +54438,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -53603,46 +54476,42 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyCenteredRMSProp"
+  name: "ResourceSparseApplyFtrl"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mom"
+    name: "linear"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "momentum"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -53663,8 +54532,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -53688,46 +54555,42 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyCenteredRMSProp"
+  name: "ResourceSparseApplyFtrl"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mom"
+    name: "linear"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "momentum"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -53750,7 +54613,6 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -53774,46 +54636,42 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyCenteredRMSProp"
+  name: "ResourceSparseApplyFtrl"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mom"
+    name: "linear"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "momentum"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -53822,21 +54680,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -53904,96 +54762,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
+        type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -54020,7 +54800,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyFtrl"
+  name: "ResourceSparseApplyFtrlV2"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -54054,85 +54834,7 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
+    name: "l2_shrinkage"
     type_attr: "T"
   }
   input_arg {
@@ -54146,21 +54848,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -54244,6 +54943,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -54329,6 +55030,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -54400,21 +55102,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -54438,7 +55140,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyFtrlV2"
+  name: "ResourceSparseApplyKerasMomentum"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -54448,8 +55150,8 @@ op {
     type: DT_RESOURCE
   }
   input_arg {
-    name: "linear"
-    type: DT_RESOURCE
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
     name: "grad"
@@ -54460,23 +55162,7 @@ op {
     type_attr: "Tindices"
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
+    name: "momentum"
     type_attr: "T"
   }
   attr {
@@ -54521,6 +55207,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
@@ -57257,52 +57950,6 @@ op {
     }
   }
 }
-op {
-  name: "ScanDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "ScatterAdd"
   input_arg {
@@ -61035,42 +61682,6 @@ op {
     }
   }
 }
-op {
-  name: "SetStatsAggregatorDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "stats_aggregator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "counter_prefix"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Shape"
   input_arg {
@@ -61917,41 +62528,6 @@ op {
     }
   }
 }
-op {
-  name: "SlideDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "window_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_shift"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_stride"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "Snapshot"
   input_arg {
@@ -71110,38 +71686,6 @@ op {
     }
   }
 }
-op {
-  name: "SqlDataset"
-  input_arg {
-    name: "driver_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data_source_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "query"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Sqrt"
   input_arg {
@@ -72702,40 +73246,6 @@ op {
     }
   }
 }
-op {
-  name: "StatsAggregatorHandle"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatsAggregatorSummary"
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
 op {
   name: "StopGradient"
   input_arg {
@@ -75673,6 +76183,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorListConcat"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListConcatLists"
   input_arg {
@@ -75933,6 +76462,39 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListSplit"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorListStack"
   input_arg {
@@ -77204,29 +77766,6 @@ op {
     type: "type"
   }
 }
-op {
-  name: "UnbatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "UnbatchGrad"
   input_arg {
@@ -77268,6 +77807,53 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnicodeDecode"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "UnicodeDecodeWithOffsets"
   input_arg {
@@ -78523,6 +79109,17 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "UnwrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
 op {
   name: "UpperBound"
   input_arg {
@@ -78956,6 +79553,17 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "WrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
 op {
   name: "WriteAudioSummary"
   input_arg {
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index e7212b7004f6cc9906f8621642e14b5719dc5daf..1c117166de029d40b84bbd2335b9315cdc53bcba 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -83,13 +83,6 @@ REGISTER_OP("GeneratorDataset")
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("UnbatchDataset")
-    .Input("input_dataset: variant")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("ZipDataset")
     .Input("input_datasets: N * variant")
     .Output("handle: variant")
@@ -142,57 +135,6 @@ REGISTER_OP("SkipDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("BytesProducedStatsDataset")
-    .Input("input_dataset: variant")
-    .Input("tag: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle tag_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("LatencyStatsDataset")
-    .Input("input_dataset: variant")
-    .Input("tag: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle tag_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("ParseExampleDataset")
-    .Input("input_dataset: variant")
-    .Input("num_parallel_calls: int64")
-    .Input("dense_defaults: Tdense")
-    .Output("handle: variant")
-    .Attr("sparse_keys: list(string) >= 0")
-    .Attr("dense_keys: list(string) >= 0")
-    .Attr("sparse_types: list({float,int64,string}) >= 0")
-    .Attr("Tdense: list({float,int64,string}) >= 0")
-    .Attr("dense_shapes: list(shape) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")  // Output components will be
-                                              // sorted by key (dense_keys and
-                                              // sparse_keys combined) here.
-    .Attr("sloppy: bool = false")
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("SetStatsAggregatorDataset")
-    .Input("input_dataset: variant")
-    .Input("stats_aggregator: resource")
-    .Input("tag: string")
-    .Input("counter_prefix: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("MapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -202,6 +144,7 @@ REGISTER_OP("MapDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("use_inter_op_parallelism: bool = true")
+    .Attr("preserve_cardinality: bool = false")
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ParallelMapDataset")
@@ -215,60 +158,9 @@ REGISTER_OP("ParallelMapDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("use_inter_op_parallelism: bool = true")
     .Attr("sloppy: bool = false")
+    .Attr("preserve_cardinality: bool = false")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("MapAndBatchDataset")
-    .Input("input_dataset: variant")
-    .Input("other_arguments: Targuments")
-    .Input("batch_size: int64")
-    .Input("num_parallel_batches: int64")
-    .Input("drop_remainder: bool")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // Use index from the end to retrieve the Input shapes,
-      // so that to avoid guessing the length of "other_arguments".
-      // batch_size, num_parallel_batches, and drop_remainder are 0-D scalars.
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
-
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("MapAndBatchDatasetV2")
-    .Input("input_dataset: variant")
-    .Input("other_arguments: Targuments")
-    .Input("batch_size: int64")
-    .Input("num_parallel_calls: int64")
-    .Input("drop_remainder: bool")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // Use index from the end to retrieve the Input shapes,
-      // so that to avoid guessing the length of "other_arguments".
-      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
-
-      return shape_inference::ScalarShape(c);
-    });
-
 REGISTER_OP("PrefetchDataset")
     .Input("input_dataset: variant")
     .Input("buffer_size: int64")
@@ -282,18 +174,6 @@ REGISTER_OP("PrefetchDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("ScanDataset")
-    .Input("input_dataset: variant")
-    .Input("initial_state: Tstate")
-    .Input("other_arguments: Targuments")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Tstate: list(type) >= 1")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("FlatMapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -316,21 +196,6 @@ REGISTER_OP("InterleaveDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("ParallelInterleaveDataset")
-    .Input("input_dataset: variant")
-    .Input("other_arguments: Targuments")
-    .Input("cycle_length: int64")
-    .Input("block_length: int64")
-    .Input("sloppy: bool")
-    .Input("buffer_output_elements: int64")
-    .Input("prefetch_input_elements: int64")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("ParallelInterleaveDatasetV2")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -345,43 +210,6 @@ REGISTER_OP("ParallelInterleaveDatasetV2")
     .Attr("sloppy: bool = false")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("GroupByReducerDataset")
-    .Input("input_dataset: variant")
-    .Input("key_func_other_arguments: Tkey_func_other_arguments")
-    .Input("init_func_other_arguments: Tinit_func_other_arguments")
-    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
-    .Input("finalize_func_other_arguments: Tfinalize_func_other_arguments")
-    .Output("handle: variant")
-    .Attr("key_func: func")
-    .Attr("init_func: func")
-    .Attr("reduce_func: func")
-    .Attr("finalize_func: func")
-    .Attr("Tkey_func_other_arguments: list(type) >= 0")
-    .Attr("Tinit_func_other_arguments: list(type) >= 0")
-    .Attr("Treduce_func_other_arguments: list(type) >= 0")
-    .Attr("Tfinalize_func_other_arguments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("GroupByWindowDataset")
-    .Input("input_dataset: variant")
-    .Input("key_func_other_arguments: Tkey_func_other_arguments")
-    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
-    .Input(
-        "window_size_func_other_arguments: Twindow_size_func_other_arguments")
-    .Output("handle: variant")
-    .Attr("key_func: func")
-    .Attr("reduce_func: func")
-    .Attr("window_size_func: func")
-    .Attr("Tkey_func_other_arguments: list(type) >= 0")
-    .Attr("Treduce_func_other_arguments: list(type) >= 0")
-    .Attr("Twindow_size_func_other_arguments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("FilterDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -447,23 +275,6 @@ REGISTER_OP("BatchDatasetV2")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("SlideDataset")
-    .Input("input_dataset: variant")
-    .Input("window_size: int64")
-    .Input("window_shift: int64")
-    .Input("window_stride: int64")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // window_size, window_shift, and window_stride should be scalars.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
 // TODO(mrry): Validate that `padded_shapes` are all vectors, the lengths of
 // `output_types` and `output_shapes` are `N` the `output_shapes` are (as far as
 // possible to tell statically) compatible with `padded_shapes`, and that
@@ -504,22 +315,6 @@ REGISTER_OP("PaddedBatchDatasetV2")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("DenseToSparseBatchDataset")
-    .Input("input_dataset: variant")
-    .Input("batch_size: int64")
-    .Input("row_shape: int64")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // batch_size should be a scalar.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      // row_shape should be a 1-D vector.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
 REGISTER_OP("RangeDataset")
     .Input("start: int64")
     .Input("stop: int64")
@@ -538,22 +333,6 @@ REGISTER_OP("RangeDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("RandomDataset")
-    .Input("seed: int64")
-    .Input("seed2: int64")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // buffer_size, seed, and seed2 should be scalars.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
 REGISTER_OP("ShuffleDataset")
     .Input("input_dataset: variant")
     .Input("buffer_size: int64")
@@ -622,24 +401,6 @@ REGISTER_OP("TextLineDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("SqlDataset")
-    .Input("driver_name: string")
-    .Input("data_source_name: string")
-    .Input("query: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // driver_name, data_source_name, and query should be scalars.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
 REGISTER_OP("FixedLengthRecordDataset")
     .Input("filenames: string")
     .Input("header_bytes: int64")
@@ -826,53 +587,6 @@ REGISTER_OP("DeserializeIterator")
     .Input("serialized: variant")
     .SetShapeFn(shape_inference::NoOutputs);
 
-REGISTER_OP("StatsAggregatorHandle")
-    .Output("handle: resource")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''");
-
-REGISTER_OP("StatsAggregatorSummary")
-    .Input("iterator: resource")
-    .Output("summary: string")
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("PrependFromQueueAndPaddedBatchDataset")
-    .Input("input_dataset: variant")
-    .Input("batch_size: int64")
-    .Input("padded_shapes: N * int64")
-    .Input("padding_values: Toutput_types")
-    .Output("handle: variant")
-    .Attr("Toutput_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .Attr("N: int >= 1")
-    // TODO(ebrevdo): Validate that `padded_shapes` are all vectors, the lengths
-    // of `Toutput_types` and `output_shapes` are `N`, that the
-    // length of `output_types` is `N`, the `output_shapes` are
-    // (as far as possible to tell statically) compatible with `padded_shapes`,
-    // and that `padding_values` are all scalars.
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // batch_size should be a scalar.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("EnqueueInQueueDataset")
-    .Input("queue: variant")
-    .Input("components: Tcomponents")
-    .Attr("Tcomponents: list(type) >= 1")
-    .SetIsStateful()  // To avoid CSE on multiple calls to Enqueue.
-    // TODO(ebrevdo): SetShapeFn to test input dtypes and shapes by
-    // reading from queue handle (is that even possible?).
-    .SetShapeFn(shape_inference::NoOutputs);
-
-REGISTER_OP("DatasetToTFRecord")
-    .Input("input_dataset: variant")
-    .Input("filename: string")
-    .Input("compression_type: string")
-    .SetShapeFn(shape_inference::NoOutputs);
-
 REGISTER_OP("DatasetToGraph")
     .Input("input_dataset: variant")
     .Output("graph: string")
@@ -973,6 +687,16 @@ REGISTER_OP("MapDefun")
       return Status::OK();
     });
 
+REGISTER_OP("WrapDatasetVariant")
+    .Input("input_handle: variant")
+    .Output("output_handle: variant")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("UnwrapDatasetVariant")
+    .Input("input_handle: variant")
+    .Output("output_handle: variant")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("MultiDeviceIterator")
     .Output("handle: resource")
     .Attr("devices: list(string) >= 1")
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 2fd6cd0cd65b73a5e57d3f21cc174fefaa1085cf..f904e2536dfe67facc25335dc3f86b3d45fd116f 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -17,14 +17,17 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("ExperimentalDirectedInterleaveDataset")
-    .Input("selector_input_dataset: variant")
-    .Input("data_input_datasets: N * variant")
+REGISTER_OP("ExperimentalBytesProducedStatsDataset")
+    .Input("input_dataset: variant")
+    .Input("tag: string")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .Attr("N: int >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("ExperimentalCSVDataset")
     .Input("filenames: string")
@@ -68,6 +71,79 @@ REGISTER_OP("ExperimentalCSVDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ExperimentalDatasetCardinality")
+    .Input("input_dataset: variant")
+    .Output("cardinality: int64")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalDatasetToTFRecord")
+    .Input("input_dataset: variant")
+    .Input("filename: string")
+    .Input("compression_type: string")
+    .SetShapeFn(shape_inference::NoOutputs);
+
+REGISTER_OP("ExperimentalDenseToSparseBatchDataset")
+    .Input("input_dataset: variant")
+    .Input("batch_size: int64")
+    .Input("row_shape: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // row_shape should be a 1-D vector.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalDirectedInterleaveDataset")
+    .Input("selector_input_dataset: variant")
+    .Input("data_input_datasets: N * variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("N: int >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalGroupByReducerDataset")
+    .Input("input_dataset: variant")
+    .Input("key_func_other_arguments: Tkey_func_other_arguments")
+    .Input("init_func_other_arguments: Tinit_func_other_arguments")
+    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
+    .Input("finalize_func_other_arguments: Tfinalize_func_other_arguments")
+    .Output("handle: variant")
+    .Attr("key_func: func")
+    .Attr("init_func: func")
+    .Attr("reduce_func: func")
+    .Attr("finalize_func: func")
+    .Attr("Tkey_func_other_arguments: list(type) >= 0")
+    .Attr("Tinit_func_other_arguments: list(type) >= 0")
+    .Attr("Treduce_func_other_arguments: list(type) >= 0")
+    .Attr("Tfinalize_func_other_arguments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalGroupByWindowDataset")
+    .Input("input_dataset: variant")
+    .Input("key_func_other_arguments: Tkey_func_other_arguments")
+    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
+    .Input(
+        "window_size_func_other_arguments: Twindow_size_func_other_arguments")
+    .Output("handle: variant")
+    .Attr("key_func: func")
+    .Attr("reduce_func: func")
+    .Attr("window_size_func: func")
+    .Attr("Tkey_func_other_arguments: list(type) >= 0")
+    .Attr("Treduce_func_other_arguments: list(type) >= 0")
+    .Attr("Twindow_size_func_other_arguments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalIgnoreErrorsDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
@@ -75,6 +151,45 @@ REGISTER_OP("ExperimentalIgnoreErrorsDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ExperimentalLatencyStatsDataset")
+    .Input("input_dataset: variant")
+    .Input("tag: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalMapAndBatchDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Input("batch_size: int64")
+    .Input("num_parallel_calls: int64")
+    .Input("drop_remainder: bool")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("preserve_cardinality: bool = false")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("ExperimentalMapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -84,6 +199,7 @@ REGISTER_OP("ExperimentalMapDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("use_inter_op_parallelism: bool = true")
+    .Attr("preserve_cardinality: bool = false")
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalMatchingFilesDataset")
@@ -105,6 +221,77 @@ REGISTER_OP("ExperimentalNonSerializableDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ExperimentalParallelInterleaveDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Input("cycle_length: int64")
+    .Input("block_length: int64")
+    .Input("sloppy: bool")
+    .Input("buffer_output_elements: int64")
+    .Input("prefetch_input_elements: int64")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalParseExampleDataset")
+    .Input("input_dataset: variant")
+    .Input("num_parallel_calls: int64")
+    .Input("dense_defaults: Tdense")
+    .Output("handle: variant")
+    .Attr("sparse_keys: list(string) >= 0")
+    .Attr("dense_keys: list(string) >= 0")
+    .Attr("sparse_types: list({float,int64,string}) >= 0")
+    .Attr("Tdense: list({float,int64,string}) >= 0")
+    .Attr("dense_shapes: list(shape) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")  // Output components will be
+                                              // sorted by key (dense_keys and
+                                              // sparse_keys combined) here.
+    .Attr("sloppy: bool = false")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalRandomDataset")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, and seed2 should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalScanDataset")
+    .Input("input_dataset: variant")
+    .Input("initial_state: Tstate")
+    .Input("other_arguments: Targuments")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Tstate: list(type) >= 1")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("preserve_cardinality: bool = false")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalSetStatsAggregatorDataset")
+    .Input("input_dataset: variant")
+    .Input("stats_aggregator: resource")
+    .Input("tag: string")
+    .Input("counter_prefix: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalSleepDataset")
     .Input("input_dataset: variant")
     .Input("sleep_microseconds: int64")
@@ -119,6 +306,59 @@ REGISTER_OP("ExperimentalSleepDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ExperimentalSlidingWindowDataset")
+    .Input("input_dataset: variant")
+    .Input("window_size: int64")
+    .Input("window_shift: int64")
+    .Input("window_stride: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // window_size, window_shift, and window_stride should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalSqlDataset")
+    .Input("driver_name: string")
+    .Input("data_source_name: string")
+    .Input("query: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // driver_name, data_source_name, and query should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalStatsAggregatorHandle")
+    .Output("handle: resource")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''");
+
+REGISTER_OP("ExperimentalStatsAggregatorSummary")
+    .Input("iterator: resource")
+    .Output("summary: string")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalUnbatchDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalUniqueDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
@@ -188,6 +428,7 @@ REGISTER_OP("ExperimentalNumaMapAndBatchDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
+    .Attr("preserve_cardinality: bool = false")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       // Use index from the end to retrieve the Input shapes,
       // so that to avoid guessing the length of "other_arguments".
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index 525b19e51e013278211e8961a17588514796c4d8..952ee4bee2e5a49edeea168f4184767dbebc2527 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -109,6 +109,30 @@ Status SelfAdjointEigV2ShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+// Input is [...,N,N].
+// First and second outputs are:
+//   [...,N,N]; [...,N].
+Status LuShapeFn(InferenceContext* c) {
+  ShapeHandle input;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &input));
+
+  DimensionHandle n;
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(input, -2), c->Dim(input, -1), &n));
+
+  ShapeHandle batch_shape;
+  TF_RETURN_IF_ERROR(c->Subshape(input, 0, -2, &batch_shape));
+
+  ShapeHandle lu_shape;
+  ShapeHandle p_shape;
+
+  TF_RETURN_IF_ERROR(c->Concatenate(batch_shape, c->Matrix(n, n), &lu_shape));
+  TF_RETURN_IF_ERROR(c->Concatenate(batch_shape, c->Vector(n), &p_shape));
+
+  c->set_output(0, lu_shape);
+  c->set_output(1, p_shape);
+  return Status::OK();
+}
+
 // Input is [...,M,N].
 // First and second outputs are:
 //   [...,M,M]; [...,M,N], if full_matrices is true,
@@ -289,6 +313,14 @@ REGISTER_OP("SelfAdjointEigV2")
     .Attr("T: {double, float, complex64, complex128}")
     .SetShapeFn(SelfAdjointEigV2ShapeFn);
 
+REGISTER_OP("Lu")
+    .Input("input: T")
+    .Output("lu: T")
+    .Output("p: output_idx_type")
+    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("output_idx_type: {int32, int64} = DT_INT32")
+    .SetShapeFn(LuShapeFn);
+
 REGISTER_OP("MatrixSolve")
     .Input("matrix: T")
     .Input("rhs: T")
diff --git a/tensorflow/core/ops/linalg_ops_test.cc b/tensorflow/core/ops/linalg_ops_test.cc
index f4be820defa3d4b4e2a45ba2038d9250570f59a5..bfacee14efa41408865fecb103bc63b5f6de73ff 100644
--- a/tensorflow/core/ops/linalg_ops_test.cc
+++ b/tensorflow/core/ops/linalg_ops_test.cc
@@ -274,4 +274,23 @@ TEST(LinalgOpsTest, Svd_ShapeFn) {
   INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
 }
 
+TEST(LinalgOpsTest, Lu_ShapeFn) {
+  ShapeInferenceTestOp op("Lu");
+  INFER_OK(op, "?", "?;?");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
+  INFER_ERROR("Dimensions must be equal, but are 1 and 2", op, "[1,?,3,4,1,2]");
+
+  INFER_OK(op, "[?,?]", "[d0_0,d0_0];[d0_0]");
+  INFER_OK(op, "[1,?]", "[d0_0,d0_0];[d0_0]");
+  INFER_OK(op, "[?,1]", "[d0_1,d0_1];[d0_1]");
+
+  // Repeat previous block of tests with input rank > 2.
+  INFER_OK(op, "[1,?,3,4,?,?]",
+           "[d0_0,d0_1,d0_2,d0_3,d0_4,d0_4];[d0_0,d0_1,d0_2,d0_3,d0_4]");
+  INFER_OK(op, "[1,?,3,4,1,?]",
+           "[d0_0,d0_1,d0_2,d0_3,d0_4,d0_4];[d0_0,d0_1,d0_2,d0_3,d0_4]");
+  INFER_OK(op, "[1,?,3,4,?,1]",
+           "[d0_0,d0_1,d0_2,d0_3,d0_5,d0_5];[d0_0,d0_1,d0_2,d0_3,d0_5]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index 88d6d14c306f5f6e3bd2317692524d6bdce62621..01ebcd15439d670274d7e2a784ce78c5c1ee44ef 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -28,13 +28,14 @@ REGISTER_OP("EmptyTensorList")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(0, &s));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          0, &element_shape));
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -45,9 +46,9 @@ REGISTER_OP("TensorListPushBack")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
 
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
@@ -57,18 +58,21 @@ REGISTER_OP("TensorListPushBack")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to push to list with wrong element dtype. List has type ",
               DataTypeString(list_shape_type.dtype),
-              " but trying to push element with type ", DataTypeString(t));
+              " but trying to push element with type ",
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
-        s = list_shape_type.shape;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
       }
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -89,9 +93,9 @@ REGISTER_OP("TensorListPushBackBatch")
 
       c->set_output(0, input_handles);
 
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
 
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
@@ -101,18 +105,21 @@ REGISTER_OP("TensorListPushBackBatch")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to push to list with wrong element dtype. List has type ",
               DataTypeString(list_shape_type.dtype),
-              " but trying to push element with type ", DataTypeString(t));
+              " but trying to push element with type ",
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
-        s = list_shape_type.shape;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
       }
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -127,9 +134,9 @@ REGISTER_OP("TensorListPopBack")
     .Output("tensor: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle tensor_shape = c->UnknownShape();
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
         return errors::InvalidArgument(
@@ -138,19 +145,21 @@ REGISTER_OP("TensorListPopBack")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to read from list with wrong element dtype. List has "
               "type ",
               DataTypeString(list_shape_type.dtype),
-              " but trying to push element with type ", DataTypeString(t));
+              " but trying to push element with type ",
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
+        TF_RETURN_IF_ERROR(
+            c->Merge(tensor_shape, list_shape_type.shape, &ignored));
         c->set_output_handle_shapes_and_types(0, *handle_data);
-        s = list_shape_type.shape;
+        tensor_shape = list_shape_type.shape;
       }
-      c->set_output(1, s);
+      c->set_output(1, tensor_shape);
       c->set_output(0, c->Scalar());
       return Status::OK();
     });
@@ -161,9 +170,9 @@ REGISTER_OP("TensorListStack")
     .Attr("element_dtype: type")
     .Attr("num_elements: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
         return errors::InvalidArgument(
@@ -172,16 +181,17 @@ REGISTER_OP("TensorListStack")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to read from list with wrong element dtype. List has "
               "type ",
               DataTypeString(list_shape_type.dtype), " but expectec type ",
-              DataTypeString(t));
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
-        s = list_shape_type.shape;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
       }
       int expected_num_elements = -1;
       TF_RETURN_IF_ERROR(c->GetAttr("num_elements", &expected_num_elements));
@@ -192,11 +202,88 @@ REGISTER_OP("TensorListStack")
         num_elements = c->MakeShape({expected_num_elements});
       }
       shape_inference::ShapeHandle result;
-      TF_RETURN_IF_ERROR(c->Concatenate(num_elements, s, &result));
+      TF_RETURN_IF_ERROR(c->Concatenate(num_elements, element_shape, &result));
       c->set_output(0, result);
       return Status::OK();
     });
 
+REGISTER_OP("TensorListConcat")
+    .Input("input_handle: variant")
+    .Output("tensor: element_dtype")
+    .Output("lengths: int64")
+    .Attr("element_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr && handle_data->size() != 1) {
+        return errors::InvalidArgument(
+            "Trying to read from list with wrong variant data.");
+      }
+      if (handle_data != nullptr) {
+        const shape_inference::ShapeAndType& list_shape_type =
+            (*handle_data)[0];
+        if (list_shape_type.dtype != element_dtype) {
+          return errors::InvalidArgument(
+              "Trying to read from list with wrong element dtype. List has "
+              "type ",
+              DataTypeString(list_shape_type.dtype), " but expected type ",
+              DataTypeString(element_dtype));
+        }
+        shape_inference::ShapeHandle ignored;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
+      }
+      if (c->RankKnown(element_shape)) {
+        shape_inference::ShapeHandle result;
+        TF_RETURN_IF_ERROR(c->Subshape(element_shape, 1, &result));
+        TF_RETURN_IF_ERROR(
+            c->Concatenate(c->MakeShape({c->UnknownDim()}), result, &result));
+        c->set_output(0, result);
+      } else {
+        c->set_output(0, c->UnknownShape());
+      }
+      c->set_output(1, c->MakeShape({c->UnknownDim()}));
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorListSplit")
+    .Input("tensor: element_dtype")
+    .Input("element_shape: shape_type")
+    .Input("lengths: int64")
+    .Output("output_handle: variant")
+    .Attr("element_dtype: type")
+    .Attr("shape_type: {int32, int64}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle tensor_shape = c->input(0);
+      shape_inference::ShapeHandle ignored;
+      // Check that tensor is at least a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(tensor_shape, 1, &ignored));
+      // Check that lengths is a vector.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &ignored));
+      shape_inference::ShapeHandle element_shape_from_tensor_shape;
+      TF_RETURN_IF_ERROR(
+          c->Subshape(tensor_shape, 1, &element_shape_from_tensor_shape));
+      TF_RETURN_IF_ERROR(c->Concatenate(c->MakeShape({c->UnknownDim()}),
+                                        element_shape_from_tensor_shape,
+                                        &element_shape_from_tensor_shape));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          1, &element_shape));
+      TF_RETURN_IF_ERROR(c->Merge(element_shape_from_tensor_shape,
+                                  element_shape,
+                                  &element_shape_from_tensor_shape));
+      c->set_output_handle_shapes_and_types(
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
+      return Status::OK();
+    });
+
 REGISTER_OP("TensorListFromTensor")
     .Input("tensor: element_dtype")
     .Input("element_shape: shape_type")
@@ -205,17 +292,20 @@ REGISTER_OP("TensorListFromTensor")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->input(0);
-      shape_inference::ShapeHandle o;
-      TF_RETURN_IF_ERROR(c->Subshape(s, 1, &o));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle tensor_shape = c->input(0);
+      shape_inference::ShapeHandle tensor_shape_except_first_dim;
+      TF_RETURN_IF_ERROR(
+          c->Subshape(tensor_shape, 1, &tensor_shape_except_first_dim));
       shape_inference::ShapeHandle element_shape;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
           1, &element_shape));
-      TF_RETURN_IF_ERROR(c->Merge(o, element_shape, &o));
+      TF_RETURN_IF_ERROR(c->Merge(tensor_shape_except_first_dim, element_shape,
+                                  &tensor_shape_except_first_dim));
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{element_shape, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -241,13 +331,14 @@ REGISTER_OP("TensorListReserve")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(0, &s));
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          0, &element_shape));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -257,17 +348,17 @@ REGISTER_OP("TensorListGetItem")
     .Output("item: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       auto* handle_data = c->input_handle_shapes_and_types(0);
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         element_shape = list_shape_type.shape;
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument("Expected list with element dtype ",
-                                         DataTypeString(t),
+                                         DataTypeString(element_dtype),
                                          " but got list with element dtype ",
                                          DataTypeString(list_shape_type.dtype));
         }
@@ -283,17 +374,19 @@ REGISTER_OP("TensorListSetItem")
     .Output("output_handle: variant")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       auto* handle_data = c->input_handle_shapes_and_types(0);
       c->set_output(0, c->Scalar());
       if (handle_data == nullptr) {
-        c->set_output_handle_shapes_and_types(0, {{c->UnknownShape(), t}});
+        c->set_output_handle_shapes_and_types(
+            0, {{c->UnknownShape(), element_dtype}});
         return Status::OK();
       }
       const shape_inference::ShapeAndType& list_shape_type = (*handle_data)[0];
-      shape_inference::ShapeHandle s = c->input(2);
-      TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &s));
+      shape_inference::ShapeHandle item_shape = c->input(2);
+      TF_RETURN_IF_ERROR(
+          c->Merge(item_shape, list_shape_type.shape, &item_shape));
       c->set_output_handle_shapes_and_types(0, *handle_data);
       return Status::OK();
     });
@@ -304,17 +397,17 @@ REGISTER_OP("TensorListGather")
     .Output("values: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       auto* handle_data = c->input_handle_shapes_and_types(0);
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         element_shape = list_shape_type.shape;
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument("Expected list with element dtype ",
-                                         DataTypeString(t),
+                                         DataTypeString(element_dtype),
                                          " but got list with element dtype ",
                                          DataTypeString(list_shape_type.dtype));
         }
@@ -333,12 +426,13 @@ REGISTER_OP("TensorListScatter")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(2, &s));
-      c->set_output_handle_shapes_and_types(0, {{s, t}});
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          2, &element_shape));
+      c->set_output_handle_shapes_and_types(0,
+                                            {{element_shape, element_dtype}});
       c->set_output(0, c->Scalar());
       return Status::OK();
     });
@@ -354,28 +448,29 @@ REGISTER_OP("TensorListConcatLists")
       TF_RETURN_IF_ERROR(c->Merge(input_a, input_b, &input_a));
       c->set_output(0, input_a);
 
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
 
       auto* handle_data_a = c->input_handle_shapes_and_types(0);
       auto* handle_data_b = c->input_handle_shapes_and_types(1);
       if (handle_data_a == nullptr && handle_data_b == nullptr) {
-        c->set_output_handle_shapes_and_types(0, {{c->UnknownShape(), t}});
+        c->set_output_handle_shapes_and_types(
+            0, {{c->UnknownShape(), element_dtype}});
         return Status::OK();
       }
       shape_inference::ShapeAndType list_shape_type_a =
           (handle_data_a) ? handle_data_a->at(0) : handle_data_b->at(0);
       const shape_inference::ShapeAndType& list_shape_type_b =
           (handle_data_b) ? handle_data_b->at(0) : handle_data_a->at(0);
-      if (list_shape_type_a.dtype != t) {
+      if (list_shape_type_a.dtype != element_dtype) {
         return errors::InvalidArgument("input_a.type != element_dtype: ",
                                        DataTypeString(list_shape_type_a.dtype),
-                                       " vs. ", DataTypeString(t));
+                                       " vs. ", DataTypeString(element_dtype));
       }
-      if (list_shape_type_b.dtype != t) {
+      if (list_shape_type_b.dtype != element_dtype) {
         return errors::InvalidArgument("input_b.type != element_dtype: ",
                                        DataTypeString(list_shape_type_b.dtype),
-                                       " vs. ", DataTypeString(t));
+                                       " vs. ", DataTypeString(element_dtype));
       }
       TF_RETURN_IF_ERROR(c->Merge(list_shape_type_a.shape,
                                   list_shape_type_b.shape,
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index 9be3470820eb523e8d41f8bf63434cbb534034d8..658afd99013485ce3c6c16906d3d6f9415ad48f6 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -32,6 +32,33 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+REGISTER_OP("_MklFusedConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_args: num_args * uint8")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {float}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+*NOTE*: Do not invoke this operator directly in Python. MKL DNN graph transformer
+ is expected to create these operators.
+)doc");
+
 REGISTER_OP("_MklQuantizedMaxPool")
     .Input("input:         T")
     .Input("min_input:     float")
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index efa84d6c22c6de6d5fdd576d834f6b660ead61e1..ee528c706d1b5ad8d753875442df6f4fbf601578 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1209,9 +1209,9 @@ Status TopKShapeFn(InferenceContext* c) {
   DimensionHandle last_dim = c->Dim(input, -1);
   if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) &&
       c->Value(last_dim) < c->Value(k_dim)) {
-    return errors::InvalidArgument("input must have last dimension >= k = ",
-                                   c->Value(k_dim), " but is ",
-                                   c->Value(last_dim));
+    return errors::InvalidArgument(
+        "input must have last dimension >= k = ", c->Value(k_dim), " but is ",
+        c->Value(last_dim));
   }
 
   // Replace last_dim with k_dim.
@@ -1265,9 +1265,9 @@ REGISTER_OP("NthElement")
       DimensionHandle last_dim = c->Dim(input, -1);
       if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) &&
           c->Value(last_dim) <= c->Value(n_dim)) {
-        return errors::InvalidArgument("Input must have last dimension > n = ",
-                                       c->Value(n_dim), " but is ",
-                                       c->Value(last_dim));
+        return errors::InvalidArgument(
+            "Input must have last dimension > n = ", c->Value(n_dim),
+            " but is ", c->Value(last_dim));
       }
 
       // Reduce last_dim for output tensor
@@ -1609,6 +1609,55 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("__MklDummyPadWithConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("paddings: Tpaddings")
+    .Output("output: T")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+Dummy node that enables fusing Pad and Conv2D operator for MKL. This node
+does not perform anything. It is just created as an intermediate output of
+merging Pad and Conv2D.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklPadWithConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("paddings: Tpaddings")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_paddings: uint8")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+MKL version of Pad and Conv2D operator. Uses MKL DNN APIs to perform
+Pad and 2D convolution to the output of convolution.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklConv2DBackpropFilter")
     .Input("input: T")
     .Input("filter_sizes: int32")
@@ -1915,6 +1964,40 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklLeakyRelu")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("activations: T")
+    .Output("mkl_activations: uint8")
+    .Attr("T: {half, float, double} = DT_FLOAT")
+    .Attr("alpha: float = 0.2")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+MKL version of LeakyRelu operator. Uses MKL DNN APIs to implement
+LeakyRelu operator.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklLeakyReluGrad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Input("mkl_gradients: uint8")
+    .Input("mkl_features: uint8")
+    .Output("backprops: T")
+    .Output("mkl_backprops: uint8")
+    .Attr("T: {half, float, double} = DT_FLOAT")
+    .Attr("alpha: float = 0.2")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
+    .Doc(R"doc(
+MKL version of LeakyReluGrad operator. Uses MKL DNN APIs to compute rectified
+linear gradients for LeakyReluGrad operation.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklElu")
     .Input("features: T")
     .Input("mkl_features: uint8")
@@ -2110,7 +2193,6 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-
 REGISTER_OP("_MklAvgPool3DGrad")
     .Input("orig_input_shape: int32")
     .Input("grad: T")
@@ -2193,11 +2275,7 @@ REGISTER_OP("_MklLRN")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-#ifdef INTEL_MKL_ML_ONLY
-    .Output("workspace: T")
-#else
     .Output("workspace: uint8")
-#endif
     .Output("mkl_output: uint8")
     .Output("mkl_workspace: uint8")
     .Attr("depth_radius: int = 5")
@@ -2221,11 +2299,7 @@ REGISTER_OP("_MklLRNGrad")
     .Input("input_grads: T")
     .Input("input_image: T")
     .Input("output_image: T")
-#ifdef INTEL_MKL_ML_ONLY
-    .Input("workspace: T")
-#else
     .Input("workspace: uint8")
-#endif
     .Input("mkl_input_grads: uint8")
     .Input("mkl_input_image: uint8")
     .Input("mkl_output_image: uint8")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 699646651e14f980b0e784ea5a5c0c2e5f515628..779d4297c7c98360b6a414630c0ee184fb549058 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4944,33 +4944,6 @@ op {
     type: "list(float)"
   }
 }
-op {
-  name: "BytesProducedStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
@@ -7915,21 +7888,6 @@ op {
     minimum: 1
   }
 }
-op {
-  name: "DatasetToTFRecord"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "compression_type"
-    type: DT_STRING
-  }
-}
 op {
   name: "DebugGradientIdentity"
   input_arg {
@@ -8599,37 +8557,6 @@ op {
     }
   }
 }
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "DenseToSparseSetOperation"
   input_arg {
@@ -9834,24 +9761,6 @@ op {
     type: DT_STRING
   }
 }
-op {
-  name: "EnqueueInQueueDataset"
-  input_arg {
-    name: "queue"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
-  }
-  attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "EnsureShape"
   input_arg {
@@ -10089,6 +9998,33 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalBytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalCSVDataset"
   input_arg {
@@ -10154,6 +10090,63 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalDatasetCardinality"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "cardinality"
+    type: DT_INT64
+  }
+}
+op {
+  name: "ExperimentalDatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalDirectedInterleaveDataset"
   input_arg {
@@ -10188,6 +10181,144 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalGroupByReducerDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "init_func_other_arguments"
+    type_list_attr: "Tinit_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "finalize_func_other_arguments"
+    type_list_attr: "Tfinalize_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "init_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tinit_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalGroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalIdentityIndexedDataset"
   input_arg {
@@ -10299,6 +10430,88 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalLatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ExperimentalMapDataset"
   input_arg {
@@ -10341,6 +10554,13 @@ op {
       b: true
     }
   }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ExperimentalMatchingFilesDataset"
@@ -10479,6 +10699,146 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalParallelInterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sloppy"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ExperimentalPrivateThreadPoolDataset"
@@ -10507,6 +10867,123 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalRandomDataset"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalSetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "counter_prefix"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalSleepDataset"
   input_arg {
@@ -10534,6 +11011,107 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalSlidingWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalSqlDataset"
+  input_arg {
+    name: "driver_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data_source_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "query"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorSummary"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalThreadPoolDataset"
   input_arg {
@@ -10599,6 +11177,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalUnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalUniqueDataset"
   input_arg {
@@ -12791,144 +13392,6 @@ op {
     }
   }
 }
-op {
-  name: "GroupByReducerDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "init_func_other_arguments"
-    type_list_attr: "Tinit_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "finalize_func_other_arguments"
-    type_list_attr: "Tfinalize_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "init_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "finalize_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tinit_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tfinalize_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "GuaranteeConst"
   input_arg {
@@ -14469,33 +14932,6 @@ op {
     }
   }
 }
-op {
-  name: "LatencyStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "LeakyRelu"
   input_arg {
@@ -15354,6 +15790,46 @@ op {
     }
   }
 }
+op {
+  name: "Lu"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "lu"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "output_idx_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "MakeIterator"
   input_arg {
@@ -15367,55 +15843,45 @@ op {
   is_stateful: true
 }
 op {
-  name: "MapAndBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_batches"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
+  name: "MapClear"
   attr {
-    name: "f"
-    type: "func"
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
   }
   attr {
-    name: "output_types"
+    name: "dtypes"
     type: "list(type)"
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "MapAndBatchDatasetV2"
+  name: "MapDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -15424,18 +15890,6 @@ op {
     name: "other_arguments"
     type_list_attr: "Targuments"
   }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -15461,85 +15915,18 @@ op {
     has_minimum: true
     minimum: 1
   }
-}
-op {
-  name: "MapClear"
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
   attr {
-    name: "container"
-    type: "string"
+    name: "use_inter_op_parallelism"
+    type: "bool"
     default_value {
-      s: ""
+      b: true
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "use_inter_op_parallelism"
+    name: "preserve_cardinality"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
 }
@@ -19484,62 +19871,6 @@ op {
     type: "type"
   }
 }
-op {
-  name: "ParallelInterleaveDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sloppy"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "buffer_output_elements"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "prefetch_input_elements"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "ParallelInterleaveDatasetV2"
   input_arg {
@@ -19648,6 +19979,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ParameterizedTruncatedNormal"
@@ -19795,83 +20133,6 @@ op {
     has_minimum: true
   }
 }
-op {
-  name: "ParseExampleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
 op {
   name: "ParseSequenceExample"
   input_arg {
@@ -20558,48 +20819,6 @@ op {
     minimum: 1
   }
 }
-op {
-  name: "PrependFromQueueAndPaddedBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "PreventGradient"
   input_arg {
@@ -22871,34 +23090,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "RandomDataset"
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "RandomGamma"
   input_arg {
@@ -25150,6 +25341,86 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdamWithAmsgrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "vhat"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyAddSign"
   input_arg {
@@ -25474,6 +25745,69 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyMomentum"
   input_arg {
@@ -26745,6 +27079,83 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceSparseApplyMomentum"
   input_arg {
@@ -27844,52 +28255,6 @@ op {
     }
   }
 }
-op {
-  name: "ScanDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "ScatterAdd"
   input_arg {
@@ -29327,42 +29692,6 @@ op {
     }
   }
 }
-op {
-  name: "SetStatsAggregatorDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "stats_aggregator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "counter_prefix"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Shape"
   input_arg {
@@ -29826,41 +30155,6 @@ op {
     }
   }
 }
-op {
-  name: "SlideDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "window_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_shift"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_stride"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "Snapshot"
   input_arg {
@@ -33051,38 +33345,6 @@ op {
     }
   }
 }
-op {
-  name: "SqlDataset"
-  input_arg {
-    name: "driver_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data_source_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "query"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Sqrt"
   input_arg {
@@ -33975,40 +34237,6 @@ op {
     }
   }
 }
-op {
-  name: "StatsAggregatorHandle"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatsAggregatorSummary"
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
 op {
   name: "StopGradient"
   input_arg {
@@ -36087,6 +36315,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorListConcat"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListConcatLists"
   input_arg {
@@ -36347,6 +36594,39 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListSplit"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorListStack"
   input_arg {
@@ -37085,29 +37365,6 @@ op {
     type: "type"
   }
 }
-op {
-  name: "UnbatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "UnbatchGrad"
   input_arg {
@@ -37149,6 +37406,53 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnicodeDecode"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "UnicodeDecodeWithOffsets"
   input_arg {
@@ -37896,6 +38200,17 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "UnwrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
 op {
   name: "UpperBound"
   input_arg {
@@ -38214,6 +38529,17 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "WrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
 op {
   name: "WriteAudioSummary"
   input_arg {
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 8ea74f1d43e5baa3f14398e6ea17c19466ea2973..d012ce67fd0c6e8ba0b29fee8da6407f3927ef70 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -296,6 +296,27 @@ REGISTER_OP("UnicodeTranscode")
     .Attr("replace_control_characters: bool = false")
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("UnicodeDecode")
+    .Input("input: string")
+    .Output("row_splits: int64")
+    .Output("char_values: int32")
+    .Attr("input_encoding: string")
+    .Attr("errors: {'strict', 'replace', 'ignore'} = 'replace'")
+    .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
+    .Attr("replace_control_characters: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      // row_splits.shape == [input.size() + 1]
+      DimensionHandle num_row_splits;
+      DimensionHandle input_size = c->NumElements(c->input(0));
+      TF_RETURN_IF_ERROR(c->Add(input_size, 1, &num_row_splits));
+      c->set_output(0, c->Vector(num_row_splits));
+
+      // char_values.shape == [num_chars]
+      DimensionHandle num_chars = c->UnknownDim();
+      c->set_output(1, c->Vector(num_chars));
+      return Status::OK();
+    });
+
 REGISTER_OP("UnicodeDecodeWithOffsets")
     .Input("input: string")
     .Output("row_splits: int64")
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 94ff092a85d512e602da5e97fc3007d4c68c5937..995ed42d53dd286e5068f0067b35849c4e36e64b 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -685,6 +685,34 @@ REGISTER_OP("ResourceSparseApplyMomentum")
       return ApplyMomentumShapeFn(c, true /* sparse */);
     });
 
+REGISTER_OP("ResourceApplyKerasMomentum")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("momentum: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyMomentumShapeFn(c, false /* sparse */);
+    });
+
+REGISTER_OP("ResourceSparseApplyKerasMomentum")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("momentum: T")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyMomentumShapeFn(c, true /* sparse */);
+    });
+
 static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
   ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
@@ -741,6 +769,44 @@ REGISTER_OP("ResourceApplyAdam")
       return ApplyAdamShapeFn(c, false /* sparse */);
     });
 
+static Status ApplyAdamWithAmsgradShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // v
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 3), &s));  // vhat
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // beta1_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // beta2_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));       // beta1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));       // beta2
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(9), 0, &unused));       // epsilon
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 10 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ResourceApplyAdamWithAmsgrad")
+    .Input("var: resource")
+    .Input("m: resource")
+    .Input("v: resource")
+    .Input("vhat: resource")
+    .Input("beta1_power: T")
+    .Input("beta2_power: T")
+    .Input("lr: T")
+    .Input("beta1: T")
+    .Input("beta2: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdamWithAmsgradShapeFn(c, false /* sparse */);
+    });
+
 static Status ApplyAdaMaxShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
   ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index c61b68aeebf4823ff70119a0349c318dd3887790..26eff8f834a85a09343663b214e0fcd1aa5a652b 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -1433,9 +1433,16 @@ Status GcsFileSystem::CreateDir(const string& dirname) {
                      : errors::NotFound("The specified bucket ", dirname,
                                         " was not found.");
   }
+
+  const string dirname_with_slash = MaybeAppendSlash(dirname);
+
+  if (FileExists(dirname_with_slash).ok()) {
+    return errors::AlreadyExists(dirname);
+  }
+
   // Create a zero-length directory marker object.
   std::unique_ptr<WritableFile> file;
-  TF_RETURN_IF_ERROR(NewWritableFile(MaybeAppendSlash(dirname), &file));
+  TF_RETURN_IF_ERROR(NewWritableFile(dirname_with_slash, &file));
   TF_RETURN_IF_ERROR(file->Close());
   return Status::OK();
 }
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 702802b185aa4ce3243e777694d5fd9e77ec7ee8..f0f5f592fae28a59e17e086dd68b3cf0abcbf8dc 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/cloud/gcs_file_system.h"
 #include <fstream>
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/cloud/http_request_fake.h"
@@ -2789,6 +2790,12 @@ TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
 TEST(GcsFileSystemTest, CreateDir_Folder) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "subpath%2F?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           "{}"),
+       new FakeHttpRequest(
            "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
            "uploadType=resumable&name=subpath%2F\n"
            "Auth Token: fake_token\n"
@@ -2802,18 +2809,12 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
                            "Put body: \n",
                            ""),
        new FakeHttpRequest(
-           "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
-           "uploadType=resumable&name=subpath%2F\n"
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "subpath%2F?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
-           "Header X-Upload-Content-Length: 0\n"
-           "Post: yes\n"
            "Timeouts: 5 1 10\n",
-           "", {{"Location", "https://custom/upload/location"}}),
-       new FakeHttpRequest("Uri: https://custom/upload/location\n"
-                           "Auth Token: fake_token\n"
-                           "Timeouts: 5 1 30\n"
-                           "Put body: \n",
-                           "")});
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
@@ -2826,7 +2827,8 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
                    nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath"));
-  TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath/"));
+  EXPECT_EQ(errors::AlreadyExists("gs://bucket/subpath/"),
+            fs.CreateDir("gs://bucket/subpath/"));
 }
 
 TEST(GcsFileSystemTest, CreateDir_Bucket) {
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index 6eba83224a4b861f7b4a469d82116ef63d4814d9..c9208cc75536732b9274440a4e5e48b51ffeb4e3 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -32,9 +32,22 @@ namespace port {
 // Returns an estimate of the number of schedulable CPUs for this
 // process.  Usually, it's constant throughout the lifetime of a
 // process, but it might change if the underlying cluster management
-// software can change it dynamically.
+// software can change it dynamically.  If the underlying call fails, a default
+// value (e.g. `4`) may be returned.
 int NumSchedulableCPUs();
 
+// Returns the total number of CPUs on the system.  This number should
+// not change even if the underlying cluster management software may
+// change the number of schedulable CPUs.  Unlike `NumSchedulableCPUs`, if the
+// underlying call fails, an invalid value of -1 will be returned;
+// the user must check for validity.
+static constexpr int kUnknownCPU = -1;
+int NumTotalCPUs();
+
+// Returns the id of the current CPU.  Returns -1 if the current CPU cannot be
+// identified.  If successful, the return value will be in [0, NumTotalCPUs()).
+int GetCurrentCPU();
+
 // Returns an estimate of the number of hyperthreads per physical core
 // on the CPU
 int NumHyperthreadsPerCore();
diff --git a/tensorflow/core/platform/cuda_libdevice_path.h b/tensorflow/core/platform/cuda_libdevice_path.h
index 6ef565ecd3c6460791b49a25fd4277e9393cfdd0..f2dbff9043a77dc8766092e89d29f642dd443966 100644
--- a/tensorflow/core/platform/cuda_libdevice_path.h
+++ b/tensorflow/core/platform/cuda_libdevice_path.h
@@ -16,16 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
 #define TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
 
+#include <vector>
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-// Returns the root directory of the CUDA SDK, which contains sub-folders such
-// as bin, lib64, and nvvm.
-string CudaRoot();
-
-// Returns the directory that contains nvvm libdevice files in the CUDA SDK.
-string LibdeviceRoot();
+// Returns, in order of preference, potential locations of the root directory of
+// the CUDA SDK, which contains sub-folders such as bin, lib64, and nvvm.
+std::vector<string> CandidateCudaRoots();
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/cuda_libdevice_path_test.cc b/tensorflow/core/platform/cuda_libdevice_path_test.cc
deleted file mode 100644
index 2d34239a9958d722a1cb84213657ca8229ebaf2c..0000000000000000000000000000000000000000
--- a/tensorflow/core/platform/cuda_libdevice_path_test.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/platform/cuda_libdevice_path.h"
-
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-
-#if GOOGLE_CUDA
-TEST(CudaLibdevicePathTest, LibdevicePath) {
-  VLOG(2) << "Libdevice root = " << LibdeviceRoot();
-  std::vector<string> libdevice_files;
-  TF_EXPECT_OK(Env::Default()->GetMatchingPaths(
-      io::JoinPath(LibdeviceRoot(), "libdevice.*.bc"), &libdevice_files));
-  EXPECT_LT(0, libdevice_files.size());
-}
-#endif
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 04287151301dd0c6eb25ec7bc8b12a207f44ab90..3a4415f229b5f625576cf85bd1852894300e109a 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -543,9 +543,6 @@ def tf_additional_proto_srcs():
 def tf_additional_human_readable_json_deps():
     return []
 
-def tf_additional_logger_deps():
-    return []
-
 def tf_additional_all_protos():
     return ["//tensorflow/core:protos_all"]
 
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index da1f66dc6763121819fe443066acc40c1d5fa79d..ee6936b372acf35c0568331d73615451b2675dd4 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -275,3 +275,8 @@ alias(
     actual = ":mobile_srcs",
     visibility = ["//visibility:public"],
 )
+
+alias(
+    name = "logger",
+    actual = "//tensorflow/core:default_logger",
+)
diff --git a/tensorflow/core/platform/default/cuda_libdevice_path.cc b/tensorflow/core/platform/default/cuda_libdevice_path.cc
index 20ee3ad621a0688013802c37184aca1342dbe45e..a8b2e7202ac79d821d88b711d1476a1893a6e5fa 100644
--- a/tensorflow/core/platform/default/cuda_libdevice_path.cc
+++ b/tensorflow/core/platform/default/cuda_libdevice_path.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cuda_libdevice_path.h"
 
 #include <stdlib.h>
+#include <vector>
 
 #if !defined(PLATFORM_GOOGLE)
 #include "cuda/cuda_config.h"
@@ -24,9 +25,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-string CudaRoot() {
+std::vector<string> CandidateCudaRoots() {
   VLOG(3) << "CUDA root = " << TF_CUDA_TOOLKIT_PATH;
-  return TF_CUDA_TOOLKIT_PATH;
+  return {TF_CUDA_TOOLKIT_PATH};
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/human_readable_json.cc b/tensorflow/core/platform/default/human_readable_json.cc
index 9f97c8272c10c9036901ac0405c27806d59fdab0..bf9c7b76206b79ad43969a1e3e2de6e6cbdacc46 100644
--- a/tensorflow/core/platform/default/human_readable_json.cc
+++ b/tensorflow/core/platform/default/human_readable_json.cc
@@ -20,7 +20,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-Status ProtoToHumanReadableJson(const ::google::protobuf::Message& proto,
+Status ProtoToHumanReadableJson(const protobuf::Message& proto,
                                 string* result) {
 #ifdef TENSORFLOW_LITE_PROTOS
   *result = "[human readable output not available on Android]";
@@ -28,7 +28,7 @@ Status ProtoToHumanReadableJson(const ::google::protobuf::Message& proto,
 #else
   result->clear();
 
-  auto status = google::protobuf::util::MessageToJsonString(proto, result);
+  auto status = protobuf::util::MessageToJsonString(proto, result);
   if (!status.ok()) {
     // Convert error_msg google::protobuf::StringPiece to
     // tensorflow::StringPiece.
@@ -41,8 +41,7 @@ Status ProtoToHumanReadableJson(const ::google::protobuf::Message& proto,
 #endif
 }
 
-Status HumanReadableJsonToProto(const string& str,
-                                ::google::protobuf::Message* proto) {
+Status HumanReadableJsonToProto(const string& str, protobuf::Message* proto) {
 #ifdef TENSORFLOW_LITE_PROTOS
   return errors::Internal("Cannot parse JSON protos on Android");
 #else
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 7374fccdc2cd2af4cfaec5a83b93fdb8d368cf2c..1b5382841574e6b8843079ae9cb359c5c9b475d0 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -167,11 +167,24 @@ class Env {
   Status DeleteFile(const string& fname);
 
   /// \brief Deletes the specified directory and all subdirectories and files
-  /// underneath it. undeleted_files and undeleted_dirs stores the number of
-  /// files and directories that weren't deleted (unspecified if the return
-  /// status is not OK).
+  /// underneath it. This is accomplished by traversing the directory tree
+  /// rooted at dirname and deleting entries as they are encountered.
+  ///
+  /// If dirname itself is not readable or does not exist, *undeleted_dir_count
+  /// is set to 1, *undeleted_file_count is set to 0 and an appropriate status
+  /// (e.g. NOT_FOUND) is returned.
+  ///
+  /// If dirname and all its descendants were successfully deleted, TF_OK is
+  /// returned and both error counters are set to zero.
+  ///
+  /// Otherwise, while traversing the tree, undeleted_file_count and
+  /// undeleted_dir_count are updated if an entry of the corresponding type
+  /// could not be deleted. The returned error status represents the reason that
+  /// any one of these entries could not be deleted.
+  ///
   /// REQUIRES: undeleted_files, undeleted_dirs to be not null.
-  /// Typical return codes
+  ///
+  /// Typical return codes:
   ///  * OK - dirname exists and we were able to delete everything underneath.
   ///  * NOT_FOUND - dirname doesn't exist
   ///  * PERMISSION_DENIED - dirname or some descendant is not writable
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index 156af6cdeaa015429d60e4599f59c5a4b806f5e6..c84a93b1bf59be7cb19352825cc4bb82b48e2246 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -167,10 +167,23 @@ class FileSystem {
   virtual Status DeleteDir(const string& dirname) = 0;
 
   /// \brief Deletes the specified directory and all subdirectories and files
-  /// underneath it. undeleted_files and undeleted_dirs stores the number of
-  /// files and directories that weren't deleted (unspecified if the return
-  /// status is not OK).
+  /// underneath it. This is accomplished by traversing the directory tree
+  /// rooted at dirname and deleting entries as they are encountered.
+  ///
+  /// If dirname itself is not readable or does not exist, *undeleted_dir_count
+  /// is set to 1, *undeleted_file_count is set to 0 and an appropriate status
+  /// (e.g. NOT_FOUND) is returned.
+  ///
+  /// If dirname and all its descendants were successfully deleted, TF_OK is
+  /// returned and both error counters are set to zero.
+  ///
+  /// Otherwise, while traversing the tree, undeleted_file_count and
+  /// undeleted_dir_count are updated if an entry of the corresponding type
+  /// could not be deleted. The returned error status represents the reason that
+  /// any one of these entries could not be deleted.
+  ///
   /// REQUIRES: undeleted_files, undeleted_dirs to be not null.
+  ///
   /// Typical return codes:
   ///  * OK - dirname exists and we were able to delete everything underneath.
   ///  * NOT_FOUND - dirname doesn't exist
diff --git a/tensorflow/core/platform/port_test.cc b/tensorflow/core/platform/port_test.cc
index 15c3cb24f046b9111d66839ba03ffaf427ba70eb..33c66a6f25a7349041efad766c03674531201d95 100644
--- a/tensorflow/core/platform/port_test.cc
+++ b/tensorflow/core/platform/port_test.cc
@@ -33,6 +33,14 @@ TEST(Port, AlignedMalloc) {
   }
 }
 
+TEST(Port, GetCurrentCPU) {
+  const int cpu = GetCurrentCPU();
+  // TODO(b/120919972): Re-enable this EXPECT_GE after fixing MacOS Kokoro
+  // failures.
+  // EXPECT_GE(cpu, 0);
+  EXPECT_LT(cpu, NumTotalCPUs());
+}
+
 TEST(ConditionVariable, WaitForMilliseconds_Timeout) {
   mutex m;
   mutex_lock l(m);
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index acdd7798ea961f2b5aed59b6eebb3f6dcafa40a5..0fac8b1a8895fe2353c6cf9589f7541fae2ecf67 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -25,7 +25,14 @@ limitations under the License.
 #if defined(__linux__) && !defined(__ANDROID__)
 #include <sched.h>
 #include <sys/sysinfo.h>
+#else
+#include <sys/syscall.h>
+#endif
+
+#if !defined(__APPLE__) && (__x86_64__ || __i386__)
+#include <cpuid.h>
 #endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -69,6 +76,34 @@ int NumSchedulableCPUs() {
   return kDefaultCores;
 }
 
+int NumTotalCPUs() {
+  int count = absl::base_internal::NumCPUs();
+  return (count == 0) ? kUnknownCPU : count;
+}
+
+int GetCurrentCPU() {
+#if defined(__linux__) && !defined(__ANDROID__)
+  return sched_getcpu();
+#elif defined(__cpuid_count)
+  // Attempt to use cpuid on all other platforms.  If that fails, perform a
+  // syscall.
+  uint32_t eax, ebx, ecx, edx;
+  __cpuid_count(/*leaf=*/1, /*subleaf=*/0, eax, ebx, ecx, edx);
+  if ((edx & (1 << 9)) != 0) {
+    // EBX bits 24-31 are APIC ID
+    return static_cast<unsigned int>(ebx >> 24);
+  }
+#elif defined(__NR_getcpu)
+  unsigned int cpu;
+  if (syscall(__NR_getcpu, &cpu, NULL, NULL) < 0) {
+    return kUnknownCPU;
+  } else {
+    return static_cast<int>(cpu);
+  }
+#endif
+  return kUnknownCPU;
+}
+
 int NumHyperthreadsPerCore() {
   static const int ht_per_core = tensorflow::port::CPUIDNumSMT();
   return (ht_per_core > 0) ? ht_per_core : 1;
@@ -83,9 +118,7 @@ int NUMANumNodes() { return 1; }
 
 void NUMASetThreadNodeAffinity(int node) {}
 
-int NUMAGetThreadNodeAffinity() {
-  return kNUMANoAffinity;
-}
+int NUMAGetThreadNodeAffinity() { return kNUMANoAffinity; }
 
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index 911ea1902f800c795c60505b2d91a6a6b31b7b01..b902c85cdcfd567d0b77322bfe30d7ba26e25e5a 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #endif
 
 #include <Windows.h>
+#include <processthreadsapi.h>
 #include <shlwapi.h>
 
 #include "tensorflow/core/platform/cpu_info.h"
@@ -54,6 +55,30 @@ int NumSchedulableCPUs() {
   return system_info.dwNumberOfProcessors;
 }
 
+int NumTotalCPUs() {
+  // TODO(ebrevdo): Make this more accurate.
+  //
+  // This only returns the number of processors in the current
+  // processor group; which may be undercounting if you have more than 64 cores.
+  // For that case, one needs to call
+  // GetLogicalProcessorInformationEx(RelationProcessorCore, ...) and accumulate
+  // the Size fields by iterating over the written-to buffer.  Since I can't
+  // easily test this on Windows, I'm deferring this to someone who can!
+  //
+  // If you fix this, also consider updatig GetCurrentCPU below.
+  return NumSchedulableCPUs();
+}
+
+int GetCurrentCPU() {
+  // NOTE(ebrevdo): This returns the processor number within the processor
+  // group on systems with >64 processors.  Therefore it doesn't necessarily map
+  // naturally to an index in NumSchedulableCPUs().
+  //
+  // On the plus side, this number is probably guaranteed to be within
+  // [0, NumTotalCPUs()) due to its incomplete implementation.
+  return GetCurrentProcessorNumber();
+}
+
 bool NUMAEnabled() {
   // Not yet implemented: coming soon.
   return false;
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 07eeeb4f032f199fe50b315c39b5e9835770d5c7..a55fe17dd5fa6f7ba7c0eaebb345c69f9dce2a5c 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc0"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/cuda_launch_config.h b/tensorflow/core/util/cuda_launch_config.h
index 080d4067cec69084b54ba1c096d01198a8e48d20..c0ae6349f755dcbd643493ccfe82374d12bc2baf 100644
--- a/tensorflow/core/util/cuda_launch_config.h
+++ b/tensorflow/core/util/cuda_launch_config.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include <algorithm>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "cuda/include/cuda.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
diff --git a/tensorflow/core/util/dump_graph.cc b/tensorflow/core/util/dump_graph.cc
index 5df5cb51cb74f18dcd3985b007e2d50430ec5510..523d37ecc244b3634545ea82385b377c871569c8 100644
--- a/tensorflow/core/util/dump_graph.cc
+++ b/tensorflow/core/util/dump_graph.cc
@@ -75,18 +75,24 @@ Status WriteToFile(const string& filepath,
 
 template <class T>
 string WriteTextProtoToUniqueFile(Env* env, const string& name,
-                                  const char* proto_type, T& proto) {
-  const char* dirname = getenv("TF_DUMP_GRAPH_PREFIX");
-  if (!dirname) {
+                                  const char* proto_type, T& proto,
+                                  const string& dirname) {
+  const char* dir = nullptr;
+  if (!dirname.empty()) {
+    dir = dirname.c_str();
+  } else {
+    dir = getenv("TF_DUMP_GRAPH_PREFIX");
+  }
+  if (!dir) {
     return "(TF_DUMP_GRAPH_PREFIX not specified)";
   }
-  Status status = env->RecursivelyCreateDir(dirname);
+  Status status = env->RecursivelyCreateDir(dir);
   if (!status.ok()) {
-    LOG(WARNING) << "Failed to create " << dirname << " for dumping "
-                 << proto_type << ": " << status;
+    LOG(WARNING) << "Failed to create " << dir << " for dumping " << proto_type
+                 << ": " << status;
     return "(unavailable)";
   }
-  string filepath = absl::StrCat(dirname, "/", MakeUniqueFilename(name));
+  string filepath = absl::StrCat(dir, "/", MakeUniqueFilename(name));
   status = WriteToFile(filepath, proto);
   if (!status.ok()) {
     LOG(WARNING) << "Failed to dump " << proto_type << " to file: " << filepath
@@ -99,23 +105,27 @@ string WriteTextProtoToUniqueFile(Env* env, const string& name,
 
 }  // anonymous namespace
 
-string DumpGraphDefToFile(const string& name, GraphDef const& graph_def) {
-  return WriteTextProtoToUniqueFile(Env::Default(), name, "GraphDef",
-                                    graph_def);
+string DumpGraphDefToFile(const string& name, GraphDef const& graph_def,
+                          const string& dirname) {
+  return WriteTextProtoToUniqueFile(Env::Default(), name, "GraphDef", graph_def,
+                                    dirname);
 }
 
 string DumpGraphToFile(const string& name, Graph const& graph,
-                       const FunctionLibraryDefinition* flib_def) {
+                       const FunctionLibraryDefinition* flib_def,
+                       const string& dirname) {
   GraphDef graph_def;
   graph.ToGraphDef(&graph_def);
   if (flib_def) {
     *graph_def.mutable_library() = flib_def->ToProto();
   }
-  return DumpGraphDefToFile(name, graph_def);
+  return DumpGraphDefToFile(name, graph_def, dirname);
 }
 
-string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef) {
-  return WriteTextProtoToUniqueFile(Env::Default(), name, "FunctionDef", fdef);
+string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef,
+                             const string& dirname) {
+  return WriteTextProtoToUniqueFile(Env::Default(), name, "FunctionDef", fdef,
+                                    dirname);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/dump_graph.h b/tensorflow/core/util/dump_graph.h
index 05e0b79f559b6444f1a363d23edc16c2f76c5f14..03dc807a2b342edaea57ad8558495462a6af0109 100644
--- a/tensorflow/core/util/dump_graph.h
+++ b/tensorflow/core/util/dump_graph.h
@@ -29,19 +29,23 @@ namespace tensorflow {
 // chosen.
 //
 // Automatically picks a file name. Prefixes 'name' with the value of the
-// TF_DUMP_GRAPH_PREFIX environment variable and suffixes it with ".pbtxt" to
-// form a name. If a graph has already been dumped by this process with the same
-// name, suffixes with "_n.pbtxt", where 'n' is a sequence number.
-string DumpGraphDefToFile(const string& name, GraphDef const& graph_def);
+// TF_DUMP_GRAPH_PREFIX environment variable if 'dirname' is empty, and suffixes
+// 'name' with ".pbtxt" to form a name. If a graph has already been dumped by
+// this process with the same name, suffixes with "_n.pbtxt", where 'n' is a
+// sequence number.
+string DumpGraphDefToFile(const string& name, GraphDef const& graph_def,
+                          const string& dirname = "");
 
 // Similar to DumpGraphDefToFile, but builds the GraphDef to dump from a 'graph'
 // and an optional function library 'flib_def'. Returns the file name chosen.
 string DumpGraphToFile(const string& name, Graph const& graph,
-                       const FunctionLibraryDefinition* flib_def = nullptr);
+                       const FunctionLibraryDefinition* flib_def = nullptr,
+                       const string& dirname = "");
 
 // Similar to DumpGraphDefToFile, but dumps a function as a FunctionDef text
 // proto. Returns the file name chosen.
-string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef);
+string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef,
+                             const string& dirname = "");
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/port.cc b/tensorflow/core/util/port.cc
index e01058dff6cd70eecece10285f485c2b36352bdd..7dc8ddda06ae77bd058e472ab375d2ed3f760437 100644
--- a/tensorflow/core/util/port.cc
+++ b/tensorflow/core/util/port.cc
@@ -15,9 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/util/port.h"
 
-#if GOOGLE_CUDA
-#include "cuda/include/cuda.h"
-#endif
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/proto/decode.h b/tensorflow/core/util/proto/decode.h
index cbcb203ee76471674429f133d54d4d0875dd9d5d..8dde14dffcdc5ffe4d64360f3af40521efe29bf8 100644
--- a/tensorflow/core/util/proto/decode.h
+++ b/tensorflow/core/util/proto/decode.h
@@ -318,7 +318,7 @@ inline int ReadPackedPrimitives(const void* bufp, const size_t len,
   return count;
 }
 
-// Reads a primitive value field from a serialized proto.
+// Reads a value of a primitive type field from a serialized proto.
 // The value is parsed from the serialized format, then static_cast
 // to the desired type for TensorFlow and stored.
 template <class ValueType, class TensorType,
diff --git a/tensorflow/core/util/stats_calculator.cc b/tensorflow/core/util/stats_calculator.cc
index eb077546501327c62aff5c9d68eb5d0ba1c9aa1c..bce650f2456029b578356e572393c0ec08df2441 100644
--- a/tensorflow/core/util/stats_calculator.cc
+++ b/tensorflow/core/util/stats_calculator.cc
@@ -53,7 +53,7 @@ std::string StatsCalculator::HeaderString(const std::string& title) const {
          << " ==============================" << std::endl;
 
   InitField(stream, 24) << "[node type]";
-  InitField(stream, 9) << "[start]";
+  InitField(stream, 17) << "[start]";
   InitField(stream, 9) << "[first]";
   InitField(stream, 9) << "[avg ms]";
   InitField(stream, 8) << "[%]";
@@ -77,7 +77,7 @@ std::string StatsCalculator::ColumnString(const Detail& detail,
 
   std::stringstream stream;
   InitField(stream, 24) << detail.type;
-  InitField(stream, 9) << start_ms;
+  InitField(stream, 17) << start_ms;
   InitField(stream, 9) << first_time_ms;
   InitField(stream, 9) << avg_time_ms;
   InitField(stream, 7) << percentage << "%";
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index b0c349dd907b71f1a33854930802e1692b3cfb69..a296fb447e252e62809aeb17d9d00cf35ad15fc9 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -498,7 +498,8 @@ inline TensorShape ShapeFromFormat(TensorFormat format, int64 N,
   dim_sizes[GetTensorBatchDimIndex(dims, format)] = N;
   for (int dim = 0; static_cast<size_t>(dim) < spatial.size(); dim++) {
     auto dim_size = spatial[dim];
-    if (format == FORMAT_NHWC_VECT_W && dim == spatial.size() - 1) {
+    if (format == FORMAT_NHWC_VECT_W &&
+        static_cast<size_t>(dim) == spatial.size() - 1) {
       CHECK_EQ(0, dim_size % 4)
           << "FORMAT_NHWC_VECT_W requires W to be a multiple of 4, but W="
           << dim_size;
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index 82bc3ffda9635a97af5acb8715d5b98fc10d440c..4e4e1685f6db128eb8cb09986e4924567f35ea75 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -180,7 +180,7 @@ After editing your WORKSPACE file to update the SDK/NDK configuration, you may
 build the APK. Run this from your workspace root:
 
 ```bash
-bazel build -c opt //tensorflow/examples/android:tensorflow_demo
+bazel build --cxxopt='--std=c++11' -c opt //tensorflow/examples/android:tensorflow_demo
 ```
 
 ##### Install
diff --git a/tensorflow/examples/get_started/regression/custom_regression.py b/tensorflow/examples/get_started/regression/custom_regression.py
index 2e34362c5ced96ac6aec5a9258519bb49ef9157d..7b7cbb78666f0de5e77858b79eda721adc493ecb 100644
--- a/tensorflow/examples/get_started/regression/custom_regression.py
+++ b/tensorflow/examples/get_started/regression/custom_regression.py
@@ -100,12 +100,11 @@ def main(argv):
         # that the examples are well mixed.
         train.shuffle(1000).batch(128)
         # Repeat forever
-        .repeat().make_one_shot_iterator().get_next())
+        .repeat())
 
   # Build the validation input_fn.
   def input_test():
-    return (test.shuffle(1000).batch(128)
-            .make_one_shot_iterator().get_next())
+    return test.shuffle(1000).batch(128)
 
   # The first way assigns a unique weight to each category. To do this you must
   # specify the category's vocabulary (values outside this specification will
diff --git a/tensorflow/examples/get_started/regression/dnn_regression.py b/tensorflow/examples/get_started/regression/dnn_regression.py
index 951c93b52e73a8e7f4497e9c4b0e91038de85620..94669a5082b26cac79e2879da43cc8aa6e5e83d0 100644
--- a/tensorflow/examples/get_started/regression/dnn_regression.py
+++ b/tensorflow/examples/get_started/regression/dnn_regression.py
@@ -45,12 +45,11 @@ def main(argv):
         # that the examples are well mixed.
         train.shuffle(1000).batch(128)
         # Repeat forever
-        .repeat().make_one_shot_iterator().get_next())
+        .repeat())
 
   # Build the validation input_fn.
   def input_test():
-    return (test.shuffle(1000).batch(128)
-            .make_one_shot_iterator().get_next())
+    return test.shuffle(1000).batch(128)
 
   # The first way assigns a unique weight to each category. To do this you must
   # specify the category's vocabulary (values outside this specification will
diff --git a/tensorflow/examples/get_started/regression/linear_regression_categorical.py b/tensorflow/examples/get_started/regression/linear_regression_categorical.py
index e2ad415fbcb161a599cff7d123597e5156d11770..5312272a9592973e757e6cdd5a2305c0c04372a9 100644
--- a/tensorflow/examples/get_started/regression/linear_regression_categorical.py
+++ b/tensorflow/examples/get_started/regression/linear_regression_categorical.py
@@ -45,12 +45,11 @@ def main(argv):
         # that the examples are well mixed.
         train.shuffle(1000).batch(128)
         # Repeat forever
-        .repeat().make_one_shot_iterator().get_next())
+        .repeat())
 
   # Build the validation input_fn.
   def input_test():
-    return (test.shuffle(1000).batch(128)
-            .make_one_shot_iterator().get_next())
+    return test.shuffle(1000).batch(128)
 
   # The following code demonstrates two of the ways that `feature_columns` can
   # be used to build a model with categorical inputs.
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index 740224744860fdd76bea9c4531242a4976b20784..5c52a2c8461660e19ef6e98c01a6a58a3f3c0920 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
@@ -126,7 +126,7 @@ def inputs(train, batch_size, num_epochs):
     dataset = dataset.repeat(num_epochs)
     dataset = dataset.batch(batch_size)
 
-    iterator = dataset.make_one_shot_iterator()
+    iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
   return iterator.get_next()
 
 
diff --git a/tensorflow/examples/saved_model/BUILD b/tensorflow/examples/saved_model/BUILD
deleted file mode 100644
index ebefc6576d646467426a784d03f4be206aeaba38..0000000000000000000000000000000000000000
--- a/tensorflow/examples/saved_model/BUILD
+++ /dev/null
@@ -1,22 +0,0 @@
-# Description: SavedModel half plus two example.
-
-package(
-    default_visibility = ["//tensorflow:internal"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_binary(
-    name = "saved_model_half_plus_two",
-    srcs = [
-        "saved_model_half_plus_two.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:lib",
-        "//tensorflow/python/saved_model:main_op",
-    ],
-)
diff --git a/tensorflow/examples/saved_model/saved_model_half_plus_two.py b/tensorflow/examples/saved_model/saved_model_half_plus_two.py
deleted file mode 100644
index dfdde445404a5ec99f3d821dff6d9f217bfadefc..0000000000000000000000000000000000000000
--- a/tensorflow/examples/saved_model/saved_model_half_plus_two.py
+++ /dev/null
@@ -1,271 +0,0 @@
-## Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-r"""Exports an example linear regression inference graph.
-
-Exports a TensorFlow graph to `/tmp/saved_model/half_plus_two/` based on the
-`SavedModel` format.
-
-This graph calculates,
-
-\\(
-  y = a*x + b
-\\)
-
-and/or, independently,
-
-\\(
-  y2 = a*x2 + c
-\\)
-
-where `a`, `b` and `c` are variables with `a=0.5` and `b=2` and `c=3`.
-
-Output from this program is typically used to exercise SavedModel load and
-execution code.
-
-To create a CPU model:
-  bazel run -c opt saved_half_plus_two -- --device=cpu
-
-To create GPU model:
-  bazel run --config=cuda -c opt saved_half_plus_two -- \
-  --device=gpu
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import os
-import sys
-
-import tensorflow as tf
-
-from tensorflow.python.lib.io import file_io
-
-FLAGS = None
-
-
-def _write_assets(assets_directory, assets_filename):
-  """Writes asset files to be used with SavedModel for half plus two.
-
-  Args:
-    assets_directory: The directory to which the assets should be written.
-    assets_filename: Name of the file to which the asset contents should be
-        written.
-
-  Returns:
-    The path to which the assets file was written.
-  """
-  if not file_io.file_exists(assets_directory):
-    file_io.recursive_create_dir(assets_directory)
-
-  path = os.path.join(
-      tf.compat.as_bytes(assets_directory), tf.compat.as_bytes(assets_filename))
-  file_io.write_string_to_file(path, "asset-file-contents")
-  return path
-
-
-def _build_regression_signature(input_tensor, output_tensor):
-  """Helper function for building a regression SignatureDef."""
-  input_tensor_info = tf.saved_model.utils.build_tensor_info(input_tensor)
-  signature_inputs = {
-      tf.saved_model.signature_constants.REGRESS_INPUTS: input_tensor_info
-  }
-  output_tensor_info = tf.saved_model.utils.build_tensor_info(output_tensor)
-  signature_outputs = {
-      tf.saved_model.signature_constants.REGRESS_OUTPUTS: output_tensor_info
-  }
-  return tf.saved_model.signature_def_utils.build_signature_def(
-      signature_inputs, signature_outputs,
-      tf.saved_model.signature_constants.REGRESS_METHOD_NAME)
-
-
-# Possibly extend this to allow passing in 'classes', but for now this is
-# sufficient for testing purposes.
-def _build_classification_signature(input_tensor, scores_tensor):
-  """Helper function for building a classification SignatureDef."""
-  input_tensor_info = tf.saved_model.utils.build_tensor_info(input_tensor)
-  signature_inputs = {
-      tf.saved_model.signature_constants.CLASSIFY_INPUTS: input_tensor_info
-  }
-  output_tensor_info = tf.saved_model.utils.build_tensor_info(scores_tensor)
-  signature_outputs = {
-      tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES:
-          output_tensor_info
-  }
-  return tf.saved_model.signature_def_utils.build_signature_def(
-      signature_inputs, signature_outputs,
-      tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME)
-
-
-def _generate_saved_model_for_half_plus_two(export_dir,
-                                            as_text=False,
-                                            use_main_op=False,
-                                            device_type="cpu"):
-  """Generates SavedModel for half plus two.
-
-  Args:
-    export_dir: The directory to which the SavedModel should be written.
-    as_text: Writes the SavedModel protocol buffer in text format to disk.
-    use_main_op: Whether to supply a main op during SavedModel build time.
-    device_name: Device to force ops to run on.
-  """
-  builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
-
-  device_name = "/cpu:0"
-  if device_type == "gpu":
-    device_name = "/gpu:0"
-
-  with tf.Session(
-      graph=tf.Graph(),
-      config=tf.ConfigProto(log_device_placement=True)) as sess:
-    with tf.device(device_name):
-      # Set up the model parameters as variables to exercise variable loading
-      # functionality upon restore.
-      a = tf.Variable(0.5, name="a")
-      b = tf.Variable(2.0, name="b")
-      c = tf.Variable(3.0, name="c")
-
-      # Create a placeholder for serialized tensorflow.Example messages to be
-      # fed.
-      serialized_tf_example = tf.placeholder(tf.string, name="tf_example")
-
-      # Parse the tensorflow.Example looking for a feature named "x" with a
-      # single floating point value.
-      feature_configs = {
-          "x": tf.FixedLenFeature([1], dtype=tf.float32),
-          "x2": tf.FixedLenFeature([1], dtype=tf.float32, default_value=[0.0])
-      }
-      # parse_example only works on CPU
-      with tf.device("/cpu:0"):
-        tf_example = tf.parse_example(serialized_tf_example, feature_configs)
-      # Use tf.identity() to assign name
-      x = tf.identity(tf_example["x"], name="x")
-      y = tf.add(tf.multiply(a, x), b)
-      y = tf.identity(y, name="y")
-      y2 = tf.add(tf.multiply(a, x), c)
-      y2 = tf.identity(y2, name="y2")
-
-      x2 = tf.identity(tf_example["x2"], name="x2")
-      y3 = tf.add(tf.multiply(a, x2), c)
-      y3 = tf.identity(y3, name="y3")
-
-    # Create an assets file that can be saved and restored as part of the
-    # SavedModel.
-    original_assets_directory = "/tmp/original/export/assets"
-    original_assets_filename = "foo.txt"
-    original_assets_filepath = _write_assets(original_assets_directory,
-                                             original_assets_filename)
-
-    # Set up the assets collection.
-    assets_filepath = tf.constant(original_assets_filepath)
-    tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, assets_filepath)
-    filename_tensor = tf.Variable(
-        original_assets_filename,
-        name="filename_tensor",
-        trainable=False,
-        collections=[])
-    assign_filename_op = filename_tensor.assign(original_assets_filename)
-
-    # Set up the signature for Predict with input and output tensor
-    # specification.
-    predict_input_tensor = tf.saved_model.utils.build_tensor_info(x)
-    predict_signature_inputs = {"x": predict_input_tensor}
-
-    predict_output_tensor = tf.saved_model.utils.build_tensor_info(y)
-    predict_signature_outputs = {"y": predict_output_tensor}
-    predict_signature_def = (
-        tf.saved_model.signature_def_utils.build_signature_def(
-            predict_signature_inputs, predict_signature_outputs,
-            tf.saved_model.signature_constants.PREDICT_METHOD_NAME))
-
-    signature_def_map = {
-        "regress_x_to_y":
-            _build_regression_signature(serialized_tf_example, y),
-        "regress_x_to_y2":
-            _build_regression_signature(serialized_tf_example, y2),
-        "regress_x2_to_y3":
-            _build_regression_signature(x2, y3),
-        "classify_x_to_y":
-            _build_classification_signature(serialized_tf_example, y),
-        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-            predict_signature_def
-    }
-    # Initialize all variables and then save the SavedModel.
-    sess.run(tf.global_variables_initializer())
-
-    if use_main_op:
-      builder.add_meta_graph_and_variables(
-          sess, [tf.saved_model.tag_constants.SERVING],
-          signature_def_map=signature_def_map,
-          assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS),
-          main_op=tf.group(tf.saved_model.main_op.main_op(),
-                           assign_filename_op))
-    else:
-      builder.add_meta_graph_and_variables(
-          sess, [tf.saved_model.tag_constants.SERVING],
-          signature_def_map=signature_def_map,
-          assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS),
-          main_op=tf.group(assign_filename_op))
-  builder.save(as_text)
-
-
-def main(_):
-  _generate_saved_model_for_half_plus_two(
-      FLAGS.output_dir, device_type=FLAGS.device)
-  print("SavedModel generated for %(device)s at: %(dir)s" % {
-      "device": FLAGS.device,
-      "dir": FLAGS.output_dir
-  })
-
-  _generate_saved_model_for_half_plus_two(
-      FLAGS.output_dir_pbtxt, as_text=True, device_type=FLAGS.device)
-  print("SavedModel generated for %(device)s at: %(dir)s" % {
-      "device": FLAGS.device,
-      "dir": FLAGS.output_dir_pbtxt
-  })
-
-  _generate_saved_model_for_half_plus_two(
-      FLAGS.output_dir_main_op, use_main_op=True, device_type=FLAGS.device)
-  print("SavedModel generated for %(device)s at: %(dir)s " % {
-      "device": FLAGS.device,
-      "dir": FLAGS.output_dir_main_op
-  })
-
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      "--output_dir",
-      type=str,
-      default="/tmp/saved_model_half_plus_two",
-      help="Directory where to output SavedModel.")
-  parser.add_argument(
-      "--output_dir_pbtxt",
-      type=str,
-      default="/tmp/saved_model_half_plus_two_pbtxt",
-      help="Directory where to output the text format of SavedModel.")
-  parser.add_argument(
-      "--output_dir_main_op",
-      type=str,
-      default="/tmp/saved_model_half_plus_two_main_op",
-      help="Directory where to output the SavedModel with a main op.")
-  parser.add_argument(
-      "--device",
-      type=str,
-      default="cpu",
-      help="Force model to run on 'cpu' or 'gpu'")
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index 7967e22d6a0319a530cb2f00e54872f022ac0095..1854e84d490d6c2ff462ee3bc3cc57b48c4d9328 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -183,7 +183,8 @@ def main(_):
   if tf.gfile.Exists(FLAGS.log_dir):
     tf.gfile.DeleteRecursively(FLAGS.log_dir)
   tf.gfile.MakeDirs(FLAGS.log_dir)
-  train()
+  with tf.Graph().as_default():
+    train()
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index b09ee9976897fcab2e90fdc17e8030532080aca8..77889effc8e61210445d87976e4bbfbed2c62440 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -91,7 +91,7 @@ vocabulary_size = 50000
 
 def build_dataset(words, n_words):
   """Process raw inputs into a dataset."""
-  count = [['UNK', -1]]
+  count = [('UNK', -1)]
   count.extend(collections.Counter(words).most_common(n_words - 1))
   dictionary = dict()
   for word, _ in count:
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
index 4a429837b7b997f0f6571060280a9a15543b9f54..464484dab830e73fbc11cc9a2bfd9310bac88653 100644
--- a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 
 using tensorflow::DT_FLOAT;
 using tensorflow::DT_UINT8;
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ad979c58be82103e0501c1fd458a00f89ccbe337..4624d120618bddf71e24b9c4355cb21dfb0cfbdf 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -5645,620 +5645,543 @@ func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size
 	return op.Output(0)
 }
 
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-//
-// The regularized incomplete beta integral is defined as:
-//
-//
-// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
-//
-// where
-//
-//
-// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
-//
+// MapUnstageAttr is an optional argument to MapUnstage.
+type MapUnstageAttr func(optionalAttr)
+
+// MapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: value >= 0
+func MapUnstageCapacity(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Betainc",
-		Input: []tf.Input{
-			a, b, x,
-		},
+}
+
+// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// MapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageContainer(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Identity",
-		Input: []tf.Input{
-			input,
-		},
+}
+
+// MapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageSharedName(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+// Op removes and returns the values associated with the key
 //
-// This is the angle \( \theta \in [-\pi, \pi] \) such that
-// \[ x = r \cos(\theta) \]
-// and
-// \[ y = r \sin(\theta) \]
-// where \(r = \sqrt(x^2 + y^2) \).
-func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Atan2",
+		Type: "MapUnstage",
 		Input: []tf.Input{
-			y, x,
+			key, indices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that passes a sliding window over `input_dataset`.
-//
-// Arguments:
-//
-//	window_size: A scalar representing the number of elements in the
-// sliding window.
-//	window_shift: A scalar representing the steps moving the sliding window
-// forward in one iteration. It must be positive.
-//	window_stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
-//
-//
-func SlideDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SlideDataset",
-		Input: []tf.Input{
-			input_dataset, window_size, window_shift, window_stride,
-		},
-		Attrs: attrs,
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstage", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return values
 }
 
-// EditDistanceAttr is an optional argument to EditDistance.
-type EditDistanceAttr func(optionalAttr)
+// MapPeekAttr is an optional argument to MapPeek.
+type MapPeekAttr func(optionalAttr)
 
-// EditDistanceNormalize sets the optional normalize attribute to value.
-//
-// value: boolean (if true, edit distances are normalized by length of truth).
+// MapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// The output is:
-// If not specified, defaults to true
-func EditDistanceNormalize(value bool) EditDistanceAttr {
+// REQUIRES: value >= 0
+func MapPeekCapacity(value int64) MapPeekAttr {
 	return func(m optionalAttr) {
-		m["normalize"] = value
+		m["capacity"] = value
 	}
 }
 
-// Computes the (possibly normalized) Levenshtein Edit Distance.
-//
-// The inputs are variable-length sequences provided by SparseTensors
-//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
-// and
-//   (truth_indices, truth_values, truth_shape).
-//
-// The inputs are:
-//
-// Arguments:
-//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
-// This is an N x R int64 matrix.
-//	hypothesis_values: The values of the hypothesis list SparseTensor.
-// This is an N-length vector.
-//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
-// This is an R-length vector.
-//	truth_indices: The indices of the truth list SparseTensor.
-// This is an M x R int64 matrix.
-//	truth_values: The values of the truth list SparseTensor.
-// This is an M-length vector.
-//	truth_shape: truth indices, vector.
-//
-// Returns A dense float tensor with rank R - 1.
-//
-// For the example input:
-//
-//     // hypothesis represents a 2x1 matrix with variable-length values:
-//     //   (0,0) = ["a"]
-//     //   (1,0) = ["b"]
-//     hypothesis_indices = [[0, 0, 0],
-//                           [1, 0, 0]]
-//     hypothesis_values = ["a", "b"]
-//     hypothesis_shape = [2, 1, 1]
-//
-//     // truth represents a 2x2 matrix with variable-length values:
-//     //   (0,0) = []
-//     //   (0,1) = ["a"]
-//     //   (1,0) = ["b", "c"]
-//     //   (1,1) = ["a"]
-//     truth_indices = [[0, 1, 0],
-//                      [1, 0, 0],
-//                      [1, 0, 1],
-//                      [1, 1, 0]]
-//     truth_values = ["a", "b", "c", "a"]
-//     truth_shape = [2, 2, 2]
-//     normalize = true
+// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// The output will be:
+// REQUIRES: value >= 0
+func MapPeekMemoryLimit(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapPeekContainer(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapPeekSharedName(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
 //
-//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
-//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+// underlying container does not contain this key
+// this op will block until it does.
+func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EditDistance",
+		Type: "MapPeek",
 		Input: []tf.Input{
-			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
-func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Xlogy",
-		Input: []tf.Input{
-			x, y,
-		},
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapPeek", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return values
 }
 
-// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
-type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+// MapStageAttr is an optional argument to MapStage.
+type MapStageAttr func(optionalAttr)
 
-// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+// MapStageCapacity sets the optional capacity attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapStageCapacity(value int64) MapStageAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["capacity"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+// MapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+// REQUIRES: value >= 0
+func MapStageMemoryLimit(value int64) MapStageAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// Computes the gradients of depthwise convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the shape of `input`, based
-// on `data_format`.  For example, if `data_format` is 'NHWC' then
-//  `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+// MapStageContainer sets the optional container attribute to value.
 //
-// Returns 4-D with shape according to `data_format`.  For example, if
-// `data_format` is 'NHWC', output shape is `[batch, in_height,
-// in_width, in_channels]`.  Gradient w.r.t. the input of the
-// convolution.
-func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
-		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
-		},
-		Attrs: attrs,
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func MapStageContainer(value string) MapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
-
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+// MapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func MapStageSharedName(value string) MapStageAttr {
 	return func(m optionalAttr) {
-		m["tolerance"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+// Stage (key, values) in the underlying container which behaves like a hashtable.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
+		Type: "MapStage",
 		Input: []tf.Input{
-			x, y,
+			key, indices, tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns x / y element-wise.
+// StageClearAttr is an optional argument to StageClear.
+type StageClearAttr func(optionalAttr)
+
+// StageClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Div",
-		Input: []tf.Input{
-			x, y,
-		},
+// REQUIRES: value >= 0
+func StageClearCapacity(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns x * y element-wise.
+// StageClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mul",
-		Input: []tf.Input{
-			x, y,
-		},
+// REQUIRES: value >= 0
+func StageClearMemoryLimit(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
+// StageClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageClearContainer(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
 
-// BiasAddDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
+// StageClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageClearSharedName(value string) StageClearAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Adds `bias` to `value`.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
-//
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+// Op removes all elements in the underlying container.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+// Returns the created operation.
+func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAdd",
-		Input: []tf.Input{
-			value, bias,
-		},
+		Type: "StageClear",
+
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
-type SparseReduceSumSparseAttr func(optionalAttr)
+// StageSizeAttr is an optional argument to StageSize.
+type StageSizeAttr func(optionalAttr)
 
-// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+// StageSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+// REQUIRES: value >= 0
+func StageSizeCapacity(value int64) StageSizeAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["capacity"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-// SparseTensor.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// REQUIRES: value >= 0
+func StageSizeMemoryLimit(value int64) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageSizeContainer(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageSizeSharedName(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSumSparse",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
+		Type: "StageSize",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
-type AllCandidateSamplerAttr func(optionalAttr)
+// StagePeekAttr is an optional argument to StagePeek.
+type StagePeekAttr func(optionalAttr)
 
-// AllCandidateSamplerSeed sets the optional seed attribute to value.
+// StagePeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
+// REQUIRES: value >= 0
+func StagePeekCapacity(value int64) StagePeekAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["capacity"] = value
 	}
 }
 
-// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
+// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
-func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+//
+// REQUIRES: value >= 0
+func StagePeekMemoryLimit(value int64) StagePeekAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
+// StagePeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StagePeekContainer(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StagePeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StagePeekSharedName(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified index.  If the
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// underlying container does not contain sufficient elements
+// this op will block until it does.   This Op is optimized for
+// performance.
+func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AllCandidateSampler",
+		Type: "StagePeek",
 		Input: []tf.Input{
-			true_classes,
+			index,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("StagePeek", err)
+		return
+	}
+	return values
 }
 
-// Returns x + y element-wise.
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The regularized incomplete beta integral is defined as:
+//
+//
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+//
+// where
+//
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AddV2",
+		Type: "Betainc",
 		Input: []tf.Input{
-			x, y,
+			a, b, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns an element-wise indication of the sign of a number.
-//
-// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
-//
-// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sign",
+		Type: "Identity",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns which elements of x are finite.
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
 //
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsFinite",
+		Type: "Atan2",
 		Input: []tf.Input{
-			x,
+			y, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
-type ResourceStridedSliceAssignAttr func(optionalAttr)
-
-// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
 
-// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+// EditDistanceNormalize sets the optional normalize attribute to value.
+//
+// value: boolean (if true, edit distances are normalized by length of truth).
+//
+// The output is:
+// If not specified, defaults to true
+func EditDistanceNormalize(value bool) EditDistanceAttr {
 	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+		m["normalize"] = value
 	}
 }
 
-// Assign `value` to the sliced l-value reference of `ref`.
+// Computes the (possibly normalized) Levenshtein Edit Distance.
 //
-// The values of `value` are assigned to the positions in the variable
-// `ref` that are selected by the slice parameters. The slice parameters
-// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
 //
-// NOTE this op currently does not support broadcasting and so `value`'s
-// shape must be exactly the shape produced by the slice of `ref`.
+// The inputs are:
 //
-// Returns the created operation.
-func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
+// Arguments:
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
+//
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
+//
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6267,94 +6190,94 @@ func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, en
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceStridedSliceAssign",
+		Type: "EditDistance",
 		Input: []tf.Input{
-			ref, begin, end, strides, value,
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ArgMaxAttr is an optional argument to ArgMax.
-type ArgMaxAttr func(optionalAttr)
-
-// ArgMaxOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Returns the index with the largest value across dimensions of a tensor.
-//
-// Note that in case of ties the identity of the return value is not guaranteed.
-//
-// Arguments:
-//
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
+// Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
+func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ArgMax",
+		Type: "Xlogy",
 		Input: []tf.Input{
-			input, dimension,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PreventGradientAttr is an optional argument to PreventGradient.
-type PreventGradientAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
+type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
 
-// PreventGradientMessage sets the optional message attribute to value.
+// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
 //
-// value: Will be printed in the error when anyone tries to differentiate
-// this operation.
-// If not specified, defaults to ""
-func PreventGradientMessage(value string) PreventGradientAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["message"] = value
+		m["data_format"] = value
 	}
 }
 
-// An identity op that triggers an error if a gradient is requested.
-//
-// When executed in a graph, this op outputs its input tensor as-is.
+// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
 //
-// When building ops to compute gradients, the TensorFlow gradient system
-// will return an error when trying to lookup the gradient of this op,
-// because no gradient must ever be registered for this function.  This
-// op exists to prevent subtle bugs from silently returning unimplemented
-// gradients in some corner cases.
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the input.
 //
 // Arguments:
-//	input: any tensor.
+//	input_sizes: An integer vector representing the shape of `input`, based
+// on `data_format`.  For example, if `data_format` is 'NHWC' then
+//  `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the same input tensor.
-func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PreventGradient",
+		Type: "DepthwiseConv2dNativeBackpropInput",
 		Input: []tf.Input{
-			input,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -6362,246 +6285,263 @@ func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientA
 	return op.Output(0)
 }
 
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns x / y element-wise.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Asin",
+		Type: "Div",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// For example:
-//
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// tf.sparse_segment_sum_with_num_segments(
-//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
-// # => [[0 0 0 0]
-// #     [0 0 0 0]
-// #     [0 0 0 0]]
-//
-// tf.sparse_segment_sum_with_num_segments(c,
-//                                         tf.constant([0, 1]),
-//                                         tf.constant([0, 2],
-//                                         num_segments=4))
-// # => [[ 1  2  3  4]
-// #     [ 0  0  0  0]
-// #     [-1 -2 -3 -4]
-// #     [ 0  0  0  0]]
-// ```
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
+// Returns x * y element-wise.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSumWithNumSegments",
+		Type: "Mul",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the determinant of one or more square matrices.
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
+
+// BiasAddDataFormat sets the optional data_format attribute to value.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddDataFormat(value string) BiasAddAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Adds `bias` to `value`.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
 //
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
+		Type: "BiasAdd",
 		Input: []tf.Input{
-			input,
+			value, bias,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes sin of x element-wise.
-func Sin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sin",
-		Input: []tf.Input{
-			x,
-		},
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
+
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// `Gamma(x)`), element-wise.
-func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Digamma",
+		Type: "SparseReduceSumSparse",
 		Input: []tf.Input{
-			x,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
+// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
+type AllCandidateSamplerAttr func(optionalAttr)
 
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+// AllCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["seed"] = value
 	}
 }
 
-// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["seed2"] = value
 	}
 }
 
-// Computes the gradients of convolution with respect to the filter.
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
+		Type: "AllCandidateSampler",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the number of work units this Reader has finished processing.
+// Returns x + y element-wise.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
+		Type: "AddV2",
 		Input: []tf.Input{
-			reader_handle,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
-func ExperimentalIgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns an element-wise indication of the sign of a number.
+//
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+//
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalIgnoreErrorsDataset",
+		Type: "Sign",
 		Input: []tf.Input{
-			input_dataset,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the log of the absolute value of `Gamma(x)` element-wise.
-func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns which elements of x are finite.
+//
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Lgamma",
+		Type: "IsFinite",
 		Input: []tf.Input{
 			x,
 		},
@@ -6610,163 +6550,170 @@ func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
+
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Assign `value` to the sliced l-value reference of `ref`.
 //
-// For an explanation see "Differentiation of the Cholesky algorithm" by
-// Iain Murray http://arxiv.org/abs/1602.07527.
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
 //
-// Arguments:
-//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
 //
-// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
-func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
+// Returns the created operation.
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "CholeskyGrad",
+		Type: "ResourceStridedSliceAssign",
 		Input: []tf.Input{
-			l, grad,
+			ref, begin, end, strides, value,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that emits each dim-0 slice of `components` once.
-func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorSliceDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
+// ArgMaxAttr is an optional argument to ArgMax.
+type ArgMaxAttr func(optionalAttr)
+
+// ArgMaxOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns the index with the largest value across dimensions of a tensor.
+//
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Arguments:
+//
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Sinh",
+		Type: "ArgMax",
 		Input: []tf.Input{
-			x,
+			input, dimension,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
+
+// PreventGradientMessage sets the optional message attribute to value.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
+	return func(m optionalAttr) {
+		m["message"] = value
+	}
+}
+
+// An identity op that triggers an error if a gradient is requested.
 //
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
+// When executed in a graph, this op outputs its input tensor as-is.
 //
-// For example:
-//
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-// # => [[0 0 0 0]]
-//
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-// # => [[ 1  2  3  4]
-// #     [-1 -2 -3 -4]]
-//
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-// # => [[0 0 0 0]
-// #     [5 6 7 8]]
-//
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
-// ```
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
 //
 // Arguments:
+//	input: any tensor.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
-		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes natural logarithm of x element-wise.
-//
-// I.e., \\(y = \log_e x\\).
-func Log(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Log",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Rounds the values of a tensor to the nearest integer, element-wise.
-//
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Round",
+		Type: "PreventGradient",
 		Input: []tf.Input{
-			x,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes reciprocal of square root of x element-wise.
-//
-// I.e., \\(y = 1 / \sqrt{x}\\).
-func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rsqrt",
+		Type: "Asin",
 		Input: []tf.Input{
 			x,
 		},
@@ -6775,40 +6722,53 @@ func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// MatrixInverseAttr is an optional argument to MatrixInverse.
-type MatrixInverseAttr func(optionalAttr)
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
 
-// MatrixInverseAdjoint sets the optional adjoint attribute to value.
-// If not specified, defaults to false
-func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+//
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Computes the inverse of one or more square invertible matrices or their
+// Converts a sparse representation into a dense tensor.
 //
-// adjoints (conjugate transposes).
+// Builds an array `dense` with shape `output_shape` such that
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the inverse for all input submatrices `[..., :, :]`.
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
 //
-// The op uses LU decomposition with partial pivoting to compute the inverses.
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
 //
-// If a matrix is not invertible there is no guarantee what the op does. It
-// may detect the condition and raise an exception or it may simply return a
-// garbage result.
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
 //
-// Arguments:
-//	input: Shape is `[..., M, M]`.
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
 //
-// Returns Shape is `[..., M, M]`.
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.inv
-// @end_compatibility
-func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
+// Arguments:
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6817,9 +6777,9 @@ func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixInverse",
+		Type: "SparseToDense",
 		Input: []tf.Input{
-			input,
+			sparse_indices, output_shape, sparse_values, default_value,
 		},
 		Attrs: attrs,
 	}
@@ -6827,83 +6787,106 @@ func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr)
 	return op.Output(0)
 }
 
-// Returns x + y element-wise.
+// Computes the sum along sparse segments of a tensor.
 //
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Add",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
-func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
+//
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
+//
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
+// ```
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomGammaGrad",
+		Type: "SparseSegmentSumWithNumSegments",
 		Input: []tf.Input{
-			alpha, sample,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes square of x element-wise.
+// Computes the determinant of one or more square matrices.
 //
-// I.e., \\(y = x * x = x^2\\).
-func Square(scope *Scope, x tf.Output) (y tf.Output) {
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Square",
+		Type: "MatrixDeterminant",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
-//
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+// Computes sin of x element-wise.
+func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Elu",
+		Type: "Sin",
 		Input: []tf.Input{
-			features,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
+// Computes Psi, the derivative of Lgamma (the log of the absolute value of
 //
-// I.e., \\(y = 1 / x\\).
-func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
+// `Gamma(x)`), element-wise.
+func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Reciprocal",
+		Type: "Digamma",
 		Input: []tf.Input{
 			x,
 		},
@@ -6912,54 +6895,74 @@ func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// ParseExampleDatasetAttr is an optional argument to ParseExampleDataset.
-type ParseExampleDatasetAttr func(optionalAttr)
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
 
-// ParseExampleDatasetSloppy sets the optional sloppy attribute to value.
-// If not specified, defaults to false
-func ParseExampleDatasetSloppy(value bool) ParseExampleDatasetAttr {
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["sloppy"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features.
-//
-// Arguments:
-//
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-//	dense_defaults: A dict mapping string keys to `Tensor`s.
-// The keys of the dict must match the dense_keys of the feature.
-//	sparse_keys: A list of string keys in the examples features.
-// The results for these keys will be returned as `SparseTensor` objects.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples features associated with dense values.
-//	sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
-// Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-// and `tf.string` (`BytesList`) are supported.
-//	dense_shapes: List of tuples with the same length as `dense_keys`.
-// The shape of the data for each dense feature referenced by `dense_keys`.
-// Required for any input tensors identified by `dense_keys`.  Must be
-// either fully defined, or may contain an unknown first dimension.
-// An unknown first dimension means the feature is treated as having
-// a variable number of blocks, and the output shape along this dimension
-// is considered unknown at graph build time.  Padding is applied for
-// minibatch elements smaller than the maximum number of blocks for the
-// given feature along this dimension.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-func ParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_calls tf.Output, dense_defaults []tf.Output, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ParseExampleDatasetAttr) (handle tf.Output) {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes, "output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParseExampleDataset",
+		Type: "Conv2DBackpropFilter",
 		Input: []tf.Input{
-			input_dataset, num_parallel_calls, tf.OutputList(dense_defaults),
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -6967,77 +6970,48 @@ func ParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_cal
 	return op.Output(0)
 }
 
-// Returns a batched matrix tensor with new batched diagonal values.
-//
-// Given `input` and `diagonal`, this operation returns a tensor with the
-// same shape and values as `input`, except for the main diagonal of the
-// innermost matrices.  These will be overwritten by the values in `diagonal`.
-//
-// The output is computed as follows:
-//
-// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
-//
-//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
+// Returns the number of work units this Reader has finished processing.
 //
 // Arguments:
-//	input: Rank `k+1`, where `k >= 1`.
-//	diagonal: Rank `k`, where `k >= 1`.
-//
-// Returns Rank `k+1`, with `output.shape = input.shape`.
-func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSetDiag",
+		Type: "ReaderNumWorkUnitsCompletedV2",
 		Input: []tf.Input{
-			input, diagonal,
+			reader_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the element-wise max of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-//
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
-//
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
+func ExperimentalIgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMaximum",
+		Type: "ExperimentalIgnoreErrorsDataset",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			input_dataset,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
-//
-// I.e., \\(y = 1 / x\\).
-func Inv(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the log of the absolute value of `Gamma(x)` element-wise.
+func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Inv",
+		Type: "Lgamma",
 		Input: []tf.Input{
 			x,
 		},
@@ -7046,506 +7020,405 @@ func Inv(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// ComplexAbsAttr is an optional argument to ComplexAbs.
-type ComplexAbsAttr func(optionalAttr)
-
-// ComplexAbsTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Computes the complex absolute value of a tensor.
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
 //
-// Given a tensor `x` of complex numbers, this operation returns a tensor of type
-// `float` or `double` that is the absolute value of each element in `x`. All
-// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-// value is computed as \\( \sqrt{a^2 + b^2}\\).
-func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
+//
+// Arguments:
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ComplexAbs",
+		Type: "CholeskyGrad",
 		Input: []tf.Input{
-			x,
+			l, grad,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of x AND y element-wise.
-//
-// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Creates a dataset that emits each dim-0 slice of `components` once.
+func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "LogicalAnd",
+		Type: "TensorSliceDataset",
 		Input: []tf.Input{
-			x, y,
+			tf.OutputList(components),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CastAttr is an optional argument to Cast.
-type CastAttr func(optionalAttr)
-
-// CastTruncate sets the optional Truncate attribute to value.
-// If not specified, defaults to false
-func CastTruncate(value bool) CastAttr {
-	return func(m optionalAttr) {
-		m["Truncate"] = value
-	}
-}
-
-// Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"DstT": DstT}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Cast",
+		Type: "Sinh",
 		Input: []tf.Input{
 			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs a tensor containing the reduction across all input tensors.
+// Computes the sum along sparse segments of a tensor.
 //
-// Outputs a tensor containing the reduction across all input tensors passed to ops
-// within the same `shared_name.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
 //
-// The graph should be constructed so if one op runs with shared_name value `c`,
-// then `num_devices` ops will run with shared_name value `c`.  Failure to do so
-// will cause the graph execution to fail to complete.
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
-// input: the input to the reduction
-// data: the value of the reduction across all `num_devices` devices.
-// reduction: the reduction operation to perform.
-// num_devices: The number of devices participating in this reduction.
-// shared_name: Identifier that shared between ops of the same reduction.
-func NcclAllReduce(scope *Scope, input tf.Output, reduction string, num_devices int64, shared_name string) (data tf.Output) {
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+// # => [[0 0 0 0]]
+//
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+// # => [[ 1  2  3  4]
+// #     [-1 -2 -3 -4]]
+//
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// # => [[0 0 0 0]
+// #     [5 6 7 8]]
+//
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"reduction": reduction, "num_devices": num_devices, "shared_name": shared_name}
 	opspec := tf.OpSpec{
-		Type: "NcclAllReduce",
+		Type: "SparseSegmentSum",
 		Input: []tf.Input{
-			input,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RegexReplaceAttr is an optional argument to RegexReplace.
-type RegexReplaceAttr func(optionalAttr)
-
-// RegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+// Computes natural logarithm of x element-wise.
 //
-// value: If True, the replacement is global, otherwise the replacement
-// is done only on the first match.
-// If not specified, defaults to true
-func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
-	return func(m optionalAttr) {
-		m["replace_global"] = value
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// Replaces the match of pattern in input with rewrite.
-//
-// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Arguments:
-//	input: The text to be processed.
-//	pattern: The regular expression to match the input.
-//	rewrite: The rewrite to be applied to the matched expresion.
+	opspec := tf.OpSpec{
+		Type: "Log",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Rounds the values of a tensor to the nearest integer, element-wise.
 //
-// Returns The text after applying pattern and rewrite.
-func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RegexReplace",
+		Type: "Round",
 		Input: []tf.Input{
-			input, pattern, rewrite,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Quantized Batch normalization.
-//
-// This op is deprecated and will be removed in the future. Prefer
-// `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	t_min: The value represented by the lowest quantized input.
-//	t_max: The value represented by the highest quantized input.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	m_min: The value represented by the lowest quantized mean.
-//	m_max: The value represented by the highest quantized mean.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v_min: The value represented by the lowest quantized variance.
-//	v_max: The value represented by the highest quantized variance.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	beta_min: The value represented by the lowest quantized offset.
-//	beta_max: The value represented by the highest quantized offset.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	gamma_min: The value represented by the lowest quantized gamma.
-//	gamma_max: The value represented by the highest quantized gamma.
+// Computes reciprocal of square root of x element-wise.
 //
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
+// I.e., \\(y = 1 / \sqrt{x}\\).
+func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "QuantizedBatchNormWithGlobalNormalization",
+		Type: "Rsqrt",
 		Input: []tf.Input{
-			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
+// MatrixInverseAttr is an optional argument to MatrixInverse.
+type MatrixInverseAttr func(optionalAttr)
+
+// MatrixInverseAdjoint sets the optional adjoint attribute to value.
+// If not specified, defaults to false
+func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Computes the inverse of one or more square invertible matrices or their
 //
-// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
+// adjoints (conjugate transposes).
 //
-// Arguments:
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the inverse for all input submatrices `[..., :, :]`.
 //
-//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_bias: The float value that the lowest quantized bias value represents.
-//	max_bias: The float value that the highest quantized bias value represents.
+// The op uses LU decomposition with partial pivoting to compute the inverses.
 //
+// If a matrix is not invertible there is no guarantee what the op does. It
+// may detect the condition and raise an exception or it may simply return a
+// garbage result.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.inv
+// @end_compatibility
+func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedBiasAdd",
+		Type: "MatrixInverse",
 		Input: []tf.Input{
-			input, bias, min_input, max_input, min_bias, max_bias,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Produces the average pool of the input tensor for quantized types.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.  The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
+// Returns x + y element-wise.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAvgPool",
+		Type: "Add",
 		Input: []tf.Input{
-			input, min_input, max_input,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Extract `patches` from `input` and put them in the "depth" output dimension. 3D extension of `extract_image_patches`.
-//
-// Arguments:
-//	input: 5-D Tensor with shape `[batch, in_planes, in_rows, in_cols, depth]`.
-//	ksizes: The size of the sliding window for each dimension of `input`.
-//	strides: 1-D of length 5. How far the centers of two consecutive patches are in
-// `input`. Must be: `[1, stride_planes, stride_rows, stride_cols, 1]`.
-//	padding: The type of padding algorithm to use.
-//
-// We specify the size-related attributes as:
-//
-// ```python
-//       ksizes = [1, ksize_planes, ksize_rows, ksize_cols, 1]
-//       strides = [1, stride_planes, strides_rows, strides_cols, 1]
-// ```
-//
-// Returns 5-D Tensor with shape `[batch, out_planes, out_rows, out_cols,
-// ksize_planes * ksize_rows * ksize_cols * depth]` containing patches
-// with size `ksize_planes x ksize_rows x ksize_cols x depth` vectorized
-// in the "depth" dimension. Note `out_planes`, `out_rows` and `out_cols`
-// are the dimensions of the output patches.
-func ExtractVolumePatches(scope *Scope, input tf.Output, ksizes []int64, strides []int64, padding string) (patches tf.Output) {
+// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
+func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "ExtractVolumePatches",
+		Type: "RandomGammaGrad",
 		Input: []tf.Input{
-			input,
+			alpha, sample,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
-type FractionalAvgPoolAttr func(optionalAttr)
-
-// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
+// Computes square of x element-wise.
 //
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
+// I.e., \\(y = x * x = x^2\\).
+func Square(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Square",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Elu",
+		Input: []tf.Input{
+			features,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+// Computes the reciprocal of x element-wise.
 //
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalAvgPool node in the computation graph. Mainly used
-// in unit test to make FractionalAvgPool deterministic.
-// If not specified, defaults to false
-func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Reciprocal",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolSeed sets the optional seed attribute to value.
+// Returns a batched matrix tensor with new batched diagonal values.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+// Given `input` and `diagonal`, this operation returns a tensor with the
+// same shape and values as `input`, except for the main diagonal of the
+// innermost matrices.  These will be overwritten by the values in `diagonal`.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Performs fractional average pooling on the input.
+// The output is computed as follows:
 //
-// Fractional average pooling is similar to Fractional max pooling in the pooling
-// region generation step. The only difference is that after pooling regions are
-// generated, a mean operation is performed instead of a max operation in each
-// pooling region.
+// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
+// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
+// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
+//
+//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
+//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
+//	input: Rank `k+1`, where `k >= 1`.
+//	diagonal: Rank `k`, where `k >= 1`.
 //
-// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+// Returns Rank `k+1`, with `output.shape = input.shape`.
+func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPool",
+		Type: "MatrixSetDiag",
 		Input: []tf.Input{
-			value,
+			input, diagonal,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// RandomCropAttr is an optional argument to RandomCrop.
-type RandomCropAttr func(optionalAttr)
-
-// RandomCropSeed sets the optional seed attribute to value.
+// Returns the element-wise max of two SparseTensors.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomCropSeed(value int64) RandomCropAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomCropSeed2 sets the optional seed2 attribute to value.
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomCropSeed2(value int64) RandomCropAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSparseMaximum",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Randomly crop `image`.
-//
-// DEPRECATED at GraphDef version 8: Random crop is now pure Python
-//
-// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
-// width.  The values must be non negative.
-//
-// This Op picks a random location in `image` and crops a `height` by `width`
-// rectangle from that location.  The random location is picked so the cropped
-// area will fit inside the original image.
-//
-// Arguments:
-//	image: 3-D of shape `[height, width, channels]`.
-//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
+// Computes the reciprocal of x element-wise.
 //
-// Returns 3-D of shape `[crop_height, crop_width, channels].`
-func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Inv(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomCrop",
+		Type: "Inv",
 		Input: []tf.Input{
-			image, size,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TopKV2Attr is an optional argument to TopKV2.
-type TopKV2Attr func(optionalAttr)
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
 
-// TopKV2Sorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKV2Sorted(value bool) TopKV2Attr {
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
 	return func(m optionalAttr) {
-		m["sorted"] = value
+		m["Tout"] = value
 	}
 }
 
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
-//
-// If two elements are equal, the lower-index element appears first.
-//
-// Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: 0-D.  Number of top elements to look for along the last dimension (along each
-// row for matrices).
+// Computes the complex absolute value of a tensor.
 //
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7554,26 +7427,26 @@ func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TopKV2",
+		Type: "ComplexAbs",
 		Input: []tf.Input{
-			input, k,
+			x,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns x // y element-wise.
+// Returns the truth value of x AND y element-wise.
 //
-// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FloorDiv",
+		Type: "LogicalAnd",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -7582,325 +7455,452 @@ func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the inverse permutation of a tensor.
-//
-// This operation computes the inverse of an index permutation. It takes a 1-D
-// integer tensor `x`, which represents the indices of a zero-based array, and
-// swaps each value with its index position. In other words, for an output tensor
-// `y` and an input tensor `x`, this operation computes the following:
-//
-// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
-//
-// The values must include 0. There can be no duplicate values or negative values.
-//
-// For example:
-//
-// ```
-// # tensor `x` is [3, 4, 0, 2, 1]
-// invert_permutation(x) ==> [2, 4, 3, 0, 1]
-// ```
-//
-// Arguments:
-//	x: 1-D.
-//
-// Returns 1-D.
-func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
+// CastAttr is an optional argument to Cast.
+type CastAttr func(optionalAttr)
+
+// CastTruncate sets the optional Truncate attribute to value.
+// If not specified, defaults to false
+func CastTruncate(value bool) CastAttr {
+	return func(m optionalAttr) {
+		m["Truncate"] = value
+	}
+}
+
+// Cast x of type SrcT to y of DstT.
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"DstT": DstT}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "InvertPermutation",
+		Type: "Cast",
 		Input: []tf.Input{
 			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes log softmax activations.
-//
-// For each batch `i` and class `j` we have
+// Outputs a tensor containing the reduction across all input tensors.
 //
-//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+// Outputs a tensor containing the reduction across all input tensors passed to ops
+// within the same `shared_name.
 //
-// Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
+// The graph should be constructed so if one op runs with shared_name value `c`,
+// then `num_devices` ops will run with shared_name value `c`.  Failure to do so
+// will cause the graph execution to fail to complete.
 //
-// Returns Same shape as `logits`.
-func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+// input: the input to the reduction
+// data: the value of the reduction across all `num_devices` devices.
+// reduction: the reduction operation to perform.
+// num_devices: The number of devices participating in this reduction.
+// shared_name: Identifier that shared between ops of the same reduction.
+func NcclAllReduce(scope *Scope, input tf.Output, reduction string, num_devices int64, shared_name string) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"reduction": reduction, "num_devices": num_devices, "shared_name": shared_name}
 	opspec := tf.OpSpec{
-		Type: "LogSoftmax",
+		Type: "NcclAllReduce",
 		Input: []tf.Input{
-			logits,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x <= y) element-wise.
-//
-// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// RegexReplaceAttr is an optional argument to RegexReplace.
+type RegexReplaceAttr func(optionalAttr)
+
+// RegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+//
+// value: If True, the replacement is global, otherwise the replacement
+// is done only on the first match.
+// If not specified, defaults to true
+func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
+	return func(m optionalAttr) {
+		m["replace_global"] = value
+	}
+}
+
+// Replaces the match of pattern in input with rewrite.
+//
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: The text to be processed.
+//	pattern: The regular expression to match the input.
+//	rewrite: The rewrite to be applied to the matched expresion.
+//
+// Returns The text after applying pattern and rewrite.
+func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LessEqual",
+		Type: "RegexReplace",
 		Input: []tf.Input{
-			x, y,
+			input, pattern, rewrite,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softmax activations.
-//
-// For each batch `i` and class `j` we have
+// Quantized Batch normalization.
 //
-//     $$softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))$$
+// This op is deprecated and will be removed in the future. Prefer
+// `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
+//	t: A 4D input Tensor.
+//	t_min: The value represented by the lowest quantized input.
+//	t_max: The value represented by the highest quantized input.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	m_min: The value represented by the lowest quantized mean.
+//	m_max: The value represented by the highest quantized mean.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v_min: The value represented by the lowest quantized variance.
+//	v_max: The value represented by the highest quantized variance.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	beta_min: The value represented by the lowest quantized offset.
+//	beta_max: The value represented by the highest quantized offset.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	gamma_min: The value represented by the lowest quantized gamma.
+//	gamma_max: The value represented by the highest quantized gamma.
 //
-// Returns Same shape as `logits`.
-func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "Softmax",
+		Type: "QuantizedBatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			logits,
+			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeBmpAttr is an optional argument to DecodeBmp.
-type DecodeBmpAttr func(optionalAttr)
-
-// DecodeBmpChannels sets the optional channels attribute to value.
-// If not specified, defaults to 0
-func DecodeBmpChannels(value int64) DecodeBmpAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Decode the first frame of a BMP-encoded image to a uint8 tensor.
+// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
 //
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
+// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
 //
-// Accepted values are:
+// Arguments:
 //
-// *   0: Use the number of channels in the BMP-encoded image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
+//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_bias: The float value that the lowest quantized bias value represents.
+//	max_bias: The float value that the highest quantized bias value represents.
 //
-// Arguments:
-//	contents: 0-D.  The BMP-encoded image.
 //
-// Returns 3-D with shape `[height, width, channels]`. RGB order
-func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "DecodeBmp",
+		Type: "QuantizedBiasAdd",
 		Input: []tf.Input{
-			contents,
+			input, bias, min_input, max_input, min_bias, max_bias,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes softsign gradients for a softsign operation.
+// Produces the average pool of the input tensor for quantized types.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding softsign operation.
-//	features: The features passed as input to the corresponding softsign operation.
+//	input: 4-D with shape `[batch, height, width, channels]`.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.  The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
-func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "SoftsignGrad",
+		Type: "QuantizedAvgPool",
 		Input: []tf.Input{
-			gradients, features,
+			input, min_input, max_input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Provides the time since epoch in seconds.
+// Extract `patches` from `input` and put them in the "depth" output dimension. 3D extension of `extract_image_patches`.
 //
-// Returns the timestamp as a `float64` for seconds since the Unix epoch.
+// Arguments:
+//	input: 5-D Tensor with shape `[batch, in_planes, in_rows, in_cols, depth]`.
+//	ksizes: The size of the sliding window for each dimension of `input`.
+//	strides: 1-D of length 5. How far the centers of two consecutive patches are in
+// `input`. Must be: `[1, stride_planes, stride_rows, stride_cols, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// Note: the timestamp is computed when the op is executed, not when it is added
-// to the graph.
-func Timestamp(scope *Scope) (ts tf.Output) {
+// We specify the size-related attributes as:
+//
+// ```python
+//       ksizes = [1, ksize_planes, ksize_rows, ksize_cols, 1]
+//       strides = [1, stride_planes, strides_rows, strides_cols, 1]
+// ```
+//
+// Returns 5-D Tensor with shape `[batch, out_planes, out_rows, out_cols,
+// ksize_planes * ksize_rows * ksize_cols * depth]` containing patches
+// with size `ksize_planes x ksize_rows x ksize_cols x depth` vectorized
+// in the "depth" dimension. Note `out_planes`, `out_rows` and `out_cols`
+// are the dimensions of the output patches.
+func ExtractVolumePatches(scope *Scope, input tf.Output, ksizes []int64, strides []int64, padding string) (patches tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Timestamp",
+		Type: "ExtractVolumePatches",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BatchMatMulAttr is an optional argument to BatchMatMul.
-type BatchMatMulAttr func(optionalAttr)
+// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
+type FractionalAvgPoolAttr func(optionalAttr)
 
-// BatchMatMulAdjX sets the optional adj_x attribute to value.
+// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
 //
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
 // If not specified, defaults to false
-func BatchMatMulAdjX(value bool) BatchMatMulAttr {
+func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
-		m["adj_x"] = value
+		m["pseudo_random"] = value
 	}
 }
 
-// BatchMatMulAdjY sets the optional adj_y attribute to value.
+// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
 //
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
 // If not specified, defaults to false
-func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
-		m["adj_y"] = value
+		m["overlapping"] = value
 	}
 }
 
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
+// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
 //
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalAvgPool node in the computation graph. Mainly used
+// in unit test to make FractionalAvgPool deterministic.
+// If not specified, defaults to false
+func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalAvgPoolSeed sets the optional seed attribute to value.
 //
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
 //
-// It is computed as:
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional average pooling on the input.
 //
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+// Fractional average pooling is similar to Fractional max pooling in the pooling
+// region generation step. The only difference is that after pooling regions are
+// generated, a mean operation is performed instead of a max operation in each
+// pooling region.
 //
 // Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
 //
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BatchMatMul",
+		Type: "FractionalAvgPool",
 		Input: []tf.Input{
-			x, y,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns which elements of x are NaN.
+// RandomCropAttr is an optional argument to RandomCrop.
+type RandomCropAttr func(optionalAttr)
+
+// RandomCropSeed sets the optional seed attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.isnan
-// @end_compatibility
-func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomCropSeed(value int64) RandomCropAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "IsNan",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// RandomCropSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomCropSeed2(value int64) RandomCropAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Identity op for gradient debugging.
+// Randomly crop `image`.
 //
-// This op is hidden from public in Python. It is used by TensorFlow Debugger to
-// register gradient tensors for gradient debugging.
-// This op operates on non-reference-type tensors.
-func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
+// DEPRECATED at GraphDef version 8: Random crop is now pure Python
+//
+// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
+// width.  The values must be non negative.
+//
+// This Op picks a random location in `image` and crops a `height` by `width`
+// rectangle from that location.  The random location is picked so the cropped
+// area will fit inside the original image.
+//
+// Arguments:
+//	image: 3-D of shape `[height, width, channels]`.
+//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
+//
+// Returns 3-D of shape `[crop_height, crop_width, channels].`
+func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DebugGradientIdentity",
+		Type: "RandomCrop",
 		Input: []tf.Input{
-			input,
+			image, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
+// TopKV2Attr is an optional argument to TopKV2.
+type TopKV2Attr func(optionalAttr)
 
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// TopKV2Sorted sets the optional sorted attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKV2Sorted(value bool) TopKV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["sorted"] = value
 	}
 }
 
-// var: Should be from a Variable().
+// Finds values and indices of the `k` largest elements for the last dimension.
 //
-// Arguments:
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
 //
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
 //
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: 0-D.  Number of top elements to look for along the last dimension (along each
+// row for matrices).
+//
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7909,453 +7909,398 @@ func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
+		Type: "TopKV2",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
+			input, k,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Gets next element for the provided shard number.
-//
-// Arguments:
-//	multi_device_iterator: A MultiDeviceIterator resource.
-//	shard_num: Integer representing which shard to fetch data for.
-//	incarnation_id: Which incarnation of the MultiDeviceIterator is running.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
+// Returns x // y element-wise.
 //
-// Returns Result of the get_next on the dataset.
-func MultiDeviceIteratorGetNextFromShard(scope *Scope, multi_device_iterator tf.Output, shard_num tf.Output, incarnation_id tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorGetNextFromShard",
+		Type: "FloorDiv",
 		Input: []tf.Input{
-			multi_device_iterator, shard_num, incarnation_id,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the inverse permutation of a tensor.
+//
+// This operation computes the inverse of an index permutation. It takes a 1-D
+// integer tensor `x`, which represents the indices of a zero-based array, and
+// swaps each value with its index position. In other words, for an output tensor
+// `y` and an input tensor `x`, this operation computes the following:
+//
+// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
+//
+// The values must include 0. There can be no duplicate values or negative values.
+//
+// For example:
+//
+// ```
+// # tensor `x` is [3, 4, 0, 2, 1]
+// invert_permutation(x) ==> [2, 4, 3, 0, 1]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.
+func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("MultiDeviceIteratorGetNextFromShard", err)
-		return
-	}
-	return components
-}
-
-// LeakyReluGradAttr is an optional argument to LeakyReluGrad.
-type LeakyReluGradAttr func(optionalAttr)
-
-// LeakyReluGradAlpha sets the optional alpha attribute to value.
-// If not specified, defaults to 0.2
-func LeakyReluGradAlpha(value float32) LeakyReluGradAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
+	opspec := tf.OpSpec{
+		Type: "InvertPermutation",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes rectified linear gradients for a LeakyRelu operation.
+// Computes log softmax activations.
+//
+// For each batch `i` and class `j` we have
+//
+//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding LeakyRelu operation.
-//	features: The features passed as input to the corresponding LeakyRelu operation,
-// OR the outputs of that operation (both work equivalently).
+//	logits: 2-D with shape `[batch_size, num_classes]`.
 //
-// Returns `gradients * (features > 0) + alpha * gradients * (featurs <= 0)`.
-func LeakyReluGrad(scope *Scope, gradients tf.Output, features tf.Output, optional ...LeakyReluGradAttr) (backprops tf.Output) {
+// Returns Same shape as `logits`.
+func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "LeakyReluGrad",
+		Type: "LogSoftmax",
 		Input: []tf.Input{
-			gradients, features,
+			logits,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayGradV3
+// Returns the truth value of (x <= y) element-wise.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
-func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV2",
+		Type: "LessEqual",
 		Input: []tf.Input{
-			handle, index, value, flow_in,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LeakyReluAttr is an optional argument to LeakyRelu.
-type LeakyReluAttr func(optionalAttr)
-
-// LeakyReluAlpha sets the optional alpha attribute to value.
-// If not specified, defaults to 0.2
-func LeakyReluAlpha(value float32) LeakyReluAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// Computes rectified linear: `max(features, features * alpha)`.
-func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LeakyRelu",
-		Input: []tf.Input{
-			features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Writes the given dataset to the given file using the TFRecord format.
+// Computes softmax activations.
+//
+// For each batch `i` and class `j` we have
+//
+//     $$softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))$$
 //
 // Arguments:
-//	input_dataset: A variant tensor representing the dataset to write.
-//	filename: A scalar string tensor representing the filename to use.
-//	compression_type: A scalar string tensor containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	logits: 2-D with shape `[batch_size, num_classes]`.
 //
-// Returns the created operation.
-func DatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DatasetToTFRecord",
-		Input: []tf.Input{
-			input_dataset, filename, compression_type,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes rectified linear 6: `min(max(features, 0), 6)`.
-func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
+// Returns Same shape as `logits`.
+func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu6",
+		Type: "Softmax",
 		Input: []tf.Input{
-			features,
+			logits,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SdcaOptimizerV2Attr is an optional argument to SdcaOptimizerV2.
-type SdcaOptimizerV2Attr func(optionalAttr)
+// DecodeBmpAttr is an optional argument to DecodeBmp.
+type DecodeBmpAttr func(optionalAttr)
 
-// SdcaOptimizerV2Adaptive sets the optional adaptive attribute to value.
-//
-// value: Whether to use Adaptive SDCA for the inner loop.
-// If not specified, defaults to true
-func SdcaOptimizerV2Adaptive(value bool) SdcaOptimizerV2Attr {
+// DecodeBmpChannels sets the optional channels attribute to value.
+// If not specified, defaults to 0
+func DecodeBmpChannels(value int64) DecodeBmpAttr {
 	return func(m optionalAttr) {
-		m["adaptive"] = value
+		m["channels"] = value
 	}
 }
 
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
+// Decode the first frame of a BMP-encoded image to a uint8 tensor.
 //
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
+// Accepted values are:
 //
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+// *   0: Use the number of channels in the BMP-encoded image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
 //
 // Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
+//	contents: 0-D.  The BMP-encoded image.
 //
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerV2Attr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`. RGB order
+func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaOptimizerV2",
+		Type: "DecodeBmp",
 		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+			contents,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizerV2", err)
-		return
-	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizerV2", err)
-		return
+	return op.Output(0)
+}
+
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
+
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
+//
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_x"] = value
 	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
 }
 
-// Computes the minimum along segments of a tensor.
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
-// for an explanation of segments.
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
+	}
+}
+
+// Multiplies slices of two tensors in batches.
 //
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the minimum such that:
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
 //
-// \\(output_i = \min_{j...} data_[j...]\\) where min is over tuples `j...` such
-// that `segment_ids[j...] == i`.
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
 //
-// If the minimum is empty for a given segment ID `i`, it outputs the largest
-// possible value for the specific numeric type,
-// `output[i] = numeric_limits<T>::max()`.
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
 //
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
 //
-// Arguments:
+// It is computed as:
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
 //
+// Arguments:
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
 //
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMin",
+		Type: "BatchMatMul",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			x, y,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes rectified linear gradients for a Relu operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu operation.
-//	features: The features passed as input to the corresponding Relu operation, OR
-// the outputs of that operation (both work equivalently).
+// Returns which elements of x are NaN.
 //
-// Returns `gradients * (features > 0)`.
-func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isnan
+// @end_compatibility
+func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReluGrad",
+		Type: "IsNan",
 		Input: []tf.Input{
-			gradients, features,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the input.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+// Identity op for gradient debugging.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
-func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+// This op is hidden from public in Python. It is used by TensorFlow Debugger to
+// register gradient tensors for gradient debugging.
+// This op operates on non-reference-type tensors.
+func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropInput",
+		Type: "DebugGradientIdentity",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the polygamma function \\(\psi^{(n)}(x)\\).
-//
-// The polygamma function is defined as:
-//
-//
-// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// where \\(\psi(x)\\) is the digamma function.
-func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// var: Should be from a Variable().
+//
+// Arguments:
+//
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Polygamma",
+		Type: "ResourceSparseApplyAdadelta",
 		Input: []tf.Input{
-			a, x,
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Gets next element for the provided shard number.
 //
 // Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// input of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	multi_device_iterator: A MultiDeviceIterator resource.
+//	shard_num: Integer representing which shard to fetch data for.
+//	incarnation_id: Which incarnation of the MultiDeviceIterator is running.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
 //
-// Returns Gradients of gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+// Returns Result of the get_next on the dataset.
+func MultiDeviceIteratorGetNextFromShard(scope *Scope, multi_device_iterator tf.Output, shard_num tf.Output, incarnation_id tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradWithArgmax",
+		Type: "MultiDeviceIteratorGetNextFromShard",
 		Input: []tf.Input{
-			input, grad, argmax,
+			multi_device_iterator, shard_num, incarnation_id,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("MultiDeviceIteratorGetNextFromShard", err)
+		return
+	}
+	return components
 }
 
-// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
-type MaxPoolGradGradV2Attr func(optionalAttr)
+// LeakyReluGradAttr is an optional argument to LeakyReluGrad.
+type LeakyReluGradAttr func(optionalAttr)
 
-// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
+// LeakyReluGradAlpha sets the optional alpha attribute to value.
+// If not specified, defaults to 0.2
+func LeakyReluGradAlpha(value float32) LeakyReluGradAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["alpha"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Computes rectified linear gradients for a LeakyRelu operation.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	gradients: The backpropagated gradients to the corresponding LeakyRelu operation.
+//	features: The features passed as input to the corresponding LeakyRelu operation,
+// OR the outputs of that operation (both work equivalently).
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
+// Returns `gradients * (features > 0) + alpha * gradients * (featurs <= 0)`.
+func LeakyReluGrad(scope *Scope, gradients tf.Output, features tf.Output, optional ...LeakyReluGradAttr) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradV2",
+		Type: "LeakyReluGrad",
 		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
+			gradients, features,
 		},
 		Attrs: attrs,
 	}
@@ -8363,64 +8308,36 @@ func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output
 	return op.Output(0)
 }
 
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// output of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// Deprecated. Use TensorArrayGradV3
 //
-// Returns Gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
+func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradWithArgmax",
+		Type: "TensorArrayWriteV2",
 		Input: []tf.Input{
-			input, grad, argmax,
+			handle, index, value, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MutexV2Attr is an optional argument to MutexV2.
-type MutexV2Attr func(optionalAttr)
-
-// MutexV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this variable is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutexV2Container(value string) MutexV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
+// LeakyReluAttr is an optional argument to LeakyRelu.
+type LeakyReluAttr func(optionalAttr)
 
-// MutexV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this variable is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func MutexV2SharedName(value string) MutexV2Attr {
+// LeakyReluAlpha sets the optional alpha attribute to value.
+// If not specified, defaults to 0.2
+func LeakyReluAlpha(value float32) LeakyReluAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["alpha"] = value
 	}
 }
 
-// Creates a Mutex resource that can be locked by `MutexLock`.
-//
-// Returns The mutex resource.
-func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
+// Computes rectified linear: `max(features, features * alpha)`.
+func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8429,110 +8346,107 @@ func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutexV2",
-
+		Type: "LeakyRelu",
+		Input: []tf.Input{
+			features,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
-
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs 3D average pooling on the input.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
+		Type: "Relu6",
 		Input: []tf.Input{
-			input,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g.
-// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
+// SdcaOptimizerV2Attr is an optional argument to SdcaOptimizerV2.
+type SdcaOptimizerV2Attr func(optionalAttr)
+
+// SdcaOptimizerV2Adaptive sets the optional adaptive attribute to value.
 //
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mod",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
+func SdcaOptimizerV2Adaptive(value bool) SdcaOptimizerV2Attr {
+	return func(m optionalAttr) {
+		m["adaptive"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes offsets of concat inputs within its output.
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
 //
-// For example:
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
 //
-// ```
-// # 'x' is [2, 2, 7]
-// # 'y' is [2, 3, 7]
-// # 'z' is [2, 5, 7]
-// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
-// ```
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
 //
-// This is typically used by gradient computations for a concat operation.
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
 //
 // Arguments:
-//	concat_dim: The dimension along which to concatenate.
-//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
 //
-// Returns The `N` int32 vectors representing the starting offset
-// of input tensors within the concatenated output.
-func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerV2Attr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ConcatOffset",
+		Type: "SdcaOptimizerV2",
 		Input: []tf.Input{
-			concat_dim, tf.OutputList(shape),
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	if scope.Err() != nil {
@@ -8540,160 +8454,104 @@ func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset
 	}
 	var idx int
 	var err error
-	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
-		scope.UpdateErr("ConcatOffset", err)
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizerV2", err)
 		return
 	}
-	return offset
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizerV2", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
 }
 
-// Compute the lower regularized incomplete Gamma function `P(a, x)`.
+// Computes the minimum along segments of a tensor.
 //
-// The lower regularized incomplete Gamma function is defined as:
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+// for an explanation of segments.
 //
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the minimum such that:
 //
-// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+// \\(output_i = \min_{j...} data_[j...]\\) where min is over tuples `j...` such
+// that `segment_ids[j...] == i`.
 //
-// where
+// If the minimum is empty for a given segment ID `i`, it outputs the largest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::max()`.
 //
-// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
 //
-// is the lower incomplete Gamma function.
+// Arguments:
 //
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Igamma",
+		Type: "UnsortedSegmentMin",
 		Input: []tf.Input{
-			a, x,
+			data, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DepthToSpaceAttr is an optional argument to DepthToSpace.
-type DepthToSpaceAttr func(optionalAttr)
-
-// DepthToSpaceDataFormat sets the optional data_format attribute to value.
-// If not specified, defaults to "NHWC"
-func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// Computes rectified linear gradients for a Relu operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu operation.
+//	features: The features passed as input to the corresponding Relu operation, OR
+// the outputs of that operation (both work equivalently).
+//
+// Returns `gradients * (features > 0)`.
+func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReluGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DepthToSpace for tensors of type T.
-//
-// Rearranges data from depth into blocks of spatial data.
-// This is the reverse transformation of SpaceToDepth. More specifically,
-// this op outputs a copy of the input tensor where values from the `depth`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions.
-// The attr `block_size` indicates the input block size and how the data is moved.
-//
-//   * Chunks of data of size `block_size * block_size` from depth are rearranged
-//     into non-overlapping blocks of size `block_size x block_size`
-//   * The width the output tensor is `input_depth * block_size`, whereas the
-//     height is `input_height * block_size`.
-//   * The Y, X coordinates within each block of the output image are determined
-//     by the high order component of the input channel index.
-//   * The depth of the input tensor must be divisible by
-//     `block_size * block_size`.
-//
-// The `data_format` attr specifies the layout of the input and output tensors
-// with the following options:
-//   "NHWC": `[ batch, height, width, channels ]`
-//   "NCHW": `[ batch, channels, height, width ]`
-//   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, 4 ]`
-//
-// It is useful to consider the operation as transforming a 6-D Tensor.
-// e.g. for data_format = NHWC,
-//      Each element in the input tensor can be specified via 6 coordinates,
-//      ordered by decreasing memory layout significance as:
-//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
-//                         within the input image, bX, bY means coordinates
-//                         within the output block, oC means output channels).
-//      The output would be the input transposed to the following layout:
-//      n,iY,bY,iX,bX,oC
-//
-// This operation is useful for resizing the activations between convolutions
-// (but keeping all data), e.g. instead of pooling. It is also useful for training
-// purely convolutional models.
-//
-// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
-// block_size = 2:
-//
-// ```
-// x = [[[[1, 2, 3, 4]]]]
-//
-// ```
-//
-// This operation will output a tensor of shape `[1, 2, 2, 1]`:
-//
-// ```
-//    [[[[1], [2]],
-//      [[3], [4]]]]
-// ```
-//
-// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
-// the corresponding output will have 2x2 elements and will have a depth of
-// 1 channel (1 = `4 / (block_size * block_size)`).
-// The output element shape is `[2, 2, 1]`.
-//
-// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
-//
-// ```
-// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-// ```
-//
-// This operation, for block size of 2, will return the following tensor of shape
-// `[1, 2, 2, 3]`
-//
-// ```
-//    [[[[1, 2, 3], [4, 5, 6]],
-//      [[7, 8, 9], [10, 11, 12]]]]
-//
-// ```
-//
-// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
-//
-// ```
-// x =  [[[[1, 2, 3, 4],
-//        [5, 6, 7, 8]],
-//       [[9, 10, 11, 12],
-//        [13, 14, 15, 16]]]]
-// ```
-//
-// the operator will return the following tensor of shape `[1 4 4 1]`:
-//
-// ```
-// x = [[[ [1],   [2],  [5],  [6]],
-//       [ [3],   [4],  [7],  [8]],
-//       [ [9],  [10], [13],  [14]],
-//       [ [11], [12], [15],  [16]]]]
-//
-// ```
+// Computes the gradient of morphological 2-D dilation with respect to the input.
 //
 // Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-//	block_size: The size of the spatial block, same as in Space2Depth.
-func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
+// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
+func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "DepthToSpace",
+		Type: "Dilation2DBackpropInput",
 		Input: []tf.Input{
-			input,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -8701,122 +8559,98 @@ func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...D
 	return op.Output(0)
 }
 
-// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
-type Conv3DBackpropInputV2Attr func(optionalAttr)
-
-// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+// Compute the polygamma function \\(\psi^{(n)}(x)\\).
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
+// The polygamma function is defined as:
 //
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
 //
-// Arguments:
-//	input_sizes: An integer vector representing the tensor shape of `input`,
-// where `input` is a 5-D
-// `[batch, depth, rows, cols, in_channels]` tensor.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
+// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+//
+// where \\(\psi(x)\\) is the digamma function.
+func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInputV2",
+		Type: "Polygamma",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			a, x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes square root of x element-wise.
+// Computes second-order gradients of the maxpooling function.
 //
-// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// input of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Sqrt",
+		Type: "MaxPoolGradGradWithArgmax",
 		Input: []tf.Input{
-			x,
+			input, grad, argmax,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
-type Conv3DBackpropFilterAttr func(optionalAttr)
+// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
+type MaxPoolGradGradV2Attr func(optionalAttr)
 
-// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
+// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the filter.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
 //	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilter",
+		Type: "MaxPoolGradGradV2",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			orig_input, orig_output, grad, ksize, strides,
 		},
 		Attrs: attrs,
 	}
@@ -8824,142 +8658,120 @@ func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_b
 	return op.Output(0)
 }
 
-// Computes the gradient for the rsqrt of `x` wrt its input.
+// Computes gradients of the maxpooling function.
 //
-// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Arguments:
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// output of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "RsqrtGrad",
+		Type: "MaxPoolGradWithArgmax",
 		Input: []tf.Input{
-			y, dy,
+			input, grad, argmax,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
-type DepthwiseConv2dNativeAttr func(optionalAttr)
+// MutexV2Attr is an optional argument to MutexV2.
+type MutexV2Attr func(optionalAttr)
 
-// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
+// MutexV2Container sets the optional container attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
+// value: If non-empty, this variable is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutexV2Container(value string) MutexV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["container"] = value
 	}
 }
 
-// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
+// MutexV2SharedName sets the optional shared_name attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
+// value: If non-empty, this variable is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func MutexV2SharedName(value string) MutexV2Attr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-// a different filter to each input channel (expanding from 1 channel to
-// `channel_multiplier` channels for each), then concatenates the results
-// together. Thus, the output has `in_channels * channel_multiplier` channels.
-//
-// ```
-// for k in 0..in_channels-1
-//   for q in 0..channel_multiplier-1
-//     output[b, i, j, k * channel_multiplier + q] =
-//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-//                         filter[di, dj, k, q]
-// ```
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-//
-// Arguments:
-//
+// Creates a Mutex resource that can be locked by `MutexLock`.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`.
-//	padding: The type of padding algorithm to use.
-func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+// Returns The mutex resource.
+func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNative",
-		Input: []tf.Input{
-			input, filter,
-		},
+		Type: "MutexV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
-type MaxPoolGradV2Attr func(optionalAttr)
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
 
-// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes gradients of the maxpooling function.
+// Performs 3D average pooling on the input.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
 //
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradV2",
+		Type: "AvgPool3D",
 		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -8967,291 +8779,216 @@ func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, gr
 	return op.Output(0)
 }
 
-// Restore a reader to a previously saved state.
-//
-// Not all Readers support being restored, so this can produce an
-// Unimplemented error.
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	state: Result of a ReaderSerializeState of a Reader with type
-// matching reader_handle.
+// the result here is consistent with a truncating divide. E.g.
+// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
 //
-// Returns the created operation.
-func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderRestoreStateV2",
-		Input: []tf.Input{
-			reader_handle, state,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
-type MaxPoolGradAttr func(optionalAttr)
-
-// MaxPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGrad",
+		Type: "Mod",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CropAndResizeAttr is an optional argument to CropAndResize.
-type CropAndResizeAttr func(optionalAttr)
-
-// CropAndResizeMethod sets the optional method attribute to value.
-//
-// value: A string specifying the sampling method for resizing. It can be either
-// `"bilinear"` or `"nearest"` and default to `"bilinear"`. Currently two sampling
-// methods are supported: Bilinear and Nearest Neighbor.
-// If not specified, defaults to "bilinear"
-func CropAndResizeMethod(value string) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
+// Computes offsets of concat inputs within its output.
 //
-// value: Value used for extrapolation, when applicable.
-// If not specified, defaults to 0
-func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["extrapolation_value"] = value
-	}
-}
-
-// Extracts crops from the input image tensor and resizes them.
+// For example:
 //
-// Extracts crops from the input image tensor and resizes them using bilinear
-// sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
-// common output size specified by `crop_size`. This is more general than the
-// `crop_to_bounding_box` op which extracts a fixed size slice from the input image
-// and does not allow resizing or aspect ratio change.
+// ```
+// # 'x' is [2, 2, 7]
+// # 'y' is [2, 3, 7]
+// # 'z' is [2, 5, 7]
+// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
+// ```
 //
-// Returns a tensor with `crops` from the input `image` at positions defined at the
-// bounding box locations in `boxes`. The cropped boxes are all resized (with
-// bilinear or nearest neighbor interpolation) to a fixed
-// `size = [crop_height, crop_width]`. The result is a 4-D tensor
-// `[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
-// In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
-// results to using `tf.image.resize_bilinear()` or
-// `tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
-// `align_corners=True`.
+// This is typically used by gradient computations for a concat operation.
 //
 // Arguments:
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-// cropped image patches are resized to this size. The aspect ratio of the image
-// content is not preserved. Both `crop_height` and `crop_width` need to be
-// positive.
+//	concat_dim: The dimension along which to concatenate.
+//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
 //
-// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
+// Returns The `N` int32 vectors representing the starting offset
+// of input tensors within the concatenated output.
+func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResize",
+		Type: "ConcatOffset",
 		Input: []tf.Input{
-			image, boxes, box_ind, crop_size,
+			concat_dim, tf.OutputList(shape),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Fills empty rows in the input 2-D `SparseTensor` with a default value.
-//
-// The input `SparseTensor` is represented via the tuple of inputs
-// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
-// same `dense_shape` but with indices `output_indices` and values
-// `output_values`.
-//
-// This op inserts a single entry for every row that doesn't have any values.
-// The index is created as `[row, 0, ..., 0]` and the inserted value
-// is `default_value`.
-//
-// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [2, 0]: c
-//     [3, 1]: d
-//
-// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [1, 0]: default_value
-//     [2, 0]: c
-//     [3, 1]: d
-//     [4, 0]: default_value
-//
-// The output `SparseTensor` will be in row-major order and will have the
-// same shape as the input.
-//
-// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
-//
-//     empty_row_indicator[i] = True iff row i was an empty row.
-//
-// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
-// backpropagation,
-//
-//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
-//
-// Arguments:
-//	indices: 2-D. the indices of the sparse tensor.
-//	values: 1-D. the values of the sparse tensor.
-//	dense_shape: 1-D. the shape of the sparse tensor.
-//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
-//   for rows missing from the input sparse tensor.
-// output indices: 2-D. the indices of the filled sparse tensor.
-//
-// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
-// input sparse tensor.1-D. a map from the input indices to the output indices.
-func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRows",
-		Input: []tf.Input{
-			indices, values, dense_shape, default_value,
-		},
+	var idx int
+	var err error
+	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
+		scope.UpdateErr("ConcatOffset", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return offset
 }
 
-// Reduces `input` from `num_devices` using `reduction` to a single device.
+// Compute the lower regularized incomplete Gamma function `P(a, x)`.
 //
-// Reduces `input` from `num_devices` using `reduction` to a single device.
+// The lower regularized incomplete Gamma function is defined as:
 //
-// The graph should be constructed so that all inputs have a valid device
-// assignment, and the op itself is assigned one of these devices.
 //
-// input: The input to the reduction.
-// data: the value of the reduction across all `num_devices` devices.
-// reduction: the reduction operation to perform.
-func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+//
+// where
+//
+// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
+//
+// is the lower incomplete Gamma function.
+//
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"reduction": reduction}
 	opspec := tf.OpSpec{
-		Type: "NcclReduce",
+		Type: "Igamma",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			a, x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BiasAddGradAttr is an optional argument to BiasAddGrad.
-type BiasAddGradAttr func(optionalAttr)
+// DepthToSpaceAttr is an optional argument to DepthToSpace.
+type DepthToSpaceAttr func(optionalAttr)
 
-// BiasAddGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
+// DepthToSpaceDataFormat sets the optional data_format attribute to value.
 // If not specified, defaults to "NHWC"
-func BiasAddGradDataFormat(value string) BiasAddGradAttr {
+func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// The backward operation for "BiasAdd" on the "bias" tensor.
-//
-// It accumulates all the values from out_backprop into the feature dimension.
-// For NHWC data format, the feature dimension is the last. For NCHW data format,
-// the feature dimension is the third-to-last.
+// DepthToSpace for tensors of type T.
 //
-// Arguments:
-//	out_backprop: Any number of dimensions.
+// Rearranges data from depth into blocks of spatial data.
+// This is the reverse transformation of SpaceToDepth. More specifically,
+// this op outputs a copy of the input tensor where values from the `depth`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions.
+// The attr `block_size` indicates the input block size and how the data is moved.
 //
-// Returns 1-D with size the feature dimension of `out_backprop`.
-func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
+//   * Chunks of data of size `block_size * block_size` from depth are rearranged
+//     into non-overlapping blocks of size `block_size x block_size`
+//   * The width the output tensor is `input_depth * block_size`, whereas the
+//     height is `input_height * block_size`.
+//   * The Y, X coordinates within each block of the output image are determined
+//     by the high order component of the input channel index.
+//   * The depth of the input tensor must be divisible by
+//     `block_size * block_size`.
+//
+// The `data_format` attr specifies the layout of the input and output tensors
+// with the following options:
+//   "NHWC": `[ batch, height, width, channels ]`
+//   "NCHW": `[ batch, channels, height, width ]`
+//   "NCHW_VECT_C":
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
+//
+// It is useful to consider the operation as transforming a 6-D Tensor.
+// e.g. for data_format = NHWC,
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
+//                         within the input image, bX, bY means coordinates
+//                         within the output block, oC means output channels).
+//      The output would be the input transposed to the following layout:
+//      n,iY,bY,iX,bX,oC
+//
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
+//
+// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
+// block_size = 2:
+//
+// ```
+// x = [[[[1, 2, 3, 4]]]]
+//
+// ```
+//
+// This operation will output a tensor of shape `[1, 2, 2, 1]`:
+//
+// ```
+//    [[[[1], [2]],
+//      [[3], [4]]]]
+// ```
+//
+// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
+// the corresponding output will have 2x2 elements and will have a depth of
+// 1 channel (1 = `4 / (block_size * block_size)`).
+// The output element shape is `[2, 2, 1]`.
+//
+// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
+//
+// ```
+// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+// ```
+//
+// This operation, for block size of 2, will return the following tensor of shape
+// `[1, 2, 2, 3]`
+//
+// ```
+//    [[[[1, 2, 3], [4, 5, 6]],
+//      [[7, 8, 9], [10, 11, 12]]]]
+//
+// ```
+//
+// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
+//
+// ```
+// x =  [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
+// ```
+//
+// the operator will return the following tensor of shape `[1 4 4 1]`:
+//
+// ```
+// x = [[[ [1],   [2],  [5],  [6]],
+//       [ [3],   [4],  [7],  [8]],
+//       [ [9],  [10], [13],  [14]],
+//       [ [11], [12], [15],  [16]]]]
+//
+// ```
+//
+// Arguments:
+//
+//	block_size: The size of the spatial block, same as in Space2Depth.
+func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"block_size": block_size}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddGrad",
+		Type: "DepthToSpace",
 		Input: []tf.Input{
-			out_backprop,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -9259,244 +8996,217 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 	return op.Output(0)
 }
 
-// Returns 0 if x == 0, and x / y otherwise, elementwise.
-func Xdivy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Xdivy",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
-type FusedBatchNormV2Attr func(optionalAttr)
-
-// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
 
-// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
+// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["dilations"] = value
 	}
 }
 
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Computes the gradients of 3-D convolution with respect to the input.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
-//
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormV2",
+		Type: "Conv3DBackpropInputV2",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Reverses specific dimensions of a tensor.
-//
-// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-//
-// Given a `tensor`, and a `int32` tensor `axis` representing the set of
-// dimensions of `tensor` to reverse. This operation reverses each dimension
-// `i` for which there exists `j` s.t. `axis[j] == i`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions specified
-// in `axis` may be 0 or more entries. If an index is specified more than
-// once, a InvalidArgument error is raised.
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [3] or 'dims' is [-1]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is '[1]' (or 'dims' is '[-3]')
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
-//
-// # 'dims' is '[2]' (or 'dims' is '[-2]')
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
-//
-// Arguments:
-//	tensor: Up to 8-D.
-//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
-// `[-rank(tensor), rank(tensor))`.
+// Computes square root of x element-wise.
 //
-// Returns The same shape as `tensor`.
-func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReverseV2",
+		Type: "Sqrt",
 		Input: []tf.Input{
-			tensor, axis,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adds `bias` to `value`.
-//
-// This is a deprecated version of BiasAdd and will be soon removed.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
+type Conv3DBackpropFilterAttr func(optionalAttr)
+
+// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddV1",
+		Type: "Conv3DBackpropFilter",
 		Input: []tf.Input{
-			value, bias,
+			input, filter, out_backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Shuffle dimensions of x according to a permutation.
+// Computes the gradient for the rsqrt of `x` wrt its input.
 //
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Transpose",
+		Type: "RsqrtGrad",
 		Input: []tf.Input{
-			x, perm,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
+// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
+type DepthwiseConv2dNativeAttr func(optionalAttr)
 
-// MinKeepDims sets the optional keep_dims attribute to value.
+// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes the minimum of elements across dimensions of a tensor.
+// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+// a different filter to each input channel (expanding from 1 channel to
+// `channel_multiplier` channels for each), then concatenates the results
+// together. Thus, the output has `in_channels * channel_multiplier` channels.
+//
+// ```
+// for k in 0..in_channels-1
+//   for q in 0..channel_multiplier-1
+//     output[b, i, j, k * channel_multiplier + q] =
+//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+//                         filter[di, dj, k, q]
+// ```
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
 //
-// Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`.
+//	padding: The type of padding algorithm to use.
+func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Min",
+		Type: "DepthwiseConv2dNative",
 		Input: []tf.Input{
-			input, axis,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
@@ -9504,142 +9214,191 @@ func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (ou
 	return op.Output(0)
 }
 
-// Computes the Bessel i1e function of `x` element-wise.
+// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
+type MaxPoolGradV2Attr func(optionalAttr)
+
+// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
 //
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the maxpooling function.
 //
-// This function is faster and numerically stabler than `bessel_i1(x)`.
-func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BesselI1e",
+		Type: "MaxPoolGradV2",
 		Input: []tf.Input{
-			x,
+			orig_input, orig_output, grad, ksize, strides,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
-
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Restore a reader to a previously saved state.
 //
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
 //
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
+//
+// Returns the created operation.
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	opspec := tf.OpSpec{
+		Type: "ReaderRestoreStateV2",
+		Input: []tf.Input{
+			reader_handle, state,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// MapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
+// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
+type MaxPoolGradAttr func(optionalAttr)
+
+// MaxPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["data_format"] = value
 	}
 }
 
-// Op removes all elements in the underlying container.
+// Computes gradients of the maxpooling function.
 //
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapClear",
-
+		Type: "MaxPoolGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
+// CropAndResizeAttr is an optional argument to CropAndResize.
+type CropAndResizeAttr func(optionalAttr)
 
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+// CropAndResizeMethod sets the optional method attribute to value.
 //
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+// value: A string specifying the sampling method for resizing. It can be either
+// `"bilinear"` or `"nearest"` and default to `"bilinear"`. Currently two sampling
+// methods are supported: Bilinear and Nearest Neighbor.
+// If not specified, defaults to "bilinear"
+func CropAndResizeMethod(value string) CropAndResizeAttr {
 	return func(m optionalAttr) {
-		m["field_delim"] = value
+		m["method"] = value
 	}
 }
 
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
 //
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+// value: Value used for extrapolation, when applicable.
+// If not specified, defaults to 0
+func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
 	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
+		m["extrapolation_value"] = value
 	}
 }
 
-// DecodeCSVNaValue sets the optional na_value attribute to value.
+// Extracts crops from the input image tensor and resizes them.
 //
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["na_value"] = value
-	}
-}
-
-// DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to <>
-func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["select_cols"] = value
-	}
-}
-
-// Convert CSV records to tensors. Each column maps to one tensor.
+// Extracts crops from the input image tensor and resizes them using bilinear
+// sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
+// common output size specified by `crop_size`. This is more general than the
+// `crop_to_bounding_box` op which extracts a fixed size slice from the input image
+// and does not allow resizing or aspect ratio change.
 //
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
+// Returns a tensor with `crops` from the input `image` at positions defined at the
+// bounding box locations in `boxes`. The cropped boxes are all resized (with
+// bilinear or nearest neighbor interpolation) to a fixed
+// `size = [crop_height, crop_width]`. The result is a 4-D tensor
+// `[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
+// In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
+// results to using `tf.image.resize_bilinear()` or
+// `tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
+// `align_corners=True`.
 //
 // Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or an empty vector if the column is
-// required.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
+// cropped image patches are resized to this size. The aspect ratio of the image
+// content is not preserved. Both `crop_height` and `crop_width` need to be
+// positive.
 //
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9648,96 +9407,135 @@ func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, opt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
+		Type: "CropAndResize",
 		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
+			image, boxes, box_ind, crop_size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// Transforms a Tensor into a serialized TensorProto proto.
+// Fills empty rows in the input 2-D `SparseTensor` with a default value.
+//
+// The input `SparseTensor` is represented via the tuple of inputs
+// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
+// same `dense_shape` but with indices `output_indices` and values
+// `output_values`.
+//
+// This op inserts a single entry for every row that doesn't have any values.
+// The index is created as `[row, 0, ..., 0]` and the inserted value
+// is `default_value`.
+//
+// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [2, 0]: c
+//     [3, 1]: d
+//
+// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [1, 0]: default_value
+//     [2, 0]: c
+//     [3, 1]: d
+//     [4, 0]: default_value
+//
+// The output `SparseTensor` will be in row-major order and will have the
+// same shape as the input.
+//
+// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
+//
+//     empty_row_indicator[i] = True iff row i was an empty row.
+//
+// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
+// backpropagation,
+//
+//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
 //
 // Arguments:
-//	tensor: A Tensor of type `T`.
+//	indices: 2-D. the indices of the sparse tensor.
+//	values: 1-D. the values of the sparse tensor.
+//	dense_shape: 1-D. the shape of the sparse tensor.
+//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
+//   for rows missing from the input sparse tensor.
+// output indices: 2-D. the indices of the filled sparse tensor.
 //
-// Returns A serialized TensorProto proto of the input tensor.
-func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
+// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
+// input sparse tensor.1-D. a map from the input indices to the output indices.
+func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeTensor",
+		Type: "SparseFillEmptyRows",
 		Input: []tf.Input{
-			tensor,
+			indices, values, dense_shape, default_value,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Computes acos of x element-wise.
-func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+// Reduces `input` from `num_devices` using `reduction` to a single device.
+//
+// Reduces `input` from `num_devices` using `reduction` to a single device.
+//
+// The graph should be constructed so that all inputs have a valid device
+// assignment, and the op itself is assigned one of these devices.
+//
+// input: The input to the reduction.
+// data: the value of the reduction across all `num_devices` devices.
+// reduction: the reduction operation to perform.
+func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"reduction": reduction}
 	opspec := tf.OpSpec{
-		Type: "Acos",
+		Type: "NcclReduce",
 		Input: []tf.Input{
-			x,
+			tf.OutputList(input),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UnbatchGradAttr is an optional argument to UnbatchGrad.
-type UnbatchGradAttr func(optionalAttr)
-
-// UnbatchGradContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func UnbatchGradContainer(value string) UnbatchGradAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
 
-// UnbatchGradSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func UnbatchGradSharedName(value string) UnbatchGradAttr {
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddGradDataFormat(value string) BiasAddGradAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["data_format"] = value
 	}
 }
 
-// Gradient of Unbatch.
+// The backward operation for "BiasAdd" on the "bias" tensor.
 //
-// Acts like Batch but using the given batch_index index of batching things as they
-// become available. This ensures that the gradients are propagated back in the
-// same session which did the forward pass.
+// It accumulates all the values from out_backprop into the feature dimension.
+// For NHWC data format, the feature dimension is the last. For NCHW data format,
+// the feature dimension is the third-to-last.
 //
-// original_input: The input to the Unbatch operation this is the gradient of.
-// batch_index: The batch_index given to the Unbatch operation this is the gradient
-// of.
-// grad: The downstream gradient.
-// id: The id scalar emitted by Batch.
-// batched_grad: The return value, either an empty tensor or the batched gradient.
-// container: Container to control resource sharing.
-// shared_name: Instances of UnbatchGrad with the same container and shared_name
-//  are assumed to possibly belong to the same batch. If left empty, the op name
-//  will be used as the shared name.
-func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output, grad tf.Output, id tf.Output, optional ...UnbatchGradAttr) (batched_grad tf.Output) {
+// Arguments:
+//	out_backprop: Any number of dimensions.
+//
+// Returns 1-D with size the feature dimension of `out_backprop`.
+func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9746,9 +9544,9 @@ func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UnbatchGrad",
+		Type: "BiasAddGrad",
 		Input: []tf.Input{
-			original_input, batch_index, grad, id,
+			out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -9756,265 +9554,233 @@ func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output,
 	return op.Output(0)
 }
 
-// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
-type AvgPool3DGradAttr func(optionalAttr)
+// Returns 0 if x == 0, and x / y otherwise, elementwise.
+func Xdivy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Xdivy",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
+type FusedBatchNormV2Attr func(optionalAttr)
+
+// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes gradients of average pooling function.
+// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	orig_input_shape: The original input dimensions.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
 //
-// Returns The backprop for input.
-func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3DGrad",
+		Type: "FusedBatchNormV2",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
-type ParseSingleSequenceExampleAttr func(optionalAttr)
-
-// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+// Reverses specific dimensions of a tensor.
 //
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
+// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
-	}
-}
-
-// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
+// Given a `tensor`, and a `int32` tensor `axis` representing the set of
+// dimensions of `tensor` to reverse. This operation reverses each dimension
+// `i` for which there exists `j` s.t. `axis[j] == i`.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
-	}
-}
-
-// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+// `tensor` can have up to 8 dimensions. The number of dimensions specified
+// in `axis` may be 0 or more entries. If an index is specified more than
+// once, a InvalidArgument error is raised.
 //
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
-// The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to <>
+// For example:
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
-	}
-}
-
-// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
 //
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// # 'dims' is [3] or 'dims' is [-1]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
-	}
-}
-
-// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+// # 'dims' is '[1]' (or 'dims' is '[-3]')
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
 //
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
+// # 'dims' is '[2]' (or 'dims' is '[-2]')
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
+// Arguments:
+//	tensor: Up to 8-D.
+//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
+// `[-rank(tensor), rank(tensor))`.
+//
+// Returns The same shape as `tensor`.
+func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "ReverseV2",
+		Input: []tf.Input{
+			tensor, axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
+// Adds `bias` to `value`.
+//
+// This is a deprecated version of BiasAdd and will be soon removed.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
-//	serialized: A scalar containing a binary serialized SequenceExample proto.
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExample.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExample.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	debug_name: A scalar containing the name of the serialized proto.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty scalar if no name is available.
-func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ParseSingleSequenceExample",
+		Type: "BiasAddV1",
 		Input: []tf.Input{
-			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
+			value, bias,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Shuffle dimensions of x according to a permutation.
+//
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
-}
-
-// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
-type QuantizeAndDequantizeAttr func(optionalAttr)
-
-// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
+	opspec := tf.OpSpec{
+		Type: "Transpose",
+		Input: []tf.Input{
+			x, perm,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
 
-// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
+// MinKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_min"] = value
-	}
-}
-
-// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+func MinKeepDims(value bool) MinAttr {
 	return func(m optionalAttr) {
-		m["input_max"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Use QuantizeAndDequantizeV2 instead.
+// Computes the minimum of elements across dimensions of a tensor.
 //
-// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
-func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10023,9 +9789,9 @@ func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAn
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantize",
+		Type: "Min",
 		Input: []tf.Input{
-			input,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -10033,182 +9799,339 @@ func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAn
 	return op.Output(0)
 }
 
-// Returns locations of nonzero / true values in a tensor.
-//
-// This operation returns the coordinates of true elements in `condition`. The
-// coordinates are returned in a 2-D tensor where the first dimension (rows)
-// represents the number of true elements, and the second dimension (columns)
-// represents the coordinates of the true elements. Keep in mind, the shape of
-// the output tensor can vary depending on how many true values there are in
-// `condition`. Indices are output in row-major order.
-//
-// For example:
-//
-// ```
-// # 'input' tensor is [[True, False]
-// #                    [True, False]]
-// # 'input' has two true values, so output has two coordinates.
-// # 'input' has rank of 2, so coordinates have two indices.
-// where(input) ==> [[0, 0],
-//                   [1, 0]]
-//
-// # `condition` tensor is [[[True, False]
-// #                     [True, False]]
-// #                    [[False, True]
-// #                     [False, True]]
-// #                    [[False, False]
-// #                     [False, True]]]
-// # 'input' has 5 true values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+// Computes the Bessel i1e function of `x` element-wise.
 //
-// # `condition` tensor is [[[1.5,  0.0]
-// #                     [-0.5, 0.0]]
-// #                    [[0.0,  0.25]
-// #                     [0.0,  0.75]]
-// #                    [[0.0,  0.0]
-// #                     [0.0,  0.01]]]
-// # 'input' has 5 nonzero values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
 //
-// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
-// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
-// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-// ```
-func Where(scope *Scope, condition tf.Output) (index tf.Output) {
+// This function is faster and numerically stabler than `bessel_i1(x)`.
+func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Where",
+		Type: "BesselI1e",
 		Input: []tf.Input{
-			condition,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
-type QueueDequeueV2Attr func(optionalAttr)
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
 
-// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If the queue is empty, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["capacity"] = value
 	}
 }
 
-// Dequeues a tuple of one or more tensors from the given queue.
-//
-// This operation has k outputs, where k is the number of components
-// in the tuples stored in the given queue, and output i is the ith
-// component of the dequeued tuple.
-//
-// N.B. If the queue is empty, this operation will block until an element
-// has been dequeued (or 'timeout_ms' elapses, if specified).
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	handle: The handle to a queue.
-//	component_types: The type of each component in a tuple.
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueV2",
-		Input: []tf.Input{
-			handle,
-		},
+		Type: "MapClear",
+
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueV2", err)
-		return
-	}
-	return components
+	return scope.AddOperation(opspec)
 }
 
-// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
-type ParseSequenceExampleAttr func(optionalAttr)
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
 
-// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
-// If not specified, defaults to 0
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
 //
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["Ncontext_sparse"] = value
+		m["field_delim"] = value
 	}
 }
 
-// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
-// If not specified, defaults to 0
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
 //
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["Ncontext_dense"] = value
+		m["use_quote_delim"] = value
 	}
 }
 
-// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
-// If not specified, defaults to 0
+// DecodeCSVNaValue sets the optional na_value attribute to value.
 //
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["Nfeature_list_sparse"] = value
+		m["na_value"] = value
 	}
 }
 
-// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
-// If not specified, defaults to 0
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["select_cols"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
 //
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
+//
+// Arguments:
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or an empty vector if the column is
+// required.
+//
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCSV",
+		Input: []tf.Input{
+			records, tf.OutputList(record_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
+	}
+	return output
+}
+
+// Convert JSON-encoded Example records to binary protocol buffer strings.
+//
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
+//
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJSONExample",
+		Input: []tf.Input{
+			json_examples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a Tensor into a serialized TensorProto proto.
+//
+// Arguments:
+//	tensor: A Tensor of type `T`.
+//
+// Returns A serialized TensorProto proto of the input tensor.
+func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeTensor",
+		Input: []tf.Input{
+			tensor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acos",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UnbatchGradAttr is an optional argument to UnbatchGrad.
+type UnbatchGradAttr func(optionalAttr)
+
+// UnbatchGradContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradContainer(value string) UnbatchGradAttr {
 	return func(m optionalAttr) {
-		m["Nfeature_list_dense"] = value
+		m["container"] = value
 	}
 }
 
-// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+// UnbatchGradSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradSharedName(value string) UnbatchGradAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Gradient of Unbatch.
+//
+// Acts like Batch but using the given batch_index index of batching things as they
+// become available. This ensures that the gradients are propagated back in the
+// same session which did the forward pass.
+//
+// original_input: The input to the Unbatch operation this is the gradient of.
+// batch_index: The batch_index given to the Unbatch operation this is the gradient
+// of.
+// grad: The downstream gradient.
+// id: The id scalar emitted by Batch.
+// batched_grad: The return value, either an empty tensor or the batched gradient.
+// container: Container to control resource sharing.
+// shared_name: Instances of UnbatchGrad with the same container and shared_name
+//  are assumed to possibly belong to the same batch. If left empty, the op name
+//  will be used as the shared name.
+func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output, grad tf.Output, id tf.Output, optional ...UnbatchGradAttr) (batched_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnbatchGrad",
+		Input: []tf.Input{
+			original_input, batch_index, grad, id,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
+type AvgPool3DGradAttr func(optionalAttr)
+
+// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of average pooling function.
+//
+// Arguments:
+//	orig_input_shape: The original input dimensions.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The backprop for input.
+func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool3DGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
+type ParseSingleSequenceExampleAttr func(optionalAttr)
+
+// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
 //
 // value: A list of Ncontext_sparse types; the data types of data in
 // each context Feature given in context_sparse_keys.
@@ -10217,23 +10140,23 @@ func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
 	return func(m optionalAttr) {
 		m["context_sparse_types"] = value
 	}
 }
 
-// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
 	return func(m optionalAttr) {
 		m["feature_list_dense_types"] = value
 	}
 }
 
-// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
 //
 // value: A list of Ncontext_dense shapes; the shapes of data in
 // each context Feature given in context_dense_keys.
@@ -10243,13 +10166,13 @@ func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenc
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
 	return func(m optionalAttr) {
 		m["context_dense_shapes"] = value
 	}
 }
 
-// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
 //
 // value: A list of Nfeature_list_sparse types; the data types
 // of data in each FeatureList given in feature_list_sparse_keys.
@@ -10258,13 +10181,13 @@ func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExamp
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
 	return func(m optionalAttr) {
 		m["feature_list_sparse_types"] = value
 	}
 }
 
-// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
 //
 // value: A list of Nfeature_list_dense shapes; the shapes of
 // data in each FeatureList given in feature_list_dense_keys.
@@ -10274,33 +10197,20 @@ func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequen
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
 	return func(m optionalAttr) {
 		m["feature_list_dense_shapes"] = value
 	}
 }
 
-// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
+// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
 //
 // Arguments:
-//	serialized: A vector containing binary serialized SequenceExample protos.
-//	debug_name: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no name is available.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
+//	serialized: A scalar containing a binary serialized SequenceExample proto.
 //	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExamples.  If the
+// FeatureList keys which may be missing from the SequenceExample.  If the
 // associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExamples.
+// any FeatureList not listed in this vector must exist in the SequenceExample.
 //	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
 // The keys expected in the Examples' features associated with context_sparse
 // values.
@@ -10313,18 +10223,31 @@ func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceE
 //	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
 // The keys expected in the SequenceExamples' feature_lists associated
 // with lists of dense values.
-func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	debug_name: A scalar containing the name of the serialized proto.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty scalar if no name is available.
+func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParseSequenceExample",
+		Type: "ParseSingleSequenceExample",
 		Input: []tf.Input{
-			serialized, debug_name, tf.OutputList(context_dense_defaults),
+			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
 		},
 		Attrs: attrs,
 	}
@@ -10335,729 +10258,221 @@ func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Outp
 	var idx int
 	var err error
 	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
+		scope.UpdateErr("ParseSingleSequenceExample", err)
 		return
 	}
 	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
+		scope.UpdateErr("ParseSingleSequenceExample", err)
 		return
 	}
 	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
+		scope.UpdateErr("ParseSingleSequenceExample", err)
 		return
 	}
 	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
+		scope.UpdateErr("ParseSingleSequenceExample", err)
 		return
 	}
 	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
+		scope.UpdateErr("ParseSingleSequenceExample", err)
 		return
 	}
 	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
+		scope.UpdateErr("ParseSingleSequenceExample", err)
 		return
 	}
 	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
+		scope.UpdateErr("ParseSingleSequenceExample", err)
 		return
 	}
 	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
+		scope.UpdateErr("ParseSingleSequenceExample", err)
 		return
 	}
-	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
+}
+
+// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
+type QuantizeAndDequantizeAttr func(optionalAttr)
+
+// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
 	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
 }
 
-// Computes the Gauss error function of `x` element-wise.
-func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_min"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_max"] = value
+	}
+}
+
+// Use QuantizeAndDequantizeV2 instead.
+//
+// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
+func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Erf",
+		Type: "QuantizeAndDequantize",
 		Input: []tf.Input{
-			x,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise largest integer not greater than x.
-func Floor(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns locations of nonzero / true values in a tensor.
+//
+// This operation returns the coordinates of true elements in `condition`. The
+// coordinates are returned in a 2-D tensor where the first dimension (rows)
+// represents the number of true elements, and the second dimension (columns)
+// represents the coordinates of the true elements. Keep in mind, the shape of
+// the output tensor can vary depending on how many true values there are in
+// `condition`. Indices are output in row-major order.
+//
+// For example:
+//
+// ```
+// # 'input' tensor is [[True, False]
+// #                    [True, False]]
+// # 'input' has two true values, so output has two coordinates.
+// # 'input' has rank of 2, so coordinates have two indices.
+// where(input) ==> [[0, 0],
+//                   [1, 0]]
+//
+// # `condition` tensor is [[[True, False]
+// #                     [True, False]]
+// #                    [[False, True]
+// #                     [False, True]]
+// #                    [[False, False]
+// #                     [False, True]]]
+// # 'input' has 5 true values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+//
+// # `condition` tensor is [[[1.5,  0.0]
+// #                     [-0.5, 0.0]]
+// #                    [[0.0,  0.25]
+// #                     [0.0,  0.75]]
+// #                    [[0.0,  0.0]
+// #                     [0.0,  0.01]]]
+// # 'input' has 5 nonzero values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+//
+// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
+// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
+// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+// ```
+func Where(scope *Scope, condition tf.Output) (index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Floor",
+		Type: "Where",
 		Input: []tf.Input{
-			x,
+			condition,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OneHotAttr is an optional argument to OneHot.
-type OneHotAttr func(optionalAttr)
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
 
-// OneHotAxis sets the optional axis attribute to value.
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: The axis to fill (default: -1, a new inner-most axis).
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
 // If not specified, defaults to -1
-func OneHotAxis(value int64) OneHotAttr {
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// Returns a one-hot tensor.
-//
-// The locations represented by indices in `indices` take value `on_value`,
-// while all other locations take value `off_value`.
-//
-// If the input `indices` is rank `N`, the output will have rank `N+1`,
-// The new axis is created at dimension `axis` (default: the new axis is
-// appended at the end).
-//
-// If `indices` is a scalar the output shape will be a vector of length `depth`.
-//
-// If `indices` is a vector of length `features`, the output shape will be:
-// ```
-//   features x depth if axis == -1
-//   depth x features if axis == 0
-// ```
-//
-// If `indices` is a matrix (batch) with shape `[batch, features]`,
-// the output shape will be:
-// ```
-//   batch x features x depth if axis == -1
-//   batch x depth x features if axis == 1
-//   depth x batch x features if axis == 0
-// ```
-//
-//
-// Examples
-// =========
+// Dequeues a tuple of one or more tensors from the given queue.
 //
-// Suppose that
+// This operation has k outputs, where k is the number of components
+// in the tuples stored in the given queue, and output i is the ith
+// component of the dequeued tuple.
 //
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 5.0
-//   off_value = 0.0
-//   axis = -1
-// ```
+// N.B. If the queue is empty, this operation will block until an element
+// has been dequeued (or 'timeout_ms' elapses, if specified).
 //
-// Then output is `[4 x 3]`:
+// Arguments:
+//	handle: The handle to a queue.
+//	component_types: The type of each component in a tuple.
 //
-//     ```output =
-//       [5.0 0.0 0.0]  // one_hot(0)
-//       [0.0 0.0 5.0]  // one_hot(2)
-//       [0.0 0.0 0.0]  // one_hot(-1)
-//       [0.0 5.0 0.0]  // one_hot(1)
-//     ```
-//
-// Suppose that
-//
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 0.0
-//   off_value = 3.0
-//   axis = 0
-// ```
-//
-// Then output is `[3 x 4]`:
-//
-//     ```output =
-//       [0.0 3.0 3.0 3.0]
-//       [3.0 3.0 3.0 0.0]
-//       [3.0 3.0 3.0 3.0]
-//       [3.0 0.0 3.0 3.0]
-//     //  ^                one_hot(0)
-//     //      ^            one_hot(2)
-//     //          ^        one_hot(-1)
-//     //              ^    one_hot(1)
-//     ```
-// Suppose that
-//
-// ```
-//   indices = [[0, 2], [1, -1]]
-//   depth = 3
-//   on_value = 1.0
-//   off_value = 0.0
-//   axis = -1
-// ```
-//
-// Then output is `[2 x 2 x 3]`:
-//
-//     ```output =
-//       [
-//         [1.0, 0.0, 0.0]  // one_hot(0)
-//         [0.0, 0.0, 1.0]  // one_hot(2)
-//       ][
-//         [0.0, 1.0, 0.0]  // one_hot(1)
-//         [0.0, 0.0, 0.0]  // one_hot(-1)
-//       ]```
-//
-// Arguments:
-//	indices: A tensor of indices.
-//	depth: A scalar defining the depth of the one hot dimension.
-//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
-//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
-//
-// Returns The one-hot tensor.
-func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OneHot",
-		Input: []tf.Input{
-			indices, depth, on_value, off_value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes exponential of x element-wise.  \\(y = e^x\\).
-func Exp(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Exp",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// NthElementAttr is an optional argument to NthElement.
-type NthElementAttr func(optionalAttr)
-
-// NthElementReverse sets the optional reverse attribute to value.
-//
-// value: When set to True, find the nth-largest value in the vector and vice
-// versa.
-// If not specified, defaults to false
-func NthElementReverse(value bool) NthElementAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Finds values of the `n`-th order statistic for the last dimension.
-//
-// If the input is a vector (rank-1), finds the entries which is the nth-smallest
-// value in the vector and outputs their values as scalar tensor.
-//
-// For matrices (resp. higher rank input), computes the entries which is the
-// nth-smallest value in each row (resp. vector along the last dimension). Thus,
-//
-//     values.shape = input.shape[:-1]
-//
-// Arguments:
-//	input: 1-D or higher with last dimension at least `n+1`.
-//	n: 0-D. Position of sorted vector to select along the last dimension (along
-// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
-//
-// Returns The `n`-th order statistic along each last dimensional slice.
-func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NthElement",
-		Input: []tf.Input{
-			input, n,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the maximum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum such that:
-//
-// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
-// that `segment_ids[j...] == i`.
-//
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest
-// possible value for the specific numeric type,
-// `output[i] = numeric_limits<T>::lowest()`.
-//
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.END
-//   }
-//   out_arg {
-//     name: "output"
-//     description: <<END
-// Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-//
-func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMax",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
-//
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseExample",
-		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
-}
-
-// CudnnRNNAttr is an optional argument to CudnnRNN.
-type CudnnRNNAttr func(optionalAttr)
-
-// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNRnnMode(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNInputMode(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNDirection(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNDropout(value float32) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNSeed(value int64) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNSeed2(value int64) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// CudnnRNNIsTraining sets the optional is_training attribute to value.
-// If not specified, defaults to true
-func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// A RNN backed by cuDNN.
-//
-// Computes the RNN from the input and initial states, with respect to the params
-// buffer.
-//
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   the actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inferenece or
-//   training.
-// reserve_space: An opaque tensor that can be used in backprop calculation. It
-//   is only produced if is_training is false.
-func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNN",
-		Input: []tf.Input{
-			input, input_h, input_c, params,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// DecodeCompressedAttr is an optional argument to DecodeCompressed.
-type DecodeCompressedAttr func(optionalAttr)
-
-// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
-//
-// value: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-// If not specified, defaults to ""
-func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
-}
-
-// Decompress strings.
-//
-// This op decompresses each element of the `bytes` input `Tensor`, which
-// is assumed to be compressed using the given `compression_type`.
-//
-// The `output` is a string `Tensor` of the same shape as `bytes`,
-// each element containing the decompressed data from the corresponding
-// element in `bytes`.
-//
-// Arguments:
-//	bytes: A Tensor of string which is compressed.
-//
-// Returns A Tensor with the same shape as input `bytes`, uncompressed
-// from bytes.
-func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeCompressed",
-		Input: []tf.Input{
-			bytes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
-
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
-//
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
-	return func(m optionalAttr) {
-		m["little_endian"] = value
-	}
-}
-
-// Reinterpret the bytes of a string as a vector of numbers.
-//
-// Arguments:
-//	bytes: All the elements must have the same length.
-//
-//
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
-		Input: []tf.Input{
-			bytes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes natural logarithm of (1 + x) element-wise.
-//
-// I.e., \\(y = \log_e (1 + x)\\).
-func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Log1p",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes rectified linear 6 gradients for a Relu6 operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation, or
-// its output; using either one produces the same result.
-//
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeBicubicAttr is an optional argument to ResizeBicubic.
-type ResizeBicubicAttr func(optionalAttr)
-
-// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize `images` to `size` using bicubic interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeBicubic",
-		Input: []tf.Input{
-			images, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gather ragged slices from `params` axis `0` according to `indices`.
-//
-// Outputs a `RaggedTensor` output composed from `output_dense_values` and
-// `output_nested_splits`, such that:
-//
-// ```python
-// output.shape = indices.shape + params.shape[1:]
-// output.ragged_rank = indices.shape.ndims + params.ragged_rank
-// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
-// ```
-//
-// where
-//
-// * `params =
-//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
-//    provides the values that should be gathered.
-// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
-//    values should be gathered.
-// * `output =
-//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
-//    is the output tensor.
-//
-// (Note: This c++ op is used to implement the higher-level python
-// `tf.ragged.gather` op, which also supports ragged indices.)
-//
-//
-// Arguments:
-//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
-// `params` RaggedTensor input.
-//	params_dense_values: The `inner_values` for the `params` RaggedTensor. There was a terminology change
-// at the python level from dense_values to inner_values, so dense_values is the
-// deprecated name.
-//	indices: Indices in the outermost dimension of `params` of the values that should be
-// gathered.
-//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
-// this number of `row_splits` tensors. This value should equal
-// `indices.shape.ndims + params.ragged_rank - 1`.
-//
-// Returns The `nested_row_splits` tensors that define the row-partitioning for the
-// returned RaggedTensor.The `inner_values` for the returned RaggedTensor.
-func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RaggedGather",
+		Type: "QueueDequeueV2",
 		Input: []tf.Input{
-			tf.OutputList(params_nested_splits), params_dense_values, indices,
+			handle,
 		},
 		Attrs: attrs,
 	}
@@ -11067,422 +10482,363 @@ func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_v
 	}
 	var idx int
 	var err error
-	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
-		scope.UpdateErr("RaggedGather", err)
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueV2", err)
 		return
 	}
-	output_dense_values = op.Output(idx)
-	return output_nested_splits, output_dense_values
+	return components
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-//
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
-//
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
+// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
+type ParseSequenceExampleAttr func(optionalAttr)
+
+// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
+// If not specified, defaults to 0
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV2",
-		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold,
-		},
+// REQUIRES: value >= 0
+func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Ncontext_sparse"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
-//
-// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// output=SparseTensor(indices=sparse_indices, values=sparse_values,
-//                     dense_shape=sparse_dense_shape)
-//
-// Arguments:
-//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
-//	rt_dense_values: The `inner_values` for the `RaggedTensor`.
+// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
+// If not specified, defaults to 0
 //
-// Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
-func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RaggedTensorToSparse",
-		Input: []tf.Input{
-			tf.OutputList(rt_nested_splits), rt_dense_values,
-		},
+// REQUIRES: value >= 0
+func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Ncontext_dense"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is a scalar
-// string tensor which is applied to every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
-//
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: A scalar string tensor containing the regular expression to match the input.
+// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
+// If not specified, defaults to 0
 //
-// Returns A bool tensor with the same shape as `input`.
-func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RegexFullMatch",
-		Input: []tf.Input{
-			input, pattern,
-		},
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_sparse"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
+// If not specified, defaults to 0
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_dense"] = value
+	}
+}
+
+// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
 //
-// Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
 //
-// Returns Computed precision at `k` as a `bool Tensor`.
-func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InTopKV2",
-		Input: []tf.Input{
-			predictions, targets, k,
-		},
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_sparse_types"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
+// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_types"] = value
+	}
+}
 
-// RandomShuffleSeed sets the optional seed attribute to value.
+// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["context_dense_shapes"] = value
 	}
 }
 
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["feature_list_sparse_types"] = value
 	}
 }
 
-// Randomly shuffles a tensor along its first dimension.
+// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
 //
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
 //
-// ```
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
-// ```
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
 //
 // Arguments:
-//	value: The tensor to be shuffled.
-//
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+//	serialized: A vector containing binary serialized SequenceExample protos.
+//	debug_name: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no name is available.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExamples.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExamples.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
+		Type: "ParseSequenceExample",
 		Input: []tf.Input{
-			value,
+			serialized, debug_name, tf.OutputList(context_dense_defaults),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
 }
 
-// Returns the truth value of (x > y) element-wise.
-//
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Greater",
+		Type: "Erf",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
-//
-// Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
+		Type: "Floor",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			x,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
+// OneHotAttr is an optional argument to OneHot.
+type OneHotAttr func(optionalAttr)
 
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+// OneHotAxis sets the optional axis attribute to value.
 //
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+// value: The axis to fill (default: -1, a new inner-most axis).
+// If not specified, defaults to -1
+func OneHotAxis(value int64) OneHotAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["axis"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+// Returns a one-hot tensor.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+// The locations represented by indices in `indices` take value `on_value`,
+// while all other locations take value `off_value`.
 //
-// value: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-// If not specified, defaults to 0.1
-func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["min_object_covered"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+// If the input `indices` is rank `N`, the output will have rank `N+1`,
+// The new axis is created at dimension `axis` (default: the new axis is
+// appended at the end).
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+// If `indices` is a scalar the output shape will be a vector of length `depth`.
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+// If `indices` is a vector of length `features`, the output shape will be:
+// ```
+//   features x depth if axis == -1
+//   depth x features if axis == 0
+// ```
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["max_attempts"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+// If `indices` is a matrix (batch) with shape `[batch, features]`,
+// the output shape will be:
+// ```
+//   batch x features x depth if axis == -1
+//   batch x depth x features if axis == 1
+//   depth x batch x features if axis == 0
+// ```
 //
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
-	}
-}
-
-// Generate a single randomly distorted bounding box for an image.
 //
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
+// Examples
+// =========
+//
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 5.0
+//   off_value = 0.0
+//   axis = -1
+// ```
 //
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
+// Then output is `[4 x 3]`:
 //
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
+//     ```output =
+//       [5.0 0.0 0.0]  // one_hot(0)
+//       [0.0 0.0 5.0]  // one_hot(2)
+//       [0.0 0.0 0.0]  // one_hot(-1)
+//       [0.0 5.0 0.0]  // one_hot(1)
+//     ```
 //
-// For example,
+// Suppose that
 //
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 0.0
+//   off_value = 3.0
+//   axis = 0
+// ```
 //
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
+// Then output is `[3 x 4]`:
 //
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
+//     ```output =
+//       [0.0 3.0 3.0 3.0]
+//       [3.0 3.0 3.0 0.0]
+//       [3.0 3.0 3.0 3.0]
+//       [3.0 0.0 3.0 3.0]
+//     //  ^                one_hot(0)
+//     //      ^            one_hot(2)
+//     //          ^        one_hot(-1)
+//     //              ^    one_hot(1)
+//     ```
+// Suppose that
+//
+// ```
+//   indices = [[0, 2], [1, -1]]
+//   depth = 3
+//   on_value = 1.0
+//   off_value = 0.0
+//   axis = -1
 // ```
 //
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// Then output is `[2 x 2 x 3]`:
+//
+//     ```output =
+//       [
+//         [1.0, 0.0, 0.0]  // one_hot(0)
+//         [0.0, 0.0, 1.0]  // one_hot(2)
+//       ][
+//         [0.0, 1.0, 0.0]  // one_hot(1)
+//         [0.0, 0.0, 0.0]  // one_hot(-1)
+//       ]```
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
+//	indices: A tensor of indices.
+//	depth: A scalar defining the depth of the one hot dimension.
+//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
+//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Returns The one-hot tensor.
+func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11491,25 +10847,23 @@ func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_box
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
+		Type: "OneHot",
 		Input: []tf.Input{
-			image_size, bounding_boxes,
+			indices, depth, on_value, off_value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes sigmoid of `x` element-wise.
-//
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sigmoid",
+		Type: "Exp",
 		Input: []tf.Input{
 			x,
 		},
@@ -11518,296 +10872,358 @@ func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
+// NthElementAttr is an optional argument to NthElement.
+type NthElementAttr func(optionalAttr)
 
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+// NthElementReverse sets the optional reverse attribute to value.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+// value: When set to True, find the nth-largest value in the vector and vice
+// versa.
+// If not specified, defaults to false
+func NthElementReverse(value bool) NthElementAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["reverse"] = value
 	}
 }
 
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+// Finds values of the `n`-th order statistic for the last dimension.
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// If the input is a vector (rank-1), finds the entries which is the nth-smallest
+// value in the vector and outputs their values as scalar tensor.
+//
+// For matrices (resp. higher rank input), computes the entries which is the
+// nth-smallest value in each row (resp. vector along the last dimension). Thus,
+//
+//     values.shape = input.shape[:-1]
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `n+1`.
+//	n: 0-D. Position of sorted vector to select along the last dimension (along
+// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
+//
+// Returns The `n`-th order statistic along each last dimensional slice.
+func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NthElement",
+		Input: []tf.Input{
+			input, n,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+// Computes the maximum along segments of a tensor.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum such that:
+//
+// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
+// that `segment_ids[j...] == i`.
+//
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::lowest()`.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.END
+//   }
+//   out_arg {
+//     name: "output"
+//     description: <<END
+// Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+//
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentMax",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
-//
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
+		Type: "ParseExample",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
-
-// RandomStandardNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
 	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// Outputs random values from a normal distribution.
+// Compute the pairwise cross product.
 //
-// The generated values will have mean 0 and standard deviation 1.
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
+		Type: "Cross",
 		Input: []tf.Input{
-			shape,
+			a, b,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
+// CudnnRNNAttr is an optional argument to CudnnRNN.
+type CudnnRNNAttr func(optionalAttr)
 
-// RandomUniformIntSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
+// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNRnnMode(value string) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["rnn_mode"] = value
 	}
 }
 
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+// CudnnRNNInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNInputMode(value string) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["input_mode"] = value
 	}
 }
 
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
-//
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
-//
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// CudnnRNNDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNDirection(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+}
+
+// CudnnRNNDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNDropout(value float32) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
-		Input: []tf.Input{
-			shape, minval, maxval,
-		},
-		Attrs: attrs,
+}
+
+// CudnnRNNSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
+// CudnnRNNSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed2(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
 
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+// CudnnRNNIsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
+		m["is_training"] = value
 	}
 }
 
-// Performs a resize and padding as a preprocess during a convolution.
-//
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// A RNN backed by cuDNN.
 //
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is false.
+func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
-	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
-		Input: []tf.Input{
-			input, size, paddings, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
-
-// RandomUniformSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+	opspec := tf.OpSpec{
+		Type: "CudnnRNN",
+		Input: []tf.Input{
+			input, input_h, input_c, params,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
+// DecodeCompressedAttr is an optional argument to DecodeCompressed.
+type DecodeCompressedAttr func(optionalAttr)
+
+// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
+// value: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+// If not specified, defaults to ""
+func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["compression_type"] = value
 	}
 }
 
-// Outputs random values from a uniform distribution.
+// Decompress strings.
 //
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// This op decompresses each element of the `bytes` input `Tensor`, which
+// is assumed to be compressed using the given `compression_type`.
+//
+// The `output` is a string `Tensor` of the same shape as `bytes`,
+// each element containing the decompressed data from the corresponding
+// element in `bytes`.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	bytes: A Tensor of string which is compressed.
 //
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+// Returns A Tensor with the same shape as input `bytes`, uncompressed
+// from bytes.
+func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniform",
+		Type: "DecodeCompressed",
 		Input: []tf.Input{
-			shape,
+			bytes,
 		},
 		Attrs: attrs,
 	}
@@ -11815,395 +11231,378 @@ func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ..
 	return op.Output(0)
 }
 
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
 
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["little_endian"] = value
 	}
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// accum_new = accum + grad * grad
-// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Reinterpret the bytes of a string as a vector of numbers.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+//	bytes: All the elements must have the same length.
 //
-// Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"out_type": out_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
+		Type: "DecodeRaw",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
+			bytes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
-//
-// is alive, any other request to use `MutexLock` with this mutex will wait.
-//
-// This is particularly useful for creating a critical section when used in
-// conjunction with `MutexLockIdentity`:
-//
-// ```python
-//
-// mutex = mutex_v2(
-//   shared_name=handle_name, container=container, name=name)
-//
-// def execute_in_critical_section(fn, *args, **kwargs):
-//   lock = gen_resource_variable_ops.mutex_lock(mutex)
-//
-//   with ops.control_dependencies([lock]):
-//     r = fn(*args, **kwargs)
-//
-//   with ops.control_dependencies(nest.flatten(r)):
-//     with ops.colocate_with(mutex):
-//       ensure_lock_exists = mutex_lock_identity(lock)
-//
-//     # Make sure that if any element of r is accessed, all of
-//     # them are executed together.
-//     r = nest.map_structure(tf.identity, r)
-//
-//   with ops.control_dependencies([ensure_lock_exists]):
-//     return nest.map_structure(tf.identity, r)
-// ```
-//
-// While `fn` is running in the critical section, no other functions which wish to
-// use this critical section may run.
-//
-// Often the use case is that two executions of the same graph, in parallel,
-// wish to run `fn`; and we wish to ensure that only one of them executes
-// at a time.  This is especially important if `fn` modifies one or more
-// variables at a time.
-//
-// It is also useful if two separate functions must share a resource, but we
-// wish to ensure the usage is exclusive.
-//
-// Arguments:
-//	mutex: The mutex resource to lock.
+// Computes natural logarithm of (1 + x) element-wise.
 //
-// Returns A tensor that keeps a shared pointer to a lock on the mutex;
-// when the Tensor is destroyed, the use count on the shared pointer is decreased
-// by 1.  When it reaches 0, the lock is released.
-func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MutexLock",
+		Type: "Log1p",
 		Input: []tf.Input{
-			mutex,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+// Computes rectified linear 6 gradients for a Relu6 operation.
 //
 // Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation, or
+// its output; using either one produces the same result.
 //
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ParseTensor",
+		Type: "Relu6Grad",
 		Input: []tf.Input{
-			serialized,
+			gradients, features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
-type MaxPoolWithArgmaxAttr func(optionalAttr)
+// ResizeBicubicAttr is an optional argument to ResizeBicubic.
+type ResizeBicubicAttr func(optionalAttr)
 
-// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
-// If not specified, defaults to DT_INT64
-func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
 	return func(m optionalAttr) {
-		m["Targmax"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Performs max pooling on the input and outputs both max values and indices.
+// Resize `images` to `size` using bicubic interpolation.
 //
-// The indices in `argmax` are flattened, so that a maximum value at position
-// `[b, y, x, c]` becomes flattened index
-// `((b * height + y) * width + x) * channels + c`.
+// Input images can be of different types but output images are always float.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBicubic",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gather ragged slices from `params` axis `0` according to `indices`.
+//
+// Outputs a `RaggedTensor` output composed from `output_dense_values` and
+// `output_nested_splits`, such that:
+//
+// ```python
+// output.shape = indices.shape + params.shape[1:]
+// output.ragged_rank = indices.shape.ndims + params.ragged_rank
+// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
+// ```
+//
+// where
+//
+// * `params =
+//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
+//    provides the values that should be gathered.
+// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
+//    values should be gathered.
+// * `output =
+//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
+//    is the output tensor.
+//
+// (Note: This c++ op is used to implement the higher-level python
+// `tf.ragged.gather` op, which also supports ragged indices.)
 //
-// The indices returned are always in `[0, height) x [0, width)` before flattening,
-// even if padding is involved and the mathematically correct answer is outside
-// (either negative or too large).  This is a bug, but fixing it is difficult to do
-// in a safe backwards compatible way, especially due to flattening.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
+// `params` RaggedTensor input.
+//	params_dense_values: The `inner_values` for the `params` RaggedTensor. There was a terminology change
+// at the python level from dense_values to inner_values, so dense_values is the
+// deprecated name.
+//	indices: Indices in the outermost dimension of `params` of the values that should be
+// gathered.
+//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
+// this number of `row_splits` tensors. This value should equal
+// `indices.shape.ndims + params.ragged_rank - 1`.
 //
-// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
-func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+// Returns The `nested_row_splits` tensors that define the row-partitioning for the
+// returned RaggedTensor.The `inner_values` for the returned RaggedTensor.
+func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolWithArgmax",
+		Type: "RaggedGather",
 		Input: []tf.Input{
-			input,
+			tf.OutputList(params_nested_splits), params_dense_values, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
+		scope.UpdateErr("RaggedGather", err)
+		return
+	}
+	output_dense_values = op.Output(idx)
+	return output_nested_splits, output_dense_values
 }
 
-// Creates a TensorList which, when stacked, has the value of `tensor`.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// Each tensor in the result list corresponds to one row of the input tensor.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
 //
-// tensor: The input tensor.
-// output_handle: The list.
-func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListFromTensor",
+		Type: "NonMaxSuppressionV2",
 		Input: []tf.Input{
-			tensor, element_shape,
+			boxes, scores, max_output_size, iou_threshold,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Assigns sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = updates[i, ...]
+// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// output=SparseTensor(indices=sparse_indices, values=sparse_values,
+//                     dense_shape=sparse_dense_shape)
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
+//	rt_dense_values: The `inner_values` for the `RaggedTensor`.
 //
-// Returns the created operation.
-func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
+func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterUpdate",
+		Type: "RaggedTensorToSparse",
 		Input: []tf.Input{
-			resource, indices, updates,
+			tf.OutputList(rt_nested_splits), rt_dense_values,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
-type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// Check if the input matches the regex pattern.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// The input is a string tensor of any shape. The pattern is a scalar
+// string tensor which is applied to every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
+//	input: A string tensor of the text to be processed.
+//	pattern: A scalar string tensor containing the regular expression to match the input.
 //
-// Returns the created operation.
-func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
+// Returns A bool tensor with the same shape as `input`.
+func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrlV2",
+		Type: "RegexFullMatch",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
+			input, pattern,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Calculates gains for each feature and returns the best possible split information for the feature.
+// Says whether the targets are in the top `K` predictions.
 //
-// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
 //
-// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
+// More formally, let
 //
-// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
 //
-// The length of output lists are all of the same length, `num_features`.
-// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
 //
 // Arguments:
-//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
-//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
-//	l1: l1 regularization factor on leaf weights, per instance based.
-//	l2: l2 regularization factor on leaf weights, per instance based.
-//	tree_complexity: adjustment to the gain, per leaf based.
-//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
-//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
 //
-// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
-func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
+// Returns Computed precision at `k` as a `bool Tensor`.
+func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"max_splits": max_splits}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesCalculateBestGainsPerFeature",
+		Type: "InTopKV2",
 		Input: []tf.Input{
-			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
+			predictions, targets, k,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
+	return op.Output(0)
 }
 
-// EncodePngAttr is an optional argument to EncodePng.
-type EncodePngAttr func(optionalAttr)
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
 
-// EncodePngCompression sets the optional compression attribute to value.
+// RandomShuffleSeed sets the optional seed attribute to value.
 //
-// value: Compression level.
-// If not specified, defaults to -1
-func EncodePngCompression(value int64) EncodePngAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
 	return func(m optionalAttr) {
-		m["compression"] = value
+		m["seed"] = value
 	}
 }
 
-// PNG-encode an image.
-//
-// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-// where `channels` is:
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
 //
-// *   1: for grayscale.
-// *   2: for grayscale + alpha.
-// *   3: for RGB.
-// *   4: for RGBA.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly shuffles a tensor along its first dimension.
 //
-// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-// default or a value from 0 to 9.  9 is the highest compression level, generating
-// the smallest output, but is slower.
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
+//
+// ```
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	value: The tensor to be shuffled.
 //
-// Returns 0-D. PNG-encoded image.
-func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12212,9 +11611,9 @@ func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (conten
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodePng",
+		Type: "RandomShuffle",
 		Input: []tf.Input{
-			image,
+			value,
 		},
 		Attrs: attrs,
 	}
@@ -12222,38 +11621,47 @@ func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (conten
 	return op.Output(0)
 }
 
-// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
-type DataFormatVecPermuteAttr func(optionalAttr)
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
-// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
 	return func(m optionalAttr) {
-		m["src_format"] = value
+		m["use_locking"] = value
 	}
 }
 
-// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
+// Update '*var' according to the RMSProp algorithm.
 //
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
-	}
-}
-
-// Returns the permuted vector/tensor in the destination data format given the
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
 //
-// one in the source data format.
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
-func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12262,291 +11670,239 @@ func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPe
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatVecPermute",
+		Type: "ResourceSparseApplyRMSProp",
 		Input: []tf.Input{
-			x,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Initializes the multi device iterator with the given dataset.
-//
-// Arguments:
-//	dataset: Dataset to be iterated upon.
-//	multi_device_iterator: A MultiDeviceIteratorResource.
-//	max_buffer_size: The maximum size of the host side per device buffer to keep.
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
 //
-// Returns An int64 indicating which incarnation of the MultiDeviceIterator
-// is running.
-func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorInit",
-		Input: []tf.Input{
-			dataset, multi_device_iterator, max_buffer_size,
-		},
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the gradient of `igamma(a, x)` wrt `a`.
-func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IgammaGradA",
-		Input: []tf.Input{
-			a, x,
-		},
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process.
-//
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
-//
-// Arguments:
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
 //
-//	num_buckets: The number of buckets.
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
-		Input: []tf.Input{
-			string_tensor,
-		},
-		Attrs: attrs,
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
-type StaticRegexReplaceAttr func(optionalAttr)
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
 
-// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
 //
-// value: If True, the replacement is global, otherwise the replacement
-// is done only on the first match.
-// If not specified, defaults to true
-func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["replace_global"] = value
+		m["use_image_if_no_bounding_boxes"] = value
 	}
 }
 
-// Replaces the match of pattern in input with rewrite.
+// Generate a single randomly distorted bounding box for an image.
 //
-// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
 //
 // Arguments:
-//	input: The text to be processed.
-//	pattern: The regular expression to match the input.
-//	rewrite: The rewrite to be applied to the matched expresion.
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
 //
-// Returns The text after applying pattern and rewrite.
-func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StaticRegexReplace",
+		Type: "SampleDistortedBoundingBox",
 		Input: []tf.Input{
-			input,
+			image_size, bounding_boxes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes gradients for the exponential linear (Elu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
+// Computes sigmoid of `x` element-wise.
 //
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "EluGrad",
+		Type: "Sigmoid",
 		Input: []tf.Input{
-			gradients, outputs,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that contains `count` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be taken. A value of `-1` indicates that all of `input_dataset`
-// is taken.
-//
-//
-func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TakeDataset",
-		Input: []tf.Input{
-			input_dataset, count,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
 
-// Reads the value of a variable.
-//
-// The tensor returned by this operation is immutable.
-//
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
 //
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// This op consumes a lock created by `MutexLock`.
-//
-// This op exists to consume a tensor created by `MutexLock` (other than
-// direct control dependencies).  It should be the only that consumes the tensor,
-// and will raise an error if it is not.  Its only purpose is to keep the
-// mutex lock tensor alive until it is consumed by this op.
-//
-// **NOTE**: This operation must run on the same device as its input.  This may
-// be enforced via the `colocate_with` mechanism.
-//
-// Arguments:
-//	mutex_lock: A tensor returned by `MutexLock`.
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
 //
-// Returns the created operation.
-func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConsumeMutexLock",
-		Input: []tf.Input{
-			mutex_lock,
-		},
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
-type ResourceScatterNdAddAttr func(optionalAttr)
-
-// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
 //
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
 // If not specified, defaults to true
-func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["is_training"] = value
 	}
 }
 
-// Adds sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
-//
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
-//
-// ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_add(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, 12, 3, 14, 14, 6, 7, 20]
+// Batch normalization.
 //
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of
-// values to add to ref.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
 //
-// Returns the created operation.
-func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12555,101 +11911,120 @@ func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, update
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdAdd",
+		Type: "FusedBatchNorm",
 		Input: []tf.Input{
-			ref, indices, updates,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Updates the tree ensemble by either adding a layer to the last tree being grown
-//
-// or by starting a new tree.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the ensemble variable.
-//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
-// the feature that will be used in the split.
-//	node_ids: List of rank 1 tensors representing the nodes for which this feature
-// has a split.
-//	gains: List of rank 1 tensors representing the gains for each of the feature's
-// split.
-//	thresholds: List of rank 1 tensors representing the thesholds for each of the
-// feature's split.
-//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
-// the feature's splits. Will be added to the previous node values to constitute
-// the values of the left nodes.
-//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
-// of the feature's splits. Will be added to the previous node values to constitute
-// the values of the right nodes.
-//	max_depth: Max depth of the tree to build.
-//	learning_rate: shrinkage const for each new tree.
-//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
+
+// RandomStandardNormalSeed sets the optional seed attribute to value.
 //
-// Returns the created operation.
-func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesUpdateEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
-		},
-		Attrs: attrs,
+}
+
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Tan",
+		Type: "RandomStandardNormal",
 		Input: []tf.Input{
-			x,
+			shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Bucketizes 'input' based on 'boundaries'.
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
+
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
 //
-// For example, if the inputs are
-//     boundaries = [0, 10, 100]
-//     input = [[-5, 10000]
-//              [150,   10]
-//              [5,    100]]
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+	return func(m optionalAttr) {
+		m["resize_align_corners"] = value
+	}
+}
+
+// Performs a resize and padding as a preprocess during a convolution.
 //
-// then the output will be
-//     output = [[0, 3]
-//               [3, 2]
-//               [1, 3]]
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	input: Any shape of Tensor contains with int or float type.
-//	boundaries: A sorted list of floats gives the boundary of the buckets.
-//
-// Returns Same shape with 'input', each value of input replaced with bucket index.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
 //
-// @compatibility(numpy)
-// Equivalent to np.digitize.
-// @end_compatibility
-func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"boundaries": boundaries}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Bucketize",
+		Type: "FusedResizeAndPadConv2D",
 		Input: []tf.Input{
-			input,
+			input, size, paddings, filter,
 		},
 		Attrs: attrs,
 	}
@@ -12657,195 +12032,337 @@ func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.O
 	return op.Output(0)
 }
 
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
 
-// EncodeJpegFormat sets the optional format attribute to value.
+// RandomUniformSeed sets the optional seed attribute to value.
 //
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
 	return func(m optionalAttr) {
-		m["format"] = value
+		m["seed"] = value
 	}
 }
 
-// EncodeJpegQuality sets the optional quality attribute to value.
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
 //
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
 	return func(m optionalAttr) {
-		m["quality"] = value
+		m["seed2"] = value
 	}
 }
 
-// EncodeJpegProgressive sets the optional progressive attribute to value.
+// Outputs random values from a uniform distribution.
 //
-// value: If True, create a JPEG that loads progressively (coarse to fine).
-// If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["progressive"] = value
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomUniform",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
+
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, spend CPU/RAM to reduce size with no quality change.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
 	return func(m optionalAttr) {
-		m["optimize_size"] = value
+		m["use_locking"] = value
 	}
 }
 
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+// Update '*var' according to the Ftrl-proximal scheme.
 //
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
-	}
-}
-
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["density_unit"] = value
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrl",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// EncodeJpegXDensity sets the optional x_density attribute to value.
+// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
 //
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["x_density"] = value
+// is alive, any other request to use `MutexLock` with this mutex will wait.
+//
+// This is particularly useful for creating a critical section when used in
+// conjunction with `MutexLockIdentity`:
+//
+// ```python
+//
+// mutex = mutex_v2(
+//   shared_name=handle_name, container=container, name=name)
+//
+// def execute_in_critical_section(fn, *args, **kwargs):
+//   lock = gen_resource_variable_ops.mutex_lock(mutex)
+//
+//   with ops.control_dependencies([lock]):
+//     r = fn(*args, **kwargs)
+//
+//   with ops.control_dependencies(nest.flatten(r)):
+//     with ops.colocate_with(mutex):
+//       ensure_lock_exists = mutex_lock_identity(lock)
+//
+//     # Make sure that if any element of r is accessed, all of
+//     # them are executed together.
+//     r = nest.map_structure(tf.identity, r)
+//
+//   with ops.control_dependencies([ensure_lock_exists]):
+//     return nest.map_structure(tf.identity, r)
+// ```
+//
+// While `fn` is running in the critical section, no other functions which wish to
+// use this critical section may run.
+//
+// Often the use case is that two executions of the same graph, in parallel,
+// wish to run `fn`; and we wish to ensure that only one of them executes
+// at a time.  This is especially important if `fn` modifies one or more
+// variables at a time.
+//
+// It is also useful if two separate functions must share a resource, but we
+// wish to ensure the usage is exclusive.
+//
+// Arguments:
+//	mutex: The mutex resource to lock.
+//
+// Returns A tensor that keeps a shared pointer to a lock on the mutex;
+// when the Tensor is destroyed, the use count on the shared pointer is decreased
+// by 1.  When it reaches 0, the lock is released.
+func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MutexLock",
+		Input: []tf.Input{
+			mutex,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegYDensity sets the optional y_density attribute to value.
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
 //
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["y_density"] = value
+// Arguments:
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
+//
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "ParseTensor",
+		Input: []tf.Input{
+			serialized,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
-//
-// value: If not empty, embed this XMP metadata in the image header.
-// If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
+type MaxPoolWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// If not specified, defaults to DT_INT64
+func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
 	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
+		m["Targmax"] = value
 	}
 }
 
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
+// Performs max pooling on the input and outputs both max values and indices.
 //
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
+// The indices in `argmax` are flattened, so that a maximum value at position
+// `[b, y, x, c]` becomes flattened index
+// `((b * height + y) * width + x) * channels + c`.
 //
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
+// The indices returned are always in `[0, height) x [0, width)` before flattening,
+// even if padding is involved and the mathematically correct answer is outside
+// (either negative or too large).  This is a bug, but fixing it is difficult to do
+// in a safe backwards compatible way, especially due to flattening.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
+func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
+		Type: "MaxPoolWithArgmax",
 		Input: []tf.Input{
-			image,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
-
-// MultinomialSeed sets the optional seed attribute to value.
+// Creates a TensorList which, when stacked, has the value of `tensor`.
 //
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Each tensor in the result list corresponds to one row of the input tensor.
+//
+// tensor: The input tensor.
+// output_handle: The list.
+func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListFromTensor",
+		Input: []tf.Input{
+			tensor, element_shape,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MultinomialSeed2 sets the optional seed2 attribute to value.
+// Assigns sparse updates to the variable referenced by `resource`.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterUpdate",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+// MaxPoolAttr is an optional argument to MaxPool.
+type MaxPoolAttr func(optionalAttr)
+
+// MaxPoolDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolDataFormat(value string) MaxPoolAttr {
 	return func(m optionalAttr) {
-		m["output_dtype"] = value
+		m["data_format"] = value
 	}
 }
 
-// Draws samples from a multinomial distribution.
+// Performs max pooling on the input.
 //
 // Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Multinomial",
+		Type: "MaxPool",
 		Input: []tf.Input{
-			logits, num_samples,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -12853,62 +12370,139 @@ func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+// Multiplies sparse updates into the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] *= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] *= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMul",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
 
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// Subtracts sparse updates from the variable referenced by `resource`.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] -= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] -= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterSub",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+// Adds sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] += updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] += updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
 // Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
+		Type: "ResourceScatterAdd",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+			resource, indices, updates,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
+// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
+type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
 
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
 //
 // value: If `True`, updating of the var and accum tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
@@ -12917,8 +12511,10 @@ func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
 // Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 //
 // That is for rows we have grad for, we update var, accum and linear as follows:
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
 // quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
 // var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
 // accum = accum_new
@@ -12931,11 +12527,12 @@ func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
 //	indices: A vector of indices into the first dimension of var and accum.
 //	lr: Scaling factor. Must be a scalar.
 //	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
 //	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12944,83 +12541,120 @@ func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, line
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
+		Type: "ResourceSparseApplyFtrlV2",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns which elements of x are Inf.
+// Calculates gains for each feature and returns the best possible split information for the feature.
 //
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
+//
+// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
+//
+// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+//
+// The length of output lists are all of the same length, `num_features`.
+// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
+//
+// Arguments:
+//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
+//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+//	l1: l1 regularization factor on leaf weights, per instance based.
+//	l2: l2 regularization factor on leaf weights, per instance based.
+//	tree_complexity: adjustment to the gain, per leaf based.
+//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
+//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+//
+// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
+func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"max_splits": max_splits}
 	opspec := tf.OpSpec{
-		Type: "IsInf",
+		Type: "BoostedTreesCalculateBestGainsPerFeature",
 		Input: []tf.Input{
-			x,
+			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
 }
 
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
+// EncodePngAttr is an optional argument to EncodePng.
+type EncodePngAttr func(optionalAttr)
 
-// TruncatedNormalSeed sets the optional seed attribute to value.
+// EncodePngCompression sets the optional compression attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+// value: Compression level.
+// If not specified, defaults to -1
+func EncodePngCompression(value int64) EncodePngAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["compression"] = value
 	}
 }
 
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// PNG-encode an image.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a truncated normal distribution.
+// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+// where `channels` is:
 //
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// *   1: for grayscale.
+// *   2: for grayscale + alpha.
+// *   3: for RGB.
+// *   4: for RGBA.
+//
+// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+// default or a value from 0 to 9.  9 is the highest compression level, generating
+// the smallest output, but is slower.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+// Returns 0-D. PNG-encoded image.
+func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
+		Type: "EncodePng",
 		Input: []tf.Input{
-			shape,
+			image,
 		},
 		Attrs: attrs,
 	}
@@ -13028,96 +12662,115 @@ func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional
 	return op.Output(0)
 }
 
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
+// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
+type DataFormatVecPermuteAttr func(optionalAttr)
 
-// SkipgramWindowSize sets the optional window_size attribute to value.
+// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
 //
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
 	return func(m optionalAttr) {
-		m["window_size"] = value
+		m["src_format"] = value
 	}
 }
 
-// SkipgramMinCount sets the optional min_count attribute to value.
+// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
 //
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
 	return func(m optionalAttr) {
-		m["min_count"] = value
+		m["dst_format"] = value
 	}
 }
 
-// SkipgramSubsample sets the optional subsample attribute to value.
+// Returns the permuted vector/tensor in the destination data format given the
 //
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["subsample"] = value
+// one in the source data format.
+//
+// Arguments:
+//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
+//
+// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
+	opspec := tf.OpSpec{
+		Type: "DataFormatVecPermute",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Parses a text file and creates a batch of examples.
-//
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+// Initializes the multi device iterator with the given dataset.
 //
 // Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
+//	dataset: Dataset to be iterated upon.
+//	multi_device_iterator: A MultiDeviceIteratorResource.
+//	max_buffer_size: The maximum size of the host side per device buffer to keep.
 //
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+// Returns An int64 indicating which incarnation of the MultiDeviceIterator
+// is running.
+func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
-		Attrs: attrs,
+		Type: "MultiDeviceIteratorInit",
+		Input: []tf.Input{
+			dataset, multi_device_iterator, max_buffer_size,
+		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+	return op.Output(0)
 }
 
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
-
-// StringToNumberOutType sets the optional out_type attribute to value.
-//
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
+// Computes the gradient of `igamma(a, x)` wrt `a`.
+func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IgammaGradA",
+		Input: []tf.Input{
+			a, x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to the specified numeric type.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
+// The hash function is deterministic on the content of the string within the
+// process.
+//
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+//
+// Arguments:
+//
+//	num_buckets: The number of buckets.
 //
 // Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "StringToNumber",
+		Type: "StringToHashBucket",
 		Input: []tf.Input{
 			string_tensor,
 		},
@@ -13127,162 +12780,213 @@ func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToN
 	return op.Output(0)
 }
 
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
+// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
+type StaticRegexReplaceAttr func(optionalAttr)
 
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+// value: If True, the replacement is global, otherwise the replacement
+// is done only on the first match.
+// If not specified, defaults to true
+func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["replace_global"] = value
 	}
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
+// Replaces the match of pattern in input with rewrite.
 //
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
+//	input: The text to be processed.
+//	pattern: The regular expression to match the input.
+//	rewrite: The rewrite to be applied to the matched expresion.
 //
-// Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+// Returns The text after applying pattern and rewrite.
+func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
+		Type: "StaticRegexReplace",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
-//
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
+// Computes gradients for the exponential linear (Elu) operation.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
+//
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
+		Type: "EluGrad",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			gradients, outputs,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Encode audio data using the WAV file format.
+// Creates a dataset that contains `count` elements from the `input_dataset`.
 //
-// This operation will generate a string suitable to be saved out to create a .wav
-// audio file. It will be encoded in the 16-bit PCM format. It takes in float
-// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-// that range.
+// Arguments:
 //
-// `audio` is a 2-D float Tensor of shape `[length, channels]`.
-// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
 //
-// Arguments:
-//	audio: 2-D with shape `[length, channels]`.
-//	sample_rate: Scalar containing the sample frequency.
 //
-// Returns 0-D. WAV-encoded file contents.
-func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "EncodeWav",
+		Type: "TakeDataset",
 		Input: []tf.Input{
-			audio, sample_rate,
+			input_dataset, count,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes atan of x element-wise.
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+// Reads the value of a variable.
+//
+// The tensor returned by this operation is immutable.
+//
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "Atan",
+		Type: "ReadVariableOp",
 		Input: []tf.Input{
-			x,
+			resource,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
-type ResourceApplyAdaMaxAttr func(optionalAttr)
+// This op consumes a lock created by `MutexLock`.
+//
+// This op exists to consume a tensor created by `MutexLock` (other than
+// direct control dependencies).  It should be the only that consumes the tensor,
+// and will raise an error if it is not.  Its only purpose is to keep the
+// mutex lock tensor alive until it is consumed by this op.
+//
+// **NOTE**: This operation must run on the same device as its input.  This may
+// be enforced via the `colocate_with` mechanism.
+//
+// Arguments:
+//	mutex_lock: A tensor returned by `MutexLock`.
+//
+// Returns the created operation.
+func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConsumeMutexLock",
+		Input: []tf.Input{
+			mutex_lock,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
 
-// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
+// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
+type ResourceScatterNdAddAttr func(optionalAttr)
+
+// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the AdaMax algorithm.
+// Adds sparse `updates` to individual values or slices within a given
+//
+// variable according to `indices`.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_add(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// v_t <- max(beta2 * v_{t-1}, abs(g))
-// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+//     [1, 12, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
 //
 // Returns the created operation.
-func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
+func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13291,231 +12995,106 @@ func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdaMax",
+		Type: "ResourceScatterNdAdd",
 		Input: []tf.Input{
-			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
+			ref, indices, updates,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
-
-// AssertSummarize sets the optional summarize attribute to value.
-//
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Asserts that the given condition is true.
+// Updates the tree ensemble by either adding a layer to the last tree being grown
 //
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
+// or by starting a new tree.
 //
 // Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
+//	tree_ensemble_handle: Handle to the ensemble variable.
+//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
+// the feature that will be used in the split.
+//	node_ids: List of rank 1 tensors representing the nodes for which this feature
+// has a split.
+//	gains: List of rank 1 tensors representing the gains for each of the feature's
+// split.
+//	thresholds: List of rank 1 tensors representing the thesholds for each of the
+// feature's split.
+//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
+// the feature's splits. Will be added to the previous node values to constitute
+// the values of the left nodes.
+//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
+// of the feature's splits. Will be added to the previous node values to constitute
+// the values of the right nodes.
+//	max_depth: Max depth of the tree to build.
+//	learning_rate: shrinkage const for each new tree.
+//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
 //
 // Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
 	opspec := tf.OpSpec{
-		Type: "Assert",
+		Type: "BoostedTreesUpdateEnsemble",
 		Input: []tf.Input{
-			condition, tf.OutputList(data),
+			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
-type CudnnRNNBackpropAttr func(optionalAttr)
-
-// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNBackpropSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Backprop step of CudnnRNN.
-//
-// Compute the backprop of both data and weights in a RNN.
-//
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     the actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-//     pass.
-// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-//     pass.
-// reserve_space: The same reserve_space produced in for forward operation.
-// input_backprop: The backprop to input in the forward pass. Has the same shape
-//     as input.
-// input_h_backprop: The backprop to input_h in the forward pass. Has the same
-//     shape as input_h.
-// input_c_backprop: The backprop to input_c in the forward pass. Has the same
-//     shape as input_c.
-// params_backprop: The backprop to the params buffer in the forward pass. Has the
-//     same shape as params.
-func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNBackprop",
+		Type: "Tan",
 		Input: []tf.Input{
-			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
-//
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
+// Bucketizes 'input' based on 'boundaries'.
 //
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
 //
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
 //
 // Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_split": num_split}
-	opspec := tf.OpSpec{
-		Type: "SparseSplit",
-		Input: []tf.Input{
-			split_dim, indices, values, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
+// Returns Same shape with 'input', each value of input replaced with bucket index.
+//
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	return output_indices, output_values, output_shape
+	attrs := map[string]interface{}{"boundaries": boundaries}
+	opspec := tf.OpSpec{
+		Type: "Bucketize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // Returns the element-wise sum of a list of tensors.
@@ -13756,6 +13335,39 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
+//
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
+//
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseAdd",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the complementary error function of `x` element-wise.
 func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
@@ -14006,90 +13618,164 @@ func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.
 	return scope.AddOperation(opspec)
 }
 
-// Exits the current frame to its parent frame.
+// SubstrAttr is an optional argument to Substr.
+type SubstrAttr func(optionalAttr)
+
+// SubstrUnit sets the optional unit attribute to value.
 //
-// Exit makes its input `data` available to the parent frame.
+// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
+// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
+// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
+// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
+// UTF-8.
+// If not specified, defaults to "BYTE"
+func SubstrUnit(value string) SubstrAttr {
+	return func(m optionalAttr) {
+		m["unit"] = value
+	}
+}
+
+// Return substrings from `Tensor` of strings.
+//
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
+//
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
+//
+// A negative `pos` indicates distance within the string backwards from the end.
+//
+// If `pos` specifies an index which is out of range for any of the input strings,
+// then an `InvalidArgumentError` is thrown.
+//
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
+//
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
+//
+// output = [b'ell', b'orl']
+// ```
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
+// ```
+//
+// Broadcasting `pos` and `len` onto `input`:
+//
+// ```
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
+// ```
+//
+// Broadcasting `input` onto `pos` and `len`:
+//
+// ```
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n']
+// ```
 //
 // Arguments:
-//	data: The tensor to be made available to the parent frame.
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
 //
-// Returns The same tensor as `data`.
-func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Exit",
+		Type: "Substr",
 		Input: []tf.Input{
-			data,
+			input, pos, len,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
+// Exits the current frame to its parent frame.
 //
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
+// Exit makes its input `data` available to the parent frame.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+//	data: The tensor to be made available to the parent frame.
+//
+// Returns The same tensor as `data`.
+func Exit(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
+		Type: "Exit",
 		Input: []tf.Input{
-			reader_handle,
+			data,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolAttr is an optional argument to MaxPool.
-type MaxPoolAttr func(optionalAttr)
-
-// MaxPoolDataFormat sets the optional data_format attribute to value.
+// Produce a string tensor that encodes the state of a Reader.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolDataFormat(value string) MaxPoolAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.
-func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool",
+		Type: "ReaderSerializeStateV2",
 		Input: []tf.Input{
-			input,
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -14188,108 +13874,48 @@ func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Out
 // <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
 // </div>
 //
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterMin",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Reshapes a quantized tensor as per the Reshape op.
-//
-// ```
-//
-// Arguments:
-//
-//	shape: Defines the shape of the output tensor.
-//	input_min: The minimum value of the input.
-//	input_max: The maximum value of the input.
-//
-// Returns This value is copied from input_min.This value is copied from input_max.
-func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedReshape",
-		Input: []tf.Input{
-			tensor, shape, input_min, input_max,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns the truth value of (x != y) element-wise.
-//
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NotEqual",
+		Type: "ResourceScatterMin",
 		Input: []tf.Input{
-			x, y,
+			resource, indices, updates,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Inverse 3D real-valued fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 3 dimensions of `input`.
-//
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// Reshapes a quantized tensor as per the Reshape op.
 //
-// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// ```
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier transform.
+//	shape: Defines the shape of the output tensor.
+//	input_min: The minimum value of the input.
+//	input_max: The maximum value of the input.
 //
-// @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
-// @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns This value is copied from input_min.This value is copied from input_max.
+func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
+		Type: "QuantizedReshape",
 		Input: []tf.Input{
-			input, fft_length,
+			tensor, shape, input_min, input_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // StringSplitAttr is an optional argument to StringSplit.
@@ -14446,176 +14072,236 @@ func Conj(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
+// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
+type CudnnRNNBackpropAttr func(optionalAttr)
 
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["rnn_mode"] = value
 	}
 }
 
-// Computes gradients of the average pooling function.
+// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNBackpropSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Backprop step of CudnnRNN.
 //
-// Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
+// Compute the backprop of both data and weights in a RNN.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in for forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
+		Type: "CudnnRNNBackprop",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high overlaps
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
-// which allows for defining a custom overlap criterium (eg. intersection over union,
-// intersection over area, etc.).
+// Encode audio data using the WAV file format.
 //
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
 //
-//   selected_indices = tf.image.non_max_suppression_with_overlaps(
-//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
 //
 // Arguments:
-//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
-// the n-by-n box overlap values.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionWithOverlaps",
+		Type: "EncodeWav",
 		Input: []tf.Input{
-			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
+			audio, sample_rate,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StageClearAttr is an optional argument to StageClear.
-type StageClearAttr func(optionalAttr)
-
-// StageClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageClearCapacity(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// Computes atan of x element-wise.
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// StageClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageClearMemoryLimit(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+	opspec := tf.OpSpec{
+		Type: "Atan",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StageClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageClearContainer(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
+// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
+type ResourceApplyAdaMaxAttr func(optionalAttr)
 
-// StageClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageClearSharedName(value string) StageClearAttr {
+// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Op removes all elements in the underlying container.
+// Update '*var' according to the AdaMax algorithm.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// v_t <- max(beta2 * v_{t-1}, abs(g))
+// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
+func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StageClear",
-
+		Type: "ResourceApplyAdaMax",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
+		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
 
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
+// AssertSummarize sets the optional summarize attribute to value.
+//
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["summarize"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+// Asserts that the given condition is true.
 //
-// Arguments:
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// Arguments:
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14624,104 +14310,79 @@ func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, ma
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
+		Type: "Assert",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			condition, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
-type FixedLengthRecordReaderV2Attr func(optionalAttr)
-
-// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
-//
-// value: Number of bytes in the header, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["header_bytes"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
 //
-// value: Number of bytes in the footer, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["footer_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
 //
-// value: Number of bytes to hop before each read. Default of 0 means using
-// record_bytes.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["hop_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// Graphically the output tensors are:
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
 //
-// value: The type of encoding for the file. Currently ZLIB and GZIP
-// are supported. Defaults to none.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["encoding"] = value
-	}
-}
-
-// A Reader that outputs fixed-length records from a file.
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
 //
 // Arguments:
-//	record_bytes: Number of bytes in the record.
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
 //
-// Returns The handle to reference the Reader.
-func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
-
+		Type: "SparseSplit",
+		Input: []tf.Input{
+			split_dim, indices, values, shape,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
 }
 
 // Computes numerical negative value element-wise.
@@ -15146,116 +14807,48 @@ func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_pa
 	var idx int
 	var err error
 	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("DynamicPartition", err)
-		return
-	}
-	return outputs
-}
-
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
-
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["update_slots"] = value
-	}
-}
-
-// Update '*var' according to the adagrad scheme.
-//
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Return the shape of s0 op s1 with broadcast.
-//
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
-	if scope.Err() != nil {
+		scope.UpdateErr("DynamicPartition", err)
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
-		Input: []tf.Input{
-			s0, s1,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return outputs
 }
 
-// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
-type DataFormatDimMapAttr func(optionalAttr)
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
 
-// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
 	return func(m optionalAttr) {
-		m["src_format"] = value
+		m["use_locking"] = value
 	}
 }
 
-// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
 	return func(m optionalAttr) {
-		m["dst_format"] = value
+		m["update_slots"] = value
 	}
 }
 
-// Returns the dimension index in the destination data format given the one in
+// Update '*var' according to the adagrad scheme.
 //
-// the source data format.
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
-//	x: A Tensor with each element as a dimension index in source data format.
-// Must be in the range [-4, 4).
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns A Tensor with each element as a dimension index in destination data format.
-func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15264,14 +14857,13 @@ func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAtt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatDimMap",
+		Type: "ResourceApplyAdagrad",
 		Input: []tf.Input{
-			x,
+			var_, accum, lr, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
@@ -15421,11 +15013,207 @@ func ShapeOutType(value tf.DataType) ShapeAttr {
 //
 // For example:
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Shape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the power of one value to another.
+//
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
+//
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pow",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes fingerprints of the input strings.
+//
+// Arguments:
+//	input: vector of strings to compute fingerprints on.
+//
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaFprint",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNBias sets the optional bias attribute to value.
+//
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Local Response Normalization.
+//
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
+//
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
+//
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+//
+// Arguments:
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LRN",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ZipDataset",
+		Input: []tf.Input{
+			tf.OutputList(input_datasets),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15434,32 +15222,28 @@ func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Shape",
+		Type: "ResourceSparseApplyAdagrad",
 		Input: []tf.Input{
-			input,
+			var_, accum, lr, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the power of one value to another.
+// Elementwise computes the bitwise right-shift of `x` and `y`.
 //
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
+// Performs a logical shift for unsigned integer types, and an arithmetic shift
+// for signed integer types.
 //
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// If `y` is negative, or greater than or equal to than the width of `x` in bits
+// the result is implementation defined.
+func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Pow",
+		Type: "RightShift",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -15468,107 +15252,70 @@ func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Computes fingerprints of the input strings.
-//
-// Arguments:
-//	input: vector of strings to compute fingerprints on.
-//
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
-		Input: []tf.Input{
-			input,
-		},
+// TensorListStackAttr is an optional argument to TensorListStack.
+type TensorListStackAttr func(optionalAttr)
+
+// TensorListStackNumElements sets the optional num_elements attribute to value.
+// If not specified, defaults to -1
+func TensorListStackNumElements(value int64) TensorListStackAttr {
+	return func(m optionalAttr) {
+		m["num_elements"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// The gradient operator for the SparseAdd op.
+// Stacks all tensors in the list.
 //
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-// values of A and B.
+// Requires that all tensors have the same shape.
 //
-// Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
+// input_handle: the input list
+// tensor: the gathered result
+// num_elements: optional. If not -1, the number of elements in the list.
 //
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+func TensorListStack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
+		Type: "TensorListStack",
 		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
+			input_handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
 
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["dtype"] = value
 	}
 }
 
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
+// Outputs deterministic pseudorandom random values from a uniform distribution.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15577,210 +15324,156 @@ func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
+		Type: "StatelessRandomUniform",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+// Makes its input available to the next iteration.
 //
 // Arguments:
+//	data: The tensor to be made available to the next iteration.
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//
-//
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BatchDataset",
+		Type: "NextIteration",
 		Input: []tf.Input{
-			input_dataset, batch_size,
+			data,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
-type RandomPoissonV2Attr func(optionalAttr)
-
-// RandomPoissonV2Seed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// RandomPoissonV2Dtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
+	opspec := tf.OpSpec{
+		Type: "Fact",
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs random values from the Poisson distribution(s) described by rate.
+// Deserialize `SparseTensor` objects.
 //
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
 //
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
 //
-// Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
 //
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`.
-func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
+// Arguments:
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "RandomPoissonV2",
+		Type: "DeserializeSparse",
 		Input: []tf.Input{
-			shape, rate,
+			serialized_sparse,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
-type DecodeAndCropJpegAttr func(optionalAttr)
-
-// DecodeAndCropJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
+// SqueezeAttr is an optional argument to Squeeze.
+type SqueezeAttr func(optionalAttr)
 
-// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// SqueezeAxis sets the optional axis attribute to value.
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+// value: If specified, only squeezes the dimensions listed. The dimension
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
+// If not specified, defaults to <>
 //
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+// REQUIRES: len(value) >= 0
+func SqueezeAxis(value []int64) SqueezeAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["squeeze_dims"] = value
 	}
 }
 
-// Decode and Crop a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
+// Removes dimensions of size 1 from the shape of a tensor.
 //
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
+// Given a tensor `input`, this operation returns a tensor of the same type with
+// all dimensions of size 1 removed. If you don't want to remove all size 1
+// dimensions, you can remove specific size 1 dimensions by specifying
+// `axis`.
 //
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
+// For example:
 //
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t)) ==> [2, 3]
+// ```
 //
+// Or, to remove specific size 1 dimensions:
 //
-// It is equivalent to a combination of decode and crop, but much faster by only
-// decoding partial jpeg image.
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// ```
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//	input: The `input` to squeeze.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+// Returns Contains the same data as `input`, but has one or more dimensions of
+// size 1 removed.
+func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15789,9 +15482,9 @@ func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeAndCropJpeg",
+		Type: "Squeeze",
 		Input: []tf.Input{
-			contents, crop_window,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -15799,105 +15492,98 @@ func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output,
 	return op.Output(0)
 }
 
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the adadelta scheme.
 //
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseAdd",
+		Type: "ResourceApplyAdadelta",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+			var_, accum, accum_update, lr, rho, epsilon, grad,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
-
-// LRNDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNBias sets the optional bias attribute to value.
-//
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// LRNAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
 
-// LRNBeta sets the optional beta attribute to value.
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
 //
-// value: An exponent.
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
 // If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
 	return func(m optionalAttr) {
-		m["beta"] = value
+		m["iou_threshold"] = value
 	}
 }
 
-// Local Response Normalization.
-//
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
-//
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15906,9 +15592,9 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRN",
+		Type: "NonMaxSuppression",
 		Input: []tf.Input{
-			input,
+			boxes, scores, max_output_size,
 		},
 		Attrs: attrs,
 	}
@@ -15916,16 +15602,16 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 	return op.Output(0)
 }
 
-// Creates a dataset that zips together `input_datasets`.
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ZipDataset",
+		Type: "TensorDataset",
 		Input: []tf.Input{
-			tf.OutputList(input_datasets),
+			tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
@@ -15933,146 +15619,174 @@ func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.Data
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
-
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// Component-wise multiplies a SparseTensor by a dense Tensor.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["update_slots"] = value
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseMul",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+// 2D real-valued fast Fourier transform.
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft2
+// @end_compatibility
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
+		Type: "RFFT2D",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Elementwise computes the bitwise right-shift of `x` and `y`.
+// Pads a tensor with zeros.
 //
-// Performs a logical shift for unsigned integer types, and an arithmetic shift
-// for signed integer types.
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
 //
-// If `y` is negative, or greater than or equal to than the width of `x` in bits
-// the result is implementation defined.
-func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+//
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RightShift",
+		Type: "Pad",
 		Input: []tf.Input{
-			x, y,
+			input, paddings,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorListStackAttr is an optional argument to TensorListStack.
-type TensorListStackAttr func(optionalAttr)
-
-// TensorListStackNumElements sets the optional num_elements attribute to value.
-// If not specified, defaults to -1
-func TensorListStackNumElements(value int64) TensorListStackAttr {
-	return func(m optionalAttr) {
-		m["num_elements"] = value
-	}
-}
-
-// Stacks all tensors in the list.
-//
-// Requires that all tensors have the same shape.
+// Checks whether a resource handle-based variable has been initialized.
 //
-// input_handle: the input list
-// tensor: the gathered result
-// num_elements: optional. If not -1, the number of elements in the list.
+// Arguments:
+//	resource: the input resource handle.
 //
-func TensorListStack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TensorListStack",
+		Type: "VarIsInitializedOp",
 		Input: []tf.Input{
-			input_handle,
+			resource,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
 
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16081,206 +15795,171 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
+		Type: "ResourceSparseApplyFtrl",
 		Input: []tf.Input{
-			shape, seed,
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Makes its input available to the next iteration.
-//
-// Arguments:
-//	data: The tensor to be made available to the next iteration.
+// Returns which elements of x are Inf.
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NextIteration",
+		Type: "IsInf",
 		Input: []tf.Input{
-			data,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fact",
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
+
+// TruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Deserialize `SparseTensor` objects.
-//
-// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
-// the last dimension stores serialized `SparseTensor` objects and the other N
-// dimensions (N >= 0) correspond to a batch. The ranks of the original
-// `SparseTensor` objects must all match. When the final `SparseTensor` is
-// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
-// the sparse tensors have been concatenated along new dimensions, one for each
-// batch.
-//
-// The output `SparseTensor` object's shape values for the original dimensions
-// are the max across the input `SparseTensor` objects' shape values for the
-// corresponding dimensions. The new dimensions match the size of the batch.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
-// then the final deserialized `SparseTensor` will be:
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
 // Arguments:
-//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
-// must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DeserializeSparse",
+		Type: "TruncatedNormal",
 		Input: []tf.Input{
-			serialized_sparse,
+			shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// SqueezeAttr is an optional argument to Squeeze.
-type SqueezeAttr func(optionalAttr)
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
 
-// SqueezeAxis sets the optional axis attribute to value.
-//
-// value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
-// be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to <>
+// SkipgramWindowSize sets the optional window_size attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func SqueezeAxis(value []int64) SqueezeAttr {
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
 	return func(m optionalAttr) {
-		m["squeeze_dims"] = value
+		m["window_size"] = value
 	}
 }
 
-// Removes dimensions of size 1 from the shape of a tensor.
-//
-// Given a tensor `input`, this operation returns a tensor of the same type with
-// all dimensions of size 1 removed. If you don't want to remove all size 1
-// dimensions, you can remove specific size 1 dimensions by specifying
-// `axis`.
-//
-// For example:
+// SkipgramMinCount sets the optional min_count attribute to value.
 //
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t)) ==> [2, 3]
-// ```
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["min_count"] = value
+	}
+}
+
+// SkipgramSubsample sets the optional subsample attribute to value.
 //
-// Or, to remove specific size 1 dimensions:
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
+	}
+}
+
+// Parses a text file and creates a batch of examples.
 //
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-// ```
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
 //
 // Arguments:
-//	input: The `input` to squeeze.
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
 //
-// Returns Contains the same data as `input`, but has one or more dimensions of
-// size 1 removed.
-func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Squeeze",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "Skipgram",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
 
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// StringToNumberOutType sets the optional out_type attribute to value.
 //
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["out_type"] = value
 	}
 }
 
-// Update '*var' according to the adadelta scheme.
-//
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
+// Converts each string in the input Tensor to the specified numeric type.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
 //
-// Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16289,58 +15968,54 @@ func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
+		Type: "StringToNumber",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
+			string_tensor,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
 
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
 //
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
-// If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
 	return func(m optionalAttr) {
-		m["iou_threshold"] = value
-	}
-}
-
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16349,162 +16024,255 @@ func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
+		Type: "ResourceApplyFtrlV2",
 		Input: []tf.Input{
-			boxes, scores, max_output_size,
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that emits `components` as a tuple of tensors once.
-func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
+
+// EncodeJpegFormat sets the optional format attribute to value.
+//
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["format"] = value
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
+}
+
+// EncodeJpegQuality sets the optional quality attribute to value.
+//
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["quality"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Component-wise multiplies a SparseTensor by a dense Tensor.
+// EncodeJpegProgressive sets the optional progressive attribute to value.
 //
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+// value: If True, create a JPEG that loads progressively (coarse to fine).
+// If not specified, defaults to false
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["progressive"] = value
+	}
+}
+
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
 //
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["optimize_size"] = value
+	}
+}
+
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
 //
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
+	}
+}
+
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["density_unit"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
+}
+
+// EncodeJpegXDensity sets the optional x_density attribute to value.
+//
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// 2D real-valued fast Fourier transform.
+// EncodeJpegYDensity sets the optional y_density attribute to value.
 //
-// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 2 dimensions of `input`.
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
+	}
+}
+
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
+	}
+}
+
+// JPEG-encode an image.
 //
-// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
 //
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft2
-// @end_compatibility
-func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
+//
+// Arguments:
+//	image: 3-D with shape `[height, width, channels]`.
+//
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RFFT2D",
+		Type: "EncodeJpeg",
 		Input: []tf.Input{
-			input, fft_length,
+			image,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
-//
-// The padded size of each dimension D of the output is:
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
+
+// MultinomialSeed sets the optional seed attribute to value.
 //
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// MultinomialSeed2 sets the optional seed2 attribute to value.
 //
-// For example:
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
 //
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
 //
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Pad",
+		Type: "Multinomial",
 		Input: []tf.Input{
-			input, paddings,
+			logits, num_samples,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Checks whether a resource handle-based variable has been initialized.
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
 //
 // Arguments:
-//	resource: the input resource handle.
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-// Returns a scalar boolean which is true if the variable has been
-// initialized.
-func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "VarIsInitializedOp",
+		Type: "ResourceSparseApplyAdagradDA",
 		Input: []tf.Input{
-			resource,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // Converts each string in the input Tensor to its hash mod by a number of buckets.
@@ -16966,222 +16734,38 @@ type SparseTensorDenseMatMulAttr func(optionalAttr)
 func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
 	return func(m optionalAttr) {
 		m["adjoint_a"] = value
-	}
-}
-
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
-//
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
-	}
-}
-
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-//
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
-//
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
-
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
-		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns immutable tensor from memory region.
-//
-// The current implementation memmaps the tensor from a file.
-//
-// Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
-	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
-
-// StringJoinSeparator sets the optional separator attribute to value.
-//
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins the strings in the given list of string tensors into one tensor;
-//
-// with the given separator (default is an empty separator).
-//
-// Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringJoin",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates and returns an empty tensor list.
-//
-// All list elements must be tensors of dtype element_dtype and shape compatible
-// with element_shape.
-//
-// handle: an empty tensor list.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "EmptyTensorList",
-		Input: []tf.Input{
-			element_shape, max_num_elements,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// VariableShapeAttr is an optional argument to VariableShape.
-type VariableShapeAttr func(optionalAttr)
+	}
+}
 
-// VariableShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+//
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["adjoint_b"] = value
 	}
 }
 
-// Returns the shape of the variable pointed to by `resource`.
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
 //
-// For example:
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17190,9 +16774,9 @@ func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VariableShape",
+		Type: "SparseTensorDenseMatMul",
 		Input: []tf.Input{
-			input,
+			a_indices, a_values, a_shape, b,
 		},
 		Attrs: attrs,
 	}
@@ -17200,241 +16784,164 @@ func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr)
 	return op.Output(0)
 }
 
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
 
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
-//
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// Update '*var' according to the RMSProp algorithm.
 //
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
+		Type: "ResourceApplyRMSProp",
 		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
+
+// SerializeManySparseOutType sets the optional out_type attribute to value.
 //
-// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-// a matrix of label probabilities, but rather a single label per row
-// of features.  This label is considered to have probability 1.0 for the
-// given row.
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
 //
-// Inputs are the logits, not probabilities.
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
 //
-// Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size vector with values in [0, num_classes).
-// This is the label for the given minibatch entry.
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
-		Input: []tf.Input{
-			features, labels,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Returns the truth value of NOT x element-wise.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalNot",
+		Type: "SerializeManySparse",
 		Input: []tf.Input{
-			x,
+			sparse_indices, sparse_values, sparse_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// 3D real-valued fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 3 dimensions of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
-//
-// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the their 3D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.rfftn with 3 dimensions.
-// @end_compatibility
-func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Computes inverse hyperbolic cosine of x element-wise.
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RFFT3D",
+		Type: "Acosh",
 		Input: []tf.Input{
-			input, fft_length,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayV3Attr is an optional argument to TensorArrayV3.
-type TensorArrayV3Attr func(optionalAttr)
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
 
-// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
-//
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
 // If not specified, defaults to <unknown_rank:true >
-func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
 	}
 }
 
-// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
-//
-// value: A boolean that determines whether writes to the TensorArray
-// are allowed to grow the size.  By default, this is not allowed.
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
 // If not specified, defaults to false
-func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
 	return func(m optionalAttr) {
 		m["dynamic_size"] = value
 	}
 }
 
-// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
-//
-// value: If true (default), Tensors in the TensorArray are cleared
-// after being read.  This disables multiple read semantics but allows early
-// release of memory.
-// If not specified, defaults to true
-func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["clear_after_read"] = value
-	}
-}
-
-// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
-//
-// value: If true (default is false), then all
-// elements in the TensorArray will be expected to have have identical shapes.
-// This allows certain behaviors, like dynamically checking for
-// consistent shapes on write, and being able to fill in properly
-// shaped zero tensors on stack -- even if the element_shape attribute
-// is not fully defined.
-// If not specified, defaults to false
-func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["identical_element_shapes"] = value
+		m["clear_after_read"] = value
 	}
 }
 
-// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
-//
-// value: Overrides the name used for the temporary tensor_array
-// resource. Default value is the name of the 'TensorArray' op (which
-// is guaranteed unique).
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
 // If not specified, defaults to ""
-func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
 	return func(m optionalAttr) {
 		m["tensor_array_name"] = value
 	}
 }
 
-// An array of Tensors of given size.
-//
-// Write data via Write and read via Read or Pack.
-//
-// Arguments:
-//	size: The size of the array.
-//	dtype: The type of the elements on the tensor_array.
+// Deprecated. Use TensorArrayV3
 //
-// Returns The handle to the TensorArray.A scalar used to control gradient flow.
-func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17443,223 +16950,308 @@ func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV3",
+		Type: "TensorArrayV2",
 		Input: []tf.Input{
 			size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Runs multiple additive regression ensemble predictors on input instances and
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// computes the logits. It is designed to be used during prediction.
-// It traverses all the trees and calculates the final score for each instance.
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// Arguments:
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
-// feature.
-//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
-// shape.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// Returns Output rank 2 Tensor containing logits for each example.
-func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesPredict",
+		Type: "ThreadUnsafeUnigramCandidateSampler",
 		Input: []tf.Input{
-			tree_ensemble_handle, tf.OutputList(bucketized_features),
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Elementwise computes the bitwise OR of `x` and `y`.
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
+
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
 //
-// The result will have those bits set, that are set in `x`, `y` or both. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs max pooling on the input.
+//
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BitwiseOr",
+		Type: "MaxPoolV2",
 		Input: []tf.Input{
-			x, y,
+			input, ksize, strides,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
 
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["fast"] = value
+		m["container"] = value
 	}
 }
 
-// Solves one or more linear least-squares problems.
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
 //
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
 //
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
 //
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
-// If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
-// sufficiently large.
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
 //
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
+//
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AddManySparseToTensorsMap",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates tensors along one dimension.
 //
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+// Arguments:
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
+		Type: "ConcatV2",
 		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
+			tf.OutputList(values), axis,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPool3DAttr is an optional argument to MaxPool3D.
-type MaxPool3DAttr func(optionalAttr)
-
-// MaxPool3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DDataFormat(value string) MaxPool3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "ReadFile",
+		Input: []tf.Input{
+			filename,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Performs 3D max pooling on the input.
+// Returns immutable tensor from memory region.
 //
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+// The current implementation memmaps the tensor from a file.
 //
-// Returns The max pooled output tensor.
-func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+// Arguments:
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3D",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "ImmutableConst",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
-type Conv3DBackpropInputAttr func(optionalAttr)
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
 
-// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
+// StringJoinSeparator sets the optional separator attribute to value.
+//
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["separator"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the input.
+// Joins the strings in the given list of string tensors into one tensor;
 //
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+// with the given separator (default is an empty separator).
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
+		Type: "StringJoin",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
@@ -17667,110 +17259,148 @@ func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_ba
 	return op.Output(0)
 }
 
-// Subtracts sparse updates from the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] -= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] -= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
+// Creates and returns an empty tensor list.
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// All list elements must be tensors of dtype element_dtype and shape compatible
+// with element_shape.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// handle: an empty tensor list.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "EmptyTensorList",
+		Input: []tf.Input{
+			element_shape, max_num_elements,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softsign gradients for a softsign operation.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
 //
-// Returns the created operation.
-func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterSub",
+		Type: "SoftsignGrad",
 		Input: []tf.Input{
-			resource, indices, updates,
+			gradients, features,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+// Provides the time since epoch in seconds.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+// Returns the timestamp as a `float64` for seconds since the Unix epoch.
+//
+// Note: the timestamp is computed when the op is executed, not when it is added
+// to the graph.
+func Timestamp(scope *Scope) (ts tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Timestamp",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// VariableShapeAttr is an optional argument to VariableShape.
+type VariableShapeAttr func(optionalAttr)
+
+// VariableShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["out_type"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
+// Returns the shape of the variable pointed to by `resource`.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
+// This operation returns a 1-D integer tensor representing the shape of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "VariableShape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the gradients of depthwise convolution with respect to the filter.
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
+
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
 //
-// Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the average pooling function.
+//
+// Arguments:
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
 //	padding: The type of padding algorithm to use.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Type: "AvgPoolGrad",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			orig_input_shape, grad,
 		},
 		Attrs: attrs,
 	}
@@ -17778,185 +17408,250 @@ func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_s
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// The hash function is deterministic on the content of the string within the
-// process. The hash function is a keyed hash function, where attribute `key`
-// defines the key of the hash function. `key` is an array of 2 elements.
+// pruning away boxes that have high overlaps
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
+// which allows for defining a custom overlap criterium (eg. intersection over union,
+// intersection over area, etc.).
 //
-// A strong hash is important when inputs may be malicious, e.g. URLs with
-// additional components. Adversaries could try to make their inputs hash to the
-// same bucket for a denial-of-service attack or to skew the results. A strong
-// hash prevents this by making it difficult, if not infeasible, to compute inputs
-// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-// time than `tf.string_to_hash_bucket_fast`.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//
+//   selected_indices = tf.image.non_max_suppression_with_overlaps(
+//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//	key: The key for the keyed hash function passed as a list of two uint64
-// elements.
+//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
+// the n-by-n box overlap values.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketStrong",
+		Type: "NonMaxSuppressionWithOverlaps",
 		Input: []tf.Input{
-			input,
+			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StringLengthAttr is an optional argument to StringLength.
-type StringLengthAttr func(optionalAttr)
-
-// StringLengthUnit sets the optional unit attribute to value.
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// value: The unit that is counted to compute string length.  One of: `"BYTE"` (for
-// the number of bytes in each string) or `"UTF8_CHAR"` (for the number of UTF-8
-// encoded Unicode code points in each string).  Results are undefined
-// if `unit=UTF8_CHAR` and the `input` strings do not contain structurally
-// valid UTF-8.
-// If not specified, defaults to "BYTE"
-func StringLengthUnit(value string) StringLengthAttr {
-	return func(m optionalAttr) {
-		m["unit"] = value
-	}
-}
-
-// String lengths of `input`.
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
 //
-// Computes the length of each string given in the input tensor.
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	input: The string for which to compute the length.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
 //
-// Returns Integer tensor that has the same shape as `input`. The output contains the
-// element-wise string lengths of `input`.
-func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StringLength",
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			input,
+			features, labels,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
-type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "LogicalNot",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update '*var' as FOBOS algorithm with fixed learning rate.
+// 3D real-valued fast Fourier transform.
 //
-// prox_v = var - alpha * delta
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 3 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	delta: The change.
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-// Returns the created operation.
-func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the their 3D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfftn with 3 dimensions.
+// @end_compatibility
+func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalGradientDescent",
+		Type: "RFFT3D",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, delta,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns 0 if the denominator is zero.
+// TensorArrayV3Attr is an optional argument to TensorArrayV3.
+type TensorArrayV3Attr func(optionalAttr)
+
+// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
 //
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
 //
-// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: A boolean that determines whether writes to the TensorArray
+// are allowed to grow the size.  By default, this is not allowed.
+// If not specified, defaults to false
+func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "DivNoNan",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
+//
+// value: If true (default), Tensors in the TensorArray are cleared
+// after being read.  This disables multiple read semantics but allows early
+// release of memory.
+// If not specified, defaults to true
+func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+//
+// value: If true (default is false), then all
+// elements in the TensorArray will be expected to have have identical shapes.
+// This allows certain behaviors, like dynamically checking for
+// consistent shapes on write, and being able to fill in properly
+// shaped zero tensors on stack -- even if the element_shape attribute
+// is not fully defined.
+// If not specified, defaults to false
+func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["identical_element_shapes"] = value
+	}
+}
+
+// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+//
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
+// If not specified, defaults to ""
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the gradient for the sqrt of `x` wrt its input.
+// An array of Tensors of given size.
 //
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Write data via Write and read via Read or Pack.
+//
+// Arguments:
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
+//
+// Returns The handle to the TensorArray.A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
+		Type: "TensorArrayV3",
 		Input: []tf.Input{
-			y, dy,
+			size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Get the value of the tensor specified by its handle.
+// Runs multiple additive regression ensemble predictors on input instances and
+//
+// computes the logits. It is designed to be used during prediction.
+// It traverses all the trees and calculates the final score for each instance.
 //
 // Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//	dtype: The type of the output value.
 //
-// Returns The tensor for the given handle.
-func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Output rank 2 Tensor containing logits for each example.
+func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
 	opspec := tf.OpSpec{
-		Type: "GetSessionTensor",
+		Type: "BoostedTreesPredict",
 		Input: []tf.Input{
-			handle,
+			tree_ensemble_handle, tf.OutputList(bucketized_features),
 		},
 		Attrs: attrs,
 	}
@@ -17964,16 +17659,16 @@ func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value
 	return op.Output(0)
 }
 
-// Returns x - y element-wise.
+// Elementwise computes the bitwise OR of `x` and `y`.
 //
-// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sub",
+		Type: "BitwiseOr",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -17982,29 +17677,65 @@ func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
 
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["fast"] = value
 	}
 }
 
-// Use RandomPoissonV2 instead.
+// Solves one or more linear least-squares problems.
 //
-// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
+//
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
+//
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
+// If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18013,9 +17744,9 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
+		Type: "MatrixSolveLs",
 		Input: []tf.Input{
-			shape, rate,
+			matrix, rhs, l2_regularizer,
 		},
 		Attrs: attrs,
 	}
@@ -18023,118 +17754,89 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 	return op.Output(0)
 }
 
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+// MaxPool3DAttr is an optional argument to MaxPool3D.
+type MaxPool3DAttr func(optionalAttr)
+
+// MaxPool3DDataFormat sets the optional data_format attribute to value.
 //
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Maximum",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
-//
-// Inputs are the logits, not probabilities.
+// Performs 3D max pooling on the input.
 //
 // Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size x num_classes matrix
-// The caller must ensure that each batch of labels represents a valid
-// probability distribution.
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SoftmaxCrossEntropyWithLogits",
+		Type: "MaxPool3D",
 		Input: []tf.Input{
-			features, labels,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// ReduceJoinAttr is an optional argument to ReduceJoin.
-type ReduceJoinAttr func(optionalAttr)
-
-// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If `True`, retain reduced dimensions with length `1`.
-// If not specified, defaults to false
-func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
+// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
+type Conv3DBackpropInputAttr func(optionalAttr)
 
-// ReduceJoinSeparator sets the optional separator attribute to value.
-//
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func ReduceJoinSeparator(value string) ReduceJoinAttr {
+// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["separator"] = value
+		m["dilations"] = value
 	}
 }
 
-// Joins a string Tensor across the given dimensions.
-//
-// Computes the string join across dimensions in the given string Tensor of shape
-// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
-// strings with the given separator (default: empty string).  Negative indices are
-// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
-// indices are not specified, joins across all dimensions beginning from `n - 1`
-// through `0`.
-//
-// For example:
+// Computes the gradients of 3-D convolution with respect to the input.
 //
-// ```python
-// # tensor `a` is [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-// tf.reduce_join(a, [0, 1]) ==> "acbd"
-// tf.reduce_join(a, [1, 0]) ==> "abcd"
-// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
-// ```
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
 //
 // Arguments:
-//	inputs: The input to be joined.  All reduced indices must have non-zero size.
-//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-// order specified.  Omitting `reduction_indices` is equivalent to passing
-// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
-//
-// Returns Has shape equal to that of the input with reduced dimensions removed or
-// set to `1` depending on `keep_dims`.
-func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ReduceJoin",
+		Type: "Conv3DBackpropInput",
 		Input: []tf.Input{
-			inputs, reduction_indices,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -18142,241 +17844,185 @@ func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, opt
 	return op.Output(0)
 }
 
-// Computes cos of x element-wise.
-func Cos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cos",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
-type FusedBatchNormGradAttr func(optionalAttr)
-
-// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
-// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
 // If not specified, defaults to "NHWC"
-func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["dilations"] = value
 	}
 }
 
-// Gradient for batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Computes the gradients of depthwise convolution with respect to the filter.
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGrad",
+		Type: "DepthwiseConv2dNativeBackpropFilter",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// TopKAttr is an optional argument to TopK.
-type TopKAttr func(optionalAttr)
-
-// TopKSorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKSorted(value bool) TopKAttr {
-	return func(m optionalAttr) {
-		m["sorted"] = value
-	}
+	return op.Output(0)
 }
 
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// DEPRECATED at GraphDef version 7: Use TopKV2 instead
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// If two elements are equal, the lower-index element appears first.
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
 //
-// If `k` varies dynamically, use `TopKV2` below.
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash prevents this by making it difficult, if not infeasible, to compute inputs
+// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: Number of top elements to look for along the last dimension (along each
-// row for matrices).
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key for the keyed hash function passed as a list of two uint64
+// elements.
 //
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
 	opspec := tf.OpSpec{
-		Type: "TopK",
+		Type: "StringToHashBucketStrong",
 		Input: []tf.Input{
 			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns a list of tensors with the same shapes and contents as the input
+// StringLengthAttr is an optional argument to StringLength.
+type StringLengthAttr func(optionalAttr)
+
+// StringLengthUnit sets the optional unit attribute to value.
 //
-// tensors.
+// value: The unit that is counted to compute string length.  One of: `"BYTE"` (for
+// the number of bytes in each string) or `"UTF8_CHAR"` (for the number of UTF-8
+// encoded Unicode code points in each string).  Results are undefined
+// if `unit=UTF8_CHAR` and the `input` strings do not contain structurally
+// valid UTF-8.
+// If not specified, defaults to "BYTE"
+func StringLengthUnit(value string) StringLengthAttr {
+	return func(m optionalAttr) {
+		m["unit"] = value
+	}
+}
+
+// String lengths of `input`.
 //
-// This op can be used to override the gradient for complicated functions. For
-// example, suppose y = f(x) and we wish to apply a custom function g for backprop
-// such that dx = g(dy). In Python,
+// Computes the length of each string given in the input tensor.
 //
-// ```python
-// with tf.get_default_graph().gradient_override_map(
-//     {'IdentityN': 'OverrideGradientWithG'}):
-//   y, _ = identity_n([f(x), x])
+// Arguments:
+//	input: The string for which to compute the length.
 //
-// @tf.RegisterGradient('OverrideGradientWithG')
-// def ApplyG(op, dy, _):
-//   return [None, g(dy)]  # Do not backprop to f(x).
-// ```
-func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
+// Returns Integer tensor that has the same shape as `input`. The output contains the
+// element-wise string lengths of `input`.
+func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IdentityN",
+		Type: "StringLength",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("IdentityN", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
-type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
+type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
 
-// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-//
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+// Update '*var' as FOBOS algorithm with fixed learning rate.
 //
-// mg <- rho * mg_{t-1} + (1-rho) * grad
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-// var <- var - mom
+// prox_v = var - alpha * delta
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	delta: The change.
 //
 // Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18385,64 +18031,68 @@ func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
+		Type: "ResourceApplyProximalGradientDescent",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+			var_, alpha, l1, l2, delta,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
-
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+// Returns 0 if the denominator is zero.
 //
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
 //
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
+// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DivNoNan",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// Computes the gradient for the sqrt of `x` wrt its input.
 //
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SqrtGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Get the value of the tensor specified by its handle.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+//	handle: The handle for a tensor stored in the session state.
+//	dtype: The type of the output value.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+// Returns The tensor for the given handle.
+func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "AudioSummary",
+		Type: "GetSessionTensor",
 		Input: []tf.Input{
-			tag, tensor,
+			handle,
 		},
 		Attrs: attrs,
 	}
@@ -18450,42 +18100,47 @@ func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate flo
 	return op.Output(0)
 }
 
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
-
-// QrFullMatrices sets the optional full_matrices attribute to value.
+// Returns x - y element-wise.
 //
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
-// If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
+// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sub",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
+
+// RandomPoissonSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
-		m["full_matrices"] = value
+		m["seed"] = value
 	}
 }
 
-// Computes the QR decompositions of one or more matrices.
-//
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
-//
-// ```python
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
-// ```
-//
-// Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Use RandomPoissonV2 instead.
 //
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18494,93 +18149,117 @@ func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Qr",
+		Type: "RandomPoisson",
 		Input: []tf.Input{
-			input,
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+//
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BytesProducedStatsDataset",
+		Type: "Maximum",
 		Input: []tf.Input{
-			input_dataset, tag,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is the
-// regular expression to be matched with every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: The regular expression to match the input.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size x num_classes matrix
+// The caller must ensure that each batch of labels represents a valid
+// probability distribution.
 //
-// Returns A bool tensor with the same shape as `input`.
-func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pattern": pattern}
 	opspec := tf.OpSpec{
-		Type: "StaticRegexFullMatch",
+		Type: "SoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			input,
+			features, labels,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
-type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, retain reduced dimensions with length `1`.
 // If not specified, defaults to false
-func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+// ReduceJoinSeparator sets the optional separator attribute to value.
 //
-// That is for rows we have grad for, we update var as follows:
-// prox_v = var - alpha * grad
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// value: The separator to use when joining.
+// If not specified, defaults to ""
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins a string Tensor across the given dimensions.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
+// indices are not specified, joins across all dimensions beginning from `n - 1`
+// through `0`.
+//
+// For example:
+//
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> "acbd"
+// tf.reduce_join(a, [1, 0]) ==> "abcd"
+// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
+// ```
 //
-// Returns the created operation.
-func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+// Arguments:
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18589,84 +18268,72 @@ func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, al
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalGradientDescent",
+		Type: "ReduceJoin",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, grad, indices,
+			inputs, reduction_indices,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Real-valued fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
-//
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedReluAttr is an optional argument to QuantizedRelu.
-type QuantizedReluAttr func(optionalAttr)
+// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
+type DenseToSparseSetOperationAttr func(optionalAttr)
 
-// QuantizedReluOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear: `max(features, 0)`
+// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set2`
+// indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+// max set size across `n-1` dimensions.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu",
+		Type: "DenseToSparseSetOperation",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			set1, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
@@ -18674,145 +18341,102 @@ func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Reshapes a SparseTensor to represent values in a new dense shape.
-//
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
-//
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
+// L2 Loss.
 //
-// Reshaping does not affect the order of values in the SparseTensor.
+// Computes half the L2 norm of a tensor without the `sqrt`:
 //
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
+//     output = sum(t ** 2) / 2
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//	t: Typically 2-D, but may have any dimensions.
 //
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+// Returns 0-D.
+func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReshape",
+		Type: "L2Loss",
 		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
+			t,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Deprecated. Use TensorArraySplitV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
+		Type: "Cos",
 		Input: []tf.Input{
-			handle, value, lengths, flow_in,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reorders a SparseTensor into the canonical, row-major ordering.
-//
-// Note that by convention, all sparse ops preserve the canonical ordering along
-// increasing dimension number. The only time ordering can be violated is during
-// manual manipulation of the indices and values vectors to add entries.
-//
-// Reordering does not affect the shape of the SparseTensor.
-//
-// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
+// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
+type FusedBatchNormGradAttr func(optionalAttr)
+
+// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
 //
-// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
-// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
-func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReorder",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape,
-		},
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Computes rectified linear: `max(features, 0)`.
-func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu",
-		Input: []tf.Input{
-			features,
-		},
+// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
-type ResourceApplyAddSignAttr func(optionalAttr)
-
-// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
 //
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["is_training"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
+// Gradient for batch normalization.
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-// variable <- variable - lr_t * update
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	alpha: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
 //
-// Returns the created operation.
-func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18821,154 +18445,193 @@ func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAddSign",
+		Type: "FusedBatchNormGrad",
 		Input: []tf.Input{
-			var_, m, lr, alpha, sign_decay, beta, grad,
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// UpperBoundAttr is an optional argument to UpperBound.
-type UpperBoundAttr func(optionalAttr)
+// TopKAttr is an optional argument to TopK.
+type TopKAttr func(optionalAttr)
 
-// UpperBoundOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
+// TopKSorted sets the optional sorted attribute to value.
+//
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKSorted(value bool) TopKAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["sorted"] = value
 	}
 }
 
-// Applies upper_bound(sorted_search_values, values) along each row.
+// Finds values and indices of the `k` largest elements for the last dimension.
 //
-// Each set of rows with the same index in (sorted_inputs, values) is treated
-// independently.  The resulting row is the equivalent of calling
-// `np.searchsorted(sorted_inputs, values, side='right')`.
+// DEPRECATED at GraphDef version 7: Use TopKV2 instead
 //
-// The result is not a global index to the entire
-// `Tensor`, but rather just the index in the last dimension.
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
 //
-// A 2-D example:
-//   sorted_sequence = [[0, 3, 9, 9, 10],
-//                      [1, 2, 3, 4, 5]]
-//   values = [[2, 4, 9],
-//             [0, 2, 6]]
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
 //
-//   result = UpperBound(sorted_sequence, values)
+//     values.shape = indices.shape = input.shape[:-1] + [k]
 //
-//   result == [[1, 2, 4],
-//              [0, 2, 5]]
+// If two elements are equal, the lower-index element appears first.
+//
+// If `k` varies dynamically, use `TopKV2` below.
 //
 // Arguments:
-//	sorted_inputs: 2-D Tensor where each row is ordered.
-//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
-// the values that will be searched for in `sorted_search_values`.
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: Number of top elements to look for along the last dimension (along each
+// row for matrices).
 //
-// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
-// into the last dimension where values can be inserted without changing the
-// ordered property.
-func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"k": k}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UpperBound",
+		Type: "TopK",
 		Input: []tf.Input{
-			sorted_inputs, values,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
-type FractionalMaxPoolGradAttr func(optionalAttr)
-
-// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
+// The gradient operator for the SparseAdd op.
 //
-// `index  0  1  2  3  4`
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
 //
-// `value  20 5  16 3  7`
+// Arguments:
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAddGrad",
+		Input: []tf.Input{
+			backprop_val_grad, a_indices, b_indices, sum_indices,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes gradient of the FractionalMaxPool function.
+// Returns a list of tensors with the same shapes and contents as the input
 //
-// Arguments:
-//	orig_input: Original input for `fractional_max_pool`
-//	orig_output: Original output for `fractional_max_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_max_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+// tensors.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+// This op can be used to override the gradient for complicated functions. For
+// example, suppose y = f(x) and we wish to apply a custom function g for backprop
+// such that dx = g(dy). In Python,
+//
+// ```python
+// with tf.get_default_graph().gradient_override_map(
+//     {'IdentityN': 'OverrideGradientWithG'}):
+//   y, _ = identity_n([f(x), x])
+//
+// @tf.RegisterGradient('OverrideGradientWithG')
+// def ApplyG(op, dy, _):
+//   return [None, g(dy)]  # Do not backprop to f(x).
+// ```
+func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
+		Type: "IdentityN",
 		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			tf.OutputList(input),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("IdentityN", err)
+		return
+	}
+	return output
 }
 
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
 
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the proximal adagrad scheme.
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+//
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// mg <- rho * mg_{t-1} + (1-rho) * grad
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+// var <- var - mom
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18977,50 +18640,63 @@ func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
+		Type: "ResourceApplyCenteredRMSProp",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
-type SparseReduceMaxSparseAttr func(optionalAttr)
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
 
-// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes the max of elements across dimensions of a SparseTensor.
+// Update '*var' according to the centered RMSProp algorithm.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-// SparseTensor.
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
 //
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19029,33 +18705,32 @@ func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceMaxSparse",
+		Type: "ResourceSparseApplyCenteredRMSProp",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that emits the outputs of `input_dataset` `count` times.
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
 //
 // Arguments:
 //
-//	count: A scalar representing the number of times that `input_dataset` should
-// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
 //
 //
-func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RepeatDataset",
+		Type: "BatchDataset",
 		Input: []tf.Input{
-			input_dataset, count,
+			input_dataset, batch_size,
 		},
 		Attrs: attrs,
 	}
@@ -19063,65 +18738,61 @@ func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, outpu
 	return op.Output(0)
 }
 
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
+// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
+type RandomPoissonV2Attr func(optionalAttr)
 
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+// RandomPoissonV2Seed sets the optional seed attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["seed"] = value
 	}
 }
 
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["seed2"] = value
 	}
 }
 
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
-//
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
-//
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
-//
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
+// RandomPoissonV2Dtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from the Poisson distribution(s) described by rate.
 //
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
 //
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
 //
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `rate[i0, i1, ...iN]`.
+func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19130,9 +18801,9 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
+		Type: "RandomPoissonV2",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
@@ -19140,201 +18811,181 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 	return op.Output(0)
 }
 
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
+// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
+type DecodeAndCropJpegAttr func(optionalAttr)
+
+// DecodeAndCropJpegChannels sets the optional channels attribute to value.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatV2",
-		Input: []tf.Input{
-			tf.OutputList(values), axis,
-		},
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReadFile",
-		Input: []tf.Input{
-			filename,
-		},
+// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Multiplies sparse updates into the variable referenced by `resource`.
+// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
 //
-// This operation computes
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
 //
-//     # Scalar indices
-//     ref[indices, ...] *= updates[...]
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
 //
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] *= updates[i, ...]
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode and Crop a JPEG-encoded image to a uint8 tensor.
 //
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// Accepted values are:
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
 //
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
 //
-// Returns the created operation.
-func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterMul",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Component-wise divides a SparseTensor by a dense Tensor.
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
 //
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+//
+// It is equivalent to a combination of decode and crop, but much faster by only
+// decoding partial jpeg image.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	contents: 0-D.  The JPEG-encoded image.
+//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
+		Type: "DecodeAndCropJpeg",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			contents, crop_window,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
-
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
 //
-// `value  20 5  16 3  7`
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// Computes gradient of the FractionalAvgPool function.
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
 //
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
 //
 // Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
-//
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
+		Type: "SparseAdd",
 		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SerializeManySparseAttr is an optional argument to SerializeManySparse.
-type SerializeManySparseAttr func(optionalAttr)
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
 
-// SerializeManySparseOutType sets the optional out_type attribute to value.
-//
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
 	return func(m optionalAttr) {
 		m["out_type"] = value
 	}
 }
 
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
 //
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
+// Arguments:
 //
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
 //
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19343,81 +18994,155 @@ func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
+		Type: "QuantizedRelu6",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes inverse hyperbolic cosine of x element-wise.
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acosh",
-		Input: []tf.Input{
-			x,
-		},
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
+
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
+//
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["header_bytes"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// TensorArrayV2Attr is an optional argument to TensorArrayV2.
-type TensorArrayV2Attr func(optionalAttr)
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
+//
+// value: Number of bytes in the footer, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["footer_bytes"] = value
+	}
+}
 
-// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+//
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["hop_bytes"] = value
 	}
 }
 
-// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
-// If not specified, defaults to false
-func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["dynamic_size"] = value
+		m["container"] = value
 	}
 }
 
-// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
-// If not specified, defaults to true
-func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+		m["shared_name"] = value
 	}
 }
 
-// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
+//
+// value: The type of encoding for the file. Currently ZLIB and GZIP
+// are supported. Defaults to none.
 // If not specified, defaults to ""
-func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["encoding"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayV3
+// A Reader that outputs fixed-length records from a file.
+//
+// Arguments:
+//	record_bytes: Number of bytes in the record.
+//
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FixedLengthRecordReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
+
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayV3
-func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV2",
+		Type: "AudioSummary",
 		Input: []tf.Input{
-			size,
+			tag, tensor,
 		},
 		Attrs: attrs,
 	}
@@ -19425,118 +19150,83 @@ func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
-
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
 
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// QrFullMatrices sets the optional full_matrices attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["full_matrices"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// Computes the QR decompositions of one or more matrices.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// ```python
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
+// ```
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Type: "Qr",
 		Input: []tf.Input{
-			true_classes,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
-
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+// Check if the input matches the regex pattern.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
+// The input is a string tensor of any shape. The pattern is the
+// regular expression to be matched with every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
+//
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	input: A string tensor of the text to be processed.
+//	pattern: The regular expression to match the input.
 //
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+// Returns A bool tensor with the same shape as `input`.
+func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"pattern": pattern}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
+		Type: "StaticRegexFullMatch",
 		Input: []tf.Input{
-			input, ksize, strides,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -19544,702 +19234,783 @@ func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output
 	return op.Output(0)
 }
 
-// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
-type MutableDenseHashTableV2Attr func(optionalAttr)
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
 
-// MutableDenseHashTableV2Container sets the optional container attribute to value.
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["use_locking"] = value
 	}
 }
 
-// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyProximalGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, l1, l2, grad, indices,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+// Real-valued fast Fourier transform.
 //
-// value: The shape of each value.
-// If not specified, defaults to <>
-func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["value_shape"] = value
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
+//
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+// Adds a value to the current value of a variable.
 //
-// value: The initial number of hash table buckets. Must be a power
-// to 2.
-// If not specified, defaults to 131072
-func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["initial_num_buckets"] = value
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the incremented value or a subsequent newer one.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignAddVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
-//
-// value: The maximum ratio between number of entries and number of
-// buckets before growing the table. Must be between 0 and 1.
-// If not specified, defaults to 0.8
-func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
+
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
 	return func(m optionalAttr) {
-		m["max_load_factor"] = value
+		m["out_type"] = value
 	}
 }
 
-// Creates an empty hash table that uses tensors as the backing store.
-//
-// It uses "open addressing" with quadratic reprobing to resolve
-// collisions.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// Computes Quantized Rectified Linear: `max(features, 0)`
 //
 // Arguments:
-//	empty_key: The key used to represent empty key buckets internally. Must not
-// be used in insert or lookup operations.
 //
-//	value_dtype: Type of the table values.
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
 //
-// Returns Handle to a table.
-func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableDenseHashTableV2",
+		Type: "QuantizedRelu",
 		Input: []tf.Input{
-			empty_key, deleted_key,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// Reshapes a SparseTensor to represent values in a new dense shape.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+//
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
+		Type: "SparseReshape",
 		Input: []tf.Input{
-			y, dy,
+			input_indices, input_shape, new_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+// Deprecated. Use TensorArraySplitV3
 //
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Minimum",
+		Type: "TensorArraySplitV2",
 		Input: []tf.Input{
-			x, y,
+			handle, value, lengths, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MfccAttr is an optional argument to Mfcc.
-type MfccAttr func(optionalAttr)
-
-// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
-//
-// value: The highest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 4000
-func MfccUpperFrequencyLimit(value float32) MfccAttr {
-	return func(m optionalAttr) {
-		m["upper_frequency_limit"] = value
-	}
-}
-
-// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
-//
-// value: The lowest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 20
-func MfccLowerFrequencyLimit(value float32) MfccAttr {
-	return func(m optionalAttr) {
-		m["lower_frequency_limit"] = value
-	}
-}
-
-// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
+// Reorders a SparseTensor into the canonical, row-major ordering.
 //
-// value: Resolution of the Mel bank used internally.
-// If not specified, defaults to 40
-func MfccFilterbankChannelCount(value int64) MfccAttr {
-	return func(m optionalAttr) {
-		m["filterbank_channel_count"] = value
-	}
-}
-
-// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+// Note that by convention, all sparse ops preserve the canonical ordering along
+// increasing dimension number. The only time ordering can be violated is during
+// manual manipulation of the indices and values vectors to add entries.
 //
-// value: How many output channels to produce per time slice.
-// If not specified, defaults to 13
-func MfccDctCoefficientCount(value int64) MfccAttr {
-	return func(m optionalAttr) {
-		m["dct_coefficient_count"] = value
-	}
-}
-
-// Transforms a spectrogram into a form that's useful for speech recognition.
+// Reordering does not affect the shape of the SparseTensor.
 //
-// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-// been effective as an input feature for machine learning. They are created by
-// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-// higher frequencies that are less significant to the human ear. They have a long
-// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-// is a good resource to learn more.
+// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
 //
 // Arguments:
-//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
-// set to true.
-//	sample_rate: How many samples per second the source audio used.
-func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
+// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
+func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Mfcc",
+		Type: "SparseReorder",
 		Input: []tf.Input{
-			spectrogram, sample_rate,
+			input_indices, input_values, input_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
-//
-// The Hurwitz zeta function is defined as:
-//
-//
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+// Computes rectified linear: `max(features, 0)`.
+func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Zeta",
+		Type: "Relu",
 		Input: []tf.Input{
-			x, q,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
+type ResourceApplyAddSignAttr func(optionalAttr)
+
+// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT",
-		Input: []tf.Input{
-			input,
-		},
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// 2D fast Fourier transform.
+// Update '*var' according to the AddSign update.
 //
-// Computes the 2-dimensional discrete Fourier transform over the inner-most
-// 2 dimensions of `input`.
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	alpha: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fft2
-// @end_compatibility
-func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "FFT2D",
+		Type: "ResourceApplyAddSign",
 		Input: []tf.Input{
-			input,
+			var_, m, lr, alpha, sign_decay, beta, grad,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Inverse 2D fast Fourier transform.
+// Component-wise divides a SparseTensor by a dense Tensor.
 //
-// Computes the inverse 2-dimensional discrete Fourier transform over the
-// inner-most 2 dimensions of `input`.
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft2
-// @end_compatibility
-func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT2D",
+		Type: "SparseDenseCwiseDiv",
 		Input: []tf.Input{
-			input,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
+
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-// y + truncate_mod(x, y) = x`.
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalAvgPool function.
+//
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
+//
+// Arguments:
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TruncateMod",
+		Type: "FractionalAvgPoolGrad",
 		Input: []tf.Input{
-			x, y,
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse 2D real-valued fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 2 dimensions of `input`.
-//
-// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
 //
-// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft2
-// @end_compatibility
-func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT2D",
+		Type: "SparseTensorDenseAdd",
 		Input: []tf.Input{
-			input, fft_length,
+			a_indices, a_values, a_shape, b,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
-
-// DecodeJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
 
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
 // If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
 //
-// Accepted values are:
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
 //
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
 //
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
 //
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
 //
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.image.decode_image`.
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
+		Type: "SparseToSparseSetOperation",
 		Input: []tf.Input{
-			contents,
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// StageSizeAttr is an optional argument to StageSize.
-type StageSizeAttr func(optionalAttr)
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
 
-// StageSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
 //
-// REQUIRES: value >= 0
-func StageSizeCapacity(value int64) StageSizeAttr {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["container"] = value
 	}
 }
 
-// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// REQUIRES: value >= 0
-func StageSizeMemoryLimit(value int64) StageSizeAttr {
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["shared_name"] = value
 	}
 }
 
-// StageSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageSizeContainer(value string) StageSizeAttr {
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// StageSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageSizeSharedName(value string) StageSizeAttr {
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+//
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["value_shape"] = value
 	}
 }
 
-// Op returns the number of elements in the underlying container.
-func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+//
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["initial_num_buckets"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "StageSize",
+}
 
-		Attrs: attrs,
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
+//
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["max_load_factor"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Produces the max pool of the input tensor for quantized types.
+// Creates an empty hash table that uses tensors as the backing store.
+//
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
+		Type: "MutableDenseHashTableV2",
 		Input: []tf.Input{
-			input, min_input, max_input,
+			empty_key, deleted_key,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softplus",
-		Input: []tf.Input{
-			features,
-		},
+// UpperBoundAttr is an optional argument to UpperBound.
+type UpperBoundAttr func(optionalAttr)
+
+// UpperBoundOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes exponential of x - 1 element-wise.
+// Applies upper_bound(sorted_search_values, values) along each row.
 //
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+// Each set of rows with the same index in (sorted_inputs, values) is treated
+// independently.  The resulting row is the equivalent of calling
+// `np.searchsorted(sorted_inputs, values, side='right')`.
+//
+// The result is not a global index to the entire
+// `Tensor`, but rather just the index in the last dimension.
+//
+// A 2-D example:
+//   sorted_sequence = [[0, 3, 9, 9, 10],
+//                      [1, 2, 3, 4, 5]]
+//   values = [[2, 4, 9],
+//             [0, 2, 6]]
+//
+//   result = UpperBound(sorted_sequence, values)
+//
+//   result == [[1, 2, 4],
+//              [0, 2, 5]]
+//
+// Arguments:
+//	sorted_inputs: 2-D Tensor where each row is ordered.
+//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
+// the values that will be searched for in `sorted_search_values`.
+//
+// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
+// into the last dimension where values can be inserted without changing the
+// ordered property.
+func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Expm1",
+		Type: "UpperBound",
 		Input: []tf.Input{
-			x,
+			sorted_inputs, values,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the number of records this Reader has produced.
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
+
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalMaxPool function.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_max_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
+		Type: "FractionalMaxPoolGrad",
 		Input: []tf.Input{
-			reader_handle,
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the set of files matching one or more glob patterns.
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
-// Note also that the order of filenames returned can be non-deterministic.
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the proximal adagrad scheme.
 //
 // Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
+		Type: "ResourceApplyAdagradDA",
 		Input: []tf.Input{
-			pattern,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
-type HistogramFixedWidthAttr func(optionalAttr)
+// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
+type SparseReduceMaxSparseAttr func(optionalAttr)
 
-// HistogramFixedWidthDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT32
-func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
+// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Return histogram of values.
+// Computes the max of elements across dimensions of a SparseTensor.
 //
-// Given the tensor `values`, this operation returns a rank 1 histogram counting
-// the number of entries in `values` that fall into every bin.  The bins are
-// equal width and determined by the arguments `value_range` and `nbins`.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+// SparseTensor.
 //
-// ```python
-// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-// nbins = 5
-// value_range = [0.0, 5.0]
-// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// with tf.get_default_session() as sess:
-//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-//   variables.global_variables_initializer().run()
-//   sess.run(hist) => [2, 1, 1, 0, 2]
-// ```
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	values: Numeric `Tensor`.
-//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
-// values <= value_range[0] will be mapped to hist[0],
-// values >= value_range[1] will be mapped to hist[-1].
-//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
-//
-// Returns A 1-D `Tensor` holding histogram of values.
-func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20248,288 +20019,270 @@ func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "HistogramFixedWidth",
+		Type: "SparseReduceMaxSparse",
 		Input: []tf.Input{
-			values, value_range, nbins,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the truth value of (x >= y) element-wise.
+// Creates a dataset that emits the outputs of `input_dataset` `count` times.
 //
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//
+//	count: A scalar representing the number of times that `input_dataset` should
+// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+//
+//
+func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
+		Type: "RepeatDataset",
 		Input: []tf.Input{
-			x, y,
+			input_dataset, count,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
-
-// Conv3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DDilations(value []int64) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
-//
-// In signal processing, cross-correlation is a measure of similarity of
-// two waveforms as a function of a time-lag applied to one of them. This
-// is also known as a sliding dot product or sliding inner-product.
-//
-// Our Conv3D implements a form of cross-correlation.
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-// Arguments:
-//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-// out_channels]`. `in_channels` must match between `input` and `filter`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Conv3D",
+		Type: "ReciprocalGrad",
 		Input: []tf.Input{
-			input, filter,
+			y, dy,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adds up a SparseTensor and a dense Tensor, using these special rules:
-//
-// (1) Broadcasts the dense side to have the same shape as the sparse side, if
-//     eligible;
-// (2) Then, only the dense values pointed to by the indices of the SparseTensor
-//     participate in the cwise addition.
-//
-// By these rules, the result is a logical SparseTensor with exactly the same
-// indices and shape, but possibly with different non-zero values.  The output of
-// this Op is the resultant non-zero values.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseAdd",
+		Type: "Minimum",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
 
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+//
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["upper_frequency_limit"] = value
 	}
 }
 
-// QuantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
-func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+//
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["round_mode"] = value
+		m["lower_frequency_limit"] = value
 	}
 }
 
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.  The
-// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
-// when rounding float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8: out[i] -= (range(T) + 1) / 2.0
-// ```
-//
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = num_discrete_values / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
 //
-// *SCALED mode Example*
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["filterbank_channel_count"] = value
+	}
+}
+
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
 //
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["dct_coefficient_count"] = value
+	}
+}
+
+// Transforms a spectrogram into a form that's useful for speech recognition.
 //
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
 //
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
+// Arguments:
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Mfcc",
+		Input: []tf.Input{
+			spectrogram, sample_rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 //
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
+// The Hurwitz zeta function is defined as:
 //
-// Our input tensor range is then `[-m, m]`.
 //
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Zeta",
+		Input: []tf.Input{
+			x, q,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse fast Fourier transform.
 //
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
 //
-// Otherwise, if T is unsigned, the fixed-point range is
+// Arguments:
+//	input: A complex64 tensor.
 //
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
 //
-// From this we compute our scaling factor, s:
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// 2D fast Fourier transform.
 //
-// ```c++
-//   s = (max_fixed - min_fixed) / (2 * m)
-// ```
+// Computes the 2-dimensional discrete Fourier transform over the inner-most
+// 2 dimensions of `input`.
 //
-// Now we can quantize the elements of our tensor:
+// Arguments:
+//	input: A complex64 tensor.
 //
-// ```c++
-// result = round(input * s)
-// ```
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform.
 //
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
+// @compatibility(numpy)
+// Equivalent to np.fft.fft2
+// @end_compatibility
+func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT2D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 2D fast Fourier transform.
 //
-// Arguments:
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
+// Arguments:
+//	input: A complex64 tensor.
 //
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
 //
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft2
+// @end_compatibility
+func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
+		Type: "IFFT2D",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Returns the truth value of (x < y) element-wise.
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Less",
+		Type: "TruncateMod",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -20538,387 +20291,367 @@ func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
-
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
+// Inverse 2D real-valued fast Fourier transform.
 //
-// Arguments:
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
 //
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft2
+// @end_compatibility
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
+		Type: "IRFFT2D",
 		Input: []tf.Input{
-			features, max_value, min_features, max_features,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
+}
+
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
 }
 
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
-//
-// Arguments:
-//
-//	batch_size: A scalar representing the number of elements to accumulate in a batch.
-//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
-// is smaller than desired.
-//
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
 //
-func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "BatchDatasetV2",
-		Input: []tf.Input{
-			input_dataset, batch_size, drop_remainder,
-		},
-		Attrs: attrs,
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
-
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["acceptable_fraction"] = value
 	}
 }
 
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["dct_method"] = value
 	}
 }
 
-// Computes a 2D convolution given quantized 4D input and filter tensors.
+// Decode a JPEG-encoded image to a uint8 tensor.
 //
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// Arguments:
+// Accepted values are:
 //
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
+		Type: "DecodeJpeg",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			contents,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
-type StatelessMultinomialAttr func(optionalAttr)
-
-// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
+	return op.Output(0)
 }
 
-// Draws samples from a multinomial distribution.
+// Inverse 3D real-valued fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
+//
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//	seed: 2 seeds (shape [2]).
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StatelessMultinomial",
+		Type: "IRFFT3D",
 		Input: []tf.Input{
-			logits, num_samples, seed,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
-
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Gather slices from the variable pointed to by `resource` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
+// Returns the truth value of (x != y) element-wise.
 //
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceGather",
+		Type: "NotEqual",
 		Input: []tf.Input{
-			resource, indices,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Delete the TensorArray from its resource container.
-//
-// This enables the user to close and release the resource in the middle
-// of a step/run.
+// Produces the max pool of the input tensor for quantized types.
 //
 // Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV3",
+		Type: "QuantizedMaxPool",
 		Input: []tf.Input{
-			handle,
+			input, min_input, max_input,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Saves the input tensors to disk.
-//
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
-//
-// See also `SaveSlices`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
-//
-// Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Save",
+		Type: "Softplus",
 		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
+			features,
 		},
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-//
-// true, this follows Python semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes exponential of x - 1 element-wise.
 //
-// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FloorMod",
+		Type: "Expm1",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the filter.
+// Returns the number of records this Reader has produced.
 //
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
 //
-// Returns 3-D with shape `[filter_height, filter_width, depth]`.
-func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropFilter",
+		Type: "ReaderNumRecordsProducedV2",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
+// Returns the set of files matching one or more glob patterns.
 //
-// tensor: The tensor to put on the list.
-// input_handle: The old list.
-// output_handle: A list with the elements of the old list followed by tensor.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
+// Note also that the order of filenames returned can be non-deterministic.
+//
+// Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListPushBack",
+		Type: "MatchingFiles",
 		Input: []tf.Input{
-			input_handle, tensor,
+			pattern,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
-
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
+// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
+type HistogramFixedWidthAttr func(optionalAttr)
 
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+// HistogramFixedWidthDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT32
+func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["dtype"] = value
 	}
 }
 
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+// Return histogram of values.
 //
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
+// Given the tensor `values`, this operation returns a rank 1 histogram counting
+// the number of entries in `values` that fall into every bin.  The bins are
+// equal width and determined by the arguments `value_range` and `nbins`.
 //
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
+// ```python
+// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+// nbins = 5
+// value_range = [0.0, 5.0]
+// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
 //
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// with tf.get_default_session() as sess:
+//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+//   variables.global_variables_initializer().run()
+//   sess.run(hist) => [2, 1, 1, 0, 2]
+// ```
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//	values: Numeric `Tensor`.
+//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
+// values <= value_range[0] will be mapped to hist[0],
+// values >= value_range[1] will be mapped to hist[-1].
+//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
 //
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+// Returns A 1-D `Tensor` holding histogram of values.
+func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20927,9 +20660,9 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
+		Type: "HistogramFixedWidth",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			values, value_range, nbins,
 		},
 		Attrs: attrs,
 	}
@@ -20937,335 +20670,267 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 	return op.Output(0)
 }
 
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-//
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
+
+// Conv3DDataFormat sets the optional data_format attribute to value.
 //
-// and
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DDilations sets the optional dilations attribute to value.
 //
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
 //
-// then the final deserialized `SparseTensor` will be:
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// Our Conv3D implements a form of cross-correlation.
 //
 // Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
-		Input: []tf.Input{
-			serialized_sparse,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Elementwise computes the bitwise AND of `x` and `y`.
-//
-// The result will have those bits set, that are set in both `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BitwiseAnd",
+		Type: "Conv3D",
 		Input: []tf.Input{
-			x, y,
+			input, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SubstrAttr is an optional argument to Substr.
-type SubstrAttr func(optionalAttr)
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
 
-// SubstrUnit sets the optional unit attribute to value.
-//
-// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
-// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
-// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
-// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
-// UTF-8.
-// If not specified, defaults to "BYTE"
-func SubstrUnit(value string) SubstrAttr {
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["mode"] = value
+	}
+}
+
+// QuantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
+func QuantizeV2RoundMode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["unit"] = value
+		m["round_mode"] = value
 	}
 }
 
-// Return substrings from `Tensor` of strings.
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
 //
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.  The
+// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
+// when rounding float values to their quantized equivalents.
 //
-// If `len` defines a substring that would extend beyond the length of the input
-// string, then as many characters as possible are used.
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
-// A negative `pos` indicates distance within the string backwards from the end.
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8: out[i] -= (range(T) + 1) / 2.0
+// ```
 //
-// If `pos` specifies an index which is out of range for any of the input strings,
-// then an `InvalidArgumentError` is thrown.
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 //
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
+// *MIN_COMBINED Mode Example*
 //
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
 //
-// ---
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
 //
-// Examples
+// If the mode is 'MIN_FIRST', then this approach is used:
 //
-// Using scalar `pos` and `len`:
+// ```
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = num_discrete_values / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
 //
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
 //
-// output = [b'ell', b'orl']
-// ```
+// *SCALED mode Example*
 //
-// Using `pos` and `len` with same shape as `input`:
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
 //
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
 //
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+//
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
 // ```
 //
-// Broadcasting `pos` and `len` onto `input`:
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
 //
 // ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
 //
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
 // ```
 //
-// Broadcasting `input` onto `pos` and `len`:
+// From this we compute our scaling factor, s:
 //
+// ```c++
+//   s = (max_fixed - min_fixed) / (2 * m)
 // ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
 //
-// output = [b'hir', b'ee', b'n']
+// Now we can quantize the elements of our tensor:
+//
+// ```c++
+// result = round(input * s)
 // ```
 //
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
+//
 // Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
 //
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Substr",
+		Type: "QuantizeV2",
 		Input: []tf.Input{
-			input, pos, len,
+			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates a Dataset that returns pseudorandom numbers.
-//
-// Arguments:
-//	seed: A scalar seed for the random number generator. If either seed or
-// seed2 is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//
+// Returns the truth value of (x < y) element-wise.
 //
-func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RandomDataset",
+		Type: "Less",
 		Input: []tf.Input{
-			seed, seed2,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
-//
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
+
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Concatenates a list of `SparseTensor` along the specified dimension.
-//
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
-//
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
-//
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
-//
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
-//
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
-//
-// For example, if `concat_dim = 1` and the inputs are
-//
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
-//
-// then the output will be
-//
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
-// Graphically this is equivalent to doing
+// Arguments:
 //
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
 //
-// Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseConcat",
+		Type: "QuantizedReluX",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+			features, max_value, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
@@ -21273,283 +20938,271 @@ func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
-//
-// For example, if the inputs are
-//
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
-//
-//     inputs[2]: Tensor [["f"], ["g"]]
-//
-// then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
-//
-// if hashed_output=true then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
 //
+//	batch_size: A scalar representing the number of elements to accumulate in a batch.
+//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
+// is smaller than desired.
 //
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SparseCross",
+		Type: "BatchDatasetV2",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+			input_dataset, batch_size, drop_remainder,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
-type ResourceApplyProximalAdagradAttr func(optionalAttr)
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
 
-// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["dilations"] = value
 	}
 }
 
-// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+// Computes a 2D convolution given quantized 4D input and filter tensors.
 //
-// accum += grad * grad
-// prox_v = var - lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
 //
-// Returns the created operation.
-func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalAdagrad",
+		Type: "QuantizedConv2D",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad,
+			input, filter, min_input, max_input, min_filter, max_filter,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
-type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
+type StatelessMultinomialAttr func(optionalAttr)
 
-// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["output_dtype"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
+// Draws samples from a multinomial distribution.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessMultinomial",
+		Input: []tf.Input{
+			logits, num_samples, seed,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to <>
-func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
+
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
 	return func(m optionalAttr) {
-		m["value_shape"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Creates an empty hash table.
+// Gather slices from the variable pointed to by `resource` according to `indices`.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a vector. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// Returns Handle to a table.
-func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableOfTensorsV2",
-
+		Type: "ResourceGather",
+		Input: []tf.Input{
+			resource, indices,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// The gradient operator for the SparseSlice op.
+// Delete the TensorArray from its resource container.
 //
-// This op takes in the upstream gradient w.r.t. non-empty values of
-// the sliced `SparseTensor`, and outputs the gradients w.r.t.
-// the non-empty values of input `SparseTensor`.
+// This enables the user to close and release the resource in the middle
+// of a step/run.
 //
-// Arguments:
-//	backprop_val_grad: 1-D. The gradient with respect to
-// the non-empty values of the sliced `SparseTensor`.
-//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
-//	input_start: 1-D. tensor represents the start of the slice.
-//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
+// Arguments:
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
 //
-// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
-func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
+// Returns the created operation.
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSliceGrad",
+		Type: "TensorArrayCloseV3",
 		Input: []tf.Input{
-			backprop_val_grad, input_indices, input_start, output_indices,
+			handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the gradient of the sigmoid of `x` wrt its input.
+// Saves the input tensors to disk.
 //
-// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-// `dy` is the corresponding input gradient.
-func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
+//
+// See also `SaveSlices`.
+//
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SigmoidGrad",
+		Type: "Save",
 		Input: []tf.Input{
-			y, dy,
+			filename, tensor_names, tf.OutputList(data),
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Convert one or more images from HSV to RGB.
-//
-// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
-//
-// See `rgb_to_hsv` for a description of the HSV encoding.
+// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
 //
-// Arguments:
-//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+// true, this follows Python semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
 //
-// Returns `images` converted to RGB.
-func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "HSVToRGB",
+		Type: "FloorMod",
 		Input: []tf.Input{
-			images,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset by applying optimizations to `input_dataset`.
-//
-// Creates a dataset by applying optimizations to `input_dataset`.
+// Computes the gradient of morphological 2-D dilation with respect to the filter.
 //
 // Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	optimizations: A `tf.string` vector `tf.Tensor` identifying optimizations to use.
-//
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns 3-D with shape `[filter_height, filter_width, depth]`.
+func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "OptimizeDataset",
+		Type: "Dilation2DBackpropFilter",
 		Input: []tf.Input{
-			input_dataset, optimizations,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -21557,68 +21210,102 @@ func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Out
 	return op.Output(0)
 }
 
-// Returns the element-wise min of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-//
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// tensor: The tensor to put on the list.
+// input_handle: The old list.
+// output_handle: A list with the elements of the old list followed by tensor.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMinimum",
+		Type: "TensorListPushBack",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			input_handle, tensor,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
-type TakeManySparseFromTensorsMapAttr func(optionalAttr)
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
 
-// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` read by this op.
+// value: The container name for the `SparseTensorsMap` created by this op.
 // If not specified, defaults to ""
-func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` read by this op.
-// It should not be blank; rather the `shared_name` or unique Operation name
-// of the Op that created the original `SparseTensorsMap` should be used.
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
 // If not specified, defaults to ""
-func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
 //
-// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
-// `N` is the minibatch size and the rows correspond to the output handles of
-// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
-// original `SparseTensor` objects that went into the given input ops must all
-// match.  When the final `SparseTensor` is created, it has rank one
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AddSparseToTensorsMap",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
+//
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
 // higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension on the left).
+// (they have been concatenated along a new row dimension).
 //
 // The output `SparseTensor` object's shape values for all dimensions but the
 // first are the max across the input `SparseTensor` objects' shape values
@@ -21629,29 +21316,24 @@ func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTens
 // standard lexicographic order.  If this is not the case, after this
 // step run `SparseReorder` to restore index ordering.
 //
-// For example, if the handles represent an input, which is a `[2, 3]` matrix
-// representing two original `SparseTensor` objects:
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
 //
-// ```
 //     index = [ 0]
 //             [10]
 //             [20]
 //     values = [1, 2, 3]
 //     shape = [50]
-// ```
 //
 // and
 //
-// ```
 //     index = [ 2]
 //             [10]
 //     values = [4, 5]
 //     shape = [30]
-// ```
 //
-// then the final `SparseTensor` will be:
+// then the final deserialized `SparseTensor` will be:
 //
-// ```
 //     index = [0  0]
 //             [0 10]
 //             [0 20]
@@ -21659,27 +21341,20 @@ func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTens
 //             [1 10]
 //     values = [1, 2, 3, 4, 5]
 //     shape = [2 50]
-// ```
 //
 // Arguments:
-//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
-// Shape: `[N]`.
-//	dtype: The `dtype` of the `SparseTensor` objects stored in the
-// `SparseTensorsMap`.
-//
-// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
-func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TakeManySparseFromTensorsMap",
+		Type: "DeserializeManySparse",
 		Input: []tf.Input{
-			sparse_handles,
+			serialized_sparse,
 		},
 		Attrs: attrs,
 	}
@@ -21687,151 +21362,234 @@ func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Assigns a new value to a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to return
-// this value or a subsequent newer value of the variable.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value to set the new tensor to use.
+// Elementwise computes the bitwise AND of `x` and `y`.
 //
-// Returns the created operation.
-func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+// The result will have those bits set, that are set in both `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AssignVariableOp",
+		Type: "BitwiseAnd",
 		Input: []tf.Input{
-			resource, value,
+			x, y,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Strip leading and trailing whitespaces from the Tensor.
+// Inverse real-valued fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Arguments:
-//	input: A string `Tensor` of any shape.
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
 //
-// Returns A string `Tensor` of the same shape as the input.
-func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StringStrip",
+		Type: "IRFFT",
 		Input: []tf.Input{
-			input,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a tensor of ones with the same shape and type as x.
+// Concatenates a list of `SparseTensor` along the specified dimension.
 //
-// Arguments:
-//	x: a tensor of type T.
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
 //
-// Returns a tensor of the same shape and type as x but filled with ones.
-func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OnesLike",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// The gradient of SparseFillEmptyRows.
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
 //
-// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
-// shaped `[N_full]`, where `N_full >= N` and copies data into either
-// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
-// `d_default_value` is a scalar.
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
 //
-//   d_values[j] = grad_values[reverse_index_map[j]]
-//   d_default_value = sum_{k : 0 .. N_full - 1} (
-//      grad_values[k] * 1{k not in reverse_index_map})
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
+//
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
 //
 // Arguments:
-//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
-//	grad_values: 1-D.  The gradients from backprop.
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
 //
-// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
-func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRowsGrad",
+		Type: "SparseConcat",
 		Input: []tf.Input{
-			reverse_index_map, grad_values,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
+// Generates sparse cross from a list of sparse and dense tensors.
 //
-// if < 0, `scale * features` otherwise.
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
 //
-// To be used together with
-// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
-// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+// For example, if the inputs are
 //
-// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
+//
+//
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "Selu",
+		Type: "SparseCross",
 		Input: []tf.Input{
-			features,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SetSizeAttr is an optional argument to SetSize.
-type SetSizeAttr func(optionalAttr)
+// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
+type ResourceApplyProximalAdagradAttr func(optionalAttr)
 
-// SetSizeValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SetSizeValidateIndices(value bool) SetSizeAttr {
+// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Number of unique elements along last dimension of input `set`.
-//
-// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-// and `set_shape`. The last dimension contains values in a set, duplicates are
-// allowed but ignored.
+// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
+// accum += grad * grad
+// prox_v = var - lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
-//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-//	set_values: 1D `Tensor`, values of a `SparseTensor`.
-//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-// `n-1` dimensions as `set`. Each value is the number of unique elements in
-// the corresponding `[0...n-1]` dimension of `set`.
-func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+// Returns the created operation.
+func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -21840,621 +21598,502 @@ func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shap
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SetSize",
+		Type: "ResourceApplyProximalAdagrad",
 		Input: []tf.Input{
-			set_indices, set_values, set_shape,
+			var_, accum, lr, l1, l2, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
+}
+
+// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
+type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+
+// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
+// If not specified, defaults to <>
+func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
 }
 
-// Computes the sign and the log of the absolute value of the determinant of
-//
-// one or more square matrices.
+// Creates an empty hash table.
 //
-// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
-// form square matrices. The outputs are two tensors containing the signs and
-// absolute values of the log determinants for all N input submatrices
-// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-// is the LU decomposition of the input and P is the corresponding
-// permutation matrix.
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a vector. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	input: Shape is `[N, M, M]`.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
-// of the N input matrices.  Shape is `[N]`.
-func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
+// Returns Handle to a table.
+func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LogMatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "MutableHashTableOfTensorsV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-//
-// The indicator function
-//
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
-//
-// For example:
-//
-// ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
-//
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
-//
-// Useful special cases:
+// The gradient operator for the SparseSlice op.
 //
-// ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
+// This op takes in the upstream gradient w.r.t. non-empty values of
+// the sliced `SparseTensor`, and outputs the gradients w.r.t.
+// the non-empty values of input `SparseTensor`.
 //
 // Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
+//	backprop_val_grad: 1-D. The gradient with respect to
+// the non-empty values of the sliced `SparseTensor`.
+//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
+//	input_start: 1-D. tensor represents the start of the slice.
+//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
 //
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
+func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
+		Type: "SparseSliceGrad",
 		Input: []tf.Input{
-			input, num_lower, num_upper,
+			backprop_val_grad, input_indices, input_start, output_indices,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Delete the tensor specified by its handle in the session.
-//
-// Arguments:
-//	handle: The handle for a tensor stored in the session state.
+// Computes the gradient of the sigmoid of `x` wrt its input.
 //
-// Returns the created operation.
-func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+// `dy` is the corresponding input gradient.
+func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DeleteSessionTensor",
+		Type: "SigmoidGrad",
 		Input: []tf.Input{
-			handle,
+			y, dy,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// L2 Loss.
+// Convert one or more images from HSV to RGB.
 //
-// Computes half the L2 norm of a tensor without the `sqrt`:
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
 //
-//     output = sum(t ** 2) / 2
+// See `rgb_to_hsv` for a description of the HSV encoding.
 //
 // Arguments:
-//	t: Typically 2-D, but may have any dimensions.
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
 //
-// Returns 0-D.
-func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "L2Loss",
+		Type: "HSVToRGB",
 		Input: []tf.Input{
-			t,
+			images,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
-type DenseToSparseSetOperationAttr func(optionalAttr)
-
-// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set2`
-// indices.
+// Creates a dataset by applying optimizations to `input_dataset`.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// Creates a dataset by applying optimizations to `input_dataset`.
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
-// max set size across `n-1` dimensions.
+//	input_dataset: A variant tensor representing the input dataset.
+//	optimizations: A `tf.string` vector `tf.Tensor` identifying optimizations to use.
 //
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "DenseToSparseSetOperation",
+		Type: "OptimizeDataset",
 		Input: []tf.Input{
-			set1, set2_indices, set2_values, set2_shape,
+			input_dataset, optimizations,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Subtracts a value from the current value of a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the decremented value or a subsequent newer one.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
-//
-// Returns the created operation.
-func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignSubVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RestoreAttr is an optional argument to Restore.
-type RestoreAttr func(optionalAttr)
-
-// RestorePreferredShard sets the optional preferred_shard attribute to value.
-//
-// value: Index of file to open first if multiple files match
-// `file_pattern`.
-// If not specified, defaults to -1
-func RestorePreferredShard(value int64) RestoreAttr {
-	return func(m optionalAttr) {
-		m["preferred_shard"] = value
-	}
+	return op.Output(0)
 }
 
-// Restores a tensor from checkpoint files.
-//
-// Reads a tensor stored in one or several files. If there are several files (for
-// instance because a tensor was saved as slices), `file_pattern` may contain
-// wildcard symbols (`*` and `?`) in the filename portion only, not in the
-// directory portion.
-//
-// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-// in which file the requested tensor is likely to be found. This op will first
-// open the file at index `preferred_shard` in the list of matching files and try
-// to restore tensors from that file.  Only if some tensors or tensor slices are
-// not found in that first file, then the Op opens all the files. Setting
-// `preferred_shard` to match the value passed as the `shard` input
-// of a matching `Save` Op may speed up Restore.  This attribute only affects
-// performance, not correctness.  The default value -1 means files are processed in
-// order.
+// Returns the element-wise min of two SparseTensors.
 //
-// See also `RestoreSlice`.
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	dt: The type of the tensor to be restored.
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-// Returns The restored tensor.
-func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Restore",
+		Type: "SparseSparseMinimum",
 		Input: []tf.Input{
-			file_pattern, tensor_name,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
-type QuantizedResizeBilinearAttr func(optionalAttr)
+// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
+type MapUnstageNoKeyAttr func(optionalAttr)
 
-// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
+// REQUIRES: value >= 0
+func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["capacity"] = value
 	}
 }
 
-// Resize quantized `images` to `size` using quantized bilinear interpolation.
-//
-// Input images and output images must be quantized types.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
+// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
+// REQUIRES: value >= 0
+func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns a random (key, value)
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
+// from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedResizeBilinear",
+		Type: "MapUnstageNoKey",
 		Input: []tf.Input{
-			images, size, min, max,
+			indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstageNoKey", err)
+		return
+	}
+	return key, values
 }
 
-// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
-type SdcaOptimizerAttr func(optionalAttr)
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
 
-// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+// HashTableV2Container sets the optional container attribute to value.
 //
-// value: Whether to use Adaptive SDCA for the inner loop.
-// If not specified, defaults to true
-func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func HashTableV2Container(value string) HashTableV2Attr {
 	return func(m optionalAttr) {
-		m["adaptative"] = value
+		m["container"] = value
 	}
 }
 
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
+// HashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func HashTableV2SharedName(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 //
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates a non-initialized hash table.
 //
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
 //
 // Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaOptimizer",
-		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
-		},
+		Type: "HashTableV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+	return op.Output(0)
 }
 
-// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
-type MatrixTriangularSolveAttr func(optionalAttr)
+// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
+type TakeManySparseFromTensorsMapAttr func(optionalAttr)
 
-// MatrixTriangularSolveLower sets the optional lower attribute to value.
+// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
 //
-// value: Boolean indicating whether the innermost matrices in `matrix` are
-// lower or upper triangular.
-// If not specified, defaults to true
-func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+// value: The container name for the `SparseTensorsMap` read by this op.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["lower"] = value
+		m["container"] = value
 	}
 }
 
-// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
-//
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-//          adjoint.
+// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to scipy.linalg.solve_triangular
-// @end_compatibility
-// If not specified, defaults to false
-func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+// value: The shared name for the `SparseTensorsMap` read by this op.
+// It should not be blank; rather the `shared_name` or unique Operation name
+// of the Op that created the original `SparseTensorsMap` should be used.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Solves systems of linear equations with upper or lower triangular matrices by
+// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
+//
+// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
+// `N` is the minibatch size and the rows correspond to the output handles of
+// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
+// original `SparseTensor` objects that went into the given input ops must all
+// match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension on the left).
+//
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the handles represent an input, which is a `[2, 3]` matrix
+// representing two original `SparseTensor` objects:
+//
+// ```
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+// ```
 //
-// backsubstitution.
+// and
 //
-// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. If `lower` is `True` then the strictly upper triangular part
-// of each inner-most matrix is assumed to be zero and not accessed.
-// If `lower` is False then the strictly lower triangular part of each inner-most
-// matrix is assumed to be zero and not accessed.
-// `rhs` is a tensor of shape `[..., M, K]`.
+// ```
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+// ```
 //
-// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-// `True` then the innermost matrices in `output` satisfy matrix equations
-// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `False` then the strictly then the  innermost matrices in
-// `output` satisfy matrix equations
-// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+// then the final `SparseTensor` will be:
+//
+// ```
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+// ```
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
+// Shape: `[N]`.
+//	dtype: The `dtype` of the `SparseTensor` objects stored in the
+// `SparseTensorsMap`.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
+func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixTriangularSolve",
+		Type: "TakeManySparseFromTensorsMap",
 		Input: []tf.Input{
-			matrix, rhs,
+			sparse_handles,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
-type UnicodeTranscodeAttr func(optionalAttr)
-
-// UnicodeTranscodeErrors sets the optional errors attribute to value.
+// Assigns a new value to a variable.
 //
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeTranscodeErrors(value string) UnicodeTranscodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeTranscodeReplacementChar sets the optional replacement_char attribute to value.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to return
+// this value or a subsequent newer value of the variable.
 //
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value to set the new tensor to use.
 //
-// Note that for UTF-8, passing a replacement character expressible in 1 byte, such
-// as ' ', will preserve string alignment to the source since invalid bytes will be
-// replaced with a 1-byte replacement. For UTF-16-BE and UTF-16-LE, any 1 or 2 byte
-// replacement character will preserve byte alignment to the source.
-// If not specified, defaults to 65533
-func UnicodeTranscodeReplacementChar(value int64) UnicodeTranscodeAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
+// Returns the created operation.
+func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// UnicodeTranscodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeTranscodeReplaceControlCharacters(value bool) UnicodeTranscodeAttr {
-	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
+	opspec := tf.OpSpec{
+		Type: "AssignVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Transcode the input text from a source encoding to a destination encoding.
-//
-// The input is a string tensor of any shape. The output is a string tensor of
-// the same shape containing the transcoded strings. Output strings are always
-// valid unicode. If the input contains invalid encoding positions, the
-// `errors` attribute sets the policy for how to deal with them. If the default
-// error-handling policy is used, invalid formatting will be substituted in the
-// output by the `replacement_char`. If the errors policy is to `ignore`, any
-// invalid encoding positions in the input are skipped and not included in the
-// output. If it set to `strict` then any invalid formatting will result in an
-// InvalidArgument error.
-//
-// This operation can be used with `output_encoding = input_encoding` to enforce
-// correct formatting for inputs even if they are already in the desired encoding.
-//
-// If the input is prefixed by a Byte Order Mark needed to determine encoding
-// (e.g. if the encoding is UTF-16 and the BOM indicates big-endian), then that
-// BOM will be consumed and not emitted into the output. If the input encoding
-// is marked with an explicit endianness (e.g. UTF-16-BE), then the BOM is
-// interpreted as a non-breaking-space and is preserved in the output (including
-// always for UTF-8).
-//
-// The end result is that if the input is marked as an explicit endianness the
-// transcoding is faithful to all codepoints in the source. If it is not marked
-// with an explicit endianness, the BOM is not considered part of the string itself
-// but as metadata, and so is not preserved in the output.
+// Strip leading and trailing whitespaces from the Tensor.
 //
 // Arguments:
-//	input: The text to be processed. Can have any shape.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//	output_encoding: The unicode encoding to use in the output. Must be one of
-// `"UTF-8", "UTF-16-BE", "UTF-32-BE"`. Multi-byte encodings will be big-endian.
+//	input: A string `Tensor` of any shape.
 //
-// Returns A string tensor containing unicode text encoded using `output_encoding`.
-func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, output_encoding string, optional ...UnicodeTranscodeAttr) (output tf.Output) {
+// Returns A string `Tensor` of the same shape as the input.
+func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding, "output_encoding": output_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "UnicodeTranscode",
+		Type: "StringStrip",
 		Input: []tf.Input{
 			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes inverse hyperbolic sine of x element-wise.
-func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns a tensor of ones with the same shape and type as x.
+//
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Asinh",
+		Type: "OnesLike",
 		Input: []tf.Input{
 			x,
 		},
@@ -22463,478 +22102,497 @@ func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Creates a dataset with a range of values. Corresponds to python's xrange.
+// The gradient of SparseFillEmptyRows.
+//
+// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
+// shaped `[N_full]`, where `N_full >= N` and copies data into either
+// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
+// `d_default_value` is a scalar.
+//
+//   d_values[j] = grad_values[reverse_index_map[j]]
+//   d_default_value = sum_{k : 0 .. N_full - 1} (
+//      grad_values[k] * 1{k not in reverse_index_map})
 //
 // Arguments:
-//	start: corresponds to start in python's xrange().
-//	stop: corresponds to stop in python's xrange().
-//	step: corresponds to step in python's xrange().
+//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
+//	grad_values: 1-D.  The gradients from backprop.
+//
+// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
+func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseFillEmptyRowsGrad",
+		Input: []tf.Input{
+			reverse_index_map, grad_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
 //
+// if < 0, `scale * features` otherwise.
 //
-func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// To be used together with
+// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+//
+// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RangeDataset",
+		Type: "Selu",
 		Input: []tf.Input{
-			start, stop, step,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Stops gradient computation.
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
+
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SetSizeValidateIndices(value bool) SetSizeAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Number of unique elements along last dimension of input `set`.
 //
-// When executed in a graph, this op outputs its input tensor as-is.
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
 //
-// When building ops to compute gradients, this op prevents the contribution of
-// its inputs to be taken into account.  Normally, the gradient generator adds ops
-// to a graph to compute the derivatives of a specified 'loss' by recursively
-// finding out inputs that contributed to its computation.  If you insert this op
-// in the graph it inputs are masked from the gradient generator.  They are not
-// taken into account for computing gradients.
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
 //
-// This is useful any time you want to compute a value with TensorFlow but need
-// to pretend that the value was a constant. Some examples include:
+// Arguments:
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
 //
-// *  The *EM* algorithm where the *M-step* should not involve backpropagation
-//    through the output of the *E-step*.
-// *  Contrastive divergence training of Boltzmann machines where, when
-//    differentiating the energy function, the training must not backpropagate
-//    through the graph that generated the samples from the model.
-// *  Adversarial training, where no backprop should happen through the adversarial
-//    example generation process.
-func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StopGradient",
+		Type: "SetSize",
 		Input: []tf.Input{
-			input,
+			set_indices, set_values, set_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Eagerly executes a python function to compute func(input)->output. The
+// Computes the sign and the log of the absolute value of the determinant of
 //
-// semantics of the input, output, and attributes are the same as those for
-// PyFunc.
-func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+// one or more square matrices.
+//
+// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
+// form square matrices. The outputs are two tensors containing the signs and
+// absolute values of the log determinants for all N input submatrices
+// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
+// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
+// is the LU decomposition of the input and P is the corresponding
+// permutation matrix.
+//
+// Arguments:
+//	input: Shape is `[N, M, M]`.
+//
+// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
+// of the N input matrices.  Shape is `[N]`.
+func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"token": token, "Tout": Tout}
 	opspec := tf.OpSpec{
-		Type: "EagerPyFunc",
+		Type: "LogMatrixDeterminant",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("EagerPyFunc", err)
-		return
-	}
-	return output
+	return op.Output(0), op.Output(1)
 }
 
-// Adds sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] += updates[...]
+// Copy a tensor setting everything outside a central band in each innermost matrix
 //
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] += updates[i, ...]
+// to zero.
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
 //
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// The indicator function
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
 //
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+// For example:
 //
-// Returns the created operation.
-func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterAdd",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Says whether the targets are in the top `K` predictions.
+// ```
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
 //
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
 //
-// More formally, let
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
 //
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// Useful special cases:
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
 //
-// Returns Computed Precision at `k` as a `bool Tensor`.
-func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "InTopK",
+		Type: "MatrixBandPart",
 		Input: []tf.Input{
-			predictions, targets,
+			input, num_lower, num_upper,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns (x - y)(x - y) element-wise.
+// Delete the tensor specified by its handle in the session.
 //
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	handle: The handle for a tensor stored in the session state.
+//
+// Returns the created operation.
+func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
+		Type: "DeleteSessionTensor",
 		Input: []tf.Input{
-			x, y,
+			handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
-
-// RandomGammaSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// Outputs random values from the Gamma distribution(s) described by alpha.
+// Subtracts a value from the current value of a variable.
 //
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the decremented value or a subsequent newer one.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
 //
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+// Returns the created operation.
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomGamma",
+		Type: "AssignSubVariableOp",
 		Input: []tf.Input{
-			shape, alpha,
+			resource, value,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
+
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
 //
-// actual distribution of the values to maximize the usage of the lower bit depth
-// and adjusting the output min and max ranges accordingly.
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
+	}
+}
+
+// Restores a tensor from checkpoint files.
 //
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
 //
-// This operator tries to squeeze as much precision as possible into an output with
-// a lower bit depth by calculating the actual min and max values found in the
-// data. For example, maybe that quint16 input has no values lower than 16,384 and
-// none higher than 49,152. That means only half the range is actually needed, all
-// the float interpretations are between -0.5f and 0.5f, so if we want to compress
-// the data into a quint8 output, we can use that range rather than the theoretical
-// -1.0f to 1.0f that is suggested by the input min and max.
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
 //
-// In practice, this is most useful for taking output from operations like
-// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
-// may have large potential output ranges, but in practice have a distribution of
-// input values that only uses a small fraction of the possible range. By feeding
-// that output into this operator, we can reduce it from 32 bits down to 8 with
-// minimal loss of accuracy.
+// See also `RestoreSlice`.
 //
 // Arguments:
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
-//
-// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeDownAndShrinkRange",
+		Type: "Restore",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			file_pattern, tensor_name,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Returns the truth value of x OR y element-wise.
+// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
+type QuantizedResizeBilinearAttr func(optionalAttr)
+
+// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalOr",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Selects elements from `x` or `y`, depending on `condition`.
-//
-// The `x`, and `y` tensors must all have the same shape, and the
-// output will also have that shape.
-//
-// The `condition` tensor must be a scalar if `x` and `y` are scalars.
-// If `x` and `y` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `x`, or must have
-// the same shape as `x`.
-//
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `x` (if true) or `y` (if false).
-//
-// If `condition` is a vector and `x` and `y` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `x` and `y`.
-// If `condition` has the same shape as `x` and `y`, then it chooses which
-// element to copy from `x` and `y`.
-//
-// For example:
-//
-// ```python
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e)  # => [[1, 6], [7, 4]]
-//
-//
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
+// Resize quantized `images` to `size` using quantized bilinear interpolation.
 //
-// ```
+// Input images and output images must be quantized types.
 //
 // Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-//	x: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `x` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	y: = A `Tensor` with the same type and shape as `x`.
 //
-// Returns = A `Tensor` with the same type and shape as `x` and `y`.
-func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Select",
+		Type: "QuantizedResizeBilinear",
 		Input: []tf.Input{
-			condition, x, y,
+			images, size, min, max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
+// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
+type SdcaOptimizerAttr func(optionalAttr)
 
-// MatMulTransposeA sets the optional transpose_a attribute to value.
+// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
 //
-// value: If true, "a" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
+func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
 	return func(m optionalAttr) {
-		m["transpose_a"] = value
+		m["adaptative"] = value
 	}
 }
 
-// MatMulTransposeB sets the optional transpose_b attribute to value.
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
 //
-// value: If true, "b" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// Multiply the matrix "a" by the matrix "b".
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
 //
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+//
+// Arguments:
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
+//
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatMul",
+		Type: "SdcaOptimizer",
 		Input: []tf.Input{
-			a, b,
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseMatMulAttr is an optional argument to SparseMatMul.
-type SparseMatMulAttr func(optionalAttr)
-
-// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
 	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
 }
 
-// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
+type MatrixTriangularSolveAttr func(optionalAttr)
+
+// MatrixTriangularSolveLower sets the optional lower attribute to value.
+//
+// value: Boolean indicating whether the innermost matrices in `matrix` are
+// lower or upper triangular.
+// If not specified, defaults to true
+func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
 	return func(m optionalAttr) {
-		m["a_is_sparse"] = value
+		m["lower"] = value
 	}
 }
 
-// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+//          adjoint.
+//
+// @compatibility(numpy)
+// Equivalent to scipy.linalg.solve_triangular
+// @end_compatibility
 // If not specified, defaults to false
-func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
 	return func(m optionalAttr) {
-		m["b_is_sparse"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Multiply matrix "a" by matrix "b".
+// Solves systems of linear equations with upper or lower triangular matrices by
 //
-// The inputs must be two-dimensional matrices and the inner dimension of "a" must
-// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
-// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
-// "b" is sparse, in the sense that they have a large proportion of zero values.
-// The breakeven for using this versus a dense matrix multiply on one platform was
-// 30% zero values in the sparse matrix.
+// backsubstitution.
 //
-// The gradient computation of this operation will only take advantage of sparsity
-// in the input gradient when that gradient comes from a Relu.
-func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. If `lower` is `True` then the strictly upper triangular part
+// of each inner-most matrix is assumed to be zero and not accessed.
+// If `lower` is False then the strictly lower triangular part of each inner-most
+// matrix is assumed to be zero and not accessed.
+// `rhs` is a tensor of shape `[..., M, K]`.
+//
+// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+// `True` then the innermost matrices in `output` satisfy matrix equations
+// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `False` then the strictly then the  innermost matrices in
+// `output` satisfy matrix equations
+// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22943,9 +22601,9 @@ func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatM
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseMatMul",
+		Type: "MatrixTriangularSolve",
 		Input: []tf.Input{
-			a, b,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
@@ -22953,174 +22611,168 @@ func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatM
 	return op.Output(0)
 }
 
-// ExperimentalThreadPoolHandleAttr is an optional argument to ExperimentalThreadPoolHandle.
-type ExperimentalThreadPoolHandleAttr func(optionalAttr)
-
-// ExperimentalThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
+// Saves tensors in V2 checkpoint format.
 //
-// value: The maximum degree of parallelism to use within operations that execute on this
-// threadpool.
-// If not specified, defaults to 1
-func ExperimentalThreadPoolHandleMaxIntraOpParallelism(value int64) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["max_intra_op_parallelism"] = value
-	}
-}
-
-// ExperimentalThreadPoolHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func ExperimentalThreadPoolHandleContainer(value string) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// ExperimentalThreadPoolHandleSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func ExperimentalThreadPoolHandleSharedName(value string) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
 //
 // Arguments:
-//	num_threads: The number of threads in the thread pool.
-//	display_name: A human-readable name for the threads that may be visible in some
-// visualizations.
-// threadpool.
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
 //
-// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
-// ops.
-func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ExperimentalThreadPoolHandleAttr) (handle tf.Output) {
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalThreadPoolHandle",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
-type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
-
-// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
+		Type: "SaveV2",
+		Input: []tf.Input{
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
+// UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
+type UnicodeTranscodeAttr func(optionalAttr)
 
-// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
+// UnicodeTranscodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeTranscodeErrors(value string) UnicodeTranscodeAttr {
 	return func(m optionalAttr) {
-		m["dropout"] = value
+		m["errors"] = value
 	}
 }
 
-// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
+// UnicodeTranscodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+//
+// Note that for UTF-8, passing a replacement character expressible in 1 byte, such
+// as ' ', will preserve string alignment to the source since invalid bytes will be
+// replaced with a 1-byte replacement. For UTF-16-BE and UTF-16-LE, any 1 or 2 byte
+// replacement character will preserve byte alignment to the source.
+// If not specified, defaults to 65533
+func UnicodeTranscodeReplacementChar(value int64) UnicodeTranscodeAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["replacement_char"] = value
 	}
 }
 
-// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
+// UnicodeTranscodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeTranscodeReplaceControlCharacters(value bool) UnicodeTranscodeAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["replace_control_characters"] = value
 	}
 }
 
-// Converts CudnnRNN params from canonical form to usable form.
+// Transcode the input text from a source encoding to a destination encoding.
 //
-// Writes a set of weights into the opaque params buffer so they can be used in
-// upcoming training or inferences.
+// The input is a string tensor of any shape. The output is a string tensor of
+// the same shape containing the transcoded strings. Output strings are always
+// valid unicode. If the input contains invalid encoding positions, the
+// `errors` attribute sets the policy for how to deal with them. If the default
+// error-handling policy is used, invalid formatting will be substituted in the
+// output by the `replacement_char`. If the errors policy is to `ignore`, any
+// invalid encoding positions in the input are skipped and not included in the
+// output. If it set to `strict` then any invalid formatting will result in an
+// InvalidArgument error.
 //
-// Note that the params buffer may not be compatible across different GPUs. So any
-// save and restoration should be converted to and from the canonical weights and
-// biases.
+// This operation can be used with `output_encoding = input_encoding` to enforce
+// correct formatting for inputs even if they are already in the desired encoding.
 //
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// weights: the canonical form of weights that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// biases: the canonical form of biases that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// num_params: number of parameter sets for all layers.
-//     Each layer may contain multiple parameter sets, with each set consisting of
-//     a weight matrix and a bias vector.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     The actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//     dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
+// If the input is prefixed by a Byte Order Mark needed to determine encoding
+// (e.g. if the encoding is UTF-16 and the BOM indicates big-endian), then that
+// BOM will be consumed and not emitted into the output. If the input encoding
+// is marked with an explicit endianness (e.g. UTF-16-BE), then the BOM is
+// interpreted as a non-breaking-space and is preserved in the output (including
+// always for UTF-8).
+//
+// The end result is that if the input is marked as an explicit endianness the
+// transcoding is faithful to all codepoints in the source. If it is not marked
+// with an explicit endianness, the BOM is not considered part of the string itself
+// but as metadata, and so is not preserved in the output.
+//
+// Arguments:
+//	input: The text to be processed. Can have any shape.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//	output_encoding: The unicode encoding to use in the output. Must be one of
+// `"UTF-8", "UTF-16-BE", "UTF-32-BE"`. Multi-byte encodings will be big-endian.
+//
+// Returns A string tensor containing unicode text encoded using `output_encoding`.
+func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, output_encoding string, optional ...UnicodeTranscodeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"input_encoding": input_encoding, "output_encoding": output_encoding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNCanonicalToParams",
+		Type: "UnicodeTranscode",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes inverse hyperbolic sine of x element-wise.
+func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Asinh",
 		Input: []tf.Input{
-			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset containing elements of first component of `input_dataset` having true in the last component.
-func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (output tf.Output) {
+// Creates a dataset with a range of values. Corresponds to python's xrange.
+//
+// Arguments:
+//	start: corresponds to start in python's xrange().
+//	stop: corresponds to stop in python's xrange().
+//	step: corresponds to step in python's xrange().
+//
+//
+func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "FilterByLastComponentDataset",
+		Type: "RangeDataset",
 		Input: []tf.Input{
-			input_dataset,
+			start, stop, step,
 		},
 		Attrs: attrs,
 	}
@@ -23128,99 +22780,102 @@ func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_
 	return op.Output(0)
 }
 
-// SumAttr is an optional argument to Sum.
-type SumAttr func(optionalAttr)
-
-// SumKeepDims sets the optional keep_dims attribute to value.
+// Stops gradient computation.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SumKeepDims(value bool) SumAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the sum of elements across dimensions of a tensor.
+// When executed in a graph, this op outputs its input tensor as-is.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// When building ops to compute gradients, this op prevents the contribution of
+// its inputs to be taken into account.  Normally, the gradient generator adds ops
+// to a graph to compute the derivatives of a specified 'loss' by recursively
+// finding out inputs that contributed to its computation.  If you insert this op
+// in the graph it inputs are masked from the gradient generator.  They are not
+// taken into account for computing gradients.
 //
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+// This is useful any time you want to compute a value with TensorFlow but need
+// to pretend that the value was a constant. Some examples include:
 //
-// Returns The reduced tensor.
-func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
+// *  The *EM* algorithm where the *M-step* should not involve backpropagation
+//    through the output of the *E-step*.
+// *  Contrastive divergence training of Boltzmann machines where, when
+//    differentiating the energy function, the training must not backpropagate
+//    through the graph that generated the samples from the model.
+// *  Adversarial training, where no backprop should happen through the adversarial
+//    example generation process.
+func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Sum",
+		Type: "StopGradient",
 		Input: []tf.Input{
-			input, axis,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EnterAttr is an optional argument to Enter.
-type EnterAttr func(optionalAttr)
-
-// EnterIsConstant sets the optional is_constant attribute to value.
+// Eagerly executes a python function to compute func(input)->output. The
 //
-// value: If true, the output is constant within the child frame.
-// If not specified, defaults to false
-func EnterIsConstant(value bool) EnterAttr {
-	return func(m optionalAttr) {
-		m["is_constant"] = value
+// semantics of the input, output, and attributes are the same as those for
+// PyFunc.
+func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// EnterParallelIterations sets the optional parallel_iterations attribute to value.
-//
-// value: The number of iterations allowed to run in parallel.
-// If not specified, defaults to 10
-func EnterParallelIterations(value int64) EnterAttr {
-	return func(m optionalAttr) {
-		m["parallel_iterations"] = value
+	attrs := map[string]interface{}{"token": token, "Tout": Tout}
+	opspec := tf.OpSpec{
+		Type: "EagerPyFunc",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("EagerPyFunc", err)
+		return
 	}
+	return output
 }
 
-// Creates or finds a child frame, and makes `data` available to the child frame.
+// Says whether the targets are in the top `K` predictions.
 //
-// This op is used together with `Exit` to create loops in the graph.
-// The unique `frame_name` is used by the `Executor` to identify frames. If
-// `is_constant` is true, `output` is a constant in the child frame; otherwise
-// it may be changed in the child frame. At most `parallel_iterations` iterations
-// are run in parallel in the child frame.
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
 //
 // Arguments:
-//	data: The tensor to be made available to the child frame.
-//	frame_name: The name of the child frame.
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
 //
-// Returns The same tensor as `data`.
-func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"frame_name": frame_name}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "Enter",
+		Type: "InTopK",
 		Input: []tf.Input{
-			data,
+			predictions, targets,
 		},
 		Attrs: attrs,
 	}
@@ -23228,128 +22883,65 @@ func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAtt
 	return op.Output(0)
 }
 
-// Add all input tensors element wise.
+// Returns (x - y)(x - y) element-wise.
 //
-// Arguments:
-//	inputs: Must all be the same size and shape.
-func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AddN",
+		Type: "SquaredDifference",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TryRpcAttr is an optional argument to TryRpc.
-type TryRpcAttr func(optionalAttr)
-
-// TryRpcProtocol sets the optional protocol attribute to value.
-//
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func TryRpcProtocol(value string) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["protocol"] = value
-	}
-}
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
 
-// TryRpcFailFast sets the optional fail_fast attribute to value.
+// RandomGammaSeed sets the optional seed attribute to value.
 //
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func TryRpcFailFast(value bool) TryRpcAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
 	return func(m optionalAttr) {
-		m["fail_fast"] = value
+		m["seed"] = value
 	}
 }
 
-// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
 //
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
+// value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
-func TryRpcTimeoutInMs(value int64) TryRpcAttr {
+func RandomGammaSeed2(value int64) RandomGammaAttr {
 	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
+		m["seed2"] = value
 	}
 }
 
-// Perform batches of RPC requests.
-//
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
-//
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
-//
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
-//
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
-//
-// then call this op with arguments:
-//
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
-//
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
-//
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
-//
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
-//
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
+// Outputs random values from the Gamma distribution(s) described by alpha.
 //
-// Unlike the standard `Rpc` op, if the connection fails or the remote worker
-// returns an error status, this op does **not** reraise the exception.
-// Instead, the `status_code` and `status_message` entry for the corresponding RPC
-// call is set with the error returned from the RPC call.  The `response` tensor
-// will contain valid response values for those minibatch entries whose RPCs did
-// not fail; the rest of the entries will have empty strings.
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
 //
 // Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
 //
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
-// returned from the RPC calls.
-func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23358,107 +22950,95 @@ func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TryRpc",
+		Type: "RandomGamma",
 		Input: []tf.Input{
-			address, method, request,
+			shape, alpha,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
-type InitializeTableFromTextFileV2Attr func(optionalAttr)
-
-// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
-//
-// value: Number of elements of the file, use -1 if unknown.
-// If not specified, defaults to -1
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// REQUIRES: value >= -1
-func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
-	return func(m optionalAttr) {
-		m["vocab_size"] = value
-	}
-}
-
-// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+// actual distribution of the values to maximize the usage of the lower bit depth
+// and adjusting the output min and max ranges accordingly.
 //
-// value: Delimiter to separate fields in a line.
-// If not specified, defaults to "\t"
-func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
-	return func(m optionalAttr) {
-		m["delimiter"] = value
-	}
-}
-
-// Initializes a table from a text file.
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
 //
-// It inserts one key-value pair into the table for each line of the file.
-// The key and value is extracted from the whole line content, elements from the
-// split line based on `delimiter` or the line number (starting from zero).
-// Where to extract the key and value from a line is specified by `key_index` and
-// `value_index`.
+// This operator tries to squeeze as much precision as possible into an output with
+// a lower bit depth by calculating the actual min and max values found in the
+// data. For example, maybe that quint16 input has no values lower than 16,384 and
+// none higher than 49,152. That means only half the range is actually needed, all
+// the float interpretations are between -0.5f and 0.5f, so if we want to compress
+// the data into a quint8 output, we can use that range rather than the theoretical
+// -1.0f to 1.0f that is suggested by the input min and max.
 //
-// - A value of -1 means use the line number(starting from zero), expects `int64`.
-// - A value of -2 means use the whole line content, expects `string`.
-// - A value >= 0 means use the index (starting at zero) of the split line based
-//   on `delimiter`.
+// In practice, this is most useful for taking output from operations like
+// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+// may have large potential output ranges, but in practice have a distribution of
+// input values that only uses a small fraction of the possible range. By feeding
+// that output into this operator, we can reduce it from 32 bits down to 8 with
+// minimal loss of accuracy.
 //
 // Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	filename: Filename of a vocabulary text file.
-//	key_index: Column index in a line to get the table `key` values from.
-//	value_index: Column index that represents information of a line to get the table
-// `value` values from.
 //
-// Returns the created operation.
-func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "InitializeTableFromTextFileV2",
+		Type: "QuantizeDownAndShrinkRange",
 		Input: []tf.Input{
-			table_handle, filename,
+			input, input_min, input_max,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MeanAttr is an optional argument to Mean.
-type MeanAttr func(optionalAttr)
-
-// MeanKeepDims sets the optional keep_dims attribute to value.
+// Returns the truth value of (x >= y) element-wise.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MeanKeepDims(value bool) MeanAttr {
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GreaterEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
+
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["tolerance"] = value
 	}
 }
 
-// Computes the mean of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23467,9 +23047,9 @@ func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Mean",
+		Type: "ApproximateEqual",
 		Input: []tf.Input{
-			input, axis,
+			x, y,
 		},
 		Attrs: attrs,
 	}
@@ -23477,77 +23057,120 @@ func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (
 	return op.Output(0)
 }
 
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
-
-// ProdKeepDims sets the optional keep_dims attribute to value.
+// Returns the truth value of x OR y element-wise.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalOr",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the product of elements across dimensions of a tensor.
+// Selects elements from `x` or `y`, depending on `condition`.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// The `x`, and `y` tensors must all have the same shape, and the
+// output will also have that shape.
+//
+// The `condition` tensor must be a scalar if `x` and `y` are scalars.
+// If `x` and `y` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `x`, or must have
+// the same shape as `x`.
+//
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `x` (if true) or `y` (if false).
+//
+// If `condition` is a vector and `x` and `y` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `x` and `y`.
+// If `condition` has the same shape as `x` and `y`, then it chooses which
+// element to copy from `x` and `y`.
+//
+// For example:
+//
+// ```python
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e)  # => [[1, 6], [7, 4]]
+//
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
+// ```
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
 //
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
+//	x: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `x` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	y: = A `Tensor` with the same type and shape as `x`.
+//
+// Returns = A `Tensor` with the same type and shape as `x` and `y`.
+func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Prod",
+		Type: "Select",
 		Input: []tf.Input{
-			input, axis,
+			condition, x, y,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
+
+// MatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
 }
 
-// ResizeBilinearAttr is an optional argument to ResizeBilinear.
-type ResizeBilinearAttr func(optionalAttr)
-
-// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+// MatMulTransposeB sets the optional transpose_b attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
+// value: If true, "b" is transposed before multiplication.
 // If not specified, defaults to false
-func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+func MatMulTransposeB(value bool) MatMulAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["transpose_b"] = value
 	}
 }
 
-// Resize `images` to `size` using bilinear interpolation.
-//
-// Input images can be of different types but output images are always float.
+// Multiply the matrix "a" by the matrix "b".
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23556,9 +23179,9 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinear",
+		Type: "MatMul",
 		Input: []tf.Input{
-			images, size,
+			a, b,
 		},
 		Attrs: attrs,
 	}
@@ -23566,33 +23189,53 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 	return op.Output(0)
 }
 
-// MaxAttr is an optional argument to Max.
-type MaxAttr func(optionalAttr)
+// SparseMatMulAttr is an optional argument to SparseMatMul.
+type SparseMatMulAttr func(optionalAttr)
 
-// MaxKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
+// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
 // If not specified, defaults to false
-func MaxKeepDims(value bool) MaxAttr {
+func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// Computes the maximum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["a_is_sparse"] = value
+	}
+}
+
+// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["b_is_sparse"] = value
+	}
+}
+
+// Multiply matrix "a" by matrix "b".
 //
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+// The inputs must be two-dimensional matrices and the inner dimension of "a" must
+// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
+// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
+// "b" is sparse, in the sense that they have a large proportion of zero values.
+// The breakeven for using this versus a dense matrix multiply on one platform was
+// 30% zero values in the sparse matrix.
 //
-// Returns The reduced tensor.
-func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
+// The gradient computation of this operation will only take advantage of sparsity
+// in the input gradient when that gradient comes from a Relu.
+func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23601,9 +23244,9 @@ func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Max",
+		Type: "SparseMatMul",
 		Input: []tf.Input{
-			input, axis,
+			a, b,
 		},
 		Attrs: attrs,
 	}
@@ -23611,493 +23254,512 @@ func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (ou
 	return op.Output(0)
 }
 
-// Creates a dataset that contains the unique elements of `input_dataset`.
-func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalUniqueDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
+// ExperimentalThreadPoolHandleAttr is an optional argument to ExperimentalThreadPoolHandle.
+type ExperimentalThreadPoolHandleAttr func(optionalAttr)
+
+// ExperimentalThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
+//
+// value: The maximum degree of parallelism to use within operations that execute on this
+// threadpool.
+// If not specified, defaults to 1
+func ExperimentalThreadPoolHandleMaxIntraOpParallelism(value int64) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["max_intra_op_parallelism"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ArgMinAttr is an optional argument to ArgMin.
-type ArgMinAttr func(optionalAttr)
+// ExperimentalThreadPoolHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleContainer(value string) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
 
-// ArgMinOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMinOutputType(value tf.DataType) ArgMinAttr {
+// ExperimentalThreadPoolHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleSharedName(value string) ExperimentalThreadPoolHandleAttr {
 	return func(m optionalAttr) {
-		m["output_type"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Returns the index with the smallest value across dimensions of a tensor.
-//
-// Note that in case of ties the identity of the return value is not guaranteed.
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
 //
 // Arguments:
+//	num_threads: The number of threads in the thread pool.
+//	display_name: A human-readable name for the threads that may be visible in some
+// visualizations.
+// threadpool.
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
+// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
+// ops.
+func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ExperimentalThreadPoolHandleAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ArgMin",
-		Input: []tf.Input{
-			input, dimension,
-		},
+		Type: "ExperimentalThreadPoolHandle",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// output range specified with 'requested_output_min' and 'requested_output_max'.
-//
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	requested_output_min: The float value that the minimum quantized output value represents.
-//	requested_output_max: The float value that the maximum quantized output value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
-//
-// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
-func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
+// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
+type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
+
+// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "Requantize",
-		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
-		},
-		Attrs: attrs,
+}
+
+// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates a dataset that emits the lines of one or more text files.
-//
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar containing the number of bytes to buffer.
-func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TextLineDataset",
-		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
-		},
+}
+
+// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \sum_j data_j\\) where sum is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-// </div>
+// Converts CudnnRNN params from canonical form to usable form.
 //
-// Arguments:
+// Writes a set of weights into the opaque params buffer so they can be used in
+// upcoming training or inferences.
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// Note that the params buffer may not be compatible across different GPUs. So any
+// save and restoration should be converted to and from the canonical weights and
+// biases.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// weights: the canonical form of weights that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// biases: the canonical form of biases that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentSum",
+		Type: "CudnnRNNCanonicalToParams",
 		Input: []tf.Input{
-			data, segment_ids,
+			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the mean along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
-//
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Creates a dataset containing elements of first component of `input_dataset` having true in the last component.
+func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SegmentMean",
+		Type: "FilterByLastComponentDataset",
 		Input: []tf.Input{
-			data, segment_ids,
+			input_dataset,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the minimum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-// that `segment_ids[j] == i`.
+// SumAttr is an optional argument to Sum.
+type SumAttr func(optionalAttr)
+
+// SumKeepDims sets the optional keep_dims attribute to value.
 //
-// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SumKeepDims(value bool) SumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a tensor.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-// </div>
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns The reduced tensor.
+func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMin",
+		Type: "Sum",
 		Input: []tf.Input{
-			data, segment_ids,
+			input, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Removes keys and its associated values from a table.
-//
-// The tensor `keys` must of the same type as the keys of the table. Keys not
-// already in the table are silently ignored.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys of the elements to remove.
+// EnterAttr is an optional argument to Enter.
+type EnterAttr func(optionalAttr)
+
+// EnterIsConstant sets the optional is_constant attribute to value.
 //
-// Returns the created operation.
-func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableRemoveV2",
-		Input: []tf.Input{
-			table_handle, keys,
-		},
+// value: If true, the output is constant within the child frame.
+// If not specified, defaults to false
+func EnterIsConstant(value bool) EnterAttr {
+	return func(m optionalAttr) {
+		m["is_constant"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-// If the given segment ID `i` is negative, the value is dropped and will not be
-// added to the sum of the segment.
-//
-// `num_segments` should equal the number of distinct segment IDs.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
+// EnterParallelIterations sets the optional parallel_iterations attribute to value.
 //
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
+// value: The number of iterations allowed to run in parallel.
+// If not specified, defaults to 10
+func EnterParallelIterations(value int64) EnterAttr {
+	return func(m optionalAttr) {
+		m["parallel_iterations"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the product along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the product of all
-// entries belonging to a segment such that:
-//
-// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
-// `j...` such that `segment_ids[j...] == i`.
-//
-// If there is no entry for a given segment ID `i`, it outputs 1.
+// Creates or finds a child frame, and makes `data` available to the child frame.
 //
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
+// This op is used together with `Exit` to create loops in the graph.
+// The unique `frame_name` is used by the `Executor` to identify frames. If
+// `is_constant` is true, `output` is a constant in the child frame; otherwise
+// it may be changed in the child frame. At most `parallel_iterations` iterations
+// are run in parallel in the child frame.
 //
 // Arguments:
+//	data: The tensor to be made available to the child frame.
+//	frame_name: The name of the child frame.
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns The same tensor as `data`.
+func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"frame_name": frame_name}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentProd",
+		Type: "Enter",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			data,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
+// Add all input tensors element wise.
 //
 // Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+//	inputs: Must all be the same size and shape.
+func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
+		Type: "AddN",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserializes a serialized tree ensemble config and replaces current tree
-//
-// ensemble.
+// TryRpcAttr is an optional argument to TryRpc.
+type TryRpcAttr func(optionalAttr)
+
+// TryRpcProtocol sets the optional protocol attribute to value.
 //
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//	stamp_token: Token to use as the new value of the resource stamp.
-//	tree_ensemble_serialized: Serialized proto of the ensemble.
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func TryRpcProtocol(value string) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// TryRpcFailFast sets the optional fail_fast attribute to value.
 //
-// Returns the created operation.
-func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func TryRpcFailFast(value bool) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesDeserializeEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
-		},
+}
+
+// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func TryRpcTimeoutInMs(value int64) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Transforms a tf.Example proto (as a string) into typed tensors.
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// Unlike the standard `Rpc` op, if the connection fails or the remote worker
+// returns an error status, this op does **not** reraise the exception.
+// Instead, the `status_code` and `status_message` entry for the corresponding RPC
+// call is set with the error returned from the RPC call.  The `response` tensor
+// will contain valid response values for those minibatch entries whose RPCs did
+// not fail; the rest of the entries will have empty strings.
 //
 // Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	dense_defaults: A list of Tensors (some may be empty), whose length matches
-// the length of `dense_keys`. dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	num_sparse: The number of sparse features to be parsed from the example. This
-// must match the lengths of `sparse_keys` and `sparse_types`.
-//	sparse_keys: A list of `num_sparse` strings.
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: The keys expected in the Examples' features associated with dense
-// values.
-//	sparse_types: A list of `num_sparse` types; the data types of data in each
-// Feature given in sparse_keys.
-// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: The shapes of data in each Feature given in dense_keys.
-// The length of this list must match the length of `dense_keys`.  The
-// number of elements in the Feature corresponding to dense_key[j] must
-// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
-// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
-// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
-// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
-// D1, .., DN), where M is the number of blocks of elements of length
-// D1 * .... * DN, in the input.
-func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
+// returned from the RPC calls.
+func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ParseSingleExample",
+		Type: "TryRpc",
 		Input: []tf.Input{
-			serialized, tf.OutputList(dense_defaults),
+			address, method, request,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
+
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
+//
+// value: Number of elements of the file, use -1 if unknown.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["vocab_size"] = value
 	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
+}
+
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+//
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["delimiter"] = value
 	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
+}
+
+// Initializes a table from a text file.
+//
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
+//
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
+//
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
+//
+// Returns the created operation.
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
 		return
 	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
+	for _, a := range optional {
+		a(attrs)
 	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "InitializeTableFromTextFileV2",
+		Input: []tf.Input{
+			table_handle, filename,
+		},
+		Attrs: attrs,
 	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
+	return scope.AddOperation(opspec)
 }
 
-// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
-type WholeFileReaderV2Attr func(optionalAttr)
+// MeanAttr is an optional argument to Mean.
+type MeanAttr func(optionalAttr)
 
-// WholeFileReaderV2Container sets the optional container attribute to value.
+// MeanKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MeanKeepDims(value bool) MeanAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+// Computes the mean of elements across dimensions of a tensor.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the entire contents of a file as a value.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
-// To use, enqueue filenames in a Queue.  The output of ReaderRead will
-// be a filename (key) and the contents of that file (value).
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns The handle to reference the Reader.
-func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+// Returns The reduced tensor.
+func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24106,30 +23768,54 @@ func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "WholeFileReaderV2",
-
+		Type: "Mean",
+		Input: []tf.Input{
+			input, axis,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pop the element at the top of the stack.
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
+
+// ProdKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func ProdKeepDims(value bool) ProdAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the product of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem_type: The type of the elem that is popped.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns The tensor that is popped from the top of the stack.
-func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StackPopV2",
+		Type: "Prod",
 		Input: []tf.Input{
-			handle,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -24137,138 +23823,144 @@ func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.
 	return op.Output(0)
 }
 
-// Computes hyperbolic cosine of x element-wise.
-func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+// ResizeBilinearAttr is an optional argument to ResizeBilinear.
+type ResizeBilinearAttr func(optionalAttr)
+
+// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using bilinear interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Cosh",
+		Type: "ResizeBilinear",
 		Input: []tf.Input{
-			x,
+			images, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
+// MaxAttr is an optional argument to Max.
+type MaxAttr func(optionalAttr)
+
+// MaxKeepDims sets the optional keep_dims attribute to value.
 //
-// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MaxKeepDims(value bool) MaxAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the maximum of elements across dimensions of a tensor.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which has size
-// `num_segments`.
-func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns The reduced tensor.
+func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanWithNumSegments",
+		Type: "Max",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			input, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
-type CudnnRNNParamsSizeAttr func(optionalAttr)
-
-// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
+// Creates a dataset that contains the unique elements of `input_dataset`.
+func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalUniqueDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// ArgMinAttr is an optional argument to ArgMin.
+type ArgMinAttr func(optionalAttr)
 
-// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
+// ArgMinOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMinOutputType(value tf.DataType) ArgMinAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["output_type"] = value
 	}
 }
 
-// Computes size of weights that can be used by a Cudnn RNN model.
+// Returns the index with the smallest value across dimensions of a tensor.
 //
-// Return the params size that can be used by the Cudnn RNN model. Subsequent
-// weight allocation and initialization should use this size.
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   The actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//   dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-// params_size: The size of the params buffer that should be allocated and
-//   initialized for this RNN model. Note that this params buffer may not be
-//   compatible across GPUs. Please use CudnnRNNParamsWeights and
-//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
-//   across different runs.
-func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
+// Arguments:
+//
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T, "S": S}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNParamsSize",
+		Type: "ArgMin",
 		Input: []tf.Input{
-			num_layers, num_units, input_size,
+			input, dimension,
 		},
 		Attrs: attrs,
 	}
@@ -24276,360 +23968,437 @@ func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output,
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentMean.
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// output range specified with 'requested_output_min' and 'requested_output_max'.
+//
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
 //
 // Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	requested_output_min: The float value that the minimum quantized output value represents.
+//	requested_output_max: The float value that the maximum quantized output value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
+func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
+		Type: "Requantize",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			input, input_min, input_max, requested_output_min, requested_output_max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+// Creates a dataset that emits the lines of one or more text files.
 //
-// N is the size of the segment being reduced.
+// Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar containing the number of bytes to buffer.
+func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TextLineDataset",
+		Input: []tf.Input{
+			filenames, compression_type, buffer_size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along segments of a tensor.
 //
 // Read
 // [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
 // for an explanation of segments.
 //
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
+//
 // Arguments:
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
 //
 // Returns Has same shape as data, except for dimension 0 which
 // has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
+		Type: "SegmentSum",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
+// Computes the mean along segments of a tensor.
 //
-// The upper regularized incomplete Gamma function is defined as:
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
 //
-// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
 //
-// where
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
 //
-// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
 //
-// is the upper incomplete Gama function.
+// Arguments:
 //
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Igammac",
+		Type: "SegmentMean",
 		Input: []tf.Input{
-			a, x,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
-//
-// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// Computes the minimum along segments of a tensor.
 //
 // Read
 // [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
 // for an explanation of segments.
 //
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+// </div>
+//
 // Arguments:
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
 //
 // Returns Has same shape as data, except for dimension 0 which
 // has size `k`, the number of segments.
-func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNWithNumSegments",
+		Type: "SegmentMin",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentSqrtN.
+// Removes keys and its associated values from a table.
 //
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// The tensor `keys` must of the same type as the keys of the table. Keys not
+// already in the table are silently ignored.
 //
 // Arguments:
-//	grad: gradient propagated to the SparseSegmentSqrtN op.
-//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys of the elements to remove.
+//
+// Returns the created operation.
+func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNGrad",
+		Type: "LookupTableRemoveV2",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			table_handle, keys,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
-
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+// Computes the sum along segments of a tensor.
 //
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNGradBias sets the optional bias attribute to value.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
 //
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNGradAlpha sets the optional alpha attribute to value.
+// Computes a tensor such that
+// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNGradBeta sets the optional beta attribute to value.
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the given segment ID `i` is negative, the value is dropped and will not be
+// added to the sum of the segment.
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
+// `num_segments` should equal the number of distinct segment IDs.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentSum",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Gradients for Local Response Normalization.
+// Computes the product along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the product of all
+// entries belonging to a segment such that:
+//
+// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
+// `j...` such that `segment_ids[j...] == i`.
+//
+// If there is no entry for a given segment ID `i`, it outputs 1.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
 //
 // Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
 //
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "LRNGrad",
+		Type: "UnsortedSegmentProd",
 		Input: []tf.Input{
-			input_grads, input_image, output_image,
+			data, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AnyAttr is an optional argument to Any.
-type AnyAttr func(optionalAttr)
-
-// AnyKeepDims sets the optional keep_dims attribute to value.
+// Computes the mean along sparse segments of a tensor.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AnyKeepDims(value bool) AnyAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the "logical or" of elements across dimensions of a tensor.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
 //
-// Returns The reduced tensor.
-func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Any",
+		Type: "SparseSegmentMean",
 		Input: []tf.Input{
-			input, axis,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
-
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
-//
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
-	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
-	}
-}
-
-// Deletes the resource specified by the handle.
+// Deserializes a serialized tree ensemble config and replaces current tree
 //
-// All subsequent operations using the resource will result in a NotFound
-// error status.
+// ensemble.
 //
 // Arguments:
-//	resource: handle to the resource to delete.
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//	stamp_token: Token to use as the new value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the ensemble.
 //
 // Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
+		Type: "BoostedTreesDeserializeEnsemble",
 		Input: []tf.Input{
-			resource,
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Generates values in an interval.
-//
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
-//
-// For example:
-//
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-// ```
+// Transforms a tf.Example proto (as a string) into typed tensors.
 //
 // Arguments:
-//	start: 0-D tensor. First entry in the range.
-//	stop: 0-D tensor. Last entry in the range.
-//	num: 0-D tensor. Number of values to generate.
-//
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	dense_defaults: A list of Tensors (some may be empty), whose length matches
+// the length of `dense_keys`. dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse features to be parsed from the example. This
+// must match the lengths of `sparse_keys` and `sparse_types`.
+//	sparse_keys: A list of `num_sparse` strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: The keys expected in the Examples' features associated with dense
+// values.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each
+// Feature given in sparse_keys.
+// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: The shapes of data in each Feature given in dense_keys.
+// The length of this list must match the length of `dense_keys`.  The
+// number of elements in the Feature corresponding to dense_key[j] must
+// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
+// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
+// D1, .., DN), where M is the number of blocks of elements of length
+// D1 * .... * DN, in the input.
+func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "LinSpace",
+		Type: "ParseSingleExample",
 		Input: []tf.Input{
-			start, stop, num,
+			serialized, tf.OutputList(dense_defaults),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
+// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
+type WholeFileReaderV2Attr func(optionalAttr)
 
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
+// WholeFileReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["container"] = value
 	}
 }
 
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
+// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// The input tensors `real` and `imag` must have the same shape.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the entire contents of a file as a value.
 //
-// For example:
+// To use, enqueue filenames in a Queue.  The output of ReaderRead will
+// be a filename (key) and the contents of that file (value).
 //
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24638,52 +24407,30 @@ func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Complex",
-		Input: []tf.Input{
-			real, imag,
-		},
+		Type: "WholeFileReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
-
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the imaginary part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
+// Pop the element at the top of the stack.
 //
-// For example:
+// Arguments:
+//	handle: The handle to a stack.
+//	elem_type: The type of the elem that is popped.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+// Returns The tensor that is popped from the top of the stack.
+func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "StackPopV2",
 		Input: []tf.Input{
-			input,
+			handle,
 		},
 		Attrs: attrs,
 	}
@@ -24691,13 +24438,13 @@ func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes hyperbolic cosine of x element-wise.
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Tanh",
+		Type: "Cosh",
 		Input: []tf.Input{
 			x,
 		},
@@ -24706,60 +24453,123 @@ func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
+// Computes the mean along sparse segments of a tensor.
+//
+// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
 // Read
 // [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
 // for an explanation of segments.
 //
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
-//
 // Arguments:
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which has size
+// `num_segments`.
+func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMax",
+		Type: "SparseSegmentMeanWithNumSegments",
 		Input: []tf.Input{
-			data, segment_ids,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that skips `count` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
+// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
+type CudnnRNNParamsSizeAttr func(optionalAttr)
+
+// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes size of weights that can be used by a Cudnn RNN model.
 //
+// Return the params size that can be used by the Cudnn RNN model. Subsequent
+// weight allocation and initialization should use this size.
 //
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   The actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//   dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// params_size: The size of the params buffer that should be allocated and
+//   initialized for this RNN model. Note that this params buffer may not be
+//   compatible across GPUs. Please use CudnnRNNParamsWeights and
+//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
+//   across different runs.
+func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"T": T, "S": S}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SkipDataset",
+		Type: "CudnnRNNParamsSize",
 		Input: []tf.Input{
-			input_dataset, count,
+			num_layers, num_units, input_size,
 		},
 		Attrs: attrs,
 	}
@@ -24767,207 +24577,196 @@ func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_
 	return op.Output(0)
 }
 
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
-
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
+// Computes gradients for SparseSegmentMean.
+//
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
+//
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMeanGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the real part of a complex number.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
+// N is the size of the segment being reduced.
 //
-// For example:
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Real",
+		Type: "SparseSegmentSqrtN",
 		Input: []tf.Input{
-			input,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Sends `input` to all devices that are connected to the output.
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
 //
-// Sends `input` to all devices that are connected to the output.
+// The upper regularized incomplete Gamma function is defined as:
 //
-// The graph should be constructed so that all ops connected to the output have a
-// valid device assignment, and the op itself is assigned one of these devices.
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
 //
-// input: The input to the broadcast.
-// output: The same as input.
-// shape: The shape of the input tensor.
+// where
 //
-func NcclBroadcast(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//
+// is the upper incomplete Gama function.
+//
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "NcclBroadcast",
+		Type: "Igammac",
 		Input: []tf.Input{
-			input,
+			a, x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
-
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
+// N is the size of the segment being reduced.
+//
+// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtNWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// The range of pixel values for the output image might be slightly different
-// from the range for the input image because of limited numerical precision.
-// To guarantee an output range, for example `[0.0, 1.0]`, apply
-// `tf.clip_by_value` to the output.
+// Computes gradients for SparseSegmentSqrtN.
 //
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeArea",
+		Type: "SparseSegmentSqrtNGrad",
 		Input: []tf.Input{
-			images, size,
+			grad, indices, segment_ids, output_dim0,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// VarHandleOpAttr is an optional argument to VarHandleOp.
-type VarHandleOpAttr func(optionalAttr)
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
 
-// VarHandleOpContainer sets the optional container attribute to value.
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
 //
-// value: the container this variable is placed in.
-// If not specified, defaults to ""
-func VarHandleOpContainer(value string) VarHandleOpAttr {
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["depth_radius"] = value
 	}
 }
 
-// VarHandleOpSharedName sets the optional shared_name attribute to value.
+// LRNGradBias sets the optional bias attribute to value.
 //
-// value: the name by which this variable is referred to.
-// If not specified, defaults to ""
-func VarHandleOpSharedName(value string) VarHandleOpAttr {
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["bias"] = value
 	}
 }
 
-// Creates a handle to a Variable resource.
+// LRNGradAlpha sets the optional alpha attribute to value.
 //
-// Arguments:
-//	dtype: the type of this variable. Must agree with the dtypes
-// of all ops using this variable.
-//	shape: The (possibly partially specified) shape of this variable.
-func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "VarHandleOp",
-
-		Attrs: attrs,
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// AngleAttr is an optional argument to Angle.
-type AngleAttr func(optionalAttr)
-
-// AngleTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func AngleTout(value tf.DataType) AngleAttr {
+// LRNGradBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["beta"] = value
 	}
 }
 
-// Returns the argument of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the argument of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part.
-//
-// The argument returned by this operation is of the form \\(atan2(b, a)\\).
-//
-// For example:
+// Gradients for Local Response Normalization.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.angle(input) ==> [2.0132, 1.056]
-// ```
+// Arguments:
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
 //
-// @compatibility(numpy)
-// Equivalent to np.angle.
-// @end_compatibility
-func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24976,9 +24775,9 @@ func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Angle",
+		Type: "LRNGrad",
 		Input: []tf.Input{
-			input,
+			input_grads, input_image, output_image,
 		},
 		Attrs: attrs,
 	}
@@ -24986,129 +24785,75 @@ func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Outp
 	return op.Output(0)
 }
 
-// Clips tensor values to a specified min and max.
-//
-// Given a tensor `t`, this operation returns a tensor of the same type and
-// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-// greater than `clip_value_max` are set to `clip_value_max`.
-//
-// Arguments:
-//	t: A `Tensor`.
-//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The minimum value to clip by.
-//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The maximum value to clip by.
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
+
+// AnyKeepDims sets the optional keep_dims attribute to value.
 //
-// Returns A clipped `Tensor` with the same shape as input 't'.
-func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ClipByValue",
-		Input: []tf.Input{
-			t, clip_value_min, clip_value_max,
-		},
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AnyKeepDims(value bool) AnyAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Counts the number of occurrences of each value in an integer array.
-//
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
+// Computes the "logical or" of elements across dimensions of a tensor.
 //
-// Values in `arr` outside of the range [0, size) are ignored.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+// Returns The reduced tensor.
+func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Bincount",
+		Type: "Any",
 		Input: []tf.Input{
-			arr, size, weights,
+			input, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
-
-// CumsumExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumsum.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
 
-// CumsumReverse sets the optional reverse attribute to value.
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
 //
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
+		m["ignore_lookup_error"] = value
+	}
+}
+
+// Deletes the resource specified by the handle.
 //
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
+// All subsequent operations using the resource will result in a NotFound
+// error status.
 //
 // Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+//	resource: handle to the resource to delete.
+//
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25117,77 +24862,75 @@ func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumsum",
+		Type: "DestroyResourceOp",
 		Input: []tf.Input{
-			x, axis,
+			resource,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
-
-// CumprodExclusive sets the optional exclusive attribute to value.
+// Generates values in an interval.
 //
-// value: If `True`, perform exclusive cumprod.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
+//
+// For example:
+//
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
+//
+// Arguments:
+//	start: 0-D tensor. First entry in the range.
+//	stop: 0-D tensor. Last entry in the range.
+//	num: 0-D tensor. Number of values to generate.
+//
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LinSpace",
+		Input: []tf.Input{
+			start, stop, num,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CumprodReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
+
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["Tout"] = value
 	}
 }
 
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
+// Converts two real numbers to a complex number.
 //
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
 //
-// ```python
-// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-// ```
+// The input tensors `real` and `imag` must have the same shape.
 //
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
+// For example:
 //
-// ```python
-// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
 // ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
 // ```
-//
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25196,9 +24939,9 @@ func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumprod",
+		Type: "Complex",
 		Input: []tf.Input{
-			x, axis,
+			real, imag,
 		},
 		Attrs: attrs,
 	}
@@ -25206,65 +24949,31 @@ func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr)
 	return op.Output(0)
 }
 
-// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
-type QuantizedMatMulAttr func(optionalAttr)
-
-// QuantizedMatMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
 
-// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
-//
-// value: The type of output produced by activation function
-// following this operation.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
 	return func(m optionalAttr) {
-		m["Tactivation"] = value
+		m["Tout"] = value
 	}
 }
 
-// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+// Returns the imaginary part of a complex number.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// `a` (after being transposed if `transpose_a` is non-zero) must match the
-// outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero).
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
 //
-// Arguments:
-//	a: Must be a two-dimensional tensor.
-//	b: Must be a two-dimensional tensor.
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
+// For example:
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25273,158 +24982,117 @@ func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, ma
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMatMul",
+		Type: "Imag",
 		Input: []tf.Input{
-			a, b, min_a, max_a, min_b, max_b,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Does nothing. Serves as a control trigger for scheduling.
-//
-// Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
-	}
-	return scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
+		Type: "Tanh",
 		Input: []tf.Input{
-			t, m, v, beta, gamma,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayReadV3
+// Computes the maximum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
-func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV2",
+		Type: "SegmentMax",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedMulAttr is an optional argument to QuantizedMul.
-type QuantizedMulAttr func(optionalAttr)
-
-// QuantizedMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// Returns x * y element-wise, working on quantized buffers.
+// Creates a dataset that skips `count` elements from the `input_dataset`.
 //
 // Arguments:
 //
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
 //
-// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMul",
+		Type: "SkipDataset",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// QuantizedAddAttr is an optional argument to QuantizedAdd.
-type QuantizedAddAttr func(optionalAttr)
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
 
-// QuantizedAddToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
 	return func(m optionalAttr) {
-		m["Toutput"] = value
+		m["Tout"] = value
 	}
 }
 
-// Returns x + y element-wise, working on quantized buffers.
-//
-// Arguments:
-//
+// Returns the real part of a complex number.
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// For example:
 //
-// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25433,241 +25101,185 @@ func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAdd",
+		Type: "Real",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Given a quantized tensor described by (input, input_min, input_max), outputs a
+// Sends `input` to all devices that are connected to the output.
 //
-// range that covers the actual values present in that tensor.  This op is
-// typically used to produce the requested_output_min and requested_output_max for
-// Requantize.
+// Sends `input` to all devices that are connected to the output.
 //
-// Arguments:
+// The graph should be constructed so that all ops connected to the output have a
+// valid device assignment, and the op itself is assigned one of these devices.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
+// input: The input to the broadcast.
+// output: The same as input.
+// shape: The shape of the input tensor.
 //
-// Returns The computed min output.the computed max output.
-func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
+func NcclBroadcast(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "RequantizationRange",
+		Type: "NcclBroadcast",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Rolls the elements of a tensor along an axis.
-//
-// The elements are shifted positively (towards larger indices) by the offset of
-// `shift` along the dimension of `axis`. Negative `shift` values will shift
-// elements in the opposite direction. Elements that roll passed the last position
-// will wrap around to the first and vice versa. Multiple shifts along multiple
-// axes may be specified.
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
+
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// For example:
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using area interpolation.
 //
-// ```
-// # 't' is [0, 1, 2, 3, 4]
-// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+// Input images can be of different types but output images are always float.
 //
-// # shifting along multiple dimensions
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+// The range of pixel values for the output image might be slightly different
+// from the range for the input image because of limited numerical precision.
+// To guarantee an output range, for example `[0.0, 1.0]`, apply
+// `tf.clip_by_value` to the output.
 //
-// # shifting along the same axis multiple times
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
-// ```
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
 //
 // Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
-// elements are shifted positively (towards larger indices) along the dimension
-// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
-// direction.
-//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
-// `shift[i]` should occur. If the same axis is referenced more than once, the
-// total shift for that axis will be the sum of all the shifts that belong to that
-// axis.
-//
-// Returns Has the same shape and size as the input. The elements are shifted
-// positively (towards larger indices) by the offsets of `shift` along the
-// dimensions of `axis`.
-func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Roll",
+		Type: "ResizeArea",
 		Input: []tf.Input{
-			input, shift, axis,
+			images, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MapPeekAttr is an optional argument to MapPeek.
-type MapPeekAttr func(optionalAttr)
-
-// MapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekCapacity(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
+// VarHandleOpAttr is an optional argument to VarHandleOp.
+type VarHandleOpAttr func(optionalAttr)
 
-// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// VarHandleOpContainer sets the optional container attribute to value.
 //
-// REQUIRES: value >= 0
-func MapPeekMemoryLimit(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapPeekContainer sets the optional container attribute to value.
+// value: the container this variable is placed in.
 // If not specified, defaults to ""
-func MapPeekContainer(value string) MapPeekAttr {
+func VarHandleOpContainer(value string) VarHandleOpAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapPeekSharedName(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified key.  If the
+// VarHandleOpSharedName sets the optional shared_name attribute to value.
 //
-// underlying container does not contain this key
-// this op will block until it does.
-func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapPeek",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapPeek", err)
-		return
+// value: the name by which this variable is referred to.
+// If not specified, defaults to ""
+func VarHandleOpSharedName(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	return values
 }
 
-// Looks up keys in a table, outputs the corresponding values.
-//
-// The tensor `keys` must of the same type as the keys of the table.
-// The output `values` is of the type of the table values.
-//
-// The scalar `default_value` is the value output for keys not present in the
-// table. It must also be of the same type as the table values.
+// Creates a handle to a Variable resource.
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//
-//
-// Returns Same shape as `keys`.  Values found in the table, or `default_values`
-// for missing keys.
-func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
+//	dtype: the type of this variable. Must agree with the dtypes
+// of all ops using this variable.
+//	shape: The (possibly partially specified) shape of this variable.
+func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableFindV2",
-		Input: []tf.Input{
-			table_handle, keys, default_value,
-		},
+		Type: "VarHandleOp",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Updates the table to associates keys with values.
-//
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
-//
-// Returns the created operation.
-func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableInsertV2",
-		Input: []tf.Input{
-			table_handle, keys, values,
-		},
+// AngleAttr is an optional argument to Angle.
+type AngleAttr func(optionalAttr)
+
+// AngleTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func AngleTout(value tf.DataType) AngleAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that batches and pads `batch_size` elements from the input.
+// Returns the argument of a complex number.
 //
-// Arguments:
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the argument of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part.
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
-//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
-// is smaller than desired.
+// The argument returned by this operation is of the form \\(atan2(b, a)\\).
 //
-func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, drop_remainder tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.angle(input) ==> [2.0132, 1.056]
+// ```
+//
+// @compatibility(numpy)
+// Equivalent to np.angle.
+// @end_compatibility
+func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "PaddedBatchDatasetV2",
+		Type: "Angle",
 		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values), drop_remainder,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -25675,66 +25287,129 @@ func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.O
 	return op.Output(0)
 }
 
-// Returns element-wise smallest integer not less than x.
-func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+// Clips tensor values to a specified min and max.
+//
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
+//
+// Arguments:
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
+//
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Ceil",
+		Type: "ClipByValue",
 		Input: []tf.Input{
-			x,
+			t, clip_value_min, clip_value_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the number of elements in the given table.
+// Counts the number of occurrences of each value in an integer array.
+//
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
+//
+// Values in `arr` outside of the range [0, size) are ignored.
 //
 // Arguments:
-//	table_handle: Handle to the table.
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
 //
-// Returns Scalar that contains number of elements in the table.
-func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableSizeV2",
+		Type: "Bincount",
 		Input: []tf.Input{
-			table_handle,
+			arr, size, weights,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
-type ResizeBilinearGradAttr func(optionalAttr)
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
 
-// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
+// CumsumExclusive sets the optional exclusive attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
+// value: If `True`, perform exclusive cumsum.
 // If not specified, defaults to false
-func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+func CumsumExclusive(value bool) CumsumAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["exclusive"] = value
 	}
 }
 
-// Computes the gradient of bilinear interpolation.
+// CumsumReverse sets the optional reverse attribute to value.
 //
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative sum of the tensor `x` along `axis`.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25743,9 +25418,9 @@ func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinearGrad",
+		Type: "Cumsum",
 		Input: []tf.Input{
-			grads, original_image,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
@@ -25753,235 +25428,296 @@ func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 	return op.Output(0)
 }
 
-// Outputs all keys and values in the table.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//
-//
+// Return the shape of s0 op s1 with broadcast.
 //
-// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
-func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "LookupTableExportV2",
+		Type: "BroadcastArgs",
 		Input: []tf.Input{
-			table_handle,
+			s0, s1,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
+
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+//
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["src_format"] = value
+	}
+}
+
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
+//
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
 }
 
-// Replaces the contents of the table with the specified keys and values.
+// Returns the dimension index in the destination data format given the one in
 //
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
+// the source data format.
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
 //
-// Returns the created operation.
-func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableImportV2",
+		Type: "DataFormatDimMap",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			x,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
-type MapUnstageNoKeyAttr func(optionalAttr)
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
 
-// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// CumprodExclusive sets the optional exclusive attribute to value.
 //
-// REQUIRES: value >= 0
-func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
+// value: If `True`, perform exclusive cumprod.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["exclusive"] = value
 	}
 }
 
-// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// CumprodReverse sets the optional reverse attribute to value.
 //
-// REQUIRES: value >= 0
-func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumprodReverse(value bool) CumprodAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["reverse"] = value
 	}
 }
 
-// Op removes and returns a random (key, value)
+// Compute the cumulative product of the tensor `x` along `axis`.
 //
-// from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapUnstageNoKey",
+		Type: "Cumprod",
 		Input: []tf.Input{
-			indices,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstageNoKey", err)
-		return
-	}
-	return key, values
+	return op.Output(0)
 }
 
-// HashTableV2Attr is an optional argument to HashTableV2.
-type HashTableV2Attr func(optionalAttr)
+// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
+type QuantizedMatMulAttr func(optionalAttr)
 
-// HashTableV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func HashTableV2Container(value string) HashTableV2Attr {
+// QuantizedMatMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["Toutput"] = value
 	}
 }
 
-// HashTableV2SharedName sets the optional shared_name attribute to value.
+// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func HashTableV2SharedName(value string) HashTableV2Attr {
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
 //
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
+// value: If true, `b` is transposed before multiplication.
 // If not specified, defaults to false
-func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
+func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
 	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		m["transpose_b"] = value
 	}
 }
 
-// Creates a non-initialized hash table.
+// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
 //
-// This op creates a hash table, specifying the type of its keys and values.
-// Before using the table you will have to initialize it.  After initialization the
-// table will be immutable.
+// value: The type of output produced by activation function
+// following this operation.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Tactivation"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+//
+// The inputs must be two-dimensional matrices and the inner dimension of
+// `a` (after being transposed if `transpose_a` is non-zero) must match the
+// outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero).
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+//	a: Must be a two-dimensional tensor.
+//	b: Must be a two-dimensional tensor.
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
 //
-// Returns Handle to a table.
-func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "HashTableV2",
-
+		Type: "QuantizedMatMul",
+		Input: []tf.Input{
+			a, b, min_a, max_a, min_b, max_b,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MultiDeviceIteratorFromStringHandleAttr is an optional argument to MultiDeviceIteratorFromStringHandle.
-type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
-
-// MultiDeviceIteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
+// Does nothing. Serves as a control trigger for scheduling.
 //
-// value: The type list for the return values.
-// If not specified, defaults to <>
+// Only useful as a placeholder for control edges.
 //
-// REQUIRES: len(value) >= 0
-func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDeviceIteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_types"] = value
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "ControlTrigger",
+	}
+	return scope.AddOperation(opspec)
 }
 
-// MultiDeviceIteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
+// Batch normalization.
 //
-// value: The list of shapes being produced.
-// If not specified, defaults to <>
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// REQUIRES: len(value) >= 0
-func MultiDeviceIteratorFromStringHandleOutputShapes(value []tf.Shape) MultiDeviceIteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_shapes"] = value
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalization",
+		Input: []tf.Input{
+			t, m, v, beta, gamma,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Generates a MultiDeviceIterator resource from its provided string handle.
-//
-// Arguments:
-//	string_handle: String representing the resource.
+// Deprecated. Use TensorArrayReadV3
 //
-// Returns A MultiDeviceIterator resource.
-func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...MultiDeviceIteratorFromStringHandleAttr) (multi_device_iterator tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
+func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorFromStringHandle",
+		Type: "TensorArrayReadV2",
 		Input: []tf.Input{
-			string_handle,
+			handle, index, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -25989,162 +25725,76 @@ func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output,
 	return op.Output(0)
 }
 
-// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
-type MutableHashTableV2Attr func(optionalAttr)
+// QuantizedMulAttr is an optional argument to QuantizedMul.
+type QuantizedMulAttr func(optionalAttr)
 
-// MutableHashTableV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+// QuantizedMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["Toutput"] = value
 	}
 }
 
-// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+// Returns x * y element-wise, working on quantized buffers.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// Arguments:
 //
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// Creates an empty hash table.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
 //
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
 //
-// Returns Handle to a table.
-func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
+// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableV2",
-
+		Type: "QuantizedMul",
+		Input: []tf.Input{
+			x, y, min_x, max_x, min_y, max_y,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DequantizeAttr is an optional argument to Dequantize.
-type DequantizeAttr func(optionalAttr)
+// QuantizedAddAttr is an optional argument to QuantizedAdd.
+type QuantizedAddAttr func(optionalAttr)
 
-// DequantizeMode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func DequantizeMode(value string) DequantizeAttr {
+// QuantizedAddToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["Toutput"] = value
 	}
 }
 
-// Dequantize the 'input' tensor into a float Tensor.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// if T == qint8: in[i] += (range(T) + 1)/ 2.0
-// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// If the input comes from a QuantizedRelu6, the output type is
-// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-// Dequantize on quint8 will take each value, cast to float, and multiply
-// by 6 / 255.
-// Note that if quantizedtype is qint8, the operation will additionally add
-// each value by 128 prior to casting.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```c++
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = range / num_discrete_values
-// const double offset_input = static_cast<double>(input) - lowest_quantized;
-// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
-// ```
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
+// Returns x + y element-wise, working on quantized buffers.
 //
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
+// Arguments:
 //
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (2 * m) / (max_fixed - min_fixed)
-// ```
 //
-// Now we can dequantize the elements of our tensor:
-// ```c++
-// result = input * s
-// ```
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
 //
-// Arguments:
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
+// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26153,93 +25803,139 @@ func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Dequantize",
+		Type: "QuantizedAdd",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			x, y, min_x, max_x, min_y, max_y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Flips all bits elementwise.
+// Given a quantized tensor described by (input, input_min, input_max), outputs a
 //
-// The result will have exactly those bits set, that are not set in `x`. The
-// computation is performed on the underlying representation of x.
-func Invert(scope *Scope, x tf.Output) (y tf.Output) {
+// range that covers the actual values present in that tensor.  This op is
+// typically used to produce the requested_output_min and requested_output_max for
+// Requantize.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//
+// Returns The computed min output.the computed max output.
+func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Invert",
+		Type: "RequantizationRange",
 		Input: []tf.Input{
-			x,
+			input, input_min, input_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Inverse 3D fast Fourier transform.
+// Rolls the elements of a tensor along an axis.
 //
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
+// The elements are shifted positively (towards larger indices) by the offset of
+// `shift` along the dimension of `axis`. Negative `shift` values will shift
+// elements in the opposite direction. Elements that roll passed the last position
+// will wrap around to the first and vice versa. Multiple shifts along multiple
+// axes may be specified.
+//
+// For example:
+//
+// ```
+// # 't' is [0, 1, 2, 3, 4]
+// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+//
+// # shifting along multiple dimensions
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+//
+// # shifting along the same axis multiple times
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+// ```
 //
 // Arguments:
-//	input: A complex64 tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+// elements are shifted positively (towards larger indices) along the dimension
+// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+// direction.
+//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+// `shift[i]` should occur. If the same axis is referenced more than once, the
+// total shift for that axis will be the sum of all the shifts that belong to that
+// axis.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
-// @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Has the same shape and size as the input. The elements are shifted
+// positively (towards larger indices) by the offsets of `shift` along the
+// dimensions of `axis`.
+func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT3D",
+		Type: "Roll",
 		Input: []tf.Input{
-			input,
+			input, shift, axis,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Disallowed in GraphDef version >= 2.
+// Looks up keys in a table, outputs the corresponding values.
 //
-// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
-func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+// The tensor `keys` must of the same type as the keys of the table.
+// The output `values` is of the type of the table values.
+//
+// The scalar `default_value` is the value output for keys not present in the
+// table. It must also be of the same type as the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//
+//
+// Returns Same shape as `keys`.  Values found in the table, or `default_values`
+// for missing keys.
+func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrast",
+		Type: "LookupTableFindV2",
 		Input: []tf.Input{
-			images, contrast_factor, min_value, max_value,
+			table_handle, keys, default_value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Table initializer that takes two tensors for keys and values respectively.
+// Updates the table to associates keys with values.
+//
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
 //
 // Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	keys: Keys of type Tkey.
-//	values: Values of type Tval.
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
 //
 // Returns the created operation.
-func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InitializeTableV2",
+		Type: "LookupTableInsertV2",
 		Input: []tf.Input{
 			table_handle, keys, values,
 		},
@@ -26247,60 +25943,30 @@ func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, val
 	return scope.AddOperation(opspec)
 }
 
-// PrintAttr is an optional argument to Print.
-type PrintAttr func(optionalAttr)
-
-// PrintMessage sets the optional message attribute to value.
-//
-// value: A string, prefix of the error message.
-// If not specified, defaults to ""
-func PrintMessage(value string) PrintAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
-	}
-}
-
-// PrintFirstN sets the optional first_n attribute to value.
-//
-// value: Only log `first_n` number of times. -1 disables logging.
-// If not specified, defaults to -1
-func PrintFirstN(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["first_n"] = value
-	}
-}
-
-// PrintSummarize sets the optional summarize attribute to value.
-//
-// value: Only print this many entries of each tensor.
-// If not specified, defaults to 3
-func PrintSummarize(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Prints a list of tensors.
-//
-// Passes `input` through to `output` and prints `data` when evaluating.
+// Creates a dataset that batches and pads `batch_size` elements from the input.
 //
 // Arguments:
-//	input: The tensor passed to `output`
-//	data: A list of tensors to print out when op is evaluated.
 //
-// Returns = The unmodified `input` tensor
-func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
+//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
+// is smaller than desired.
+//
+func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, drop_remainder tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Print",
+		Type: "PaddedBatchDatasetV2",
 		Input: []tf.Input{
-			input, tf.OutputList(data),
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values), drop_remainder,
 		},
 		Attrs: attrs,
 	}
@@ -26308,334 +25974,340 @@ func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAtt
 	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
-//
-// Arguments:
-//	tag: A string attached to this summary. Used for organization in TensorBoard.
-//	tensor: A tensor to serialize.
-//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
-// data.
-func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
+// Returns element-wise smallest integer not less than x.
+func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorSummaryV2",
+		Type: "Ceil",
 		Input: []tf.Input{
-			tag, tensor, serialized_summary_metadata,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
+// Computes the number of elements in the given table.
 //
 // Arguments:
+//	table_handle: Handle to the table.
 //
-//	buffer_size: The maximum number of elements to buffer in an iterator over
-// this dataset.
-//
-//
-func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "PrefetchDataset",
+		Type: "LookupTableSizeV2",
 		Input: []tf.Input{
-			input_dataset, buffer_size,
+			table_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorSummaryAttr is an optional argument to TensorSummary.
-type TensorSummaryAttr func(optionalAttr)
-
-// TensorSummaryDescription sets the optional description attribute to value.
-//
-// value: A json-encoded SummaryDescription proto.
-// If not specified, defaults to ""
-func TensorSummaryDescription(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["description"] = value
-	}
-}
-
-// TensorSummaryLabels sets the optional labels attribute to value.
-//
-// value: An unused list of strings.
-// If not specified, defaults to <>
-func TensorSummaryLabels(value []string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["labels"] = value
-	}
-}
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
 
-// TensorSummaryDisplayName sets the optional display_name attribute to value.
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: An unused string.
-// If not specified, defaults to ""
-func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
-		m["display_name"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with a tensor.
-//
-// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
-// a tag as well as a serialized SummaryMetadata proto string that contains
-// plugin-specific data. We will keep this op to maintain backwards compatibility.
-//
-// Arguments:
-//	tensor: A tensor to serialize.
-func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorSummary",
-		Input: []tf.Input{
-			tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Read an element from the TensorArray into output `value`.
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
-//
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
-		Input: []tf.Input{
-			handle, index, flow_in,
-		},
-		Attrs: attrs,
+		m["align_corners"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = max(ref[indices, ...], updates[...])
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions are combined.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// Computes the gradient of bilinear interpolation.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-// Returns the created operation.
-func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterMax",
+		Type: "ResizeBilinearGrad",
 		Input: []tf.Input{
-			resource, indices, updates,
+			grads, original_image,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the gradient for the tanh of `x` wrt its input.
+// Outputs all keys and values in the table.
 //
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Arguments:
+//	table_handle: Handle to the table.
+//
+//
+//
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "TanhGrad",
+		Type: "LookupTableExportV2",
 		Input: []tf.Input{
-			y, dy,
+			table_handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Outputs a `Summary` protocol buffer with scalar values.
+// Replaces the contents of the table with the specified keys and values.
 //
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
 //
 // Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
 //
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+// Returns the created operation.
+func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
+		Type: "LookupTableImportV2",
 		Input: []tf.Input{
-			tags, values,
+			table_handle, keys, values,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
+// MultiDeviceIteratorFromStringHandleAttr is an optional argument to MultiDeviceIteratorFromStringHandle.
+type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
+
+// MultiDeviceIteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
 //
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
+// value: The type list for the return values.
+// If not specified, defaults to <>
 //
-// This op reports an `InvalidArgument` error if any value is not finite.
+// REQUIRES: len(value) >= 0
+func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDeviceIteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_types"] = value
+	}
+}
+
+// MultiDeviceIteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
+//
+// value: The list of shapes being produced.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func MultiDeviceIteratorFromStringHandleOutputShapes(value []tf.Shape) MultiDeviceIteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_shapes"] = value
+	}
+}
+
+// Generates a MultiDeviceIterator resource from its provided string handle.
 //
 // Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
+//	string_handle: String representing the resource.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+// Returns A MultiDeviceIterator resource.
+func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...MultiDeviceIteratorFromStringHandleAttr) (multi_device_iterator tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
+		Type: "MultiDeviceIteratorFromStringHandle",
 		Input: []tf.Input{
-			tag, values,
+			string_handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the number of elements in the given queue.
+// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
+type MutableHashTableV2Attr func(optionalAttr)
+
+// MutableHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates an empty hash table.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	handle: The handle to a queue.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+// Returns Handle to a table.
+func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
-		Input: []tf.Input{
-			handle,
-		},
+		Type: "MutableHashTableV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImageSummaryAttr is an optional argument to ImageSummary.
-type ImageSummaryAttr func(optionalAttr)
+// DequantizeAttr is an optional argument to Dequantize.
+type DequantizeAttr func(optionalAttr)
 
-// ImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
+// DequantizeMode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func DequantizeMode(value string) DequantizeAttr {
 	return func(m optionalAttr) {
-		m["max_images"] = value
+		m["mode"] = value
 	}
 }
 
-// ImageSummaryBadColor sets the optional bad_color attribute to value.
+// Dequantize the 'input' tensor into a float Tensor.
 //
-// value: Color to use for pixels with non-finite values.
-// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
-func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["bad_color"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with images.
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
 //
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
+// ```
+// if T == qint8: in[i] += (range(T) + 1)/ 2.0
+// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 //
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
+// *MIN_COMBINED Mode Example*
 //
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
+// If the input comes from a QuantizedRelu6, the output type is
+// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
+// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
+// Dequantize on quint8 will take each value, cast to float, and multiply
+// by 6 / 255.
+// Note that if quantizedtype is qint8, the operation will additionally add
+// each value by 128 prior to casting.
 //
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```c++
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = range / num_discrete_values
+// const double offset_input = static_cast<double>(input) - lowest_quantized;
+// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
+// ```
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
 //
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
 //
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
 //
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (2 * m) / (max_fixed - min_fixed)
+// ```
+//
+// Now we can dequantize the elements of our tensor:
+// ```c++
+// result = input * s
+// ```
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26644,9 +26316,9 @@ func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...Ima
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ImageSummary",
+		Type: "Dequantize",
 		Input: []tf.Input{
-			tag, tensor,
+			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
@@ -26654,150 +26326,144 @@ func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...Ima
 	return op.Output(0)
 }
 
-// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
-type AudioSummaryV2Attr func(optionalAttr)
-
-// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// Flips all bits elementwise.
 //
-// REQUIRES: value >= 1
-func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
+// The result will have exactly those bits set, that are not set in `x`. The
+// computation is performed on the underlying representation of x.
+func Invert(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Invert",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// Inverse 3D fast Fourier transform.
 //
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+//	input: A complex64 tensor.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummaryV2",
+		Type: "IFFT3D",
 		Input: []tf.Input{
-			tag, tensor, sample_rate,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AvgPoolAttr is an optional argument to AvgPool.
-type AvgPoolAttr func(optionalAttr)
-
-// AvgPoolDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolDataFormat(value string) AvgPoolAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs average pooling on the input.
-//
-// Each entry in `output` is the mean of the corresponding size `ksize`
-// window in `value`.
-//
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	ksize: The size of the sliding window for each dimension of `value`.
-//	strides: The stride of the sliding window for each dimension of `value`.
-//	padding: The type of padding algorithm to use.
+// Deprecated. Disallowed in GraphDef version >= 2.
 //
-// Returns The average pooled output tensor.
-func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool",
+		Type: "AdjustContrast",
 		Input: []tf.Input{
-			value,
+			images, contrast_factor, min_value, max_value,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Merges summaries.
-//
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
-//
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
+// Table initializer that takes two tensors for keys and values respectively.
 //
 // Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
+//	table_handle: Handle to a table which will be initialized.
+//	keys: Keys of type Tkey.
+//	values: Values of type Tval.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+// Returns the created operation.
+func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MergeSummary",
+		Type: "InitializeTableV2",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			table_handle, keys, values,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// The shape of the elements of the given list, as a tensor.
+// PrintAttr is an optional argument to Print.
+type PrintAttr func(optionalAttr)
+
+// PrintMessage sets the optional message attribute to value.
 //
-//   input_handle: the list
-//   element_shape: the shape of elements of the list
-func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
+// value: A string, prefix of the error message.
+// If not specified, defaults to ""
+func PrintMessage(value string) PrintAttr {
+	return func(m optionalAttr) {
+		m["message"] = value
+	}
+}
+
+// PrintFirstN sets the optional first_n attribute to value.
+//
+// value: Only log `first_n` number of times. -1 disables logging.
+// If not specified, defaults to -1
+func PrintFirstN(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["first_n"] = value
+	}
+}
+
+// PrintSummarize sets the optional summarize attribute to value.
+//
+// value: Only print this many entries of each tensor.
+// If not specified, defaults to 3
+func PrintSummarize(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Prints a list of tensors.
+//
+// Passes `input` through to `output` and prints `data` when evaluating.
+//
+// Arguments:
+//	input: The tensor passed to `output`
+//	data: A list of tensors to print out when op is evaluated.
+//
+// Returns = The unmodified `input` tensor
+func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape_type": shape_type}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorListElementShape",
+		Type: "Print",
 		Input: []tf.Input{
-			input_handle,
+			input, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
@@ -26805,106 +26471,93 @@ func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.
 	return op.Output(0)
 }
 
-// Returns the item in the list with the given index.
-//
-// input_handle: the list
-// index: the position in the list from which an element will be retrieved
-// item: the element at that position
-//
+// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
 //
-func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_dtype tf.DataType) (item tf.Output) {
+// Arguments:
+//	tag: A string attached to this summary. Used for organization in TensorBoard.
+//	tensor: A tensor to serialize.
+//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+// data.
+func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorListGetItem",
+		Type: "TensorSummaryV2",
 		Input: []tf.Input{
-			input_handle, index,
+			tag, tensor, serialized_summary_metadata,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a diagonal tensor with a given diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
+// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
 //
-// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+// Arguments:
 //
-// For example:
+//	buffer_size: The maximum number of elements to buffer in an iterator over
+// this dataset.
 //
-// ```
-// # 'diagonal' is [1, 2, 3, 4]
-// tf.diag(diagonal) ==> [[1, 0, 0, 0]
-//                        [0, 2, 0, 0]
-//                        [0, 0, 3, 0]
-//                        [0, 0, 0, 4]]
-// ```
 //
-// Arguments:
-//	diagonal: Rank k tensor where k is at most 1.
-func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Diag",
+		Type: "PrefetchDataset",
 		Input: []tf.Input{
-			diagonal,
+			input_dataset, buffer_size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
-type ParameterizedTruncatedNormalAttr func(optionalAttr)
+// TensorSummaryAttr is an optional argument to TensorSummary.
+type TensorSummaryAttr func(optionalAttr)
 
-// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+// TensorSummaryDescription sets the optional description attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
+// value: A json-encoded SummaryDescription proto.
+// If not specified, defaults to ""
+func TensorSummaryDescription(value string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["description"] = value
 	}
 }
 
-// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// TensorSummaryLabels sets the optional labels attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
+// value: An unused list of strings.
+// If not specified, defaults to <>
+func TensorSummaryLabels(value []string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["labels"] = value
 	}
 }
 
-// Outputs random values from a normal distribution. The parameters may each be a
+// TensorSummaryDisplayName sets the optional display_name attribute to value.
 //
-// scalar which applies to the entire output, or a vector of length shape[0] which
-// stores the parameters for each batch.
+// value: An unused string.
+// If not specified, defaults to ""
+func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["display_name"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with a tensor.
 //
-// Arguments:
-//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-//	means: The mean parameter of each batch.
-//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-//	minvals: The minimum cutoff. May be -infinity.
-//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-// for each batch.
+// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+// a tag as well as a serialized SummaryMetadata proto string that contains
+// plugin-specific data. We will keep this op to maintain backwards compatibility.
 //
-// Returns A matrix of shape num_batches x samples_per_batch, filled with random
-// truncated normal values using the parameters for each row.
-func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
+// Arguments:
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26913,9 +26566,9 @@ func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParameterizedTruncatedNormal",
+		Type: "TensorSummary",
 		Input: []tf.Input{
-			shape, means, stdevs, minvals, maxvals,
+			tensor,
 		},
 		Attrs: attrs,
 	}
@@ -26923,296 +26576,342 @@ func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output
 	return op.Output(0)
 }
 
-// Sets the index-th position of the list to contain the given tensor.
+// Read an element from the TensorArray into output `value`.
 //
-// input_handle: the list
-// index: the position in the list to which the tensor will be assigned
-// item: the element to be assigned to that position
-// output_handle: the new list, with the element in the proper position
+// Arguments:
+//	handle: The handle to a TensorArray.
 //
-func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorListSetItem",
+		Type: "TensorArrayReadV3",
 		Input: []tf.Input{
-			input_handle, index, item,
+			handle, index, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a Tensor by indexing into the TensorList.
+// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
 //
-// Each row in the produced Tensor corresponds to the element in the TensorList
-// specified by the given index (see `tf.gather`).
+// This operation computes
 //
-// input_handle: The input tensor list.
-// indices: The indices used to index into the list.
-// values: The tensor.
-func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_dtype tf.DataType) (values tf.Output) {
+//     # Scalar indices
+//     ref[indices, ...] = max(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorListGather",
+		Type: "ResourceScatterMax",
 		Input: []tf.Input{
-			input_handle, indices,
+			resource, indices, updates,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a TensorList by indexing into a Tensor.
-//
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
+// Computes the gradient for the tanh of `x` wrt its input.
 //
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// element_shape: The shape of the elements in the list (can be less specified than
-//   the shape of the tensor).
-// output_handle: The TensorList.
-func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListScatter",
+		Type: "TanhGrad",
 		Input: []tf.Input{
-			tensor, indices, element_shape,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a `RaggedTensor` containing the specified sequences of numbers.
-//
-//
-// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
-// `rt_nested_splits`, such that
-// `result[i] = range(starts[i], limits[i], deltas[i])`.
-//
-// ```python
-// >>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
-// ...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
-// >>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// >>> print result.eval().tolist()
-// [[2],               # result[0] = range(2, 3)
-//  [],                # result[1] = range(5, 5)
-//  [8, 9, 10, 11]]    # result[2] = range(8, 12)
-// ```
+// Outputs a `Summary` protocol buffer with scalar values.
 //
-// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
-// The vector inputs must all have the same size.  Scalar inputs are broadcast
-// to match the size of the vector inputs.
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
 //
 // Arguments:
-//	starts: The starts of each range.
-//	limits: The limits of each range.
-//	deltas: The deltas of each range.
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
 //
-// Returns The `row_splits` for the returned `RaggedTensor`.The `inner_values` for the returned `RaggedTensor`.
-func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RaggedRange",
+		Type: "ScalarSummary",
 		Input: []tf.Input{
-			starts, limits, deltas,
+			tags, values,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Deprecated, use python implementation tf.linalg.matrix_exponential.
+// Outputs a `Summary` protocol buffer with a histogram.
 //
-// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
-func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
+//
+// This op reports an `InvalidArgument` error if any value is not finite.
+//
+// Arguments:
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixExponential",
+		Type: "HistogramSummary",
 		Input: []tf.Input{
-			input,
+			tag, values,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
-type QueueDequeueUpToV2Attr func(optionalAttr)
-
-// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Dequeues `n` tuples of one or more tensors from the given queue.
-//
-// This operation is not supported by all queues.  If a queue does not support
-// DequeueUpTo, then an Unimplemented error is returned.
-//
-// If the queue is closed and there are more than 0 but less than `n`
-// elements remaining, then instead of returning an OutOfRange error like
-// QueueDequeueMany, less than `n` elements are returned immediately.  If
-// the queue is closed and there are 0 elements left in the queue, then
-// an OutOfRange error is returned just like in QueueDequeueMany.
-// Otherwise the behavior is identical to QueueDequeueMany:
-//
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size n in the 0th dimension.
-//
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
+// Computes the number of elements in the given queue.
 //
 // Arguments:
 //	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueUpToV2",
+		Type: "QueueSizeV2",
 		Input: []tf.Input{
-			handle, n,
+			handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
+	return op.Output(0)
+}
+
+// ImageSummaryAttr is an optional argument to ImageSummary.
+type ImageSummaryAttr func(optionalAttr)
+
+// ImageSummaryMaxImages sets the optional max_images attribute to value.
+//
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_images"] = value
 	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueUpToV2", err)
-		return
+}
+
+// ImageSummaryBadColor sets the optional bad_color attribute to value.
+//
+// value: Color to use for pixels with non-finite values.
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
+func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["bad_color"] = value
 	}
-	return components
 }
 
-// Computes the Cholesky decomposition of one or more square matrices.
+// Outputs a `Summary` protocol buffer with images.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
 //
-// The input has to be symmetric and positive definite. Only the lower-triangular
-// part of the input will be used for this operation. The upper-triangular part
-// will not be read.
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
 //
-// The output is a tensor of the same shape as the input
-// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
 //
-// **Note**: The gradient computation on GPU is faster for large matrices but
-// not for large batch dimensions when the submatrices are small. In this
-// case it might be faster to use the CPU.
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
 //
-// Returns Shape is `[..., M, M]`.
-func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Cholesky",
+		Type: "ImageSummary",
 		Input: []tf.Input{
-			input,
+			tag, tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Writes contents to the file at input filename. Creates file and recursively
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
+
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
 //
-// creates directory if not existing.
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	filename: scalar. The name of the file to which we write the contents.
-//	contents: scalar. The content to be written to the output file.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns the created operation.
-func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "WriteFile",
+		Type: "AudioSummaryV2",
 		Input: []tf.Input{
-			filename, contents,
+			tag, tensor, sample_rate,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AllAttr is an optional argument to All.
-type AllAttr func(optionalAttr)
+// AvgPoolAttr is an optional argument to AvgPool.
+type AvgPoolAttr func(optionalAttr)
 
-// AllKeepDims sets the optional keep_dims attribute to value.
+// AvgPoolDataFormat sets the optional data_format attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AllKeepDims(value bool) AllAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolDataFormat(value string) AvgPoolAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes the "logical and" of elements across dimensions of a tensor.
+// Performs average pooling on the input.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Each entry in `output` is the mean of the corresponding size `ksize`
+// window in `value`.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	ksize: The size of the sliding window for each dimension of `value`.
+//	strides: The stride of the sliding window for each dimension of `value`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The reduced tensor.
-func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
+// Returns The average pooled output tensor.
+func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "All",
+		Type: "AvgPool",
 		Input: []tf.Input{
-			input, axis,
+			value,
 		},
 		Attrs: attrs,
 	}
@@ -27220,165 +26919,155 @@ func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (ou
 	return op.Output(0)
 }
 
-// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
-//
-// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
+// Merges summaries.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices, with the same constraints as the single matrix
-// SelfAdjointEig.
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
 //
-// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
-// are sorted in non-decreasing order.
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
 //
-// Returns Shape is `[..., M+1, M]`.
-func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEig",
+		Type: "MergeSummary",
 		Input: []tf.Input{
-			input,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softplus gradients for a softplus operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softplus operation.
-//	features: The features passed as input to the corresponding softplus operation.
+// The shape of the elements of the given list, as a tensor.
 //
-// Returns The gradients: `gradients / (1 + exp(-features))`.
-func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+//   input_handle: the list
+//   element_shape: the shape of elements of the list
+func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"shape_type": shape_type}
 	opspec := tf.OpSpec{
-		Type: "SoftplusGrad",
+		Type: "TensorListElementShape",
 		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
-type SelfAdjointEigV2Attr func(optionalAttr)
-
-// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
-//
-// value: If `True` then eigenvectors will be computed and returned in `v`.
-// Otherwise, only the eigenvalues will be computed.
-// If not specified, defaults to true
-func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
-	return func(m optionalAttr) {
-		m["compute_v"] = value
+			input_handle,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the eigen decomposition of one or more square self-adjoint matrices.
-//
-// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
-// are sorted in non-decreasing order.
+// Returns the item in the list with the given index.
 //
-// ```python
-// # a is a tensor.
-// # e is a tensor of eigenvalues.
-// # v is a tensor of eigenvectors.
-// e, v = self_adjoint_eig(a)
-// e = self_adjoint_eig(a, compute_v=False)
-// ```
+// input_handle: the list
+// index: the position in the list from which an element will be retrieved
+// item: the element at that position
 //
-// Arguments:
-//	input: `Tensor` input of shape `[N, N]`.
 //
-// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
-func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_dtype tf.DataType) (item tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEigV2",
+		Type: "TensorListGetItem",
 		Input: []tf.Input{
-			input,
+			input_handle, index,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Adjust the saturation of one or more images.
+// Returns a diagonal tensor with a given diagonal values.
 //
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
 //
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A scale is then applied all the saturation
-// values, and then remapped back to RGB colorspace.
+// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
 //
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	scale: A float scale to add to the saturation.
+// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
 //
-// Returns The hue-adjusted image or images.
-func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
+// For example:
+//
+// ```
+// # 'diagonal' is [1, 2, 3, 4]
+// tf.diag(diagonal) ==> [[1, 0, 0, 0]
+//                        [0, 2, 0, 0]
+//                        [0, 0, 3, 0]
+//                        [0, 0, 0, 4]]
+// ```
+//
+// Arguments:
+//	diagonal: Rank k tensor where k is at most 1.
+func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustSaturation",
+		Type: "Diag",
 		Input: []tf.Input{
-			images, scale,
+			diagonal,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
 
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
-// If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["seed"] = value
 	}
 }
 
-// Solves systems of linear equations.
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution. The parameters may each be a
+//
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27387,9 +27076,9 @@ func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...Matr
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
+		Type: "ParameterizedTruncatedNormal",
 		Input: []tf.Input{
-			matrix, rhs,
+			shape, means, stdevs, minvals, maxvals,
 		},
 		Attrs: attrs,
 	}
@@ -27397,365 +27086,285 @@ func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...Matr
 	return op.Output(0)
 }
 
-// Returns a serialized GraphDef representing `input_dataset`.
-//
-// Returns a graph representation for `input_dataset`.
+// Sets the index-th position of the list to contain the given tensor.
 //
-// Arguments:
-//	input_dataset: A variant tensor representing the dataset to return the graph representation for.
+// input_handle: the list
+// index: the position in the list to which the tensor will be assigned
+// item: the element to be assigned to that position
+// output_handle: the new list, with the element in the proper position
 //
-// Returns The graph representation of the dataset (as serialized GraphDef).
-func DatasetToGraph(scope *Scope, input_dataset tf.Output) (graph tf.Output) {
+func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DatasetToGraph",
+		Type: "TensorListSetItem",
 		Input: []tf.Input{
-			input_dataset,
+			input_handle, index, item,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the matrix square root of one or more square matrices:
-//
-// matmul(sqrtm(A), sqrtm(A)) = A
-//
-// The input matrix should be invertible. If the input matrix is real, it should
-// have no eigenvalues which are real and negative (pairs of complex conjugate
-// eigenvalues are allowed).
-//
-// The matrix square root is computed by first reducing the matrix to
-// quasi-triangular form with the real Schur decomposition. The square root
-// of the quasi-triangular matrix is then computed directly. Details of
-// the algorithm can be found in: Nicholas J. Higham, "Computing real
-// square roots of a real matrix", Linear Algebra Appl., 1987.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the matrix square root for all input submatrices `[..., :, :]`.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
+// Creates a Tensor by indexing into the TensorList.
 //
-// Returns Shape is `[..., M, M]`.
+// Each row in the produced Tensor corresponds to the element in the TensorList
+// specified by the given index (see `tf.gather`).
 //
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.sqrtm
-// @end_compatibility
-func MatrixSquareRoot(scope *Scope, input tf.Output) (output tf.Output) {
+// input_handle: The input tensor list.
+// indices: The indices used to index into the list.
+// values: The tensor.
+func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_dtype tf.DataType) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "MatrixSquareRoot",
+		Type: "TensorListGather",
 		Input: []tf.Input{
-			input,
+			input_handle, indices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
+// Creates a TensorList by indexing into a Tensor.
 //
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
 //
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// element_shape: The shape of the elements in the list (can be less specified than
+//   the shape of the tensor).
+// output_handle: The TensorList.
+func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
+		Type: "TensorListScatter",
 		Input: []tf.Input{
-			json_examples,
+			tensor, indices, element_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SvdAttr is an optional argument to Svd.
-type SvdAttr func(optionalAttr)
-
-// SvdComputeUv sets the optional compute_uv attribute to value.
-//
-// value: If true, left and right singular vectors will be
-// computed and returned in `u` and `v`, respectively.
-// If false, `u` and `v` are not set and should never referenced.
-// If not specified, defaults to true
-func SvdComputeUv(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["compute_uv"] = value
-	}
-}
-
-// SvdFullMatrices sets the optional full_matrices attribute to value.
+// Returns a `RaggedTensor` containing the specified sequences of numbers.
 //
-// value: If true, compute full-sized `u` and `v`. If false
-// (the default), compute only the leading `P` singular vectors.
-// Ignored if `compute_uv` is `False`.
-// If not specified, defaults to false
-func SvdFullMatrices(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
-}
-
-// Computes the singular value decompositions of one or more matrices.
 //
-// Computes the SVD of each inner matrix in `input` such that
-// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
+// `rt_nested_splits`, such that
+// `result[i] = range(starts[i], limits[i], deltas[i])`.
 //
 // ```python
-// # a is a tensor containing a batch of matrices.
-// # s is a tensor of singular values for each matrix.
-// # u is the tensor containing of left singular vectors for each matrix.
-// # v is the tensor containing of right singular vectors for each matrix.
-// s, u, v = svd(a)
-// s, _, _ = svd(a, compute_uv=False)
+// >>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
+// ...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
+// >>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// >>> print result.eval().tolist()
+// [[2],               # result[0] = range(2, 3)
+//  [],                # result[1] = range(5, 5)
+//  [8, 9, 10, 11]]    # result[2] = range(8, 12)
 // ```
 //
+// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
+// The vector inputs must all have the same size.  Scalar inputs are broadcast
+// to match the size of the vector inputs.
+//
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//	starts: The starts of each range.
+//	limits: The limits of each range.
+//	deltas: The deltas of each range.
 //
-// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
-// Undefined if `compute_uv` is false.
-func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
+// Returns The `row_splits` for the returned `RaggedTensor`.The `inner_values` for the returned `RaggedTensor`.
+func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Svd",
+		Type: "RaggedRange",
 		Input: []tf.Input{
-			input,
+			starts, limits, deltas,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// PrintV2Attr is an optional argument to PrintV2.
-type PrintV2Attr func(optionalAttr)
-
-// PrintV2OutputStream sets the optional output_stream attribute to value.
-//
-// value: A string specifying the output stream or logging level to print to.
-// If not specified, defaults to "stderr"
-func PrintV2OutputStream(value string) PrintV2Attr {
-	return func(m optionalAttr) {
-		m["output_stream"] = value
-	}
+	return op.Output(0), op.Output(1)
 }
 
-// Prints a string scalar.
-//
-// Prints a string scalar to the desired output_stream.
-//
-// Arguments:
-//	input: The string scalar to print.
+// Deprecated, use python implementation tf.linalg.matrix_exponential.
 //
-// Returns the created operation.
-func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
+// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
+func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "PrintV2",
+		Type: "MatrixExponential",
 		Input: []tf.Input{
 			input,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
-type QueueEnqueueManyV2Attr func(optionalAttr)
+// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
+type QueueDequeueUpToV2Attr func(optionalAttr)
 
-// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: If the queue is too full, this operation will block for up
-// to timeout_ms milliseconds.
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
 // Note: This option is not supported yet.
 // If not specified, defaults to -1
-func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
+func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
 	return func(m optionalAttr) {
 		m["timeout_ms"] = value
 	}
 }
 
-// Enqueues zero or more tuples of one or more tensors in the given queue.
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// This operation slices each component tensor along the 0th dimension to
-// make multiple queue elements. All of the tuple components must have the
-// same size in the 0th dimension.
+// This operation is not supported by all queues.  If a queue does not support
+// DequeueUpTo, then an Unimplemented error is returned.
 //
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
+// If the queue is closed and there are more than 0 but less than `n`
+// elements remaining, then instead of returning an OutOfRange error like
+// QueueDequeueMany, less than `n` elements are returned immediately.  If
+// the queue is closed and there are 0 elements left in the queue, then
+// an OutOfRange error is returned just like in QueueDequeueMany.
+// Otherwise the behavior is identical to QueueDequeueMany:
 //
-// N.B. If the queue is full, this operation will block until the given
-// elements have been enqueued (or 'timeout_ms' elapses, if specified).
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size n in the 0th dimension.
+//
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
 //
 // Arguments:
 //	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should
-// be taken.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns the created operation.
-func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueManyV2",
+		Type: "QueueDequeueUpToV2",
 		Input: []tf.Input{
-			handle, tf.OutputList(components),
+			handle, n,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueUpToV2", err)
+		return
+	}
+	return components
 }
 
-// Computes the product along segments of a tensor.
+// Computes the Cholesky decomposition of one or more square matrices.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
 //
-// Computes a tensor such that
-// \\(output_i = \prod_j data_j\\) where the product is over `j` such
-// that `segment_ids[j] == i`.
+// The input has to be symmetric and positive definite. Only the lower-triangular
+// part of the input will be used for this operation. The upper-triangular part
+// will not be read.
 //
-// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+// The output is a tensor of the same shape as the input
+// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
-// </div>
+// **Note**: The gradient computation on GPU is faster for large matrices but
+// not for large batch dimensions when the submatrices are small. In this
+// case it might be faster to use the CPU.
 //
 // Arguments:
+//	input: Shape is `[..., M, M]`.
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns Shape is `[..., M, M]`.
+func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentProd",
+		Type: "Cholesky",
 		Input: []tf.Input{
-			data, segment_ids,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts one or more images from RGB to HSV.
-//
-// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// Writes contents to the file at input filename. Creates file and recursively
 //
-// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+// creates directory if not existing.
 //
 // Arguments:
-//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+//	filename: scalar. The name of the file to which we write the contents.
+//	contents: scalar. The content to be written to the output file.
 //
-// Returns `images` converted to HSV.
-func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns the created operation.
+func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RGBToHSV",
+		Type: "WriteFile",
 		Input: []tf.Input{
-			images,
+			filename, contents,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Does nothing. Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func NoOp(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NoOp",
-	}
 	return scope.AddOperation(opspec)
 }
 
-// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
-type MergeV2CheckpointsAttr func(optionalAttr)
+// AllAttr is an optional argument to All.
+type AllAttr func(optionalAttr)
 
-// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
+// AllKeepDims sets the optional keep_dims attribute to value.
 //
-// value: see above.
-// If not specified, defaults to true
-func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AllKeepDims(value bool) AllAttr {
 	return func(m optionalAttr) {
-		m["delete_old_dirs"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// V2 format specific: merges the metadata files of sharded checkpoints.  The
-//
-// result is one logical checkpoint, with one physical metadata file and renamed
-// data files.
-//
-// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+// Computes the "logical and" of elements across dimensions of a tensor.
 //
-// If delete_old_dirs is true, attempts to delete recursively the dirname of each
-// path in the input checkpoint_prefixes.  This is useful when those paths are non
-// user-facing temporary locations.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
-//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
-// as one of the checkpoint_prefixes.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns the created operation.
-func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
+// Returns The reduced tensor.
+func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27764,425 +27373,306 @@ func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MergeV2Checkpoints",
+		Type: "All",
 		Input: []tf.Input{
-			checkpoint_prefixes, destination_prefix,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Saves input tensors slices to disk.
-//
-// This is like `Save` except that tensors can be listed in the saved file as being
-// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-// have as many elements as `tensor_names`.
+// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
 //
-// Elements of the `shapes_and_slices` input must either be:
+// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
 //
-// *  The empty string, in which case the corresponding tensor is
-//    saved normally.
-// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-//    `dimI` are the dimensions of the larger tensor and `slice-spec`
-//    specifies what part is covered by the tensor to save.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices, with the same constraints as the single matrix
+// SelfAdjointEig.
 //
-// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-// where each `sliceI` is either:
+// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
+// are sorted in non-decreasing order.
 //
-// *  The string `-` meaning that the slice covers all indices of this dimension
-// *  `start,length` where `start` and `length` are integers.  In that
-//    case the slice covers `length` indices starting at `start`.
+// Arguments:
+//	input: Shape is `[..., M, M]`.
 //
-// See also `Save`.
+// Returns Shape is `[..., M+1, M]`.
+func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SelfAdjointEig",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softplus gradients for a softplus operation.
 //
 // Arguments:
-//	filename: Must have a single element. The name of the file to which we write the
-// tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-// saving the tensors.
-//	data: `N` tensors to save.
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
 //
-// Returns the created operation.
-func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SaveSlices",
+		Type: "SoftplusGrad",
 		Input: []tf.Input{
-			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+			gradients, features,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
-type DenseToDenseSetOperationAttr func(optionalAttr)
+// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
+type SelfAdjointEigV2Attr func(optionalAttr)
 
-// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
+//
+// value: If `True` then eigenvectors will be computed and returned in `v`.
+// Otherwise, only the eigenvalues will be computed.
 // If not specified, defaults to true
-func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
+func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["compute_v"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `Tensor` inputs.
+// Computes the eigen decomposition of one or more square self-adjoint matrices.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
+// are sorted in non-decreasing order.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// ```python
+// # a is a tensor.
+// # e is a tensor of eigenvalues.
+// # v is a tensor of eigenvectors.
+// e, v = self_adjoint_eig(a)
+// e = self_adjoint_eig(a, compute_v=False)
+// ```
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//
+//	input: `Tensor` input of shape `[N, N]`.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
+func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DenseToDenseSetOperation",
+		Type: "SelfAdjointEigV2",
 		Input: []tf.Input{
-			set1, set2,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// Generate a sharded filename. The filename is printf formatted as
+// Adjust the saturation of one or more images.
 //
-//    %s-%05d-of-%05d, basename, shard, num_shards.
-func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShardedFilename",
-		Input: []tf.Input{
-			basename, shard, num_shards,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BatchToSpace for N-D tensors of type T.
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
 //
-// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
-// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
-// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
-// the input.  The spatial dimensions of this intermediate result are then
-// optionally cropped according to `crops` to produce the output.  This is the
-// reverse of SpaceToBatch.  See below for a precise description.
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A scale is then applied all the saturation
+// values, and then remapped back to RGB colorspace.
 //
 // Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has M dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
-//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
-//   required that
-//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
-//
-// This operation is equivalent to the following steps:
-//
-// 1. Reshape `input` to `reshaped` of shape:
-//      [block_shape[0], ..., block_shape[M-1],
-//       batch / prod(block_shape),
-//       input_shape[1], ..., input_shape[N-1]]
-//
-// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1], block_shape[0],
-//       ...,
-//       input_shape[M], block_shape[M-1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0],
-//       ...,
-//       input_shape[M] * block_shape[M-1],
-//
-//       input_shape[M+1],
-//       ...,
-//       input_shape[N-1]]
-//
-// 4. Crop the start and end of dimensions `[1, ..., M]` of
-//    `reshaped_permuted` according to `crops` to produce the output of shape:
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-//       ...,
-//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[1, 4, 4, 1]` and value:
-//
-// ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
-//
-// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [2, 0]]`:
-//
-// ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
-// ```
-//
-// The output tensor has shape `[2, 2, 4, 1]` and value:
+//	images: Images to adjust.  At least 3-D.
+//	scale: A float scale to add to the saturation.
 //
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
+// Returns The hue-adjusted image or images.
+func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpaceND",
+		Type: "AdjustSaturation",
 		Input: []tf.Input{
-			input, block_shape, crops,
+			images, scale,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UnpackAttr is an optional argument to Unpack.
-type UnpackAttr func(optionalAttr)
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
 
-// UnpackAxis sets the optional axis attribute to value.
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
 //
-// value: Dimension along which to unpack.  Negative values wrap around, so the
-// valid range is `[-R, R)`.
-// If not specified, defaults to 0
-func UnpackAxis(value int64) UnpackAttr {
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
+// If not specified, defaults to false
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
-//
-// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-// For example, given a tensor of shape `(A, B, C, D)`;
-//
-// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-//   dimension unpacked along is gone, unlike `split`).
-//
-// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-//   and each tensor in `output` will have shape `(A, C, D)`.
-// Etc.
+// Solves systems of linear equations.
 //
-// This is the opposite of `pack`.
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
 //
 // Arguments:
-//	value: 1-D or higher, with `axis` dimension size equal to `num`.
-//
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-// Returns The list of tensors unpacked from `value`.
-func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num": num}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Unpack",
+		Type: "MatrixSolve",
 		Input: []tf.Input{
-			value,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Unpack", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// Increments variable pointed to by 'resource' until it reaches 'limit'.
+// Returns a serialized GraphDef representing `input_dataset`.
 //
-// Arguments:
-//	resource: Should be from a scalar `Variable` node.
-//	limit: If incrementing ref would bring it above limit, instead generates an
-// 'OutOfRange' error.
+// Returns a graph representation for `input_dataset`.
 //
+// Arguments:
+//	input_dataset: A variant tensor representing the dataset to return the graph representation for.
 //
-// Returns A copy of the input before increment. If nothing else modifies the
-// input, the values produced will all be distinct.
-func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
+// Returns The graph representation of the dataset (as serialized GraphDef).
+func DatasetToGraph(scope *Scope, input_dataset tf.Output) (graph tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"limit": limit, "T": T}
 	opspec := tf.OpSpec{
-		Type: "ResourceCountUpTo",
+		Type: "DatasetToGraph",
 		Input: []tf.Input{
-			resource,
+			input_dataset,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Delete the stack from its resource container.
+// Computes the matrix square root of one or more square matrices:
+//
+// matmul(sqrtm(A), sqrtm(A)) = A
+//
+// The input matrix should be invertible. If the input matrix is real, it should
+// have no eigenvalues which are real and negative (pairs of complex conjugate
+// eigenvalues are allowed).
+//
+// The matrix square root is computed by first reducing the matrix to
+// quasi-triangular form with the real Schur decomposition. The square root
+// of the quasi-triangular matrix is then computed directly. Details of
+// the algorithm can be found in: Nicholas J. Higham, "Computing real
+// square roots of a real matrix", Linear Algebra Appl., 1987.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the matrix square root for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	handle: The handle to a stack.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns the created operation.
-func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StackCloseV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Generate a glob pattern matching all sharded file names.
-func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.sqrtm
+// @end_compatibility
+func MatrixSquareRoot(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ShardedFilespec",
+		Type: "MatrixSquareRoot",
 		Input: []tf.Input{
-			basename, num_shards,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
-type TextLineReaderV2Attr func(optionalAttr)
+// SvdAttr is an optional argument to Svd.
+type SvdAttr func(optionalAttr)
 
-// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
+// SvdComputeUv sets the optional compute_uv attribute to value.
 //
-// value: Number of lines to skip from the beginning of every file.
-// If not specified, defaults to 0
-func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
+// value: If true, left and right singular vectors will be
+// computed and returned in `u` and `v`, respectively.
+// If false, `u` and `v` are not set and should never referenced.
+// If not specified, defaults to true
+func SvdComputeUv(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["skip_header_lines"] = value
+		m["compute_uv"] = value
 	}
 }
 
-// TextLineReaderV2Container sets the optional container attribute to value.
+// SvdFullMatrices sets the optional full_matrices attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+// value: If true, compute full-sized `u` and `v`. If false
+// (the default), compute only the leading `P` singular vectors.
+// Ignored if `compute_uv` is `False`.
+// If not specified, defaults to false
+func SvdFullMatrices(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["full_matrices"] = value
 	}
 }
 
-// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
+// Computes the singular value decompositions of one or more matrices.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the lines of a file delimited by '\n'.
+// Computes the SVD of each inner matrix in `input` such that
+// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
 //
-// Returns The handle to reference the Reader.
-func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
+// ```python
+// # a is a tensor containing a batch of matrices.
+// # s is a tensor of singular values for each matrix.
+// # u is the tensor containing of left singular vectors for each matrix.
+// # v is the tensor containing of right singular vectors for each matrix.
+// s, u, v = svd(a)
+// s, _, _ = svd(a, compute_uv=False)
+// ```
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+// Undefined if `compute_uv` is false.
+func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28191,141 +27681,89 @@ func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_ha
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TextLineReaderV2",
-
+		Type: "Svd",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
-type LoadAndRemapMatrixAttr func(optionalAttr)
+// PrintV2Attr is an optional argument to PrintV2.
+type PrintV2Attr func(optionalAttr)
 
-// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
+// PrintV2OutputStream sets the optional output_stream attribute to value.
 //
-// value: The maximum number of rows to load from the checkpoint at
-// once. If less than or equal to 0, the entire matrix will be loaded into
-// memory. Setting this arg trades increased disk reads for lower memory usage.
-// If not specified, defaults to -1
-func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
+// value: A string specifying the output stream or logging level to print to.
+// If not specified, defaults to "stderr"
+func PrintV2OutputStream(value string) PrintV2Attr {
 	return func(m optionalAttr) {
-		m["max_rows_in_memory"] = value
+		m["output_stream"] = value
 	}
 }
 
-// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
-//
-// at `ckpt_path` and potentially reorders its rows and columns using the
-// specified remappings.
-//
-// Most users should use one of the wrapper initializers (such as
-// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
-// function directly.
-//
-// The remappings are 1-D tensors with the following properties:
-//
-// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
-//   matrix will be initialized from the row corresponding to index
-//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
-// * `col_remapping` must have either 0 entries (indicating that no column
-//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
-//   output matrix will be initialized from the column corresponding to index
-//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
-// * A value of -1 in either of the remappings signifies a "missing" entry. In that
-//   case, values from the `initializing_values` tensor will be used to fill that
-//   missing row or column. If `row_remapping` has `r` missing entries and
-//   `col_remapping` has `c` missing entries, then the following condition must be
-//   true:
-//
-// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
-//
-// The remapping tensors can be generated using the GenerateVocabRemapping op.
-//
-// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
-// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
-// the value from row i, column j of the old tensor in the checkpoint, the output
-// matrix will look like the following:
-//
-// [[w(1, 0),  w(1, 2),  0.5],
-//  [w(0, 0),  w(0, 2), -0.5],
-//  [0.25,    -0.25,      42]]
-//
-// Arguments:
-//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
-// which the old matrix `Tensor` will be loaded.
-//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-//	row_remapping: An int `Tensor` of row remappings (generally created by
-// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
-// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
-// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
-//	col_remapping: An int `Tensor` of column remappings (generally created by
-// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
-// is to be done (e.g. column ordering is the same).
-//	initializing_values: A float `Tensor` containing  values to fill in for cells
-// in the output matrix that are not loaded from the checkpoint. Length must be
-// exactly the same as the number of missing / new cells.
-//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
-//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
+// Prints a string scalar.
 //
-// Returns Output matrix containing existing values loaded from the
-// checkpoint, and with any missing values filled in from initializing_values.
-func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
+// Prints a string scalar to the desired output_stream.
+//
+// Arguments:
+//	input: The string scalar to print.
+//
+// Returns the created operation.
+func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadAndRemapMatrix",
+		Type: "PrintV2",
 		Input: []tf.Input{
-			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
+			input,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
-type TFRecordReaderV2Attr func(optionalAttr)
+// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
+type QueueEnqueueManyV2Attr func(optionalAttr)
 
-// TFRecordReaderV2Container sets the optional container attribute to value.
+// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+// value: If the queue is too full, this operation will block for up
+// to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// Enqueues zero or more tuples of one or more tensors in the given queue.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
-// If not specified, defaults to ""
-func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
-}
-
-// A Reader that outputs the records from a TensorFlow Records file.
+// This operation slices each component tensor along the 0th dimension to
+// make multiple queue elements. All of the tuple components must have the
+// same size in the 0th dimension.
 //
-// Returns The handle to reference the Reader.
-func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
+//
+// N.B. If the queue is full, this operation will block until the given
+// elements have been enqueued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should
+// be taken.
+//
+// Returns the created operation.
+func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28334,88 +27772,124 @@ func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_ha
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TFRecordReaderV2",
-
+		Type: "QueueEnqueueManyV2",
+		Input: []tf.Input{
+			handle, tf.OutputList(components),
+		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
-type QuantizeAndDequantizeV3Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
+// Computes the product along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
+	opspec := tf.OpSpec{
+		Type: "SegmentProd",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Quantizes then dequantizes a tensor.
+// Converts one or more images from RGB to HSV.
 //
-// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-// tensor, so its value can change during training.
-func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
+// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+//
+// Arguments:
+//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to HSV.
+func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV3",
+		Type: "RGBToHSV",
 		Input: []tf.Input{
-			input, input_min, input_max, num_bits,
+			images,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
-type IdentityReaderV2Attr func(optionalAttr)
-
-// IdentityReaderV2Container sets the optional container attribute to value.
+// Does nothing. Only useful as a placeholder for control edges.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NoOp",
 	}
+	return scope.AddOperation(opspec)
 }
 
-// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
+// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
+type MergeV2CheckpointsAttr func(optionalAttr)
+
+// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+// value: see above.
+// If not specified, defaults to true
+func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["delete_old_dirs"] = value
 	}
 }
 
-// A Reader that outputs the queued work as both the key and value.
+// V2 format specific: merges the metadata files of sharded checkpoints.  The
 //
-// To use, enqueue strings in a Queue.  ReaderRead will take the front
-// work string and output (work, work).
+// result is one logical checkpoint, with one physical metadata file and renamed
+// data files.
 //
-// Returns The handle to reference the Reader.
-func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
+// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+//
+// If delete_old_dirs is true, attempts to delete recursively the dirname of each
+// path in the input checkpoint_prefixes.  This is useful when those paths are non
+// user-facing temporary locations.
+//
+// Arguments:
+//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
+//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
+// as one of the checkpoint_prefixes.
+//
+// Returns the created operation.
+func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28424,199 +27898,310 @@ func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_ha
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "IdentityReaderV2",
-
-		Attrs: attrs,
+		Type: "MergeV2Checkpoints",
+		Input: []tf.Input{
+			checkpoint_prefixes, destination_prefix,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Saves input tensors slices to disk.
+//
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
+//
+// Elements of the `shapes_and_slices` input must either be:
+//
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
+//
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
+//
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
+//
+// See also `Save`.
+//
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SaveSlices",
+		Input: []tf.Input{
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
-type ResourceApplyGradientDescentAttr func(optionalAttr)
+// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
+type DenseToDenseSetOperationAttr func(optionalAttr)
 
-// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Update '*var' by subtracting 'alpha' * 'delta' from it.
+// Applies set operation along last dimension of 2 `Tensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	delta: The change.
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
 //
-// Returns the created operation.
-func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyGradientDescent",
+		Type: "DenseToDenseSetOperation",
 		Input: []tf.Input{
-			var_, alpha, delta,
+			set1, set2,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the next record (key, value pair) produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
+// Generate a sharded filename. The filename is printf formatted as
 //
-// Returns A scalar.A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+//    %s-%05d-of-%05d, basename, shard, num_shards.
+func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
+		Type: "ShardedFilename",
 		Input: []tf.Input{
-			reader_handle, queue_handle,
+			basename, shard, num_shards,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns up to `num_records` (key, value) pairs produced by a Reader.
+// BatchToSpace for N-D tensors of type T.
 //
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-// It may return less than `num_records` even before the last batch.
+// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
+// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+// the input.  The spatial dimensions of this intermediate result are then
+// optionally cropped according to `crops` to produce the output.  This is the
+// reverse of SpaceToBatch.  See below for a precise description.
 //
 // Arguments:
-//	reader_handle: Handle to a `Reader`.
-//	queue_handle: Handle to a `Queue`, with string work items.
-//	num_records: number of records to read from `Reader`.
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has M dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
+//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
+//   required that
+//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
 //
-// Returns A 1-D tensor.A 1-D tensor.
-func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
+// This operation is equivalent to the following steps:
+//
+// 1. Reshape `input` to `reshaped` of shape:
+//      [block_shape[0], ..., block_shape[M-1],
+//       batch / prod(block_shape),
+//       input_shape[1], ..., input_shape[N-1]]
+//
+// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1], block_shape[0],
+//       ...,
+//       input_shape[M], block_shape[M-1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0],
+//       ...,
+//       input_shape[M] * block_shape[M-1],
+//
+//       input_shape[M+1],
+//       ...,
+//       input_shape[N-1]]
+//
+// 4. Crop the start and end of dimensions `[1, ..., M]` of
+//    `reshaped_permuted` according to `crops` to produce the output of shape:
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+//       ...,
+//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadUpToV2",
+		Type: "BatchToSpaceND",
 		Input: []tf.Input{
-			reader_handle, queue_handle, num_records,
+			input, block_shape, crops,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// BatchAttr is an optional argument to Batch.
-type BatchAttr func(optionalAttr)
-
-// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
-// If not specified, defaults to 10
-func BatchMaxEnqueuedBatches(value int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["max_enqueued_batches"] = value
-	}
-}
-
-// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
-// If not specified, defaults to <>
-func BatchAllowedBatchSizes(value []int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["allowed_batch_sizes"] = value
-	}
-}
-
-// BatchContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func BatchContainer(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// BatchSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func BatchSharedName(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
+	return op.Output(0)
 }
 
-// BatchBatchingQueue sets the optional batching_queue attribute to value.
-// If not specified, defaults to ""
-func BatchBatchingQueue(value string) BatchAttr {
+// UnpackAttr is an optional argument to Unpack.
+type UnpackAttr func(optionalAttr)
+
+// UnpackAxis sets the optional axis attribute to value.
+//
+// value: Dimension along which to unpack.  Negative values wrap around, so the
+// valid range is `[-R, R)`.
+// If not specified, defaults to 0
+func UnpackAxis(value int64) UnpackAttr {
 	return func(m optionalAttr) {
-		m["batching_queue"] = value
+		m["axis"] = value
 	}
 }
 
-// Batches all input tensors nondeterministically.
+// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
 //
-// When many instances of this Op are being run concurrently with the same
-// container/shared_name in the same device, some will output zero-shaped Tensors
-// and others will output Tensors of size up to max_batch_size.
+// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+// For example, given a tensor of shape `(A, B, C, D)`;
 //
-// All Tensors in in_tensors are batched together (so, for example, labels and
-// features should be batched with a single instance of this operation.
+// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+//   dimension unpacked along is gone, unlike `split`).
 //
-// Each invocation of batch emits an `id` scalar which will be used to identify
-// this particular invocation when doing unbatch or its gradient.
+// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+//   and each tensor in `output` will have shape `(A, C, D)`.
+// Etc.
 //
-// Each op which emits a non-empty batch will also emit a non-empty batch_index
-// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
-// start, and length of elements of each set of Tensors present in batched_tensors.
+// This is the opposite of `pack`.
 //
-// Batched tensors are concatenated along the first dimension, and all tensors in
-// in_tensors must have the first dimension of the same size.
+// Arguments:
+//	value: 1-D or higher, with `axis` dimension size equal to `num`.
 //
-// in_tensors: The tensors to be batched.
-// num_batch_threads: Number of scheduling threads for processing batches of work.
-//  Determines the number of batches processed in parallel.
-// max_batch_size: Batch sizes will never be bigger than this.
-// batch_timeout_micros: Maximum number of microseconds to wait before outputting
-//  an incomplete batch.
-// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
-//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
-//  batches up to one of those sizes. The entries must increase monotonically, and
-//  the final entry must equal max_batch_size.
-// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
-// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
-// batch_index: If out_tensors is non-empty, has information to invert it.
-// container: Controls the scope of sharing of this batch.
-// id: always contains a scalar with a unique ID for this invocation of Batch.
-// shared_name: Concurrently running instances of batch in the same device with the
-//  same container and shared_name will batch their elements together. If left
-//  empty, the op name will be used as the shared name.
-// T: the types of tensors to be batched.
-func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
+//
+// Returns The list of tensors unpacked from `value`.
+func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
+	attrs := map[string]interface{}{"num": num}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Batch",
+		Type: "Unpack",
 		Input: []tf.Input{
-			tf.OutputList(in_tensors),
+			value,
 		},
 		Attrs: attrs,
 	}
@@ -28626,89 +28211,112 @@ func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_ba
 	}
 	var idx int
 	var err error
-	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
-		scope.UpdateErr("Batch", err)
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Unpack", err)
 		return
 	}
-	batch_index = op.Output(idx)
-	id = op.Output(idx)
-	return batched_tensors, batch_index, id
+	return output
 }
 
-// Adjust the hue of one or more images.
+// Increments variable pointed to by 'resource' until it reaches 'limit'.
 //
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
+// Arguments:
+//	resource: Should be from a scalar `Variable` node.
+//	limit: If incrementing ref would bring it above limit, instead generates an
+// 'OutOfRange' error.
 //
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
+//
+// Returns A copy of the input before increment. If nothing else modifies the
+// input, the values produced will all be distinct.
+func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"limit": limit, "T": T}
+	opspec := tf.OpSpec{
+		Type: "ResourceCountUpTo",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Delete the stack from its resource container.
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
+//	handle: The handle to a stack.
 //
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+// Returns the created operation.
+func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustHue",
+		Type: "StackCloseV2",
 		Input: []tf.Input{
-			images, delta,
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Generate a glob pattern matching all sharded file names.
+func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShardedFilespec",
+		Input: []tf.Input{
+			basename, num_shards,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
 
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["skip_header_lines"] = value
 	}
 }
 
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+// TextLineReaderV2Container sets the optional container attribute to value.
 //
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["container"] = value
 	}
 }
 
-// Update '*var' according to the Adam algorithm.
-//
-// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the lines of a file delimited by '\n'.
 //
-// Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+// Returns The handle to reference the Reader.
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28717,61 +28325,141 @@ func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
-		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
-		},
+		Type: "TextLineReaderV2",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Store the input tensor in the state of the current session.
+// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
+type LoadAndRemapMatrixAttr func(optionalAttr)
+
+// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
+//
+// value: The maximum number of rows to load from the checkpoint at
+// once. If less than or equal to 0, the entire matrix will be loaded into
+// memory. Setting this arg trades increased disk reads for lower memory usage.
+// If not specified, defaults to -1
+func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
+	return func(m optionalAttr) {
+		m["max_rows_in_memory"] = value
+	}
+}
+
+// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
+//
+// at `ckpt_path` and potentially reorders its rows and columns using the
+// specified remappings.
+//
+// Most users should use one of the wrapper initializers (such as
+// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
+// function directly.
+//
+// The remappings are 1-D tensors with the following properties:
+//
+// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
+//   matrix will be initialized from the row corresponding to index
+//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
+// * `col_remapping` must have either 0 entries (indicating that no column
+//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
+//   output matrix will be initialized from the column corresponding to index
+//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
+// * A value of -1 in either of the remappings signifies a "missing" entry. In that
+//   case, values from the `initializing_values` tensor will be used to fill that
+//   missing row or column. If `row_remapping` has `r` missing entries and
+//   `col_remapping` has `c` missing entries, then the following condition must be
+//   true:
+//
+// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+//
+// The remapping tensors can be generated using the GenerateVocabRemapping op.
+//
+// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
+// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
+// the value from row i, column j of the old tensor in the checkpoint, the output
+// matrix will look like the following:
+//
+// [[w(1, 0),  w(1, 2),  0.5],
+//  [w(0, 0),  w(0, 2), -0.5],
+//  [0.25,    -0.25,      42]]
 //
 // Arguments:
-//	value: The tensor to be stored.
+//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
+// which the old matrix `Tensor` will be loaded.
+//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+//	row_remapping: An int `Tensor` of row remappings (generally created by
+// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
+// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
+// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
+//	col_remapping: An int `Tensor` of column remappings (generally created by
+// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
+// is to be done (e.g. column ordering is the same).
+//	initializing_values: A float `Tensor` containing  values to fill in for cells
+// in the output matrix that are not loaded from the checkpoint. Length must be
+// exactly the same as the number of missing / new cells.
+//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
+//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
 //
-// Returns The handle for the tensor stored in the session state, represented
-// as a ResourceHandle object.
-func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+// Returns Output matrix containing existing values loaded from the
+// checkpoint, and with any missing values filled in from initializing_values.
+func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandleV2",
+		Type: "LoadAndRemapMatrix",
 		Input: []tf.Input{
-			value,
+			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
-type ResizeBicubicGradAttr func(optionalAttr)
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
 
-// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
+// TFRecordReaderV2Container sets the optional container attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["container"] = value
 	}
 }
 
-// Computes the gradient of bicubic interpolation.
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
+// If not specified, defaults to ""
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// A Reader that outputs the records from a TensorFlow Records file.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28780,40 +28468,38 @@ func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubicGrad",
-		Input: []tf.Input{
-			grads, original_image,
-		},
+		Type: "TFRecordReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
-type ResizeNearestNeighborAttr func(optionalAttr)
+// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
+type QuantizeAndDequantizeV3Attr func(optionalAttr)
 
-// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["signed_input"] = value
 	}
 }
 
-// Resize `images` to `size` using nearest neighbor interpolation.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// Quantizes then dequantizes a tensor.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+// tensor, so its value can change during training.
+func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28822,9 +28508,9 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighbor",
+		Type: "QuantizeAndDequantizeV3",
 		Input: []tf.Input{
-			images, size,
+			input, input_min, input_max, num_bits,
 		},
 		Attrs: attrs,
 	}
@@ -28832,30 +28518,38 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
-type ResizeNearestNeighborGradAttr func(optionalAttr)
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
 
-// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
+// IdentityReaderV2Container sets the optional container attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["container"] = value
 	}
 }
 
-// Computes the gradient of nearest neighbor interpolation.
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
-// original input size.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the queued work as both the key and value.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
-// with respect to the input image.
-func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
+//
+// Returns The handle to reference the Reader.
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28864,39 +28558,37 @@ func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, op
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighborGrad",
-		Input: []tf.Input{
-			grads, size,
-		},
+		Type: "IdentityReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
-type ExtractJpegShapeAttr func(optionalAttr)
+// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
+type ResourceApplyGradientDescentAttr func(optionalAttr)
 
-// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
+// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: (Optional) The output type of the operation (int32 or int64).
-// Defaults to int32.
-// If not specified, defaults to DT_INT32
-func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["output_type"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Extract the shape information of a JPEG-encoded image.
-//
-// This op only parses the image header, so it is much faster than DecodeJpeg.
+// Update '*var' by subtracting 'alpha' * 'delta' from it.
 //
 // Arguments:
-//	contents: 0-D. The JPEG-encoded image.
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
 //
-// Returns 1-D. The image shape with format [height, width, channels].
-func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
+// Returns the created operation.
+func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28905,689 +28597,627 @@ func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegS
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExtractJpegShape",
+		Type: "ResourceApplyGradientDescent",
 		Input: []tf.Input{
-			contents,
+			var_, alpha, delta,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
-type PaddingFIFOQueueV2Attr func(optionalAttr)
-
-// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types.
-// Shapes of fixed rank but variable size are allowed by setting
-// any shape dimension to -1.  In this case, the inputs' shape may vary along
-// the given dimension, and DequeueMany will pad the given dimension with
-// zeros up to the maximum shape of all elements in the given batch.
-// If the length of this attr is 0, different queue elements may have
-// different ranks and shapes, but only one element may be dequeued at a time.
-// If not specified, defaults to <>
+// Returns the next record (key, value pair) produced by a Reader.
 //
-// REQUIRES: len(value) >= 0
-func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// PaddingFIFOQueueV2Container sets the optional container attribute to value.
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+	opspec := tf.OpSpec{
+		Type: "ReaderReadV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// A queue that produces elements in first-in first-out order.
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
 //
-// Variable-size shapes are allowed by setting the corresponding shape dimensions
-// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-// size of any given element in the minibatch.  See below for details.
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+// It may return less than `num_records` even before the last batch.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
 //
-// Returns The handle to the queue.
-func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "PaddingFIFOQueueV2",
-
-		Attrs: attrs,
+		Type: "ReaderReadUpToV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle, num_records,
+		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodePngAttr is an optional argument to DecodePng.
-type DecodePngAttr func(optionalAttr)
-
-// DecodePngChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodePngChannels(value int64) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodePngDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_UINT8
-func DecodePngDtype(value tf.DataType) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
+	return op.Output(0), op.Output(1)
 }
 
-// Decode a PNG-encoded image to a uint8 or uint16 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the PNG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
-//
-// If needed, the PNG-encoded image is transformed to match the requested number
-// of color channels.
+//     Adds v into specified rows of x.
 //
-// This op also supports decoding JPEGs and non-animated GIFs since the interface
-// is the same, though it is cleaner to use `tf.image.decode_image`.
+//     Computes y = x; y[i, :] += v; return y.
 //
 // Arguments:
-//	contents: 0-D.  The PNG-encoded image.
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
 //
-// Returns 3-D with shape `[height, width, channels]`.
-func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DecodePng",
+		Type: "InplaceAdd",
 		Input: []tf.Input{
-			contents,
+			x, i, v,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Decode the first frame of a GIF-encoded image to a uint8 tensor.
-//
-// GIF with frame or transparency compression are not supported
-// convert animated GIF from compressed to uncompressed by:
-//
-//     convert $src.gif -coalesce $dst.gif
-//
-// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-// `tf.image.decode_image`.
+// Restore a Reader to its initial clean state.
 //
 // Arguments:
-//	contents: 0-D.  The GIF-encoded image.
+//	reader_handle: Handle to a Reader.
 //
-// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
-func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeGif",
+		Type: "ReaderResetV2",
 		Input: []tf.Input{
-			contents,
+			reader_handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
-type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
+// BatchAttr is an optional argument to Batch.
+type BatchAttr func(optionalAttr)
 
-// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
+// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
+// If not specified, defaults to 10
+func BatchMaxEnqueuedBatches(value int64) BatchAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["max_enqueued_batches"] = value
 	}
 }
 
-// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
+// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
+// If not specified, defaults to <>
+func BatchAllowedBatchSizes(value []int64) BatchAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["allowed_batch_sizes"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
+// BatchContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BatchContainer(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// BatchSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func BatchSharedName(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// BatchBatchingQueue sets the optional batching_queue attribute to value.
+// If not specified, defaults to ""
+func BatchBatchingQueue(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["batching_queue"] = value
+	}
+}
+
+// Batches all input tensors nondeterministically.
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// When many instances of this Op are being run concurrently with the same
+// container/shared_name in the same device, some will output zero-shaped Tensors
+// and others will output Tensors of size up to max_batch_size.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// All Tensors in in_tensors are batched together (so, for example, labels and
+// features should be batched with a single instance of this operation.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// Each invocation of batch emits an `id` scalar which will be used to identify
+// this particular invocation when doing unbatch or its gradient.
 //
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+// Each op which emits a non-empty batch will also emit a non-empty batch_index
+// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+// start, and length of elements of each set of Tensors present in batched_tensors.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Batched tensors are concatenated along the first dimension, and all tensors in
+// in_tensors must have the first dimension of the same size.
+//
+// in_tensors: The tensors to be batched.
+// num_batch_threads: Number of scheduling threads for processing batches of work.
+//  Determines the number of batches processed in parallel.
+// max_batch_size: Batch sizes will never be bigger than this.
+// batch_timeout_micros: Maximum number of microseconds to wait before outputting
+//  an incomplete batch.
+// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+//  batches up to one of those sizes. The entries must increase monotonically, and
+//  the final entry must equal max_batch_size.
+// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+// batch_index: If out_tensors is non-empty, has information to invert it.
+// container: Controls the scope of sharing of this batch.
+// id: always contains a scalar with a unique ID for this invocation of Batch.
+// shared_name: Concurrently running instances of batch in the same device with the
+//  same container and shared_name will batch their elements together. If left
+//  empty, the op name will be used as the shared name.
+// T: the types of tensors to be batched.
+func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LearnedUnigramCandidateSampler",
+		Type: "Batch",
 		Input: []tf.Input{
-			true_classes,
+			tf.OutputList(in_tensors),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// SerializeSparseAttr is an optional argument to SerializeSparse.
-type SerializeSparseAttr func(optionalAttr)
-
-// SerializeSparseOutType sets the optional out_type attribute to value.
-//
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
+		scope.UpdateErr("Batch", err)
+		return
 	}
+	batch_index = op.Output(idx)
+	id = op.Output(idx)
+	return batched_tensors, batch_index, id
 }
 
-// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+// Adjust the hue of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
+		Type: "AdjustHue",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			images, delta,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
 
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
 	return func(m optionalAttr) {
-		m["shapes"] = value
+		m["use_locking"] = value
 	}
 }
 
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+// Update '*var' according to the Adam algorithm.
 //
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 //
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// Returns the created operation.
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdam",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
+// Store the input tensor in the state of the current session.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a ResourceHandle object.
+func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandleV2",
+		Input: []tf.Input{
+			value,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
+type ResizeBicubicGradAttr func(optionalAttr)
+
+// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["align_corners"] = value
 	}
 }
 
-// A queue that randomizes the order of elements.
+// Computes the gradient of bicubic interpolation.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-// Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
-
+		Type: "ResizeBicubicGrad",
+		Input: []tf.Input{
+			grads, original_image,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
+
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
 //
-// Parts of the bounding box may fall outside the image.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using nearest neighbor interpolation.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
+		Type: "ResizeNearestNeighbor",
 		Input: []tf.Input{
-			images, boxes,
+			images, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gets the next output from the given iterator.
+// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
+type ResizeNearestNeighborGradAttr func(optionalAttr)
+
+// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
 //
-// This operation is a synchronous version IteratorGetNext. It should only be used
-// in situations where the iterator does not block the calling thread, or where
-// the calling thread is not a member of the thread pool used to execute parallel
-// operations (e.g. in eager mode).
-func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Computes the gradient of nearest neighbor interpolation.
+//
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
+// original input size.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
+// with respect to the input image.
+func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNextSync",
+		Type: "ResizeNearestNeighborGrad",
 		Input: []tf.Input{
-			iterator,
+			grads, size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNextSync", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
-type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
+// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
+type ExtractJpegShapeAttr func(optionalAttr)
 
-// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
+// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
 //
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
+// value: (Optional) The output type of the operation (int32 or int64).
+// Defaults to int32.
+// If not specified, defaults to DT_INT32
+func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["output_type"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
+// Extract the shape information of a JPEG-encoded image.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// This op only parses the image header, so it is much faster than DecodeJpeg.
+//
+// Arguments:
+//	contents: 0-D. The JPEG-encoded image.
+//
+// Returns 1-D. The image shape with format [height, width, channels].
+func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExtractJpegShape",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
+// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
+type PaddingFIFOQueueV2Attr func(optionalAttr)
+
+// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types.
+// Shapes of fixed rank but variable size are allowed by setting
+// any shape dimension to -1.  In this case, the inputs' shape may vary along
+// the given dimension, and DequeueMany will pad the given dimension with
+// zeros up to the maximum shape of all elements in the given batch.
+// If the length of this attr is 0, different queue elements may have
+// different ranks and shapes, but only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
+		m["shapes"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
+// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["area_range"] = value
+		m["capacity"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
+// PaddingFIFOQueueV2Container sets the optional container attribute to value.
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["max_attempts"] = value
+		m["container"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
-//
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
+// A queue that produces elements in first-in first-out order.
 //
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// Variable-size shapes are allowed by setting the corresponding shape dimensions
+// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
+// size of any given element in the minibatch.  See below for details.
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
-//	min_object_covered: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
+//	component_types: The type of each component in a value.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Returns The handle to the queue.
+func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBoxV2",
-		Input: []tf.Input{
-			image_size, bounding_boxes, min_object_covered,
-		},
+		Type: "PaddingFIFOQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
-type ExtractGlimpseAttr func(optionalAttr)
+// DecodePngAttr is an optional argument to DecodePng.
+type DecodePngAttr func(optionalAttr)
 
-// ExtractGlimpseCentered sets the optional centered attribute to value.
+// DecodePngChannels sets the optional channels attribute to value.
 //
-// value: indicates if the offset coordinates are centered relative to
-// the image, in which case the (0, 0) offset is relative to the center
-// of the input images. If false, the (0,0) offset corresponds to the
-// upper left corner of the input images.
-// If not specified, defaults to true
-func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodePngChannels(value int64) DecodePngAttr {
 	return func(m optionalAttr) {
-		m["centered"] = value
+		m["channels"] = value
 	}
 }
 
-// ExtractGlimpseNormalized sets the optional normalized attribute to value.
-//
-// value: indicates if the offset coordinates are normalized.
-// If not specified, defaults to true
-func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
+// DecodePngDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_UINT8
+func DecodePngDtype(value tf.DataType) DecodePngAttr {
 	return func(m optionalAttr) {
-		m["normalized"] = value
+		m["dtype"] = value
 	}
 }
 
-// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
+// Decode a PNG-encoded image to a uint8 or uint16 tensor.
 //
-// value: indicates if the noise should be generated using a
-// uniform distribution or a Gaussian distribution.
-// If not specified, defaults to true
-func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["uniform_noise"] = value
-	}
-}
-
-// Extracts a glimpse from the input tensor.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// Returns a set of windows called glimpses extracted at location
-// `offsets` from the input tensor. If the windows only partially
-// overlaps the inputs, the non overlapping areas will be filled with
-// random noise.
+// Accepted values are:
 //
-// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-// glimpse_width, channels]`. The channels and batch dimensions are the
-// same as that of the input tensor. The height and width of the output
-// windows are specified in the `size` parameter.
+// *   0: Use the number of channels in the PNG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
 //
-// The argument `normalized` and `centered` controls how the windows are built:
+// If needed, the PNG-encoded image is transformed to match the requested number
+// of color channels.
 //
-// * If the coordinates are normalized but not centered, 0.0 and 1.0
-//   correspond to the minimum and maximum of each height and width
-//   dimension.
-// * If the coordinates are both normalized and centered, they range from
-//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-//   left corner, the lower right corner is located at (1.0, 1.0) and the
-//   center is at (0, 0).
-// * If the coordinates are not normalized they are interpreted as
-//   numbers of pixels.
+// This op also supports decoding JPEGs and non-animated GIFs since the interface
+// is the same, though it is cleaner to use `tf.image.decode_image`.
 //
 // Arguments:
-//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
-//	size: A 1-D tensor of 2 elements containing the size of the glimpses
-// to extract.  The glimpse height must be specified first, following
-// by the glimpse width.
-//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
-// the y, x locations of the center of each window.
+//	contents: 0-D.  The PNG-encoded image.
 //
-// Returns A tensor representing the glimpses `[batch_size,
-// glimpse_height, glimpse_width, channels]`.
-func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`.
+func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29596,9 +29226,9 @@ func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExtractGlimpse",
+		Type: "DecodePng",
 		Input: []tf.Input{
-			input, size, offsets,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -29606,121 +29236,139 @@ func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Ou
 	return op.Output(0)
 }
 
-// A container for an iterator resource.
+// Decode the first frame of a GIF-encoded image to a uint8 tensor.
 //
-// Returns A handle to the iterator that can be passed to a "MakeIterator"
-// or "IteratorGetNext" op.
-func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// GIF with frame or transparency compression are not supported
+// convert animated GIF from compressed to uncompressed by:
+//
+//     convert $src.gif -coalesce $dst.gif
+//
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The GIF-encoded image.
+//
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Iterator",
-
-		Attrs: attrs,
+		Type: "DecodeGif",
+		Input: []tf.Input{
+			contents,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
+// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
+type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
 
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
+// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["seed"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
+		Type: "LearnedUnigramCandidateSampler",
 		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
-type ShuffleDatasetAttr func(optionalAttr)
+// SerializeSparseAttr is an optional argument to SerializeSparse.
+type SerializeSparseAttr func(optionalAttr)
 
-// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
+// SerializeSparseOutType sets the optional out_type attribute to value.
 //
-// value: If true, each iterator over this dataset will be given
-// a different pseudorandomly generated seed, based on a sequence seeded by the
-// `seed` and `seed2` inputs. If false, each iterator will be given the same
-// seed, and repeated iteration over this dataset will yield the exact same
-// sequence of results.
-// If not specified, defaults to true
-func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
 	return func(m optionalAttr) {
-		m["reshuffle_each_iteration"] = value
+		m["out_type"] = value
 	}
 }
 
-// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
+// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
 //
 // Arguments:
-//
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either `seed` or
-// `seed2` is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//
-//
-func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ShuffleDataset",
+		Type: "SerializeSparse",
 		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -29728,145 +29376,106 @@ func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output
 	return op.Output(0)
 }
 
-// 3D fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
-// dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
+
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their 3D Fourier transform.
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fftn with 3 dimensions.
-// @end_compatibility
-func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FFT3D",
-		Input: []tf.Input{
-			input,
-		},
+// REQUIRES: len(value) >= 0
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
-type CropAndResizeGradBoxesAttr func(optionalAttr)
-
-// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["capacity"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
-//
-// Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
 //
-// Returns A 2-D tensor of shape `[num_boxes, 4]`.
-func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradBoxes",
-		Input: []tf.Input{
-			grads, image, boxes, box_ind,
-		},
-		Attrs: attrs,
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["min_after_dequeue"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Saves tensors in V2 checkpoint format.
-//
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
-//
-// Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
 //
-// Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SaveV2",
-		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
-		},
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// StatsAggregatorHandleAttr is an optional argument to StatsAggregatorHandle.
-type StatsAggregatorHandleAttr func(optionalAttr)
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
 
-// StatsAggregatorHandleContainer sets the optional container attribute to value.
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
 // If not specified, defaults to ""
-func StatsAggregatorHandleContainer(value string) StatsAggregatorHandleAttr {
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// StatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
 // If not specified, defaults to ""
-func StatsAggregatorHandleSharedName(value string) StatsAggregatorHandleAttr {
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Creates a statistics manager resource.
-func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr) (handle tf.Output) {
+// A queue that randomizes the order of elements.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatsAggregatorHandle",
+		Type: "RandomShuffleQueueV2",
 
 		Attrs: attrs,
 	}
@@ -29874,203 +29483,299 @@ func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr)
 	return op.Output(0)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Draw bounding boxes on a batch of images.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system and more
-// generally is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
+//
+// Parts of the bounding box may fall outside the image.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV3(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV3",
+		Type: "DrawBoundingBoxes",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold, score_threshold,
+			images, boxes,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
-type NonMaxSuppressionV4Attr func(optionalAttr)
+// Gets the next output from the given iterator.
+//
+// This operation is a synchronous version IteratorGetNext. It should only be used
+// in situations where the iterator does not block the calling thread, or where
+// the calling thread is not a member of the thread pool used to execute parallel
+// operations (e.g. in eager mode).
+func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "IteratorGetNextSync",
+		Input: []tf.Input{
+			iterator,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNextSync", err)
+		return
+	}
+	return components
+}
 
-// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
+// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
+type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
+
+// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
 //
-// value: If true, the output `selected_indices` is padded to be of length
-// `max_output_size`. Defaults to false.
-// If not specified, defaults to false
-func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["pad_to_max_output_size"] = value
+		m["seed"] = value
 	}
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system and more
-// generally is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
 //
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
-// `selected_indices`, with the valid elements appearing first.
-func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+}
+
+// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV4",
-		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold, score_threshold,
-		},
-		Attrs: attrs,
+}
+
+// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Computes the matrix logarithm of one or more square matrices:
+// Generate a single randomly distorted bounding box for an image.
 //
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
 //
-// \\(log(exp(A)) = A\\)
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
 //
-// This op is only defined for complex matrices. If A is positive-definite and
-// real, then casting to a complex matrix, taking the logarithm and casting back
-// to a real matrix will give the correct result.
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
 //
-// This function computes the matrix logarithm using the Schur-Parlett algorithm.
-// Details of the algorithm can be found in Section 11.6.2 of:
-// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
-// ISBN 978-0-898716-46-7.
+// For example,
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the exponential for all input submatrices `[..., :, :]`.
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
 //
-// Arguments:
-//	input: Shape is `[..., M, M]`.
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
 //
-// Returns Shape is `[..., M, M]`.
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
 //
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.logm
-// @end_compatibility
-func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//	min_object_covered: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+//
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixLogarithm",
+		Type: "SampleDistortedBoundingBoxV2",
 		Input: []tf.Input{
-			input,
+			image_size, bounding_boxes, min_object_covered,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-//   This op is used as a placeholder in If branch functions. It doesn't provide a
-//   valid output when run, so must either be removed (e.g. replaced with a
-//   function input) or guaranteed not to be used (e.g. if mirroring an
-//   intermediate output needed for the gradient computation of the other branch).
+// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
+type ExtractGlimpseAttr func(optionalAttr)
+
+// ExtractGlimpseCentered sets the optional centered attribute to value.
 //
-// Arguments:
-//	dtype: The type of the output.
-//	shape:     The purported shape of the output. This is only used for shape inference;
-//     the output will not necessarily have this shape. Can be a partial shape.
+// value: indicates if the offset coordinates are centered relative to
+// the image, in which case the (0, 0) offset is relative to the center
+// of the input images. If false, the (0,0) offset corresponds to the
+// upper left corner of the input images.
+// If not specified, defaults to true
+func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["centered"] = value
+	}
+}
+
+// ExtractGlimpseNormalized sets the optional normalized attribute to value.
 //
-// Returns     \"Fake\" output value. This should not be consumed by another op.
-func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: indicates if the offset coordinates are normalized.
+// If not specified, defaults to true
+func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["normalized"] = value
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "FakeParam",
+}
 
-		Attrs: attrs,
+// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
+//
+// value: indicates if the noise should be generated using a
+// uniform distribution or a Gaussian distribution.
+// If not specified, defaults to true
+func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["uniform_noise"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
+// Extracts a glimpse from the input tensor.
 //
-// Arguments:
-//	selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines which of the
-// `N` data inputs should produce the next output element.
-//	data_input_datasets: `N` datasets with the same type that will be interleaved according to
-// the values of `selector_input_dataset`.
+// Returns a set of windows called glimpses extracted at location
+// `offsets` from the input tensor. If the windows only partially
+// overlaps the inputs, the non overlapping areas will be filled with
+// random noise.
 //
+// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+// glimpse_width, channels]`. The channels and batch dimensions are the
+// same as that of the input tensor. The height and width of the output
+// windows are specified in the `size` parameter.
 //
-func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset tf.Output, data_input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// The argument `normalized` and `centered` controls how the windows are built:
+//
+// * If the coordinates are normalized but not centered, 0.0 and 1.0
+//   correspond to the minimum and maximum of each height and width
+//   dimension.
+// * If the coordinates are both normalized and centered, they range from
+//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+//   left corner, the lower right corner is located at (1.0, 1.0) and the
+//   center is at (0, 0).
+// * If the coordinates are not normalized they are interpreted as
+//   numbers of pixels.
+//
+// Arguments:
+//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+//	size: A 1-D tensor of 2 elements containing the size of the glimpses
+// to extract.  The glimpse height must be specified first, following
+// by the glimpse width.
+//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
+// the y, x locations of the center of each window.
+//
+// Returns A tensor representing the glimpses `[batch_size,
+// glimpse_height, glimpse_width, channels]`.
+func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalDirectedInterleaveDataset",
+		Type: "ExtractGlimpse",
 		Input: []tf.Input{
-			selector_input_dataset, tf.OutputList(data_input_datasets),
+			input, size, offsets,
 		},
 		Attrs: attrs,
 	}
@@ -30078,108 +29783,121 @@ func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset
 	return op.Output(0)
 }
 
-// Add the quantile summaries to each quantile stream resource.
-//
-// An op that adds a list of quantile summaries to a quantile stream resource. Each
-// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
-// for a single feature.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
+// A container for an iterator resource.
 //
-// Returns the created operation.
-func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
+// Returns A handle to the iterator that can be passed to a "MakeIterator"
+// or "IteratorGetNext" op.
+func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
-		Input: []tf.Input{
-			quantile_stream_resource_handle, tf.OutputList(summaries),
-		},
+		Type: "Iterator",
+
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StringSplitV2Attr is an optional argument to StringSplitV2.
-type StringSplitV2Attr func(optionalAttr)
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
 
-// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
 //
-// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
-// If not specified, defaults to -1
-func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
 	return func(m optionalAttr) {
-		m["maxsplit"] = value
+		m["method"] = value
 	}
 }
 
-// Split elements of `source` based on `sep` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `source` based on `sep` and return a `SparseTensor`
-// containing the split tokens. Empty tokens are ignored.
-//
-// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
-// then the output will be
-// ```
-// st.indices = [0, 0;
-//               0, 1;
-//               1, 0;
-//               1, 1;
-//               1, 2]
-// st.shape = [2, 3]
-// st.values = ['hello', 'world', 'a', 'b', 'c']
-// ```
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
 //
-// If `sep` is given, consecutive delimiters are not grouped together and are
-// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
-// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
-// string, consecutive whitespace are regarded as a single separator, and the
-// result will contain no empty strings at the startor end if the string has
-// leading or trailing whitespace.
+// Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
 //
-// Note that the above mentioned behavior matches python's str.split.
 //
-// Arguments:
-//	input: `1-D` string `Tensor`, the strings to split.
-//	sep: `0-D` string `Tensor`, the delimiter character.
-func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplitV2",
+		Type: "CropAndResizeGradImage",
 		Input: []tf.Input{
-			input, sep,
+			grads, boxes, box_ind, image_size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
+type ShuffleDatasetAttr func(optionalAttr)
+
+// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
+//
+// value: If true, each iterator over this dataset will be given
+// a different pseudorandomly generated seed, based on a sequence seeded by the
+// `seed` and `seed2` inputs. If false, each iterator will be given the same
+// seed, and repeated iteration over this dataset will yield the exact same
+// sequence of results.
+// If not specified, defaults to true
+func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
+	return func(m optionalAttr) {
+		m["reshuffle_each_iteration"] = value
+	}
+}
+
+// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
 //
 // Arguments:
 //
-//	thread_pool: A resource produced by the ThreadPoolHandle op.
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
 //
 //
-func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalThreadPoolDataset",
+		Type: "ShuffleDataset",
 		Input: []tf.Input{
-			input_dataset, thread_pool,
+			input_dataset, buffer_size, seed, seed2,
 		},
 		Attrs: attrs,
 	}
@@ -30187,91 +29905,80 @@ func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread
 	return op.Output(0)
 }
 
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+// 3D fast Fourier transform.
+//
+// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+// dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fftn with 3 dimensions.
+// @end_compatibility
+func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Softsign",
+		Type: "FFT3D",
 		Input: []tf.Input{
-			features,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EncodeProtoAttr is an optional argument to EncodeProto.
-type EncodeProtoAttr func(optionalAttr)
+// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
+type CropAndResizeGradBoxesAttr func(optionalAttr)
 
-// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
-// If not specified, defaults to "local://"
-func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
+// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
 	return func(m optionalAttr) {
-		m["descriptor_source"] = value
+		m["method"] = value
 	}
 }
 
-// The op serializes protobuf messages provided in the input tensors.
-//
-// The types of the tensors in `values` must match the schema for the
-// fields specified in `field_names`. All the tensors in `values` must
-// have a common shape prefix, *batch_shape*.
-//
-// The `sizes` tensor specifies repeat counts for each field.  The repeat
-// count (last dimension) of a each tensor in `values` must be greater
-// than or equal to corresponding repeat count in `sizes`.
-//
-// A `message_type` name must be provided to give context for the field
-// names. The actual message descriptor can be looked up either in the
-// linked-in descriptor pool or a filename provided by the caller using
-// the `descriptor_source` attribute.
-//
-// The `descriptor_source` attribute selects a source of protocol
-// descriptors to consult when looking up `message_type`. This may be a
-// filename containing a serialized `FileDescriptorSet` message,
-// or the special value `local://`, in which case only descriptors linked
-// into the code will be searched; the filename can be on any filesystem
-// accessible to TensorFlow.
-//
-// You can build a `descriptor_source` file using the `--descriptor_set_out`
-// and `--include_imports` options to the protocol compiler `protoc`.
-//
-// The `local://` database only covers descriptors linked into the
-// code via C++ libraries, not Python imports. You can link in a proto descriptor
-// by creating a cc_library target with alwayslink=1.
-//
-// There are a few special cases in the value mapping:
-//
-// Submessage and group fields must be pre-serialized as TensorFlow strings.
-//
-// TensorFlow lacks support for unsigned int64s, so they must be
-// represented as `tf.int64` with the same twos-complement bit pattern
-// (the obvious way).
-//
-// Unsigned int32 values can be represented exactly with `tf.int64`, or
-// with sign wrapping if the input is of type `tf.int32`.
+// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
 //
 // Arguments:
-//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
-//	values: List of tensors containing values for the corresponding field.
-//	field_names: List of strings containing proto field names.
-//	message_type: Name of the proto message type to decode.
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
 //
-// Returns Tensor of serialized protos with shape `batch_shape`.
-func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
+// Returns A 2-D tensor of shape `[num_boxes, 4]`.
+func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeProto",
+		Type: "CropAndResizeGradBoxes",
 		Input: []tf.Input{
-			sizes, tf.OutputList(values),
+			grads, image, boxes, box_ind,
 		},
 		Attrs: attrs,
 	}
@@ -30279,59 +29986,113 @@ func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names
 	return op.Output(0)
 }
 
-// Creates a TensorArray for storing the gradients of values in the given handle.
-//
-// If the given TensorArray gradient already exists, returns a reference to it.
-//
-// Locks the size of the original TensorArray by disabling its dynamic size flag.
-//
-// **A note about the input flow_in:**
-//
-// The handle flow_in forces the execution of the gradient lookup to occur
-// only after certain other operations have occurred.  For example, when
-// the forward TensorArray is dynamically sized, writes to this TensorArray
-// may resize the object.  The gradient TensorArray is statically sized based
-// on the size of the forward TensorArray when this operation executes.
-// Furthermore, the size of the forward TensorArray is frozen by this call.
-// As a result, the flow is used to ensure that the call to generate the gradient
-// TensorArray only happens after all writes are executed.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// In the case of dynamically sized TensorArrays, gradient computation should
-// only be performed on read operations that have themselves been chained via
-// flow to occur only after all writes have executed. That way the final size
-// of the forward TensorArray is known when this operation is called.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system and more
+// generally is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
-// **A note about the source attribute:**
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
 //
-// TensorArray gradient calls use an accumulator TensorArray object.  If
-// multiple gradients are calculated and run in the same session, the multiple
-// gradient nodes may accidentally flow through the same accumulator TensorArray.
-// This double counts and generally breaks the TensorArray gradient flow.
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV3(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppressionV3",
+		Input: []tf.Input{
+			boxes, scores, max_output_size, iou_threshold, score_threshold,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
+type NonMaxSuppressionV4Attr func(optionalAttr)
+
+// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
 //
-// The solution is to identify which gradient call this particular
-// TensorArray gradient is being called in.  This is performed by identifying
-// a unique string (e.g. "gradients", "gradients_1", ...) from the input
-// gradient Tensor's name.  This string is used as a suffix when creating
-// the TensorArray gradient object here (the attribute `source`).
+// value: If true, the output `selected_indices` is padded to be of length
+// `max_output_size`. Defaults to false.
+// If not specified, defaults to false
+func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
+	return func(m optionalAttr) {
+		m["pad_to_max_output_size"] = value
+	}
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// The attribute `source` is added as a suffix to the forward TensorArray's
-// name when performing the creation / lookup, so that each separate gradient
-// calculation gets its own TensorArray accumulator.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system and more
+// generally is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
+// `selected_indices`, with the valid elements appearing first.
+func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV3",
+		Type: "NonMaxSuppressionV4",
 		Input: []tf.Input{
-			handle, flow_in,
+			boxes, scores, max_output_size, iou_threshold, score_threshold,
 		},
 		Attrs: attrs,
 	}
@@ -30339,94 +30100,129 @@ func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source
 	return op.Output(0), op.Output(1)
 }
 
-// Creates a dataset that splits a SparseTensor into elements row-wise.
-func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
+// Computes the matrix logarithm of one or more square matrices:
+//
+//
+// \\(log(exp(A)) = A\\)
+//
+// This op is only defined for complex matrices. If A is positive-definite and
+// real, then casting to a complex matrix, taking the logarithm and casting back
+// to a real matrix will give the correct result.
+//
+// This function computes the matrix logarithm using the Schur-Parlett algorithm.
+// Details of the algorithm can be found in Section 11.6.2 of:
+// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
+// ISBN 978-0-898716-46-7.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.logm
+// @end_compatibility
+func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorSliceDataset",
+		Type: "MatrixLogarithm",
 		Input: []tf.Input{
-			indices, values, dense_shape,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x / y element-wise for real types.
+//   This op is used as a placeholder in If branch functions. It doesn't provide a
+//   valid output when run, so must either be removed (e.g. replaced with a
+//   function input) or guaranteed not to be used (e.g. if mirroring an
+//   intermediate output needed for the gradient computation of the other branch).
 //
-// If `x` and `y` are reals, this will return the floating-point division.
+// Arguments:
+//	dtype: The type of the output.
+//	shape:     The purported shape of the output. This is only used for shape inference;
+//     the output will not necessarily have this shape. Can be a partial shape.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns     \"Fake\" output value. This should not be consumed by another op.
+func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "RealDiv",
-		Input: []tf.Input{
-			x, y,
-		},
+		Type: "FakeParam",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-//     Adds v into specified rows of x.
-//
-//     Computes y = x; y[i, :] += v; return y.
-//
-// Arguments:
-//	x: A `Tensor` of type T.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InplaceAdd",
+		Type: "InvGrad",
 		Input: []tf.Input{
-			x, i, v,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restore a Reader to its initial clean state.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
+// List of the given size with empty elements.
 //
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
+		Type: "TensorListReserve",
 		Input: []tf.Input{
-			reader_handle,
+			element_shape, num_elements,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// A dataset that splits the elements of its input into multiple elements.
-func UnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
+//
+// Arguments:
+//	selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines which of the
+// `N` data inputs should produce the next output element.
+//	data_input_datasets: `N` datasets with the same type that will be interleaved according to
+// the values of `selector_input_dataset`.
+//
+//
+func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset tf.Output, data_input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "UnbatchDataset",
+		Type: "ExperimentalDirectedInterleaveDataset",
 		Input: []tf.Input{
-			input_dataset,
+			selector_input_dataset, tf.OutputList(data_input_datasets),
 		},
 		Attrs: attrs,
 	}
@@ -30434,195 +30230,260 @@ func UnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.Dat
 	return op.Output(0)
 }
 
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
 
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
+// RandomUniformIntSeed sets the optional seed attribute to value.
 //
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
 // If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["seed"] = value
 	}
 }
 
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["seed2"] = value
 	}
 }
 
-// OrderedMapStageContainer sets the optional container attribute to value.
+// Outputs random integers from a uniform distribution.
 //
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomUniformInt",
+		Input: []tf.Input{
+			shape, minval, maxval,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+// Add the quantile summaries to each quantile stream resource.
 //
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+// An op that adds a list of quantile summaries to a quantile stream resource. Each
+// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
+// for a single feature.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
+//
+// Returns the created operation.
+func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
+		Input: []tf.Input{
+			quantile_stream_resource_handle, tf.OutputList(summaries),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StringSplitV2Attr is an optional argument to StringSplitV2.
+type StringSplitV2Attr func(optionalAttr)
+
+// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
+//
+// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
+// If not specified, defaults to -1
+func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["maxsplit"] = value
 	}
 }
 
-// Stage (key, values) in the underlying container which behaves like a ordered
+// Split elements of `source` based on `sep` into a `SparseTensor`.
 //
-// associative container.   Elements are ordered by key.
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `source` based on `sep` and return a `SparseTensor`
+// containing the split tokens. Empty tokens are ignored.
 //
-// Arguments:
-//	key: int64
+// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+// then the output will be
+// ```
+// st.indices = [0, 0;
+//               0, 1;
+//               1, 0;
+//               1, 1;
+//               1, 2]
+// st.shape = [2, 3]
+// st.values = ['hello', 'world', 'a', 'b', 'c']
+// ```
 //
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
+// If `sep` is given, consecutive delimiters are not grouped together and are
+// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+// string, consecutive whitespace are regarded as a single separator, and the
+// result will contain no empty strings at the startor end if the string has
+// leading or trailing whitespace.
 //
+// Note that the above mentioned behavior matches python's str.split.
 //
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+// Arguments:
+//	input: `1-D` string `Tensor`, the strings to split.
+//	sep: `0-D` string `Tensor`, the delimiter character.
+func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
+		Type: "StringSplitV2",
 		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
+			input, sep,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// RpcAttr is an optional argument to Rpc.
-type RpcAttr func(optionalAttr)
-
-// RpcProtocol sets the optional protocol attribute to value.
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
 //
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func RpcProtocol(value string) RpcAttr {
-	return func(m optionalAttr) {
-		m["protocol"] = value
+// Arguments:
+//
+//	thread_pool: A resource produced by the ThreadPoolHandle op.
+//
+//
+func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalThreadPoolDataset",
+		Input: []tf.Input{
+			input_dataset, thread_pool,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RpcFailFast sets the optional fail_fast attribute to value.
-//
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func RpcFailFast(value bool) RpcAttr {
-	return func(m optionalAttr) {
-		m["fail_fast"] = value
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softsign",
+		Input: []tf.Input{
+			features,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
-//
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
-// If not specified, defaults to 0
-func RpcTimeoutInMs(value int64) RpcAttr {
+// EncodeProtoAttr is an optional argument to EncodeProto.
+type EncodeProtoAttr func(optionalAttr)
+
+// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
+// If not specified, defaults to "local://"
+func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
 	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
+		m["descriptor_source"] = value
 	}
 }
 
-// Perform batches of RPC requests.
-//
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
-//
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the RPC method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
+// The op serializes protobuf messages provided in the input tensors.
 //
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
+// The types of the tensors in `values` must match the schema for the
+// fields specified in `field_names`. All the tensors in `values` must
+// have a common shape prefix, *batch_shape*.
 //
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
+// The `sizes` tensor specifies repeat counts for each field.  The repeat
+// count (last dimension) of a each tensor in `values` must be greater
+// than or equal to corresponding repeat count in `sizes`.
 //
-// then call this op with arguments:
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
 //
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
 //
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
 //
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
 //
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
+// There are a few special cases in the value mapping:
 //
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
+// Submessage and group fields must be pre-serialized as TensorFlow strings.
 //
-// If the connection fails or the remote worker returns an error
-// status, the op reraises this exception locally.
+// TensorFlow lacks support for unsigned int64s, so they must be
+// represented as `tf.int64` with the same twos-complement bit pattern
+// (the obvious way).
 //
-// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+// Unsigned int32 values can be represented exactly with `tf.int64`, or
+// with sign wrapping if the input is of type `tf.int32`.
 //
 // Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
+//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+//	values: List of tensors containing values for the corresponding field.
+//	field_names: List of strings containing proto field names.
+//	message_type: Name of the proto message type to decode.
 //
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
-func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
+// Returns Tensor of serialized protos with shape `batch_shape`.
+func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Rpc",
+		Type: "EncodeProto",
 		Input: []tf.Input{
-			address, method, request,
+			sizes, tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
@@ -30630,160 +30491,111 @@ func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, o
 	return op.Output(0)
 }
 
-// StackPushV2Attr is an optional argument to StackPushV2.
-type StackPushV2Attr func(optionalAttr)
-
-// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+// Creates a TensorArray for storing the gradients of values in the given handle.
 //
-// value: Swap `elem` to CPU. Default to false.
-// If not specified, defaults to false
-func StackPushV2SwapMemory(value bool) StackPushV2Attr {
-	return func(m optionalAttr) {
-		m["swap_memory"] = value
-	}
-}
-
-// Push an element onto the stack.
+// If the given TensorArray gradient already exists, returns a reference to it.
 //
-// Arguments:
-//	handle: The handle to a stack.
-//	elem: The tensor to be pushed onto the stack.
+// Locks the size of the original TensorArray by disabling its dynamic size flag.
 //
-// Returns The same tensor as the input 'elem'.
-func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+// **A note about the input flow_in:**
+//
+// The handle flow_in forces the execution of the gradient lookup to occur
+// only after certain other operations have occurred.  For example, when
+// the forward TensorArray is dynamically sized, writes to this TensorArray
+// may resize the object.  The gradient TensorArray is statically sized based
+// on the size of the forward TensorArray when this operation executes.
+// Furthermore, the size of the forward TensorArray is frozen by this call.
+// As a result, the flow is used to ensure that the call to generate the gradient
+// TensorArray only happens after all writes are executed.
+//
+// In the case of dynamically sized TensorArrays, gradient computation should
+// only be performed on read operations that have themselves been chained via
+// flow to occur only after all writes have executed. That way the final size
+// of the forward TensorArray is known when this operation is called.
+//
+// **A note about the source attribute:**
+//
+// TensorArray gradient calls use an accumulator TensorArray object.  If
+// multiple gradients are calculated and run in the same session, the multiple
+// gradient nodes may accidentally flow through the same accumulator TensorArray.
+// This double counts and generally breaks the TensorArray gradient flow.
+//
+// The solution is to identify which gradient call this particular
+// TensorArray gradient is being called in.  This is performed by identifying
+// a unique string (e.g. "gradients", "gradients_1", ...) from the input
+// gradient Tensor's name.  This string is used as a suffix when creating
+// the TensorArray gradient object here (the attribute `source`).
+//
+// The attribute `source` is added as a suffix to the forward TensorArray's
+// name when performing the creation / lookup, so that each separate gradient
+// calculation gets its own TensorArray accumulator.
+//
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "StackPushV2",
+		Type: "TensorArrayGradV3",
 		Input: []tf.Input{
-			handle, elem,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
-func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Creates a dataset that splits a SparseTensor into elements row-wise.
+func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ConcatenateDataset",
+		Type: "SparseTensorSliceDataset",
 		Input: []tf.Input{
-			input_dataset, another_dataset,
+			indices, values, dense_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adds a value to the current value of a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the incremented value or a subsequent newer one.
+// Returns x / y element-wise for real types.
 //
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+// If `x` and `y` are reals, this will return the floating-point division.
 //
-// Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Records the latency of producing `input_dataset` elements in a StatsAggregator.
-func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "LatencyStatsDataset",
+		Type: "RealDiv",
 		Input: []tf.Input{
-			input_dataset, tag,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
-
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
-//
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
-//
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
-//
-// Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
-//
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
+func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SparseToDense",
+		Type: "ConcatenateDataset",
 		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
+			input_dataset, another_dataset,
 		},
 		Attrs: attrs,
 	}
@@ -30924,52 +30736,6 @@ func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Out
 	return op.Output(0)
 }
 
-// Creates a dataset that batches input elements into a SparseTensor.
-//
-// Arguments:
-//	input_dataset: A handle to an input dataset. Must have a single component.
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	row_shape: A vector representing the dense shape of each row in the produced
-// SparseTensor. The shape may be partially specified, using `-1` to indicate
-// that a particular dimension should use the maximum size of all batch elements.
-//
-//
-func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "DenseToSparseBatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size, row_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated. Use TensorArrayGradV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
-func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"source": source}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a dataset that shuffles and repeats elements from `input_dataset`
 //
 // pseudorandomly.
@@ -31575,46 +31341,6 @@ func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQu
 	return op.Output(0)
 }
 
-// Produces a summary of any statistics recorded by the given statistics manager.
-func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatsAggregatorSummary",
-		Input: []tf.Input{
-			iterator,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the pairwise cross product.
-//
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
-//
-// Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
-//
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cross",
-		Input: []tf.Input{
-			a, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Constructs an Optional variant from a tuple of tensors.
 func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
 	if scope.Err() != nil {
@@ -31783,30 +31509,6 @@ func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
 	return op.Output(0)
 }
 
-// Creates a dataset that executes a SQL query and emits rows of the result set.
-//
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
-//
-//
-func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SqlDataset",
-		Input: []tf.Input{
-			driver_name, data_source_name, query,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns the value stored in an Optional variant or raises an error if none exists.
 func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
@@ -31902,6 +31604,24 @@ func ModelDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataT
 	return op.Output(0)
 }
 
+// Returns the truth value of (x > y) element-wise.
+//
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Greater",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Performs a padding as a preprocess during a convolution.
 //
 // Similar to FusedResizeAndPadConv2d, this op allows for an optimized
@@ -32199,64 +31919,24 @@ func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow
 //         x, tf.cast(condition_mask, tf.int32) , 2)
 //     partitioned_data[1] = partitioned_data[1] + 1.0
 //     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ParallelDynamicStitch",
-		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient for the inverse of `x` wrt its input.
-//
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InvGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// List of the given size with empty elements.
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
 //
-// element_shape: the shape of the future elements of the list
-// num_elements: the number of elements to reserve
-// handle: the output list
-// element_dtype: the desired type of elements in the list.
-func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorListReserve",
+		Type: "ParallelDynamicStitch",
 		Input: []tf.Input{
-			element_shape, num_elements,
+			tf.OutputList(indices), tf.OutputList(data),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -32769,30 +32449,265 @@ type StackV2Attr func(optionalAttr)
 // If not specified, defaults to ""
 func StackV2StackName(value string) StackV2Attr {
 	return func(m optionalAttr) {
-		m["stack_name"] = value
+		m["stack_name"] = value
+	}
+}
+
+// A stack that produces elements in first-in last-out order.
+//
+// Arguments:
+//	max_size: The maximum size of the stack if non-negative. If negative, the stack
+// size is unlimited.
+//	elem_type: The type of the elements on the stack.
+//
+// Returns The handle to the stack.
+func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StackV2",
+		Input: []tf.Input{
+			max_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
+
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a ordered
+//
+// associative container.   Elements are ordered by key.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RpcAttr is an optional argument to Rpc.
+type RpcAttr func(optionalAttr)
+
+// RpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func RpcProtocol(value string) RpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// RpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func RpcFailFast(value bool) RpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func RpcTimeoutInMs(value int64) RpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the RPC method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// If the connection fails or the remote worker returns an error
+// status, the op reraises this exception locally.
+//
+// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
+func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Rpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StackPushV2Attr is an optional argument to StackPushV2.
+type StackPushV2Attr func(optionalAttr)
+
+// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+//
+// value: Swap `elem` to CPU. Default to false.
+// If not specified, defaults to false
+func StackPushV2SwapMemory(value bool) StackPushV2Attr {
+	return func(m optionalAttr) {
+		m["swap_memory"] = value
 	}
 }
 
-// A stack that produces elements in first-in last-out order.
+// Push an element onto the stack.
 //
 // Arguments:
-//	max_size: The maximum size of the stack if non-negative. If negative, the stack
-// size is unlimited.
-//	elem_type: The type of the elements on the stack.
+//	handle: The handle to a stack.
+//	elem: The tensor to be pushed onto the stack.
 //
-// Returns The handle to the stack.
-func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
+// Returns The same tensor as the input 'elem'.
+func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StackV2",
+		Type: "StackPushV2",
 		Input: []tf.Input{
-			max_size,
+			handle, elem,
 		},
 		Attrs: attrs,
 	}
@@ -33183,6 +33098,25 @@ func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size
 	return op.Output(0)
 }
 
+// Deprecated. Use TensorArrayGradV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
+func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"source": source}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGradV2",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SparseReduceMaxAttr is an optional argument to SparseReduceMax.
 type SparseReduceMaxAttr func(optionalAttr)
 
@@ -33736,221 +33670,3 @@ func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Opera
 	}
 	return scope.AddOperation(opspec)
 }
-
-// StagePeekAttr is an optional argument to StagePeek.
-type StagePeekAttr func(optionalAttr)
-
-// StagePeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StagePeekCapacity(value int64) StagePeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StagePeekMemoryLimit(value int64) StagePeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StagePeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StagePeekContainer(value string) StagePeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StagePeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StagePeekSharedName(value string) StagePeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified index.  If the
-//
-// underlying container does not contain sufficient elements
-// this op will block until it does.   This Op is optimized for
-// performance.
-func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StagePeek",
-		Input: []tf.Input{
-			index,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("StagePeek", err)
-		return
-	}
-	return values
-}
-
-// MapStageAttr is an optional argument to MapStage.
-type MapStageAttr func(optionalAttr)
-
-// MapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapStageCapacity(value int64) MapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapStageMemoryLimit(value int64) MapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func MapStageContainer(value string) MapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func MapStageSharedName(value string) MapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a hashtable.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MapUnstageAttr is an optional argument to MapUnstage.
-type MapUnstageAttr func(optionalAttr)
-
-// MapUnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageCapacity(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapUnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageContainer(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapUnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageSharedName(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns the values associated with the key
-//
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapUnstage",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstage", err)
-		return
-	}
-	return values
-}
diff --git a/tensorflow/go/session.go b/tensorflow/go/session.go
index db6ae4f26cd92dcf5e542052e4bae561bbefe999..48909ffe39e0260096d9ec4513976a2d49c34a98 100644
--- a/tensorflow/go/session.go
+++ b/tensorflow/go/session.go
@@ -71,37 +71,39 @@ type Device struct {
 	MemoryLimitBytes int64
 }
 
-// Return list of devices associated with a Session
-func (s *Session) ListDevices() ([]Device, error) {
-	var devices []Device
+// String describes d and implements fmt.Stringer.
+func (d Device) String() string {
+	memStr := "no memory limit"
+	if d.MemoryLimitBytes >= 0 {
+		memStr = fmt.Sprintf("memory limit %d bytes", d.MemoryLimitBytes)
+	}
+	return fmt.Sprintf("(Device: name \"%s\", type %s, %s)", d.Name, d.Type, memStr)
+}
 
+func deviceSliceFromDeviceList(list *C.TF_DeviceList) ([]Device, error) {
+	var devices []Device
 	status := newStatus()
-	devices_list := C.TF_SessionListDevices(s.c, status.c)
-	if err := status.Err(); err != nil {
-		return nil, fmt.Errorf("SessionListDevices() failed: %v", err)
-	}
-	defer C.TF_DeleteDeviceList(devices_list)
 
-	for i := 0; i < int(C.TF_DeviceListCount(devices_list)); i++ {
-		device_name := C.TF_DeviceListName(devices_list, C.int(i), status.c)
+	for i := 0; i < int(C.TF_DeviceListCount(list)); i++ {
+		name := C.TF_DeviceListName(list, C.int(i), status.c)
 		if err := status.Err(); err != nil {
 			return nil, fmt.Errorf("DeviceListName(index=%d) failed: %v", i, err)
 		}
 
-		device_type := C.TF_DeviceListType(devices_list, C.int(i), status.c)
+		deviceType := C.TF_DeviceListType(list, C.int(i), status.c)
 		if err := status.Err(); err != nil {
 			return nil, fmt.Errorf("DeviceListType(index=%d) failed: %v", i, err)
 		}
 
-		memory_limit_bytes := C.TF_DeviceListMemoryBytes(devices_list, C.int(i), status.c)
+		memoryLimitBytes := C.TF_DeviceListMemoryBytes(list, C.int(i), status.c)
 		if err := status.Err(); err != nil {
 			return nil, fmt.Errorf("DeviceListMemoryBytes(index=%d) failed: %v", i, err)
 		}
 
 		device := Device{
-			Name:             C.GoString(device_name),
-			Type:             C.GoString(device_type),
-			MemoryLimitBytes: int64(memory_limit_bytes),
+			Name:             C.GoString(name),
+			Type:             C.GoString(deviceType),
+			MemoryLimitBytes: int64(memoryLimitBytes),
 		}
 
 		devices = append(devices, device)
@@ -110,6 +112,17 @@ func (s *Session) ListDevices() ([]Device, error) {
 	return devices, nil
 }
 
+// ListDevices returns the list of devices associated with a Session.
+func (s *Session) ListDevices() ([]Device, error) {
+	status := newStatus()
+	devicesList := C.TF_SessionListDevices(s.c, status.c)
+	if err := status.Err(); err != nil {
+		return nil, fmt.Errorf("SessionListDevices() failed: %v", err)
+	}
+	defer C.TF_DeleteDeviceList(devicesList)
+	return deviceSliceFromDeviceList(devicesList)
+}
+
 // Run the graph with the associated session starting with the supplied feeds
 // to compute the value of the requested fetches. Runs, but does not return
 // Tensors for operations specified in targets.
diff --git a/tensorflow/go/session_test.go b/tensorflow/go/session_test.go
index 05ace99a2387c6884832427187525f2fb7d5aba2..c9bda00167171179dac7ced108d928c9e7bb5f86 100644
--- a/tensorflow/go/session_test.go
+++ b/tensorflow/go/session_test.go
@@ -299,3 +299,21 @@ func TestListDevices(t *testing.T) {
 		t.Fatalf("no devices detected")
 	}
 }
+
+func TestDeviceString(t *testing.T) {
+	d := Device{Name: "foo", Type: "bar", MemoryLimitBytes: 12345}
+	got := d.String()
+	want := "(Device: name \"foo\", type bar, memory limit 12345 bytes)"
+	if got != want {
+		t.Errorf("Got \"%s\", want \"%s\"", got, want)
+	}
+}
+
+func TestDeviceStringNoMemoryLimit(t *testing.T) {
+	d := Device{Name: "foo", Type: "bar", MemoryLimitBytes: -1}
+	got := d.String()
+	want := "(Device: name \"foo\", type bar, no memory limit)"
+	if got != want {
+		t.Errorf("Got \"%s\", want \"%s\"", got, want)
+	}
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Session.java b/tensorflow/java/src/main/java/org/tensorflow/Session.java
index a660d25f98ec961ac2ba1a48bced13803c00096b..8cc23e2991b301448b319313f111a48349e1b15f 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Session.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Session.java
@@ -149,10 +149,10 @@ public final class Session implements AutoCloseable {
 
     /**
      * Use {@code t} instead of the Tensor referred to by executing the operation referred to by
-     * {@code output}.
+     * {@code operand}.
      */
-    public Runner feed(Output<?> o, Tensor<?> t) {
-      inputs.add(o);
+    public Runner feed(Operand<?> operand, Tensor<?> t) {
+      inputs.add(operand.asOutput());
       inputTensors.add(t);
       return this;
     }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/PrimitiveOp.java b/tensorflow/java/src/main/java/org/tensorflow/op/PrimitiveOp.java
index 8e56f970416ef35737d6763fcc6bb46bc7a157c5..006ae99dc46265aede6991e2cea99119113de165 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/op/PrimitiveOp.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/PrimitiveOp.java
@@ -25,6 +25,11 @@ import org.tensorflow.Operation;
  */
 public abstract class PrimitiveOp implements Op {
 
+  /** Returns the underlying {@link Operation} */
+  public Operation op() {
+    return operation;
+  }
+
   @Override
   public final int hashCode() {
     return operation.hashCode();
@@ -48,10 +53,6 @@ public abstract class PrimitiveOp implements Op {
     return String.format("<%s '%s'>", operation.type(), operation.name());
   }
 
-  /**
-   * Underlying operation. It is deliberately not exposed by a getter method to avoid any name
-   * conflict with generated methods of the subclasses.
-   */
   protected final Operation operation;
 
   /**
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
index 1bd00a763ddff2f067183f57cfa80fdcbed84fd2..3229cce2776dd305a67d5936c37db5b1d9626402 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
@@ -21,6 +21,7 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
+import java.nio.Buffer;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.DoubleBuffer;
@@ -100,7 +101,7 @@ public class TensorTest {
                     : ByteOrder.LITTLE_ENDIAN)
             .asDoubleBuffer()
             .put(doubles);
-    buf.flip();
+    flipBuffer(buf);
     try (Tensor<Double> t = Tensor.create(new long[] {doubles.length}, buf)) {
       double[] actual = new double[doubles.length];
       assertArrayEquals(doubles, t.copyTo(actual), EPSILON);
@@ -179,30 +180,30 @@ public class TensorTest {
       {
         ByteBuffer bbuf = ByteBuffer.allocate(1024).order(ByteOrder.nativeOrder());
 
-        bbuf.clear(); // FLOAT
+        clearBuffer(bbuf); // FLOAT
         tfloats.writeTo(bbuf);
         assertEquals(tfloats.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(floats[0], bbuf.asFloatBuffer().get(0), EPSILON);
-        bbuf.clear(); // DOUBLE
+        clearBuffer(bbuf); // DOUBLE
         tdoubles.writeTo(bbuf);
         assertEquals(tdoubles.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(doubles[0], bbuf.asDoubleBuffer().get(0), EPSILON);
-        bbuf.clear(); // INT32
+        clearBuffer(bbuf); // INT32
         tints.writeTo(bbuf);
         assertEquals(tints.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(ints[0], bbuf.asIntBuffer().get(0));
-        bbuf.clear(); // INT64
+        clearBuffer(bbuf); // INT64
         tlongs.writeTo(bbuf);
         assertEquals(tlongs.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(longs[0], bbuf.asLongBuffer().get(0));
-        bbuf.clear(); // BOOL
+        clearBuffer(bbuf); // BOOL
         tbools.writeTo(bbuf);
         assertEquals(tbools.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(bools[0], bbuf.get(0) != 0);
       }
 
@@ -254,7 +255,7 @@ public class TensorTest {
                         : ByteOrder.LITTLE_ENDIAN)
                 .asDoubleBuffer();
         tdoubles.writeTo(foreignBuf);
-        foreignBuf.flip();
+        flipBuffer(foreignBuf);
         double[] actual = new double[foreignBuf.remaining()];
         foreignBuf.get(actual);
         assertArrayEquals(doubles, actual, EPSILON);
@@ -547,4 +548,25 @@ public class TensorTest {
       // expected.
     }
   }
+
+  // Workaround for cross compiliation
+  // (e.g., javac -source 1.9 -target 1.8).
+  //
+  // In Java 8 and prior, subclasses of java.nio.Buffer (e.g., java.nio.DoubleBuffer) inherited the
+  // "flip()" and "clear()" methods from java.nio.Buffer resulting in the signature:
+  //   Buffer flip();
+  // In Java 9 these subclasses had their own methods like:
+  //   DoubleBuffer flip();
+  // As a result, compiling for 1.9 source for a target of JDK 1.8 would result in errors at runtime
+  // like:
+  //
+  // java.lang.NoSuchMethodError: java.nio.DoubleBuffer.flip()Ljava/nio/DoubleBuffer
+  private static void flipBuffer(Buffer buf) {
+    buf.flip();
+  }
+
+  // See comment for flipBuffer()
+  private static void clearBuffer(Buffer buf) {
+    buf.clear();
+  }
 }
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index bb2c53b8c9e4300f67fa8badbdbaaf73532005fe..8fca01624cfa2c21cd428e63ed1eadf7b853f107 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -1,11 +1,12 @@
-package(default_visibility = [
-    "//visibility:public",
-])
+package(
+    default_visibility = ["//visibility:public"],
+)
 
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
 exports_files(glob([
     "testdata/*.bin",
@@ -35,15 +36,22 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+TFLITE_DEFAULT_COPTS = if_not_windows([
+    "-Wall",
+    "-Wno-comment",
+])
+
 cc_library(
     name = "schema_fbs_version",
     hdrs = ["version.h"],
+    copts = TFLITE_DEFAULT_COPTS,
 )
 
 cc_library(
     name = "arena_planner",
     srcs = ["arena_planner.cc"],
     hdrs = ["arena_planner.h"],
+    copts = TFLITE_DEFAULT_COPTS,
     deps = [
         ":graph_info",
         ":memory_planner",
@@ -57,12 +65,10 @@ cc_test(
     size = "small",
     srcs = ["arena_planner_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable",
     ],
     deps = [
         ":arena_planner",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
@@ -74,18 +80,21 @@ cc_test(
 cc_library(
     name = "context",
     hdrs = ["context.h"],
+    copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:c_api_internal"],
 )
 
 cc_library(
     name = "graph_info",
     hdrs = ["graph_info.h"],
+    copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:c_api_internal"],
 )
 
 cc_library(
     name = "memory_planner",
     hdrs = ["memory_planner.h"],
+    copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:c_api_internal"],
 )
 
@@ -93,6 +102,7 @@ cc_library(
     name = "simple_memory_arena",
     srcs = ["simple_memory_arena.cc"],
     hdrs = ["simple_memory_arena.h"],
+    copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:c_api_internal"],
 )
 
@@ -109,9 +119,9 @@ cc_library(
     hdrs = [
         "builtin_op_data.h",
         "builtin_ops.h",
-        "context.h",
         "context_util.h",
     ],
+    deps = ["//tensorflow/lite/c:c_api_internal"],
 )
 
 exports_files(["builtin_ops.h"])
@@ -121,9 +131,7 @@ cc_library(
     hdrs = [
         "string.h",
     ],
-    deps = [
-        "//tensorflow/core:lib_platform",
-    ],
+    copts = TFLITE_DEFAULT_COPTS,
 )
 
 # TODO(ahentz): investigate dependency on gemm_support requiring usage of tf_copts.
@@ -167,7 +175,7 @@ cc_library(
         "optional_debug_tools.h",
         "stderr_reporter.h",
     ],
-    copts = tflite_copts(),
+    copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
     linkopts = [
     ] + select({
         "//tensorflow:android": [
@@ -185,7 +193,7 @@ cc_library(
         ":string",
         ":util",
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/api:api",
         "//tensorflow/lite/kernels:eigen_support",
         "//tensorflow/lite/kernels:gemm_support",
         "//tensorflow/lite/nnapi:nnapi_lib",
@@ -203,10 +211,10 @@ cc_library(
     name = "string_util",
     srcs = ["string_util.cc"],
     hdrs = ["string_util.h"],
-    copts = tflite_copts(),
+    copts = TFLITE_DEFAULT_COPTS,
     deps = [
-        ":framework",
         ":string",
+        "//tensorflow/lite/c:c_api_internal",
     ],
 )
 
@@ -217,6 +225,7 @@ cc_test(
     deps = [
         ":framework",
         ":string_util",
+        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
@@ -246,10 +255,8 @@ cc_test(
     name = "graph_info_test",
     size = "small",
     srcs = ["graph_info_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":framework",
-        ":string_util",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
@@ -298,7 +305,12 @@ tf_cc_test(
     data = [
         "testdata/multi_add_flex.bin",
     ],
-    tags = ["no_windows"],  # TODO(b/116667551): No weak symbols with MSVC.
+    tags = [
+        "no_gpu",  # GPU + flex is not officially supported.
+        "no_windows",  # TODO(b/116667551): No weak symbols with MSVC.
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
     deps = [
         ":framework",
         "//tensorflow/lite/core/api",
@@ -314,7 +326,6 @@ cc_test(
     name = "mutable_op_resolver_test",
     size = "small",
     srcs = ["mutable_op_resolver_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":framework",
         "//tensorflow/lite/testing:util",
@@ -326,7 +337,7 @@ cc_library(
     name = "util",
     srcs = ["util.cc"],
     hdrs = ["util.h"],
-    copts = tflite_copts(),
+    copts = TFLITE_DEFAULT_COPTS + tflite_copts(),
     deps = [
         "//tensorflow/lite/c:c_api_internal",
     ],
@@ -336,27 +347,9 @@ cc_test(
     name = "util_test",
     size = "small",
     srcs = ["util_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":util",
-        "//tensorflow/lite/testing:util",
+        "//tensorflow/lite/c:c_api_internal",
         "@com_google_googletest//:gtest",
     ],
 )
-
-# Test the serialization of a model with optional tensors.
-
-# Model tests
-
-#cc_library(
-#    name = "models_test_utils",
-#    testonly = 1,
-#    hdrs = ["models/test_utils.h"],
-#    deps = select({
-#        "//tensorflow:android": [],
-#        "//conditions:default": [
-#            "@com_google_absl//absl/strings",
-#            "//tensorflow/core:test",
-#        ],
-#    }),
-#)
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 33cee1ab201cfbfaba01c64f3fa744a66baa8d83..c17eddf47bc86c9537364117c302df38e390c8da 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -112,7 +112,8 @@ def tflite_jni_binary(
         linkshared = 1,
         linkstatic = 1,
         testonly = 0,
-        deps = []):
+        deps = [],
+        srcs = []):
     """Builds a jni binary for TFLite."""
     linkopts = linkopts + [
         "-Wl,--version-script",  # Export only jni functions & classes.
@@ -124,6 +125,7 @@ def tflite_jni_binary(
         linkshared = linkshared,
         linkstatic = linkstatic,
         deps = deps + [linkscript],
+        srcs = srcs,
         linkopts = linkopts,
         testonly = testonly,
     )
@@ -237,13 +239,14 @@ def generated_test_models():
         "equal",
         "exp",
         "expand_dims",
+        "fill",
         "floor",
         "floor_div",
         "floor_mod",
         "fully_connected",
         "fused_batch_norm",
         "gather",
-        "gather_buggy",
+        "gather_with_constant",
         "global_batch_norm",
         "greater",
         "greater_equal",
@@ -264,6 +267,7 @@ def generated_test_models():
         "maximum",
         "mean",
         "minimum",
+        "mirror_pad",
         "mul",
         "neg",
         "not_equal",
@@ -455,6 +459,7 @@ def gen_model_coverage_test(model_name, data, failure_type, tags):
         native.py_test(
             name = "model_coverage_test_%s_%s" % (model_name, target_op_sets.lower().replace(",", "_")),
             srcs = ["model_coverage_test.py"],
+            size = "large",
             main = "model_coverage_test.py",
             args = [
                 "--model_name=%s" % model_name,
@@ -465,7 +470,6 @@ def gen_model_coverage_test(model_name, data, failure_type, tags):
             tags = [
                 "no_oss",
                 "no_windows",
-                "notap",
             ] + tags,
             deps = [
                 "//tensorflow/lite/testing/model_coverage:model_coverage_lib",
diff --git a/tensorflow/lite/c/c_api_internal.c b/tensorflow/lite/c/c_api_internal.c
index 598d74be8468bc4b3c4d9e7aa086ed1745c297da..2923dbad4ef285c497ca2c84d86168954fe8ec99 100644
--- a/tensorflow/lite/c/c_api_internal.c
+++ b/tensorflow/lite/c/c_api_internal.c
@@ -59,7 +59,7 @@ void TfLiteIntArrayPrint(const char* s, TfLiteIntArray* a) {
   printf("]\n");
 }
 
-TfLiteIntArray* TfLiteIntArrayCopy(TfLiteIntArray* src) {
+TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src) {
   if (!src) return NULL;
   TfLiteIntArray* ret = TfLiteIntArrayCreate(src->size);
   if (ret) {
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index 6280bf825d389dd405e3071c3285a7427df7ff61..1cd84eff5c436abb781c74d1ac287b709558133f 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -96,7 +96,7 @@ int TfLiteIntArrayEqualsArray(TfLiteIntArray* a, int b_size, int b_data[]);
 
 // Create a copy of an array passed as `src`.
 // You are expected to free memory with TfLiteIntArrayFree
-TfLiteIntArray* TfLiteIntArrayCopy(TfLiteIntArray* src);
+TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src);
 
 // Free memory of array `v`.
 void TfLiteIntArrayFree(TfLiteIntArray* v);
@@ -496,12 +496,12 @@ typedef struct _TfLiteDelegate {
                                        TfLiteBufferHandle buffer_handle,
                                        TfLiteTensor* tensor);
 
-  // Copy the data from raw memory to delegate buffer handle.
-  // This can be null if the delegate doesn't use its own buffer.
+  // Copy the data from raw memory of the given 'tensor' to delegate buffer
+  // handle. This can be null if the delegate doesn't use its own buffer.
   TfLiteStatus (*CopyToBufferHandle)(TfLiteContext* context,
                                      TfLiteDelegate* delegate,
                                      TfLiteBufferHandle buffer_handle,
-                                     void* data, size_t size);
+                                     TfLiteTensor* tensor);
 
   // Free the Delegate Buffer Handle. Note: This only frees the handle, but
   // this doesn't release the underlying resource (e.g. textures). The
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index d8542f951307e65eb006157bee579eca112c0902..2a7c3a7c322e55500d9edb7d7c1b9763e9a76e88 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -219,8 +219,9 @@ class Subgraph {
       TF_LITE_ENSURE(context_, t->delegate != nullptr);
       TF_LITE_ENSURE(context_, t->buffer_handle != kTfLiteNullBufferHandle);
       TF_LITE_ENSURE(context_, t->delegate->CopyFromBufferHandle != nullptr);
-      TF_LITE_ENSURE(context_, t->delegate->CopyFromBufferHandle(
-                                   context_, t->delegate, t->buffer_handle, t));
+      // TODO(b/120420546): we must add a test that exercise this code.
+      TF_LITE_ENSURE_STATUS(t->delegate->CopyFromBufferHandle(
+          context_, t->delegate, t->buffer_handle, t));
       t->data_is_stale = false;
     }
     return kTfLiteOk;
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 63e86899da6d2e884dee3663f5cc4346104ce8ed..75083bf95a126fe7a8d1ca92af2cfa0c5a85f371 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -83,8 +83,10 @@ cc_library(
         ":delegate_data",
         ":kernel",
         ":util",
+        "@com_google_absl//absl/strings:strings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
     ] + select({
         "//tensorflow:android": [
diff --git a/tensorflow/lite/delegates/flex/buffer_map.cc b/tensorflow/lite/delegates/flex/buffer_map.cc
index 9a6c5e74a7b8d71a04c20bbcb969cfe0b0ce3478..0d0c953636672e33130a991b1a302f410e42f381 100644
--- a/tensorflow/lite/delegates/flex/buffer_map.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map.cc
@@ -26,6 +26,8 @@ namespace flex {
 namespace {
 // A tensor buffer that is allocated, deallocated and populated by TF Lite.
 class BaseTfLiteTensorBuffer : public tensorflow::TensorBuffer {
+  using tensorflow::TensorBuffer::TensorBuffer;
+
   TensorBuffer* root_buffer() override { return this; }
   void FillAllocationDescription(
       tensorflow::AllocationDescription* proto) const override {
@@ -60,31 +62,29 @@ class BaseTfLiteTensorBuffer : public tensorflow::TensorBuffer {
 // representation in TFLITE and TF, so we just need use memcpy().
 class TfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
  public:
-  explicit TfLiteTensorBuffer(const TfLiteTensor* tensor) {
+  explicit TfLiteTensorBuffer(const TfLiteTensor* tensor)
+      : BaseTfLiteTensorBuffer(tensorflow::cpu_allocator()->AllocateRaw(
+            EIGEN_MAX_ALIGN_BYTES, tensor->bytes)) {
     // TODO(ahentz): if we can guarantee that TF Lite allocated tensors with
     // the same alignment as TensorFlow (EIGEN_MAX_ALIGN_BYTES), then we can
     // potentially eliminate the copy below.
     len_ = tensor->bytes;
-    data_ =
-        tensorflow::cpu_allocator()->AllocateRaw(EIGEN_MAX_ALIGN_BYTES, len_);
 
     LogAllocation();
 
-    if (data_) {
-      std::memcpy(data_, tensor->data.raw, tensor->bytes);
+    if (data()) {
+      std::memcpy(data(), tensor->data.raw, tensor->bytes);
     }
   }
 
   ~TfLiteTensorBuffer() override {
     LogDeallocation();
-    tensorflow::cpu_allocator()->DeallocateRaw(data_);
+    tensorflow::cpu_allocator()->DeallocateRaw(data());
   }
 
-  void* data() const override { return data_; }
   size_t size() const override { return len_; }
 
  private:
-  void* data_;
   size_t len_;
 };
 
@@ -92,14 +92,30 @@ class TfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
 // TF's so we need perform the conversion here.
 class StringTfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
  public:
-  explicit StringTfLiteTensorBuffer(const TfLiteTensor* tensor) {
-    num_strings_ = GetStringCount(tensor->data.raw);
-    data_ = tensorflow::cpu_allocator()->Allocate<string>(num_strings_);
+  explicit StringTfLiteTensorBuffer(const TfLiteTensor* tensor)
+      : StringTfLiteTensorBuffer(tensor, tensor->data.raw != nullptr
+                                             ? GetStringCount(tensor->data.raw)
+                                             : 0) {}
+
+  ~StringTfLiteTensorBuffer() override {
+    LogDeallocation();
+    tensorflow::cpu_allocator()->Deallocate<string>(
+        static_cast<string*>(data()), num_strings_);
+  }
+
+  size_t size() const override { return num_strings_ * sizeof(string); }
 
+ private:
+  StringTfLiteTensorBuffer(const TfLiteTensor* tensor, int num_strings)
+      : BaseTfLiteTensorBuffer(
+            num_strings != 0
+                ? tensorflow::cpu_allocator()->Allocate<string>(num_strings)
+                : nullptr),
+        num_strings_(num_strings) {
     LogAllocation();
 
-    if (data_) {
-      string* p = data_;
+    if (data()) {
+      string* p = static_cast<string*>(data());
       for (size_t i = 0; i < num_strings_; ++p, ++i) {
         auto ref = GetString(tensor->data.raw, i);
         p->assign(ref.str, ref.len);
@@ -107,16 +123,6 @@ class StringTfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
     }
   }
 
-  ~StringTfLiteTensorBuffer() override {
-    LogDeallocation();
-    tensorflow::cpu_allocator()->Deallocate<string>(data_, num_strings_);
-  }
-
-  void* data() const override { return data_; }
-  size_t size() const override { return num_strings_ * sizeof(string); }
-
- private:
-  string* data_;
   int num_strings_;
 };
 
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index 12dcaa11d335ea8b1fd902f59259eefa6ec27686..ca7314fbaee6644cf9385a1d7b0b2964d6a2762f 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -16,12 +16,14 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/delegates/flex/buffer_map.h"
 #include "tensorflow/lite/delegates/flex/kernel.h"
 #include "tensorflow/lite/delegates/flex/util.h"
+#include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/util.h"
-#include "tensorflow/core/lib/core/status.h"
 
 namespace tflite {
 namespace flex {
@@ -68,11 +70,34 @@ TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
   }
 
   tensorflow::Tensor t = buffer_map->GetTensor(buffer_handle);
+
+  if (output->type == kTfLiteString) {
+    if (t.dtype() != tensorflow::DT_STRING) {
+      context->ReportError(context,
+                           "Inconsistent type for TF string tensor index %d.",
+                           buffer_handle);
+      return kTfLiteError;
+    }
+    DynamicBuffer dynamic_buffer;
+
+    auto tf_data = t.flat<string>();
+    for (int i = 0; i < t.NumElements(); ++i) {
+      dynamic_buffer.AddString(tf_data(i).data(), tf_data(i).size());
+    }
+
+    dynamic_buffer.WriteToTensor(output, /*new_shape=*/nullptr);
+    return kTfLiteOk;
+  }
+
   tensorflow::StringPiece t_data = t.tensor_data();
 
   if (output->bytes != t_data.size()) {
-    context->ReportError(
-        context, "Not enough space to store TensorFlow's aligned buffer.");
+    context->ReportError(context,
+                         absl::StrCat("The given ", output->bytes,
+                                      " bytes are not enough to store "
+                                      "TensorFlow's aligned buffer of size ",
+                                      t_data.size(), " bytes.")
+                             .c_str());
     return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/delegates/flex/delegate_test.cc b/tensorflow/lite/delegates/flex/delegate_test.cc
index e13029d9a514e7207c69a530713d2dcb6ec11ad5..ee37090d94eaadca2a767a0ea9a2ad105618da97 100644
--- a/tensorflow/lite/delegates/flex/delegate_test.cc
+++ b/tensorflow/lite/delegates/flex/delegate_test.cc
@@ -22,7 +22,6 @@ namespace tflite {
 namespace flex {
 namespace {
 
-using ::testing::ContainsRegex;
 using ::testing::ElementsAre;
 
 class DelegateTest : public testing::FlexModelTest {
@@ -93,6 +92,25 @@ TEST_F(DelegateTest, NonFloatTypeInference) {
   ASSERT_EQ(GetType(2), kTfLiteInt32);
 }
 
+TEST_F(DelegateTest, StringInference) {
+  AddTensors(3, {0, 1}, {2}, kTfLiteString, {2});
+
+  AddTfOp(testing::kAdd, {0, 1}, {2});
+
+  ConfigureDelegate();
+
+  SetShape(0, {2, 2});
+  SetStringValues(0, {"1", "2", "3", "4"});
+  SetShape(1, {2, 2});
+  SetStringValues(1, {"4", "3", "2", "1"});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(2), ElementsAre(2, 2));
+  ASSERT_THAT(GetStringValues(2), ElementsAre("14", "23", "32", "41"));
+  ASSERT_EQ(GetType(2), kTfLiteString);
+}
+
 TEST_F(DelegateTest, MixedGraph) {
   AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3});
 
diff --git a/tensorflow/lite/delegates/flex/test_util.cc b/tensorflow/lite/delegates/flex/test_util.cc
index 08feb349e6dbf15dc908c7c4d4fd5694814c8594..aa24675a7b1beab8632435debc8dd1fc04f347e7 100644
--- a/tensorflow/lite/delegates/flex/test_util.cc
+++ b/tensorflow/lite/delegates/flex/test_util.cc
@@ -25,6 +25,29 @@ namespace testing {
 
 bool FlexModelTest::Invoke() { return interpreter_->Invoke() == kTfLiteOk; }
 
+void FlexModelTest::SetStringValues(int tensor_index,
+                                    const std::vector<string>& values) {
+  DynamicBuffer dynamic_buffer;
+  for (const string& s : values) {
+    dynamic_buffer.AddString(s.data(), s.size());
+  }
+  dynamic_buffer.WriteToTensor(interpreter_->tensor(tensor_index),
+                               /*new_shape=*/nullptr);
+}
+
+std::vector<string> FlexModelTest::GetStringValues(int tensor_index) const {
+  std::vector<string> result;
+
+  TfLiteTensor* tensor = interpreter_->tensor(tensor_index);
+  auto num_strings = GetStringCount(tensor->data.raw);
+  for (size_t i = 0; i < num_strings; ++i) {
+    auto ref = GetString(tensor->data.raw, i);
+    result.push_back(string(ref.str, ref.len));
+  }
+
+  return result;
+}
+
 void FlexModelTest::SetShape(int tensor_index, const std::vector<int>& values) {
   ASSERT_EQ(interpreter_->ResizeInputTensor(tensor_index, values), kTfLiteOk);
   ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
@@ -95,12 +118,22 @@ void FlexModelTest::AddTfOp(TfOpType op, const std::vector<int>& inputs,
     return " attr{ key: '" + key + "' value {" + value + "}}";
   };
 
-  // Crude type attribution, will need fleshing out as more tests are added.
-  // TODO(b/113613439): Use nodedef string utilities to properly handle
-  // all types.
-  string type_attribute = attr("T", "type: DT_FLOAT");
-  if (interpreter_->tensor(inputs[0])->type == kTfLiteInt32) {
-    type_attribute = attr("T", "type: DT_INT32");
+  string type_attribute;
+  switch (interpreter_->tensor(inputs[0])->type) {
+    case kTfLiteInt32:
+      type_attribute = attr("T", "type: DT_INT32");
+      break;
+    case kTfLiteFloat32:
+      type_attribute = attr("T", "type: DT_FLOAT");
+      break;
+    case kTfLiteString:
+      type_attribute = attr("T", "type: DT_STRING");
+      break;
+    default:
+      // TODO(b/113613439): Use nodedef string utilities to properly handle all
+      // types.
+      LOG(FATAL) << "Type not supported";
+      break;
   }
 
   if (op == kUnpack) {
diff --git a/tensorflow/lite/delegates/flex/test_util.h b/tensorflow/lite/delegates/flex/test_util.h
index 4d3f5ad0968ad34ef9ee673ffacd7d9b2c83cb7f..2cc2dc30e92586535687187105057d41ab5c0350 100644
--- a/tensorflow/lite/delegates/flex/test_util.h
+++ b/tensorflow/lite/delegates/flex/test_util.h
@@ -63,11 +63,13 @@ class FlexModelTest : public ::testing::Test {
   void SetValues(int tensor_index, const std::vector<float>& values) {
     SetTypedValues<float>(tensor_index, values);
   }
+  void SetStringValues(int tensor_index, const std::vector<string>& values);
 
   // Returns the tensor's values at the given index.
   std::vector<float> GetValues(int tensor_index) {
     return GetTypedValues<float>(tensor_index);
   }
+  std::vector<string> GetStringValues(int tensor_index) const;
 
   // Sets the tensor's shape at the given index.
   void SetShape(int tensor_index, const std::vector<int>& values);
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index c24f0f71ac4edde456fc67a926ef120da6a50931..fd954ba222627ab0457711b87baf9c3f7573e129 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -23,10 +23,7 @@ tf_cc_test(
     name = "nnapi_delegate_test",
     size = "small",
     srcs = ["nnapi_delegate_test.cc"],
-    tags = [
-        "no_oss",
-        "noasan",  # TODO(b/112326936): re-enable for asan once fixed.
-    ],
+    tags = ["no_oss"],
     deps = [
         ":nnapi_delegate",
         "//tensorflow/lite:framework",
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 4fe07004a82ff30228d866bcc7a90067e5940aca..7908bbf1641fcf07408b9380fb1587768d9f233c 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -37,11 +37,15 @@ namespace {
 
 // TODO(b/80621585): Consider printing error string, but don't for now to
 // minimize binary size.
-#define CHECK_NN(context, code)                                           \
-  if (code != ANEURALNETWORKS_NO_ERROR) {                                 \
-    context->ReportError(context, "NN API returned error (%d).\n", code); \
-    return kTfLiteError;                                                  \
-  }
+#define CHECK_NN(context, code)                                               \
+  do {                                                                        \
+    const auto _code = (code);                                                \
+    if (_code != ANEURALNETWORKS_NO_ERROR) {                                  \
+      context->ReportError(context, "NN API returned error (%d, line %d).\n", \
+                           _code, __LINE__);                                  \
+      return kTfLiteError;                                                    \
+    }                                                                         \
+  } while (0)
 
 namespace {
 int32_t GetAndroidSdkVersion() {
@@ -349,19 +353,18 @@ class NNAPIOpBuilder {
     return kTfLiteOk;
   }
 
-  // TfLiteContext for error handling. Must be named context for macros to
-  // work.
-  TfLiteContext* context_;
+  // TfLiteContext for error handling.
+  TfLiteContext* const context_;
 
-  // Tracks relationship between indices
+  // Tracks relationship between indices.
   OperandMapping* operand_mapping_;
 
-  // The model
-  ANeuralNetworksModel* nn_model_;
+  // The NNAPI model.
+  ANeuralNetworksModel* const nn_model_;
 
   // Inputs and outputs for the current op. These are augmented in the sense
   // that NN API uses operands for all arguments, not just tensors, unlike
-  // TensorFlow lite.
+  // TensorFlow Lite.
   std::vector<uint32_t> augmented_inputs_;
   std::vector<uint32_t> augmented_outputs_;
 };
@@ -374,6 +377,14 @@ struct NNAPIOpMappingArgs {
   std::vector<int>* model_state_tfl_inputs;
 };
 
+// Mapping function simply returning the operation type without adding any
+// additional parameter.
+template <ANeuralNetworksOperationType OperationType>
+ANeuralNetworksOperationType BasicMappingFn(
+    const NNAPIOpMappingArgs& mapping_args) {
+  return OperationType;
+}
+
 // The kernel that represents the node sub set of TF Lite being run on NN API.
 class NNAPIDelegateKernel {
  public:
@@ -385,8 +396,8 @@ class NNAPIDelegateKernel {
   // Return a function that knows how to translate a node into its operands
   // when called. You can use this function to see if a node is supported
   // (i.e. that MappingFn is not nullptr).
-  MappingFn Map(TfLiteContext* context, int builtin_code, int version,
-                TfLiteNode* node) {
+  static MappingFn Map(TfLiteContext* context, int builtin_code, int version,
+                       TfLiteNode* node) {
     switch (builtin_code) {
       case kTfLiteBuiltinAdd:
         if (version == 1) {
@@ -397,8 +408,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_ADD;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinMul:
@@ -410,8 +419,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_MUL;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinAveragePool2d:
@@ -422,8 +429,6 @@ class NNAPIDelegateKernel {
                 mapping_args.node->builtin_data);
             return ANEURALNETWORKS_AVERAGE_POOL_2D;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinMaxPool2d:
@@ -434,8 +439,6 @@ class NNAPIDelegateKernel {
                 mapping_args.node->builtin_data);
             return ANEURALNETWORKS_MAX_POOL_2D;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinL2Pool2d:
@@ -446,8 +449,6 @@ class NNAPIDelegateKernel {
                 mapping_args.node->builtin_data);
             return ANEURALNETWORKS_L2_POOL_2D;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinConv2d:
@@ -469,8 +470,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_CONV_2D;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinDepthwiseConv2d:
@@ -487,8 +486,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_DEPTHWISE_CONV_2D;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinFullyConnected:
@@ -500,8 +497,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_FULLY_CONNECTED;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinSoftmax:
@@ -513,18 +508,11 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarFloat32Operand(builtin->beta);
             return ANEURALNETWORKS_SOFTMAX;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinReshape:
         if (version == 1 && node->inputs->size == 2) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_RESHAPE;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_RESHAPE>;
         }
         break;
       case kTfLiteBuiltinSqueeze:
@@ -540,20 +528,15 @@ class NNAPIDelegateKernel {
                 static_cast<uint32_t>(builtin->num_squeeze_dims));
             return ANEURALNETWORKS_SQUEEZE;
           };
-        } else {
-          return nullptr;
         }
+        break;
       case kTfLiteBuiltinL2Normalization: {
         auto builtin =
             reinterpret_cast<TfLiteL2NormParams*>(node->builtin_data);
-        if (builtin->activation != kTfLiteActNone) {
-          // NNAPI does not support activations
-          return nullptr;
+        if (builtin->activation == kTfLiteActNone) {
+          return BasicMappingFn<ANEURALNETWORKS_L2_NORMALIZATION>;
         }
-        return [](const NNAPIOpMappingArgs& mapping_args)
-                   -> ANeuralNetworksOperationType {
-          return ANEURALNETWORKS_L2_NORMALIZATION;
-        };
+        break;
       }
       case kTfLiteBuiltinLocalResponseNormalization:
         if (version == 1) {
@@ -567,10 +550,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarFloat32Operand(builtin->beta);
             return ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION;
           };
-        } else {
-          // TODO(miaowang): clean-up code and return early in the unsupported
-          // case.
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinLshProjection:
@@ -587,8 +566,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->type);
             return ANEURALNETWORKS_LSH_PROJECTION;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinConcatenation:
@@ -599,7 +576,7 @@ class NNAPIDelegateKernel {
             // NNAPI only support concatenating quantized tensor of the same
             // scale and offset.
             auto first_param = context->tensors[node->inputs->data[0]].params;
-            for (int i = 0; i < node->inputs->size; i++) {
+            for (int i = 1; i < node->inputs->size; i++) {
               auto curr_param = context->tensors[node->inputs->data[i]].params;
               if (curr_param.scale != first_param.scale ||
                   curr_param.zero_point != first_param.zero_point) {
@@ -614,68 +591,36 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->axis);
             return ANEURALNETWORKS_CONCATENATION;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinDequantize:
         if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_DEQUANTIZE;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_DEQUANTIZE>;
         }
         break;
       case kTfLiteBuiltinFloor:
         if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_FLOOR;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_FLOOR>;
         }
         break;
       case kTfLiteBuiltinRelu:
         if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_RELU;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_RELU>;
         }
         break;
       case kTfLiteBuiltinReluN1To1:
         if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_RELU1;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_RELU1>;
         }
         break;
       case kTfLiteBuiltinRelu6:
         if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_RELU6;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_RELU6>;
         }
         break;
       case kTfLiteBuiltinLogistic:
         if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_LOGISTIC;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_LOGISTIC>;
         }
         break;
       case kTfLiteBuiltinTanh:
@@ -683,12 +628,7 @@ class NNAPIDelegateKernel {
         if (version == 1 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
           // NNAPI only support float tanh.
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_TANH;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_TANH>;
         }
         break;
       case kTfLiteBuiltinSub:
@@ -702,8 +642,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_SUB;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinDiv:
@@ -717,8 +655,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_DIV;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinPad:
@@ -728,22 +664,12 @@ class NNAPIDelegateKernel {
           // NNAPI does not support specifying the padding value.
           // NNAPI pads physical zero for quantized tensors, so only delegate
           // float pad to NNAPI.
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_PAD;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_PAD>;
         }
         break;
       case kTfLiteBuiltinSpaceToBatchNd:
         if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_SPACE_TO_BATCH_ND;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_SPACE_TO_BATCH_ND>;
         }
         break;
       case kTfLiteBuiltinStridedSlice:
@@ -758,8 +684,6 @@ class NNAPIDelegateKernel {
                 builtin->shrink_axis_mask);
             return ANEURALNETWORKS_STRIDED_SLICE;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinTranspose:
@@ -771,12 +695,7 @@ class NNAPIDelegateKernel {
             (node->inputs->size > 1) &&
             (context->tensors[node->inputs->data[1]].allocation_type ==
              kTfLiteMmapRo)) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_TRANSPOSE;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_TRANSPOSE>;
         }
         break;
       case kTfLiteBuiltinRnn:
@@ -799,8 +718,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_RNN;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinSvdf:
@@ -827,8 +744,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_SVDF;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinLstm:
@@ -870,8 +785,6 @@ class NNAPIDelegateKernel {
 
             return ANEURALNETWORKS_LSTM;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinMean:
@@ -888,36 +801,27 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(keep_dims);
             return ANEURALNETWORKS_MEAN;
           };
-        } else {
-          return nullptr;
         }
+        break;
       case kTfLiteBuiltinEmbeddingLookup:
         // NNAPI only support float32 values.
         if (version == 1 &&
             context->tensors[node->inputs->data[1]].type == kTfLiteFloat32) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_EMBEDDING_LOOKUP;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_EMBEDDING_LOOKUP>;
         }
         break;
       case kTfLiteBuiltinHashtableLookup:
         // NNAPI only support float32 output.
         if (version == 1 &&
             context->tensors[node->outputs->data[0]].type == kTfLiteFloat32) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_HASHTABLE_LOOKUP;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_HASHTABLE_LOOKUP>;
         }
         break;
       default:
+        // All other operators are not mapped.
         return nullptr;
     }
+    return nullptr;
   }
 
   // Initialize the kernel (a NN model).
@@ -1090,7 +994,7 @@ class NNAPIDelegateKernel {
     outputs.reserve(output_tensors->size);
 
     size_t total_input_byte_size = 0;
-    // Make the TensorFlow lite inputs and outputs to ann_indices.
+    // Make the TensorFlow Lite inputs and outputs to ann_indices.
     for (int i : TfLiteIntArrayView(input_tensors)) {
       // Constant tensors are not NNAPI inputs.
       if (i != kOptionalTensor &&
@@ -1149,12 +1053,14 @@ TfLiteDelegate* NnApiDelegate() {
           return kTfLiteOk;
         }
 
+        // Allocate one element in vector already since TensorFlow Lite uses
+        // the first value as the number of nodes. The actual value will be set
+        // later, after the vector has been filled.
         std::vector<int> supported_nodes(1);
         // We don't care about all nodes_, we only care about ones in the
         // current plan.
         TfLiteIntArray* plan;
         TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
-        int total_supported_nodes = 0;
 
         // Check for every node if it is supported
         // TODO(b/80625235): Fix this to do more careful checking of versioning.
@@ -1163,14 +1069,12 @@ TfLiteDelegate* NnApiDelegate() {
           TfLiteRegistration* registration;
           TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
               context, node_index, &node, &registration));
-          NNAPIDelegateKernel dummy_kernel;
-          if (dummy_kernel.Map(context, registration->builtin_code,
-                               registration->version, node)) {
+          if (NNAPIDelegateKernel::Map(context, registration->builtin_code,
+                                       registration->version, node)) {
             supported_nodes.push_back(node_index);
           }
-          total_supported_nodes += 1;
         }
-        // Put the size at the beginning of the array.
+        // First element in vector must be the number of actual nodes.
         supported_nodes[0] = supported_nodes.size() - 1;
 
         // NN API Delegate Registration (the pseudo kernel that will invoke NN
@@ -1208,11 +1112,10 @@ TfLiteDelegate* NnApiDelegate() {
 
         // Request TFLite to partition the graph and make kernels
         // for each independent node sub set a new nnapi_delegate_kernel.
-        context->ReplaceNodeSubsetsWithDelegateKernels(
+        return context->ReplaceNodeSubsetsWithDelegateKernels(
             context, nnapi_delegate_kernel,
             reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()),
             delegate);
-        return kTfLiteOk;
       }};
 
   return &delegate;
diff --git a/tensorflow/lite/examples/android/BUILD b/tensorflow/lite/examples/android/BUILD
index 761a60314e8fb663d9a60af4116bd96a7e5839e2..80cefd415a579ad053c9f4cfcd59f63a64566931 100644
--- a/tensorflow/lite/examples/android/BUILD
+++ b/tensorflow/lite/examples/android/BUILD
@@ -34,7 +34,7 @@ android_binary(
     # to reduce APK size.
     assets = [
         "//tensorflow/lite/examples/android/app/src/main/assets:labels_mobilenet_quant_v1_224.txt",
-        "@tflite_mobilenet//:mobilenet_quant_v1_224.tflite",
+        "@tflite_mobilenet_quant//:mobilenet_v1_1.0_224_quant.tflite",
         "@tflite_conv_actions_frozen//:conv_actions_frozen.tflite",
         "//tensorflow/lite/examples/android/app/src/main/assets:conv_actions_labels.txt",
         "@tflite_mobilenet_ssd//:mobilenet_ssd.tflite",
diff --git a/tensorflow/lite/examples/android/app/download-models.gradle b/tensorflow/lite/examples/android/app/download-models.gradle
index d2f03db5f6373b8f679d55464dbfbf01ab8bd0c0..36bd177a1fd6bb21a27edd6d2b6e82fa7aa5d57b 100644
--- a/tensorflow/lite/examples/android/app/download-models.gradle
+++ b/tensorflow/lite/examples/android/app/download-models.gradle
@@ -8,13 +8,12 @@
  *     3 model files will be downloaded into given folder of ext.ASSET_DIR
  */
 // hard coded model files
-// LINT.IfChange
 
-def models = ['conv_actions_tflite.zip',
-              'mobilenet_ssd_tflite_v1.zip',
-              'mobilenet_v1_224_android_quant_2017_11_08.zip',
-              'coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip']
-// LINT.ThenChange(//tensorflow/lite/examples/android/BUILD)
+def models = ['https://storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip',
+              'https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip',
+              'https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip',
+              'http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz',
+              'http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz']
 
 // Root URL for model archives
 def MODEL_URL = 'https://storage.googleapis.com/download.tensorflow.org/models/tflite'
@@ -30,9 +29,9 @@ buildscript {
 
 import de.undercouch.gradle.tasks.download.Download
 task downloadFile(type: Download){
-    for (f in models) {
-        def modelUrl = MODEL_URL + "/" + f
-        println "Downloading ${f} from ${modelUrl}"
+    for (modelUrl in models) {
+        def localFile = modelUrl.split("/")[-1]
+        println "Downloading ${localFile} from ${modelUrl}"
         src modelUrl
     }
 
@@ -43,7 +42,12 @@ task downloadFile(type: Download){
 task extractModels(type: Copy) {
     for (f in models) {
         def localFile = f.split("/")[-1]
-        from zipTree(project.ext.TMP_DIR + '/' + localFile)
+        def localExt = localFile.split("[.]")[-1]
+        if (localExt == "tgz") {
+            from tarTree(project.ext.TMP_DIR + '/' + localFile)
+        } else {
+            from zipTree(project.ext.TMP_DIR + '/' + localFile)
+        }
     }
 
     into file(project.ext.ASSET_DIR)
@@ -63,6 +67,9 @@ task extractModels(type: Copy) {
     }
 }
 
+
+
+
 tasks.whenTaskAdded { task ->
     if (task.name == 'assembleDebug') {
         task.dependsOn 'extractModels'
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java
index dcbbefbeab6627b37579902cd25841c0ae257dda..698251d8b4aff3423808126ff490fe277a7ed283 100644
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java
+++ b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java
@@ -65,7 +65,7 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
   // --input_binary=true
   private static final int INPUT_SIZE = 224;
 
-  private static final String MODEL_FILE = "mobilenet_quant_v1_224.tflite";
+  private static final String MODEL_FILE = "mobilenet_v1_1.0_224_quant.tflite";
   private static final String LABEL_FILE = "labels_mobilenet_quant_v1_224.txt";
 
   private static final boolean MAINTAIN_ASPECT = true;
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
index 87160f6b3fb8c0d24e5df131d9becbb3eb6e2980..2feca79e888b4cd20b0416edd4a5c114b60c5369 100644
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
+++ b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
@@ -52,8 +52,8 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
   private static final int TF_OD_API_INPUT_SIZE = 300;
   private static final boolean TF_OD_API_IS_QUANTIZED = true;
   private static final String TF_OD_API_MODEL_FILE = "detect.tflite";
-  private static final String TF_OD_API_LABELS_FILE = "file:///android_asset/coco_labels_list.txt";
-  
+  private static final String TF_OD_API_LABELS_FILE = "coco_labels_list.txt";
+
   // Which detection model to use: by default uses Tensorflow Object Detection API frozen
   // checkpoints.
   private enum DetectorMode {
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
index 9eb21de9d03e387d3c25b38171e154a358dc81ce..afbf3178314897a9c1b7681b0b1a0de27577f3e3 100644
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
+++ b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
@@ -105,8 +105,7 @@ public class TFLiteObjectDetectionAPIModel implements Classifier {
     final TFLiteObjectDetectionAPIModel d = new TFLiteObjectDetectionAPIModel();
 
     InputStream labelsInput = null;
-    String actualFilename = labelFilename.split("file:///android_asset/")[1];
-    labelsInput = assetManager.open(actualFilename);
+    labelsInput = assetManager.open(labelFilename);
     BufferedReader br = null;
     br = new BufferedReader(new InputStreamReader(labelsInput));
     String line;
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h
index fb5800e86d365b56f1b52147c3f9cc8d7211f8c3..438e6adc79a2eb6ca0ed9a61d278eef79546ce8d 100644
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h
@@ -17,8 +17,26 @@
 
 #include <vector>
 
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
+// TensorFlow Lite was migrated out of `contrib/` directory. The change
+// wasn't reflected in newest CocoaPod release yet (1.12.0).
+// Change this to 0 when using a TFLite version which is newer than 1.12.0.
+// TODO(ycling): Remove the macro when we release the next version.
+#ifndef TFLITE_USE_CONTRIB_LITE
+#define TFLITE_USE_CONTRIB_LITE 1
+#endif
+
+// Set TFLITE_USE_GPU_DELEGATE to 1 to use TFLite GPU Delegate.
+// Note: TFLite GPU Delegate binary isn't releast yet, and we're working
+// on it.
+#ifndef TFLITE_USE_GPU_DELEGATE
+#define TFLITE_USE_GPU_DELEGATE 0
+#endif
+
+#if TFLITE_USE_GPU_DELEGATE && TFLITE_USE_CONTRIB_LITE
+// Sanity check.
+#error "GPU Delegate only works with newer TFLite " \
+    "after migrating out of contrib"
+#endif
 
 @interface CameraExampleViewController
     : UIViewController<UIGestureRecognizerDelegate, AVCaptureVideoDataOutputSampleBufferDelegate> {
@@ -33,10 +51,6 @@
   AVCaptureSession* session;
 
   std::vector<std::string> labels;
-  std::unique_ptr<tflite::FlatBufferModel> model;
-  tflite::ops::builtin::BuiltinOpResolver resolver;
-  std::unique_ptr<tflite::Interpreter> interpreter;
-
   double total_latency;
   int total_count;
 }
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
index a3e6e110958dc1b0e5ff7a8033f2082cd4fe3864..48cd313c9d7a94328d990e45243e2b84c9dc7a62 100644
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -23,10 +23,20 @@
 #include <iostream>
 #include <queue>
 
+#if TFLITE_USE_CONTRIB_LITE
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/op_resolver.h"
 #include "tensorflow/contrib/lite/string_util.h"
+#else
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/string_util.h"
+#if TFLITE_USE_GPU_DELEGATE
+#include "tensorflow/lite/delegates/gpu/metal_delegate.h"
+#endif
+#endif
 
 #define LOG(x) std::cerr
 
@@ -34,7 +44,12 @@ namespace {
 
 // If you have your own model, modify this to the file name, and make sure
 // you've added the file to your app resources too.
+#if TFLITE_USE_GPU_DELEGATE
+// GPU Delegate only supports float model now.
 NSString* model_file_name = @"mobilenet_v1_1.0_224";
+#else
+NSString* model_file_name = @"mobilenet_quant_v1_224.tflite";
+#endif
 NSString* model_file_type = @"tflite";
 // If you have your own model, point this to the labels file.
 NSString* labels_file_name = @"labels";
@@ -151,7 +166,12 @@ void ProcessInputWithQuantizedModel(
 - (void)teardownAVCapture;
 @end
 
-@implementation CameraExampleViewController
+@implementation CameraExampleViewController {
+  std::unique_ptr<tflite::FlatBufferModel> model;
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  TfLiteDelegate* delegate;
+}
 
 - (void)setupAVCapture {
   NSError* error = nil;
@@ -363,6 +383,11 @@ void ProcessInputWithQuantizedModel(
 }
 
 - (void)dealloc {
+#if TFLITE_USE_GPU_DELEGATE
+  if (delegate) {
+    DeleteGpuDelegate(delegate);
+  }
+#endif
   [self teardownAVCapture];
 }
 
@@ -388,6 +413,15 @@ void ProcessInputWithQuantizedModel(
   LoadLabels(labels_file_name, labels_file_type, &labels);
 
   tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+
+#if TFLITE_USE_GPU_DELEGATE
+  GpuDelegateOptions options;
+  options.allow_precision_loss = true;
+  options.wait_type = GpuDelegateOptions::WaitType::kActive;
+  delegate = NewGpuDelegate(&options);
+  interpreter->ModifyGraphWithDelegate(delegate);
+#endif
+
   // Explicitly resize the input tensor.
   {
     int input = interpreter->inputs()[0];
diff --git a/tensorflow/lite/examples/ios/camera/Podfile b/tensorflow/lite/examples/ios/camera/Podfile
index 96a0d234265dac00f4bfe3b484fb95b5e1e103eb..2e15cc63decb30eb2b8c9bffab3b5d1bff10e9b3 100644
--- a/tensorflow/lite/examples/ios/camera/Podfile
+++ b/tensorflow/lite/examples/ios/camera/Podfile
@@ -1,5 +1,13 @@
 platform :ios, '8.0'
 inhibit_all_warnings!
 
+project 'tflite_camera_example.xcodeproj'
+
 target 'tflite_camera_example'
-       pod 'TensorFlowLite', '1.12.0'
+  # Comment 'TensorFlowLite' pod and un-comment 'TensorFlowLiteGpuExperimental'
+  # to use TFLite GPU Delegate.
+  # Note: TFLite GPU Delegate binary isn't releast yet, and we're working
+  # on it.
+
+  pod 'TensorFlowLite', '1.12.0'
+  # pod 'TensorFlowLiteGpuExperimental', '0.0.1'
diff --git a/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj b/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
index 9522c41dea0e6609e1b8e1462d9abec8874e3999..9b5c2b32a8f176e58a2d28d11ee3e41ef875e722 100644
--- a/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
+++ b/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
@@ -15,6 +15,7 @@
 		1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */; };
 		54DC6C3C5F734F3A58069F0C /* libPods-tflite_camera_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */; };
 		AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */ = {isa = PBXBuildFile; fileRef = AC1F82641FBA3CBD0052BA77 /* labels.txt */; };
+		AC31178921BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = AC31178821BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite */; };
 		AC3BB41720114C400084552C /* mobilenet_v1_1.0_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = AC3BB41620114C400084552C /* mobilenet_v1_1.0_224.tflite */; };
 /* End PBXBuildFile section */
 
@@ -36,6 +37,7 @@
 		3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.debug.xcconfig"; sourceTree = "<group>"; };
 		55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.release.xcconfig"; sourceTree = "<group>"; };
 		AC1F82641FBA3CBD0052BA77 /* labels.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = labels.txt; sourceTree = "<group>"; };
+		AC31178821BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_quant_v1_224.tflite; sourceTree = "<group>"; };
 		AC3BB41620114C400084552C /* mobilenet_v1_1.0_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_v1_1.0_224.tflite; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
@@ -103,6 +105,7 @@
 		59A3CFF31CF4E68100C4259F /* data */ = {
 			isa = PBXGroup;
 			children = (
+				AC31178821BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite */,
 				AC3BB41620114C400084552C /* mobilenet_v1_1.0_224.tflite */,
 				AC1F82641FBA3CBD0052BA77 /* labels.txt */,
 			);
@@ -120,8 +123,6 @@
 				1C564C091ED3A92E00087306 /* Sources */,
 				1C564C0A1ED3A92E00087306 /* Frameworks */,
 				1C564C0B1ED3A92E00087306 /* Resources */,
-				00E875C3B066535AE6B77101 /* [CP] Embed Pods Frameworks */,
-				5C2D02120E3E5E09567AA946 /* [CP] Copy Pods Resources */,
 			);
 			buildRules = (
 			);
@@ -175,42 +176,13 @@
 				AC3BB41720114C400084552C /* mobilenet_v1_1.0_224.tflite in Resources */,
 				1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */,
 				AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */,
+				AC31178921BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
 /* End PBXResourcesBuildPhase section */
 
 /* Begin PBXShellScriptBuildPhase section */
-		00E875C3B066535AE6B77101 /* [CP] Embed Pods Frameworks */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-			);
-			name = "[CP] Embed Pods Frameworks";
-			outputPaths = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example-frameworks.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
-		5C2D02120E3E5E09567AA946 /* [CP] Copy Pods Resources */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-			);
-			name = "[CP] Copy Pods Resources";
-			outputPaths = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example-resources.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
 		66DAEAAEE9EF6550C3A061E0 /* [CP] Check Pods Manifest.lock */ = {
 			isa = PBXShellScriptBuildPhase;
 			buildActionMask = 2147483647;
@@ -322,9 +294,7 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
-				HEADER_SEARCH_PATHS = (
-					"$(inherited)",
-				);
+				HEADER_SEARCH_PATHS = "$(inherited)";
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
 				MTL_ENABLE_DEBUG_INFO = YES;
 				ONLY_ACTIVE_ARCH = YES;
@@ -365,9 +335,7 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
-				HEADER_SEARCH_PATHS = (
-					"$(inherited)",
-				);
+				HEADER_SEARCH_PATHS = "$(inherited)";
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				SDKROOT = iphoneos;
diff --git a/tensorflow/lite/examples/ios/download_models.sh b/tensorflow/lite/examples/ios/download_models.sh
index ad6ccd1b0ad89aadd8035d5c952164f63f29ccaf..4828617d95e94c1b6ad811e04d3b94b659bd8f74 100755
--- a/tensorflow/lite/examples/ios/download_models.sh
+++ b/tensorflow/lite/examples/ios/download_models.sh
@@ -53,6 +53,6 @@ download_and_extract "${QUANTIZED_MODELS_URL}" "${DOWNLOADS_DIR}/quantized_model
 file ${DOWNLOADS_DIR}/models
 
 cp ${DOWNLOADS_DIR}/models/models/* simple/data/
-cp "${DOWNLOADS_DIR}/quantized_models/labels.txt" camera/data/
+cp ${DOWNLOADS_DIR}/models/models/* camera/data/
 cp "${DOWNLOADS_DIR}/quantized_models/mobilenet_quant_v1_224.tflite" \
-   'camera/data/mobilenet_v1_1.0_224.tflite'
+   'camera/data/mobilenet_quant_v1_224.tflite'
diff --git a/tensorflow/lite/examples/label_image/BUILD b/tensorflow/lite/examples/label_image/BUILD
index de1bfd7053256538e4516b681d78b040bf9aec0d..4fc8648d46c4bdefe3865381a23f4d73c87c284b 100644
--- a/tensorflow/lite/examples/label_image/BUILD
+++ b/tensorflow/lite/examples/label_image/BUILD
@@ -63,7 +63,6 @@ cc_test(
     data = [
         "testdata/grace_hopper.bmp",
     ],
-    tags = ["no_oss"],
     deps = [
         ":bitmap_helpers",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/lite/examples/label_image/label_image_test.cc b/tensorflow/lite/examples/label_image/label_image_test.cc
index 6b4ec2a9374ca58a227506ec312c9374c1a7fee3..4db139f048d44a263fa1bbe38099b55ee45fd593 100644
--- a/tensorflow/lite/examples/label_image/label_image_test.cc
+++ b/tensorflow/lite/examples/label_image/label_image_test.cc
@@ -20,8 +20,6 @@ limitations under the License.
 #include "tensorflow/lite/examples/label_image/get_top_n.h"
 #include "tensorflow/lite/examples/label_image/label_image.h"
 
-using ::testing::ElementsAreArray;
-
 namespace tflite {
 namespace label_image {
 
diff --git a/tensorflow/lite/examples/minimal/minimal.cc b/tensorflow/lite/examples/minimal/minimal.cc
index 46f8b09df6cee12cfd7a3767be1e8f501cc5ee4f..9bbfee60851e0d9a1cd1e7549338341b634f0aa6 100644
--- a/tensorflow/lite/examples/minimal/minimal.cc
+++ b/tensorflow/lite/examples/minimal/minimal.cc
@@ -50,7 +50,7 @@ int main(int argc, char* argv[]) {
 
   // Build the interpreter
   tflite::ops::builtin::BuiltinOpResolver resolver;
-  InterpreterBuilder builder(*model.get(), resolver);
+  InterpreterBuilder builder(*model, resolver);
   std::unique_ptr<Interpreter> interpreter;
   builder(&interpreter);
   TFLITE_MINIMAL_CHECK(interpreter != nullptr);
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
index 9c00d0501ab272fc1d7a909457f02a4388f8e02c..eeb48d123113c5924a74286ad1e0851eb484cdb8 100644
--- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
@@ -111,7 +111,7 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
 
     # Initialize variables
     init = tf.global_variables_initializer()
-    self.evaluate(init)
+    sess.run(init)
     for _ in range(TRAIN_STEPS):
       batch_x, batch_y = self.mnist.train.next_batch(
           batch_size=self.batch_size, shuffle=False)
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..0e42329cade2e4b49b8000412c593f9a442af4ca
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
@@ -0,0 +1,153 @@
+
+# Tests loading and running a speech model.
+MICRO_SPEECH_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
+ALL_SRCS += $(MICRO_SPEECH_TEST_SRCS)
+MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICRO_SPEECH_TEST_SRCS))))
+MICRO_SPEECH_TEST_BINARY := $(BINDIR)micro_speech_test
+ALL_BINARIES += $(MICRO_SPEECH_TEST_BINARY)
+$(MICRO_SPEECH_TEST_BINARY): $(MICRO_SPEECH_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(MICRO_SPEECH_TEST_BINARY) $(MICRO_SPEECH_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+micro_speech_test: $(MICRO_SPEECH_TEST_BINARY)
+micro_speech_test_bin: $(MICRO_SPEECH_TEST_BINARY).bin
+test_micro_speech: $(MICRO_SPEECH_TEST_BINARY)
+	$(TEST_SCRIPT) $(MICRO_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+# Source files that are used by multiple preprocessor tests.
+PREPROCESSOR_TEST_SHARED_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
+
+# Test the float reference code for feature generation.
+PREPROCESSOR_REFERENCE_TEST_SRCS = \
+$(PREPROCESSOR_TEST_SHARED_SRCS) \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
+ALL_SRCS += $(PREPROCESSOR_REFERENCE_TEST_SRCS)
+PREPROCESSOR_REFERENCE_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_REFERENCE_TEST_SRCS))))
+PREPROCESSOR_REFERENCE_TEST_BINARY := $(BINDIR)preprocessor_reference_test
+ALL_BINARIES += $(PREPROCESSOR_REFERENCE_TEST_BINARY)
+$(PREPROCESSOR_REFERENCE_TEST_BINARY): $(PREPROCESSOR_REFERENCE_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PREPROCESSOR_REFERENCE_TEST_BINARY) $(PREPROCESSOR_REFERENCE_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+preprocessor_reference_test: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
+preprocessor_reference_test_bin: $(PREPROCESSOR_REFERENCE_TEST_BINARY).bin
+test_preprocessor_reference: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_REFERENCE_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+# Test the fixed point reference code for feature generation.
+PREPROCESSOR_FIXED_TEST_SRCS = \
+$(PREPROCESSOR_TEST_SHARED_SRCS) \
+tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
+ALL_SRCS += $(PREPROCESSOR_FIXED_TEST_SRCS)
+PREPROCESSOR_FIXED_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_FIXED_TEST_SRCS))))
+PREPROCESSOR_FIXED_TEST_BINARY := $(BINDIR)preprocessor_fixed_test
+ALL_BINARIES += $(PREPROCESSOR_FIXED_TEST_BINARY)
+$(PREPROCESSOR_FIXED_TEST_BINARY): $(PREPROCESSOR_FIXED_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PREPROCESSOR_FIXED_TEST_BINARY) $(PREPROCESSOR_FIXED_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+preprocessor_fixed_test: $(PREPROCESSOR_FIXED_TEST_BINARY)
+preprocessor_fixed_test_bin: $(PREPROCESSOR_FIXED_TEST_BINARY).bin
+test_preprocessor_fixed: $(PREPROCESSOR_FIXED_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_FIXED_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+# Tests the audio provider module.
+AUDIO_PROVIDER_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
+ALL_SRCS += $(AUDIO_PROVIDER_TEST_SRCS)
+AUDIO_PROVIDER_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(AUDIO_PROVIDER_TEST_SRCS))))
+AUDIO_PROVIDER_TEST_BINARY := $(BINDIR)audio_provider_test
+ALL_BINARIES += $(AUDIO_PROVIDER_TEST_BINARY)
+$(AUDIO_PROVIDER_TEST_BINARY): $(AUDIO_PROVIDER_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(AUDIO_PROVIDER_TEST_BINARY) $(AUDIO_PROVIDER_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+audio_provider_test: $(AUDIO_PROVIDER_TEST_BINARY)
+audio_provider_test_bin: $(AUDIO_PROVIDER_TEST_BINARY).bin
+test_audio_provider: $(AUDIO_PROVIDER_TEST_BINARY)
+	$(TEST_SCRIPT) $(AUDIO_PROVIDER_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+# Tests the feature provider module.
+FEATURE_PROVIDER_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
+ALL_SRCS += $(FEATURE_PROVIDER_TEST_SRCS)
+FEATURE_PROVIDER_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(FEATURE_PROVIDER_TEST_SRCS))))
+FEATURE_PROVIDER_TEST_BINARY := $(BINDIR)feature_provider_test
+ALL_BINARIES += $(FEATURE_PROVIDER_TEST_BINARY)
+$(FEATURE_PROVIDER_TEST_BINARY): $(FEATURE_PROVIDER_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(FEATURE_PROVIDER_TEST_BINARY) $(FEATURE_PROVIDER_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+feature_provider_test: $(FEATURE_PROVIDER_TEST_BINARY)
+feature_provider_test_bin: $(FEATURE_PROVIDER_TEST_BINARY).bin
+test_feature_provider: $(FEATURE_PROVIDER_TEST_BINARY)
+	$(TEST_SCRIPT) $(FEATURE_PROVIDER_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+# Tests the timer module.
+TIMER_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
+ALL_SRCS += $(TIMER_TEST_SRCS)
+TIMER_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(TIMER_TEST_SRCS))))
+TIMER_TEST_BINARY := $(BINDIR)timer_test
+ALL_BINARIES += $(TIMER_TEST_BINARY)
+$(TIMER_TEST_BINARY): $(TIMER_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(TIMER_TEST_BINARY) $(TIMER_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+timer_test: $(TIMER_TEST_BINARY)
+timer_test_bin: $(TIMER_TEST_BINARY).bin
+test_timer: $(TIMER_TEST_BINARY)
+	$(TEST_SCRIPT) $(TIMER_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+# Builds a standalone speech command recognizer binary.
+MICRO_SPEECH_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/main.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
+ALL_SRCS += $(MICRO_SPEECH_SRCS)
+MICRO_SPEECH_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICRO_SPEECH_SRCS))))
+MICRO_SPEECH_BINARY := $(BINDIR)micro_speech
+ALL_BINARIES += $(MICRO_SPEECH_BINARY)
+$(MICRO_SPEECH_BINARY): $(MICRO_SPEECH_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(MICRO_SPEECH_BINARY) $(MICRO_SPEECH_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+micro_speech: $(MICRO_SPEECH_BINARY)
+micro_speech_bin: $(MICRO_SPEECH_BINARY).bin
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
index 0caf0ca099e0520f90530b02f9a95efbe6e3d299..20307e2b211f451997216f760c218b4daae6a201 100644
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -52,29 +52,6 @@ CC_PREFIX :=
 # runtime that can be linked in to other programs.
 MICROLITE_LIB_NAME := libtensorflow-microlite.a
 
-# Test binary for the microcontroller speech model.
-MICRO_SPEECH_TEST_SRCS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
-
-# Test binary for the microcontroller speech model.
-PREPROCESSOR_TEST_SRCS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
-
-PREPROCESSOR_REFERENCE_TEST_SRCS = \
-$(PREPROCESSOR_TEST_SRCS) \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
-
-PREPROCESSOR_FIXED_TEST_SRCS += \
-$(PREPROCESSOR_TEST_SRCS) \
-tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
-
 MICROLITE_TEST_SRCS := \
 $(wildcard tensorflow/lite/experimental/micro/*test.cc) \
 $(wildcard tensorflow/lite/experimental/micro/kernels/*test.cc)
@@ -97,9 +74,6 @@ MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SR
 include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
 
 ALL_SRCS := \
-	$(MICRO_SPEECH_TEST_SRCS) \
-	$(PREPROCESSOR_REFERENCE_TEST_SRCS) \
-	$(PREPROCESSOR_FIXED_TEST_SRCS) \
 	$(MICROLITE_CC_SRCS) \
 	$(MICROLITE_TEST_SRCS)
 
@@ -111,22 +85,12 @@ LIBDIR := $(GENDIR)lib/
 
 MICROLITE_LIB_PATH := $(LIBDIR)$(MICROLITE_LIB_NAME)
 
-MICRO_SPEECH_TEST_BINARY := $(BINDIR)micro_speech_test
-PREPROCESSOR_REFERENCE_TEST_BINARY := $(BINDIR)preprocessor_reference_test
-PREPROCESSOR_FIXED_TEST_BINARY := $(BINDIR)preprocessor_fixed_test
-
 CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
 CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
 AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar
 
-MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICRO_SPEECH_TEST_SRCS))))
-
-PREPROCESSOR_REFERENCE_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_REFERENCE_TEST_SRCS))))
-
-PREPROCESSOR_FIXED_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_FIXED_TEST_SRCS))))
+# Load the examples.
+include $(wildcard tensorflow/lite/experimental/micro/examples/*/Makefile.inc)
 
 MICROLITE_LIB_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICROLITE_CC_SRCS))))
@@ -145,7 +109,7 @@ $(OBJDIR)%.o: %.c
 	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
 
 # The target that's compiled if there's no command-line arguments.
-all: $(MICROLITE_LIB_PATH) $(MICRO_SPEECH_TEST_BINARY) $(PREPROCESSOR_TEST_BINARY)
+all: $(MICROLITE_LIB_PATH) $(ALL_BINARIES)
 
 microlite: $(MICROLITE_LIB_PATH)
 
@@ -158,42 +122,6 @@ $(MICROLITE_LIB_PATH): tensorflow/lite/schema/schema_generated.h $(MICROLITE_LIB
 	@mkdir -p $(dir $@)
 	$(AR) $(ARFLAGS) $(MICROLITE_LIB_PATH) $(MICROLITE_LIB_OBJS)
 
-$(MICRO_SPEECH_TEST_BINARY): $(MICRO_SPEECH_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(MICRO_SPEECH_TEST_BINARY) $(MICRO_SPEECH_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-
-micro_speech_test: $(MICRO_SPEECH_TEST_BINARY)
-micro_speech_test_bin: $(MICRO_SPEECH_TEST_BINARY).bin
-
-test_micro_speech: $(MICRO_SPEECH_TEST_BINARY)
-	$(TEST_SCRIPT) $(MICRO_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
-$(PREPROCESSOR_REFERENCE_TEST_BINARY): $(PREPROCESSOR_REFERENCE_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(PREPROCESSOR_REFERENCE_TEST_BINARY) $(PREPROCESSOR_REFERENCE_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-
-preprocessor_reference_test: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
-preprocessor_reference_test_bin: $(PREPROCESSOR_REFERENCE_TEST_BINARY).bin
-
-test_preprocessor_reference: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
-	$(TEST_SCRIPT) $(PREPROCESSOR_REFERENCE_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
-$(PREPROCESSOR_FIXED_TEST_BINARY): $(PREPROCESSOR_FIXED_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(PREPROCESSOR_FIXED_TEST_BINARY) $(PREPROCESSOR_FIXED_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-
-preprocessor_fixed_test: $(PREPROCESSOR_FIXED_TEST_BINARY)
-preprocessor_fixed_test_bin: $(PREPROCESSOR_FIXED_TEST_BINARY).bin
-
-test_preprocessor_fixed: $(PREPROCESSOR_FIXED_TEST_BINARY)
-	$(TEST_SCRIPT) $(PREPROCESSOR_FIXED_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
 $(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
@@ -203,8 +131,6 @@ $(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
 $(BINDIR)%.test_target: $(BINDIR)%_test
 	$(TEST_SCRIPT) $< '~~~ALL TESTS PASSED~~~'
 
-$(info $(MICROLITE_TEST_TARGETS))
-
 test: test_micro_speech $(MICROLITE_TEST_TARGETS)
 
 # Gets rid of all generated files.
diff --git a/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc b/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc
index 4367fe74a484445289f15c83860ca08ca4e144db..84ab164d2c08623d41ed9468fe42e1e7d2fbf354 100644
--- a/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc
+++ b/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc
@@ -142,7 +142,8 @@ void GenerateFeatures(TfLiteAudioMicrofrontendParams* data,
 
     if (output.values != nullptr) {
       frame_buffer[frame_index].reserve(output.size);
-      for (int i = 0; i < output.size; ++i) {
+      int i;
+      for (i = 0; i < output.size; ++i) {
         frame_buffer[frame_index].push_back(static_cast<T>(output.values[i]) /
                                             data->out_scale);
       }
@@ -152,9 +153,10 @@ void GenerateFeatures(TfLiteAudioMicrofrontendParams* data,
 
   int index = 0;
   std::vector<T> pad(data->state->filterbank.num_channels, 0);
-  for (int anchor = 0; anchor < frame_buffer.size();
-       anchor += data->frame_stride) {
-    for (int frame = anchor - data->left_context;
+  int anchor;
+  for (anchor = 0; anchor < frame_buffer.size(); anchor += data->frame_stride) {
+    int frame;
+    for (frame = anchor - data->left_context;
          frame <= anchor + data->right_context; ++frame) {
       std::vector<T>* feature;
       if (data->zero_padding && (frame < 0 || frame >= frame_buffer.size())) {
diff --git a/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc b/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc
index a9119d01831f6892dbf887930f3626445fc8a8e3..e3a0e06f7b0faf07c9188b4b77957358c0e84d9c 100644
--- a/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc
@@ -140,13 +140,16 @@ class BaseMicroFrontendTest : public ::testing::Test {
 
     // Mimic padding behaviour with zero_padding = true.
     std::vector<int> output_flattened;
-    for (int anchor = 0; anchor < output.size();
+    int anchor;
+    for (anchor = 0; anchor < output.size();
          anchor += micro_frontend->num_frame_stride()) {
-      for (int frame = anchor - micro_frontend->num_left_context();
+      int frame;
+      for (frame = anchor - micro_frontend->num_left_context();
            frame <= anchor + micro_frontend->num_right_context(); ++frame) {
         if (frame < 0 || frame >= output.size()) {
           // Padding with zeros.
-          for (int j = 0; j < num_frequency_per_frame; ++j) {
+          int j;
+          for (j = 0; j < num_frequency_per_frame; ++j) {
             output_flattened.push_back(0.0);
           }
         } else {
diff --git a/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc b/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc
index 7c1ee2d852201cc52a53ae07bf6e00ebf6f1ab47..1b754c1b4c244edf1b091a581e5ae9399c2ac2e3 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc
@@ -38,7 +38,8 @@ TEST(FftTest, CheckOutputValues) {
       {-887, 0}, {3000, 3000}, {0, -6401}, {-3000, 3000}, {886, 0}, {118, 119},
       {0, 25},   {9, -10},     {19, 0},    {9, 9},        {0, 0}};
   ASSERT_EQ(state.fft_size / 2 + 1, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i <= state.fft_size / 2; ++i) {
+  int i;
+  for (i = 0; i <= state.fft_size / 2; ++i) {
     EXPECT_EQ(state.output[i].real, expected[i].real);
     EXPECT_EQ(state.output[i].imag, expected[i].imag);
   }
diff --git a/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.c b/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.c
index 2dbb4b3bf09654df3be0165f14c6f3da742268f1..6ce4c7c79646485477a3067f96c7fe8526836ee6 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.c
+++ b/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.c
@@ -17,7 +17,8 @@ limitations under the License.
 static void PrintArray(FILE* fp, const char* name, const int16_t* values,
                        size_t size) {
   fprintf(fp, "static int16_t filterbank_%s[] = {", name);
-  for (int i = 0; i < size; ++i) {
+  int i;
+  for (i = 0; i < size; ++i) {
     fprintf(fp, "%d", values[i]);
     if (i < size - 1) {
       fprintf(fp, ", ");
diff --git a/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc b/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc
index 808d527186eaa920a9eb5319b328b96de6047174..41f0064d4f1674471fa731e72464b1d40fce4216 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc
@@ -71,7 +71,8 @@ TEST_F(FilterbankTest, CheckChannelFrequencyStarts) {
 
   const int16_t expected[] = {0, 4, 8};
   ASSERT_EQ(state.num_channels + 1, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i <= state.num_channels; ++i) {
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
     EXPECT_EQ(state.channel_frequency_starts[i], expected[i]);
   }
 
@@ -85,7 +86,8 @@ TEST_F(FilterbankTest, CheckChannelWeightStarts) {
 
   const int16_t expected[] = {0, 8, 16};
   ASSERT_EQ(state.num_channels + 1, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i <= state.num_channels; ++i) {
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
     EXPECT_EQ(state.channel_weight_starts[i], expected[i]);
   }
 
@@ -99,7 +101,8 @@ TEST_F(FilterbankTest, CheckChannelWidths) {
 
   const int16_t expected[] = {8, 8, 8};
   ASSERT_EQ(state.num_channels + 1, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i <= state.num_channels; ++i) {
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
     EXPECT_EQ(state.channel_widths[i], expected[i]);
   }
 
@@ -117,7 +120,8 @@ TEST_F(FilterbankTest, CheckWeights) {
   ASSERT_EQ(state.channel_weight_starts[state.num_channels] +
                 state.channel_widths[state.num_channels],
             sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
     EXPECT_EQ(state.weights[i], expected[i]);
   }
 
@@ -135,7 +139,8 @@ TEST_F(FilterbankTest, CheckUnweights) {
   ASSERT_EQ(state.channel_weight_starts[state.num_channels] +
                 state.channel_widths[state.num_channels],
             sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
     EXPECT_EQ(state.unweights[i], expected[i]);
   }
 
@@ -154,7 +159,8 @@ TEST_F(FilterbankTest, CheckConvertFftComplexToEnergy) {
   int32_t* energy = reinterpret_cast<int32_t*>(fake_fft);
   FilterbankConvertFftComplexToEnergy(&state, fake_fft, energy);
 
-  for (int i = state.start_index; i < state.end_index; ++i) {
+  int i;
+  for (i = state.start_index; i < state.end_index; ++i) {
     EXPECT_EQ(energy[i], kEnergy[i]);
   }
 }
@@ -167,7 +173,8 @@ TEST_F(FilterbankTest, CheckAccumulateChannels) {
   FilterbankAccumulateChannels(&state, kEnergy);
 
   ASSERT_EQ(state.num_channels + 1, sizeof(kWork) / sizeof(kWork[0]));
-  for (int i = 0; i <= state.num_channels; ++i) {
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
     EXPECT_EQ(state.work[i], kWork[i]);
   }
 
@@ -184,7 +191,8 @@ TEST_F(FilterbankTest, CheckSqrt) {
 
   const uint32_t expected[] = {247311, 508620};
   ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < state.num_channels; ++i) {
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
     EXPECT_EQ(scaled_filterbank[i], expected[i]);
   }
 
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc b/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
index 993e866cc08850cdfea129278783420e827d67f2..a6faa1fc1f51360e295253fb2b3cfdf01ada74ad 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
@@ -64,7 +64,8 @@ TEST_F(FrontendTest, CheckOutputValues) {
 
   const uint16_t expected[] = {479, 425};
   ASSERT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < output.size; ++i) {
+  int i;
+  for (i = 0; i < output.size; ++i) {
     EXPECT_EQ(output.values[i], expected[i]);
   }
 
@@ -86,7 +87,8 @@ TEST_F(FrontendTest, CheckConsecutiveWindow) {
 
   const int16_t expected[] = {436, 378};
   ASSERT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < output.size; ++i) {
+  int i;
+  for (i = 0; i < output.size; ++i) {
     EXPECT_EQ(output.values[i], expected[i]);
   }
 
diff --git a/tensorflow/lite/experimental/microfrontend/lib/log_scale.c b/tensorflow/lite/experimental/microfrontend/lib/log_scale.c
index 54f370e7d9f55250279cd6c9a81b9a17e0d6e071..149ec7cfba0a7891da320f92507fc06171363e70 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/log_scale.c
+++ b/tensorflow/lite/experimental/microfrontend/lib/log_scale.c
@@ -63,7 +63,8 @@ uint16_t* LogScaleApply(struct LogScaleState* state, uint32_t* signal,
   const int scale_shift = state->scale_shift;
   uint16_t* output = (uint16_t*) signal;
   uint16_t* ret = output;
-  for (int i = 0; i < signal_size; ++i) {
+  int i;
+  for (i = 0; i < signal_size; ++i) {
     uint32_t value = *signal++;
     if (state->enable_log) {
       if (correction_bits < 0) {
diff --git a/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc b/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc
index 91ca657e543d2a5f89a55483df8bdfbee1365951..1ea0842ec2ad1065782198b635bf8b4858d6bf3a 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc
@@ -34,7 +34,8 @@ TEST(LogScaleTest, CheckOutputValues) {
                                    kCorrectionBits);
 
   const uint16_t expected[] = {479, 425};
-  for (int i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
     EXPECT_EQ(output[i], expected[i]);
   }
 }
@@ -50,7 +51,8 @@ TEST(LogScaleTest, CheckOutputValuesNoLog) {
                                    kCorrectionBits);
 
   const uint16_t expected[] = {65535, 45998};
-  for (int i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
     EXPECT_EQ(output[i], expected[i]);
   }
 }
diff --git a/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc
index 16140564879305de86947044f8b8efd055a4793c..13d58b2476762d89ee79be554be12a9b7a897ad5 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc
@@ -44,7 +44,8 @@ TEST_F(NoiseReductionTest, TestNoiseReductionEstimate) {
 
   const uint32_t expected[] = {6321887, 31248341};
   ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < state.num_channels; ++i) {
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
     EXPECT_EQ(state.estimate[i], expected[i]);
   }
 
@@ -60,7 +61,8 @@ TEST_F(NoiseReductionTest, TestNoiseReduction) {
 
   const uint32_t expected[] = {241137, 478104};
   ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < state.num_channels; ++i) {
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
     EXPECT_EQ(signal[i], expected[i]);
   }
 
diff --git a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c
index b49eb301370a7e95497478625a97333225a83341..8ccc2fde98c810bdf238edbf2f7a8d61b9e4f495 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c
+++ b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c
@@ -47,7 +47,8 @@ uint32_t PcanShrink(const uint32_t x) {
 
 void PcanGainControlApply(struct PcanGainControlState* state,
                           uint32_t* signal) {
-  for (int i = 0; i < state->num_channels; ++i) {
+  int i;
+  for (i = 0; i < state->num_channels; ++i) {
     const uint32_t gain = WideDynamicFunction(state->noise_estimate[i],
                                               state->gain_lut);
     const uint32_t snr = ((uint64_t) signal[i] * gain) >> state->snr_shift;
diff --git a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc
index 830db89edd8eb39fc68d24bfa4a61fe82ef3eace..7c92d2d29d0e41d5e378a596c5a06e8418edfa8d 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc
@@ -49,7 +49,8 @@ TEST_F(PcanGainControlTest, TestPcanGainControl) {
 
   const uint32_t expected[] = {3578, 1533};
   ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < state.num_channels; ++i) {
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
     EXPECT_EQ(signal[i], expected[i]);
   }
 
diff --git a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c
index dbe44c494ae07fb8c356723287cb32bf63381d27..5201cf045b4d43738968cc27d34ec4b5fc896d4e 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c
+++ b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c
@@ -62,7 +62,8 @@ int PcanGainControlPopulateState(const struct PcanGainControlConfig* config,
   state->gain_lut[0] = PcanGainLookupFunction(config, input_bits, 0);
   state->gain_lut[1] = PcanGainLookupFunction(config, input_bits, 1);
   state->gain_lut -= 6;
-  for (int interval = 2; interval <= kWideDynamicFunctionBits; ++interval) {
+  int interval;
+  for (interval = 2; interval <= kWideDynamicFunctionBits; ++interval) {
     const uint32_t x0 = (uint32_t) 1 << (interval - 1);
     const uint32_t x1 = x0 + (x0 >> 1);
     const uint32_t x2 = (interval == kWideDynamicFunctionBits)
diff --git a/tensorflow/lite/experimental/microfrontend/lib/window_io.c b/tensorflow/lite/experimental/microfrontend/lib/window_io.c
index ed4ac5eb110c0f1358656ca9e1b79d6b37052258..d12cac2c85374f3a2465d59211d7ef44958d26af 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/window_io.c
+++ b/tensorflow/lite/experimental/microfrontend/lib/window_io.c
@@ -16,7 +16,8 @@ limitations under the License.
 
 void WindowWriteMemmapPreamble(FILE* fp, const struct WindowState* state) {
   fprintf(fp, "static int16_t window_coefficients[] = {\n");
-  for (int i = 0; i < state->size; ++i) {
+  int i;
+  for (i = 0; i < state->size; ++i) {
     fprintf(fp, "%d", state->coefficients[i]);
     if (i < state->size - 1) {
       fprintf(fp, ", ");
diff --git a/tensorflow/lite/experimental/microfrontend/lib/window_test.cc b/tensorflow/lite/experimental/microfrontend/lib/window_test.cc
index 8c6c19188d3e128e7bb3b1d007fff10ec271da95..60f11440f56ea39a25a5aa2beb23eb25a83048b3 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/window_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/window_test.cc
@@ -48,7 +48,8 @@ TEST_F(WindowTest, CheckCoefficients) {
                               3843, 3541, 3145, 2681, 2177, 1664, 1176,
                               743,  391,  144,  16};
   ASSERT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < state.size; ++i) {
+  int i;
+  for (i = 0; i < state.size; ++i) {
     EXPECT_EQ(state.coefficients[i], expected[i]);
   }
 
@@ -64,7 +65,8 @@ TEST_F(WindowTest, CheckResidualInput) {
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
 
-  for (int i = kStepSamples; i < kWindowSamples; ++i) {
+  int i;
+  for (i = kStepSamples; i < kWindowSamples; ++i) {
     EXPECT_EQ(state.input[i - kStepSamples], kFakeAudioData[i]);
   }
 
@@ -84,7 +86,8 @@ TEST_F(WindowTest, CheckOutputValues) {
       0, 1151,   0, -5944, 0, 13311,  0, -21448, 0, 28327, 0, -32256, 0, 32255,
       0, -28328, 0, 21447, 0, -13312, 0, 5943,   0, -1152, 0};
   ASSERT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < state.size; ++i) {
+  int i;
+  for (i = 0; i < state.size; ++i) {
     EXPECT_EQ(state.output[i], expected[i]);
   }
 
@@ -122,7 +125,8 @@ TEST_F(WindowTest, CheckConsecutiveWindow) {
       0, -1152, 0, 5943,   0, -13312, 0, 21447, 0, -28328, 0, 32255, 0, -32256,
       0, 28327, 0, -21448, 0, 13311,  0, -5944, 0, 1151,   0};
   ASSERT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < state.size; ++i) {
+  int i;
+  for (i = 0; i < state.size; ++i) {
     EXPECT_EQ(state.output[i], expected[i]);
   }
 
diff --git a/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc b/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
index 51094a976d297af8e807ae4f828702ace9a9306a..9f2ea7eee638285edd7c70fb1f91d868a1811790 100644
--- a/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
+++ b/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
@@ -250,7 +250,8 @@ class AudioMicrofrontendOp : public OpKernel {
 
       if (output.values != nullptr) {
         frame_buffer[frame_index].reserve(output.size);
-        for (int i = 0; i < output.size; ++i) {
+        int i;
+        for (i = 0; i < output.size; ++i) {
           frame_buffer[frame_index].push_back(static_cast<T>(output.values[i]) /
                                               out_scale_);
         }
@@ -261,9 +262,10 @@ class AudioMicrofrontendOp : public OpKernel {
 
     int index = 0;
     std::vector<T> pad(config_.filterbank.num_channels, 0);
-    for (int anchor = 0; anchor < frame_buffer.size();
-         anchor += frame_stride_) {
-      for (int frame = anchor - left_context_; frame <= anchor + right_context_;
+    int anchor;
+    for (anchor = 0; anchor < frame_buffer.size(); anchor += frame_stride_) {
+      int frame;
+      for (frame = anchor - left_context_; frame <= anchor + right_context_;
            ++frame) {
         std::vector<T>* feature;
         if (zero_padding_ && (frame < 0 || frame >= frame_buffer.size())) {
diff --git a/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py b/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
index 561f5f7a50e0207ab64fd06211e94e406208e894..3ce861707fda767a3ec1c6e2d23e6a70c6131f24 100644
--- a/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
+++ b/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import tensorflow as tf
 
 from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfrontend_op as frontend_op
+from tensorflow.python.framework import test_util
 
 SAMPLE_RATE = 1000
 WINDOW_SIZE = 25
@@ -33,6 +34,7 @@ SMOOTHING_BITS = 10
 
 class AudioFeatureGenerationTest(tf.test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testSimple(self):
     with self.test_session():
       audio = tf.constant(
@@ -51,6 +53,7 @@ class AudioFeatureGenerationTest(tf.test.TestCase):
       self.assertAllEqual(filterbanks.eval(),
                           [[479, 425], [436, 378], [410, 350], [391, 325]])
 
+  @test_util.run_v1_only("b/120545219")
   def testSimpleFloatScaled(self):
     with self.test_session():
       audio = tf.constant(
@@ -72,6 +75,7 @@ class AudioFeatureGenerationTest(tf.test.TestCase):
                           [[7.484375, 6.640625], [6.8125, 5.90625],
                            [6.40625, 5.46875], [6.109375, 5.078125]])
 
+  @test_util.run_v1_only("b/120545219")
   def testStacking(self):
     with self.test_session():
       audio = tf.constant(
@@ -114,6 +118,7 @@ class AudioFeatureGenerationTest(tf.test.TestCase):
           [[479, 425, 479, 425, 436, 378], [479, 425, 436, 378, 410, 350],
            [436, 378, 410, 350, 391, 325], [410, 350, 391, 325, 391, 325]])
 
+  @test_util.run_v1_only("b/120545219")
   def testStackingDropFrame(self):
     with self.test_session():
       audio = tf.constant(
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index a51c7a667f355f04e272d9868f996225444557fb..0c79e79fddbd43b5a7340ea334ba4011a8c540ac 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -77,8 +77,36 @@ upper_tabs:
         - title: Optimizing for mobile
           path: /lite/tfmobile/optimizing
 
+    # - name: Models
+    #   contents:
+    #   - title: Overview
+    #     path: /lite/models/
+    #   - heading: Beginner
+    #     style: divider
+    #   - title: Image labeling
+    #     section:
+    #     - title: Overview
+    #       path: /lite/models/image/label/overview
+    #     - title: Android
+    #       path: /lite/models/image/label/android
+    #     - title: iOS
+    #       path: /lite/models/image/label/ios
+    #   - heading: Advanced
+    #     style: divider
+    #   - heading: Image
+    #   - title: Image classification
+    #     path: /lite/models/image/classification/
+    #   - heading: Audio
+    #   - title: Hot word detection
+    #     path: /lite/models/audio/hot_word/
+    #   - heading: Text
+    #   - title: Text classification
+    #     path: /lite/models/text/classification/
+
     - name: API
       skip_translation: true
       contents:
       - title: API
         path: /api_docs/python/tf/lite
+
+- include: /_upper_tabs_right.yaml
diff --git a/tensorflow/lite/g3doc/apis.md b/tensorflow/lite/g3doc/apis.md
index e9fa24bff1d1a3d2b8e6a62f061245289afabcd1..b15159ce4145727863c335126557e06402f8dbd3 100644
--- a/tensorflow/lite/g3doc/apis.md
+++ b/tensorflow/lite/g3doc/apis.md
@@ -304,6 +304,13 @@ one of the following primitive types:
 *   `long`
 *   `byte`
 
+`String` types are also supported, but they are encoded differently than the
+primitive types. In particular, the shape of a string Tensor dictates the number
+and arrangement of strings in the Tensor, with each element itself being a
+variable length string. In this sense, the (byte) size of the Tensor cannot be
+computed from the shape and type alone, and consequently strings cannot be
+provided as a single, flat `ByteBuffer` argument.
+
 If other data types, including boxed types like `Integer` and `Float`, are used,
 an `IllegalArgumentException` will be thrown.
 
@@ -345,13 +352,12 @@ interpreter.runForMultipleInputsOutputs(inputs, map_of_indices_to_outputs);
 ```
 
 where each entry in `inputs` corresponds to an input tensor and
-`map_of_indices_to_outputs` maps indices of output tensors to the
-corresponding output data. In both cases the tensor indices should correspond to
-the values given to the [TensorFlow Lite Optimized Converter](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/toco/g3doc/cmdline_examples.md)
+`map_of_indices_to_outputs` maps indices of output tensors to the corresponding
+output data. In both cases the tensor indices should correspond to the values
+given to the [TensorFlow Lite Optimized Converter](convert/cmdline_examples.md)
 when the model was created. Be aware that the order of tensors in `input` must
 match the order given to the `TensorFlow Lite Optimized Converter`.
 
-
 The Java API also provides convenient functions for app developers to get the
 index of any model input or output using a tensor name:
 
diff --git a/tensorflow/lite/g3doc/convert/cmdline_examples.md b/tensorflow/lite/g3doc/convert/cmdline_examples.md
index 59f26b35051ce2ec410e25a5c877344ffe96dc45..169f2d91d8a72278ff61f170f0b450885e4c2c93 100644
--- a/tensorflow/lite/g3doc/convert/cmdline_examples.md
+++ b/tensorflow/lite/g3doc/convert/cmdline_examples.md
@@ -94,9 +94,10 @@ tflite_convert \
 ### Convert a TensorFlow GraphDef for quantized inference <a name="graphdef_quant"></a>
 
 The TensorFlow Lite Converter is compatible with fixed point quantization models
-described [here](https://www.tensorflow.org/performance/quantization). These are
-float models with `FakeQuant*` ops inserted at the boundaries of fused layers
-to record min-max range information. This generates a quantized inference
+described
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/quantize/README.md).
+These are float models with `FakeQuant*` ops inserted at the boundaries of fused
+layers to record min-max range information. This generates a quantized inference
 workload that reproduces the quantization behavior that was used during
 training.
 
diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
index b914a34fa87a38c87b30dc38cde74d8d94eccbce..4d2c7361c9f399848c161ccc706c71894625725d 100644
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ b/tensorflow/lite/g3doc/convert/python_api.md
@@ -19,9 +19,9 @@ be targeted to devices with mobile.
 
 ## API
 
-The API for converting TensorFlow models to TensorFlow Lite as of TensorFlow 1.9
-is `tf.lite.TFLiteConverter`. The API for calling the Python intepreter
-is `tf.lite.Interpreter`.
+The API for converting TensorFlow models to TensorFlow Lite is
+`tf.lite.TFLiteConverter`. The API for calling the Python interpreter is
+`tf.lite.Interpreter`.
 
 `TFLiteConverter` provides class methods based on the original format of the
 model. `TFLiteConverter.from_session()` is available for GraphDefs.
diff --git a/tensorflow/lite/g3doc/models.md b/tensorflow/lite/g3doc/models.md
index 537e285490f905730d9aa5fc61faefae6556b7d9..62b3f17c79aa3688011a1452da18e098008f414e 100644
--- a/tensorflow/lite/g3doc/models.md
+++ b/tensorflow/lite/g3doc/models.md
@@ -76,8 +76,11 @@ Mobilenet_V1_1.0_128_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tf
 Mobilenet_V1_1.0_160_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_160_quant.tgz)  | 4.3 Mb     | 66.9%          | 86.7%          | 37.4 ms
 Mobilenet_V1_1.0_192_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_192_quant.tgz)  | 4.3 Mb     | 69.1%          | 88.1%          | 51.9 ms
 Mobilenet_V1_1.0_224_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)  | 4.3 Mb     | 70.0%          | 89.0%          | 70.2 ms
-Mobilenet_v2_1.0_224_quant  | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz)              | 3.4 Mb     | 70.8%          | 89.9%          | 80.3 ms
-Inception_v3_quant          | [paper](https://arxiv.org/abs/1806.08342),[tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz)                       | 23 Mb      | 77.5%          | 93.7%          | 637 ms
+Mobilenet_V2_1.0_224_quant  | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz)              | 3.4 Mb     | 70.8%          | 89.9%          | 80.3 ms
+Inception_V1_quant          | [paper](https://arxiv.org/abs/1409.4842), [tflite&pb](http://download.tensorflow.org/models/inception_v1_224_quant_20181026.tgz)                          | 6.4 Mb     | 70.1%          | 89.8%          | 154.5 ms
+Inception_V2_quant          | [paper](https://arxiv.org/abs/1512.00567), [tflite&pb](http://download.tensorflow.org/models/inception_v2_224_quant_20181026.tgz)                         | 11 Mb      | 73.5%          | 91.4%          | 235.0 ms
+Inception_V3_quant          | [paper](https://arxiv.org/abs/1806.08342),[tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz)                       | 23 Mb      | 77.5%          | 93.7%          | 637 ms
+Inception_V4_quant          | [paper](https://arxiv.org/abs/1602.07261), [tflite&pb](http://download.tensorflow.org/models/inception_v4_299_quant_20181026.tgz)                         | 41 Mb      | 79.5%          | 93.9%          | 1250.8 ms
 
 ## Other models
 
diff --git a/tensorflow/lite/g3doc/models/_index.yaml b/tensorflow/lite/g3doc/models/_index.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f4d8bc40a9325b12734022e005996e13dba0a0d6
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/_index.yaml
@@ -0,0 +1,125 @@
+project_path: /lite/_project.yaml
+book_path: /lite/_book.yaml
+description: <!--no description-->
+landing_page:
+  body_class: tfo-hide-page-nav
+  custom_css_path: /site-assets/css/style.css
+  show_side_navs: true
+  rows:
+
+  # Hero
+  - classname: >
+      devsite-landing-row-50
+      devsite-landing-row-large-headings
+      devsite-landing-row-no-image-background
+    foreground: theme
+    items:
+    - heading: Models marketplace
+      description: >
+        The TensorFlow Lite models marketplace, your neighborhood model shoppe.
+      image_path: /resources/images/tflite-card-16x9.png
+
+  # Features
+  - background: grey
+    items:
+    - heading: Optimized for mobile
+      description: >
+        Machine learning can make your apps more engaging, personalized, and
+        helpful, and provides solutions that are optimized to run on-device.
+    - heading: Built with Google expertise
+      description: >
+        Models offer the technologies that have long powered Google's own
+        experiences on mobile.
+    - heading: Approachable and comprehensive
+      description: >
+        Use out-of-the-box solutions (base APIs) or custom models, running
+        on-device or in the Cloud, depending on your specific needs.
+
+  # Beginner models
+  - classname: devsite-landing-row-100
+    heading: "Build machine learning into your apps"
+    items:
+    - heading: >
+        Image labeling
+      description: >
+        Identify objects, locations, activities, animal species, products, and
+        more
+      icon:
+        path: ../images/landing-page/assistant_logo.png
+      path: /lite/image/labeling/
+    - heading: >
+        Text recognition (OCR)
+      description: >
+        Recognize and extract text from images
+      icon:
+        path: ../images/landing-page/assistant_logo.png
+      path: /lite/image/labeling/
+    - heading: >
+        Face detection
+      description: >
+        Detect faces and facial landmarks
+      icon:
+        path: ../images/landing-page/assistant_logo.png
+      path: /lite/image/labeling/
+
+  - items:
+    - heading: >
+        Barcode scanning
+      description: >
+        Scan and process barcodes
+      icon:
+        path: ../images/landing-page/assistant_logo.png
+      path: /lite/image/labeling/
+    - heading: >
+        Landmark detection
+      description: >
+        Identify popular landmarks in an image
+      icon:
+        path: ../images/landing-page/assistant_logo.png
+      path: /lite/image/labeling/
+    - heading: >
+        Smart reply
+      description: >
+        Provide suggested text snippet that fits context
+      icon:
+        path: ../images/landing-page/assistant_logo.png
+      path: /lite/image/labeling/
+
+  # Custom models
+  - classname: >
+      devsite-landing-row-no-image-background
+      devsite-landing-row-50
+      devsite-landing-row-large-headings
+    foreground: theme
+    background: grey
+    items:
+    - heading: Custom models
+      description: >
+        <p>If models don’t cover your use cases, you can always
+        bring your own existing TensorFlow Lite models. Just upload your model,
+        and we’ll take care of hosting and serving it to your app.</p>
+
+        <p>Models acts as an API layer to your custom model, making it easy to
+        run and use. In addition to deploying your models, we are releasing an
+        experimental model compression flow that aims to reduce model size (up
+        to orders of magnitudes) while maintaining similar accuracy. Sign up at
+        <a href="https://g.co/firebase/signup">g.co/firebase/signup</a></p>
+
+        <p>And if you’re new to machine learning and want more information on
+        custom models for mobile, you can <a
+        href="//www.tensorflow.org/lite/">learn more about TensorFlow
+        Lite.</a></p>
+      image_path: /resources/images/tflite-card-16x9.png
+      image_left: true
+  - classname: devsite-landing-row-large-headings
+    foreground: theme
+    items:
+    - heading: Just the beginning
+      description: >
+        Our ultimate goal is to reduce idea–to–implementation cycles and make AI
+        an essential and intuitive part of a developer's toolkit. We will do so
+        by continuing to add new Base APIs that leverage Google’s machine
+        learning expertise. Base APIs will ultimately cover significantly more
+        use cases in the vision, speech, and text fields. We will also continue
+        to simplify use of custom models, adding tools to deploy, compress, and
+        create them.
diff --git a/tensorflow/lite/g3doc/models/image/label/android.md b/tensorflow/lite/g3doc/models/image/label/android.md
new file mode 100644
index 0000000000000000000000000000000000000000..9cd54aad1e933823eab169b313fdd6232dd16aa1
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/image/label/android.md
@@ -0,0 +1,3 @@
+# Android
+
+lorem
diff --git a/tensorflow/lite/g3doc/models/image/label/ios.md b/tensorflow/lite/g3doc/models/image/label/ios.md
new file mode 100644
index 0000000000000000000000000000000000000000..904c6450ac7272e67c1982b56099b608b91e2237
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/image/label/ios.md
@@ -0,0 +1,3 @@
+# iOS
+
+lorem
diff --git a/tensorflow/lite/g3doc/models/image/label/overview.md b/tensorflow/lite/g3doc/models/image/label/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3d9133bb2123012f2ddd2db768347305d224744
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/image/label/overview.md
@@ -0,0 +1,8 @@
+# Overview
+
+Image labeling gives you insight into the content of images. When you use the
+API, you get a list of the entities that were recognized: people, things,
+places, activities, and so on. Each label found comes with a score that
+indicates the confidence the ML model has in its relevance. With this
+information, you can perform tasks such as automatic metadata generation
+and content moderation.
diff --git a/tensorflow/lite/g3doc/tf_ops_compatibility.md b/tensorflow/lite/g3doc/tf_ops_compatibility.md
index 6976f058d1346a13b2a2ac8f98089dfa68d8475b..dcfda72137cafbc676dec2fb5dbf5da8ab8cb45a 100644
--- a/tensorflow/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/lite/g3doc/tf_ops_compatibility.md
@@ -989,6 +989,18 @@ Outputs {
 }
 ```
 
+**FILL**
+
+```
+Inputs {
+  0: A Tensor. Must be one of the following types: int32, int64. 1-D. Represents the shape of the output tensor.
+  1: A Tensor. 0-D (scalar). Value to fill the returned tensor.
+}
+Outputs {
+  0: A tensor of the same type as value (input1).
+}
+```
+
 And these are TensorFlow Lite operations that are present but not ready for
 custom models yet:
 
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 09b8832b4d2a3b84e06a1206e5fb2a39e2fc3c47..78b5d1b8873b8b3558b098031ffa33c7857a31e5 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -566,7 +566,7 @@ TEST(BasicInterpreter, ThreeStepAllocate) {
     DynamicBuffer buf;
     StringRef str_ref = GetString(input, 0);
     buf.AddString(str_ref);
-    buf.WriteToTensor(output);
+    buf.WriteToTensorAsVector(output);
     return kTfLiteOk;
   };
 
@@ -1090,10 +1090,10 @@ class TestDelegate : public ::testing::Test {
         TfLiteIntArrayFree(nodes_to_separate);
         return kTfLiteOk;
       };
-      delegate_.CopyToBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle buffer_handle, void* data,
-             size_t size) -> TfLiteStatus {
+      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
+                                        TfLiteDelegate* delegate,
+                                        TfLiteBufferHandle buffer_handle,
+                                        TfLiteTensor* tensor) -> TfLiteStatus {
         // TODO(ycling): Implement tests to test buffer copying logic.
         return kTfLiteOk;
       };
diff --git a/tensorflow/lite/java/demo/app/build.gradle b/tensorflow/lite/java/demo/app/build.gradle
index 05301ebf88c12cc95f71d5efd74062d76e598e1d..b8fc282cb1dfe8a9c80692759e985bf369fc163d 100644
--- a/tensorflow/lite/java/demo/app/build.gradle
+++ b/tensorflow/lite/java/demo/app/build.gradle
@@ -40,6 +40,15 @@ repositories {
         url 'https://google.bintray.com/tensorflow'
     }
 }
+allprojects {
+    repositories {
+        // Uncomment if you want to use a local repo.
+        // mavenLocal()
+        jcenter()
+    }
+}
+
+
 
 dependencies {
     compile fileTree(dir: 'libs', include: ['*.jar'])
@@ -49,31 +58,66 @@ dependencies {
     compile 'com.android.support:support-annotations:25.3.1'
     compile 'com.android.support:support-v13:25.2.0'
 
+    // Build off of nightly TensorFlow Lite
     compile 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
+    // Use local TensorFlow library
+    // compile 'org.tensorflow:tensorflow-lite-local:0.0.0'
 }
 
-def modelDownloadUrl = "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip"
-def localCache = "build/intermediates/mobilenet_v1_224_android_quant_2017_11_08.zip"
 def targetFolder = "src/main/assets"
+def modelFloatDownloadUrl = "http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz"
+def modelQuantDownloadUrl = "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz"
+def localCacheFloat = "build/intermediates/mobilenet_v1_1.0_224.tgz"
+def localCacheQuant = "build/intermediates/mmobilenet_v1_1.0_224_quant.tgz"
+
 
-task downloadModel(type: DownloadUrlTask) {
+task downloadModelFloat(type: DownloadUrlTask) {
     doFirst {
-        println "Downloading ${modelDownloadUrl}"
+        println "Downloading ${modelFloatDownloadUrl}"
     }
-    sourceUrl = "${modelDownloadUrl}"
-    target = file("${localCache}")
+    sourceUrl = "${modelFloatDownloadUrl}"
+    target = file("${localCacheFloat}")
 }
 
-task unzipModel(type: Copy, dependsOn: 'downloadModel') {
+task downloadModelQuant(type: DownloadUrlTask) {
     doFirst {
-        println "Unzipping ${localCache}"
+        println "Downloading ${modelQuantDownloadUrl}"
     }
-    from zipTree("${localCache}")
+    sourceUrl = "${modelQuantDownloadUrl}"
+    target = file("${localCacheQuant}")
+}
+
+task unzipModelFloat(type: Copy, dependsOn: 'downloadModelFloat') {
+    doFirst {
+        println "Unzipping ${localCacheFloat}"
+    }
+    from tarTree("${localCacheFloat}")
     into "${targetFolder}"
 }
 
+task unzipModelQuant(type: Copy, dependsOn: 'downloadModelQuant') {
+    doFirst {
+        println "Unzipping ${localCacheQuant}"
+    }
+    from tarTree("${localCacheQuant}")
+    into "${targetFolder}"
+}
+
+task cleanUnusedFiles(type: Delete, dependsOn: ['unzipModelFloat', 'unzipModelQuant']) {
+    delete fileTree("${targetFolder}").matching {
+        include "*.pb"
+        include "*.ckpt.*"
+        include "*.pbtxt.*"
+        include "*.quant_info.*"
+        include "*.meta"
+    }
+}
+
+
 // Ensure the model file is downloaded and extracted before every build
-preBuild.dependsOn unzipModel
+preBuild.dependsOn unzipModelFloat
+preBuild.dependsOn unzipModelQuant
+preBuild.dependsOn cleanUnusedFiles
 
 class DownloadUrlTask extends DefaultTask {
     @Input
@@ -87,3 +131,4 @@ class DownloadUrlTask extends DefaultTask {
         ant.get(src: sourceUrl, dest: target)
     }
 }
+
diff --git a/tensorflow/lite/java/demo/app/src/main/BUILD b/tensorflow/lite/java/demo/app/src/main/BUILD
index df8a024a570fe071c808bcd70167221f8c8fd8cc..9a7c1d0b61192c61896813f41b2db1e03ff65ecb 100644
--- a/tensorflow/lite/java/demo/app/src/main/BUILD
+++ b/tensorflow/lite/java/demo/app/src/main/BUILD
@@ -10,7 +10,8 @@ android_binary(
     aapt_version = "aapt",
     assets = [
         "//tensorflow/lite/java/demo/app/src/main/assets:labels_mobilenet_quant_v1_224.txt",
-        "@tflite_mobilenet//:mobilenet_quant_v1_224.tflite",
+        "@tflite_mobilenet_quant//:mobilenet_v1_1.0_224_quant.tflite",
+        "@tflite_mobilenet_float//:mobilenet_v1_1.0_224.tflite",
     ],
     assets_dir = "",
     custom_package = "com.example.android.tflitecamerademo",
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index 3596e4201150abaecc1cd8fdd736510a0afc97bb..165d33510131ac9c9fc08070f0a4d08653188fae 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -56,11 +56,12 @@ import android.view.Surface;
 import android.view.TextureView;
 import android.view.View;
 import android.view.ViewGroup;
-import android.widget.CompoundButton;
+import android.widget.AdapterView;
+import android.widget.ArrayAdapter;
+import android.widget.ListView;
 import android.widget.NumberPicker;
 import android.widget.TextView;
 import android.widget.Toast;
-import android.widget.ToggleButton;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -70,6 +71,7 @@ import java.util.List;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
 
+
 /** Basic fragments for the Camera. */
 public class Camera2BasicFragment extends Fragment
     implements FragmentCompat.OnRequestPermissionsResultCallback {
@@ -87,9 +89,11 @@ public class Camera2BasicFragment extends Fragment
   private boolean runClassifier = false;
   private boolean checkedPermissions = false;
   private TextView textView;
-  private ToggleButton toggle;
   private NumberPicker np;
   private ImageClassifier classifier;
+  private ListView deviceView;
+  private ListView modelView;
+
 
   /** Max preview width that is guaranteed by Camera2 API */
   private static final int MAX_PREVIEW_WIDTH = 1920;
@@ -123,6 +127,15 @@ public class Camera2BasicFragment extends Fragment
         public void onSurfaceTextureUpdated(SurfaceTexture texture) {}
       };
 
+  // Model parameter constants.
+  private String gpu;
+  private String cpu;
+  private String nnApi;
+  private String mobilenetV1Quant;
+  private String mobilenetV1Float;
+
+
+
   /** ID of the current {@link CameraDevice}. */
   private String cameraId;
 
@@ -169,6 +182,14 @@ public class Camera2BasicFragment extends Fragment
         }
       };
 
+  private ArrayList<String> deviceStrings = new ArrayList<String>();
+  private ArrayList<String> modelStrings = new ArrayList<String>();
+
+  /** Current indices of device and model. */
+  int currentDevice = -1;
+
+  int currentModel = -1;
+
   /** An additional thread for running tasks that shouldn't block the UI. */
   private HandlerThread backgroundThread;
 
@@ -298,17 +319,113 @@ public class Camera2BasicFragment extends Fragment
     return inflater.inflate(R.layout.fragment_camera2_basic, container, false);
   }
 
+  private void updateActiveModel() {
+    // Get UI information before delegating to background
+    final int modelIndex = modelView.getCheckedItemPosition();
+    final int deviceIndex = deviceView.getCheckedItemPosition();
+
+    backgroundHandler.post(() -> {
+      if (modelIndex == currentModel && deviceIndex == currentDevice) {
+        return;
+      }
+      currentModel = modelIndex;
+      currentDevice = deviceIndex;
+
+      // Disable classifier while updating
+      if (classifier != null) {
+        classifier.close();
+        classifier = null;
+      }
+
+      // Lookup names of parameters.
+      String model = modelStrings.get(modelIndex);
+      String device = deviceStrings.get(deviceIndex);
+
+      Log.i(TAG, "Changing model to " + model + " device " + device);
+
+      // Try to load model.
+      try {
+        if (model.equals(mobilenetV1Quant)) {
+          classifier = new ImageClassifierQuantizedMobileNet(getActivity());
+        } else if (model.equals(mobilenetV1Float)) {
+          classifier = new ImageClassifierFloatMobileNet(getActivity());
+        } else {
+          showToast("Failed to load model");
+        }
+      } catch (IOException e) {
+        Log.d(TAG, "Failed to load", e);
+        classifier = null;
+      }
+
+      // Customzie the interpreter to the type of device we want to use.
+      if (device.equals(cpu)) {
+      } else if (device.equals(gpu)) {
+        if (!GpuDelegateHelper.isGpuDelegateAvailable()) {
+          showToast("gpu not in this build.");
+          classifier = null;
+        } else if (model.equals(mobilenetV1Quant)) {
+          showToast("gpu requires float model.");
+          classifier = null;
+        } else {
+          classifier.useGpu();
+        }
+      } else if (device.equals(nnApi)) {
+        classifier.useNNAPI();
+      }
+    });
+  }
+
   /** Connect the buttons to their event handler. */
   @Override
   public void onViewCreated(final View view, Bundle savedInstanceState) {
+    gpu = getString(R.string.gpu);
+    cpu = getString(R.string.cpu);
+    nnApi = getString(R.string.nnapi);
+    mobilenetV1Quant = getString(R.string.mobilenetV1Quant);
+    mobilenetV1Float = getString(R.string.mobilenetV1Float);
+
+    // Get references to widgets.
     textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
     textView = (TextView) view.findViewById(R.id.text);
-    toggle = (ToggleButton) view.findViewById(R.id.button);
-
-    toggle.setOnCheckedChangeListener(
-        new CompoundButton.OnCheckedChangeListener() {
-          public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
-            backgroundHandler.post(() -> classifier.setUseNNAPI(isChecked));
+    deviceView = (ListView) view.findViewById(R.id.device);
+    modelView = (ListView) view.findViewById(R.id.model);
+
+    // Build list of models
+    modelStrings.add(mobilenetV1Quant);
+    modelStrings.add(mobilenetV1Float);
+
+    // Build list of devices
+    int defaultModelIndex = 0;
+    deviceStrings.add(cpu);
+    if (GpuDelegateHelper.isGpuDelegateAvailable()) {
+      deviceStrings.add(gpu);
+    }
+    deviceStrings.add(nnApi);
+
+    deviceView.setAdapter(
+        new ArrayAdapter<String>(
+            getContext(), R.layout.listview_row, R.id.listview_row_text, deviceStrings));
+    deviceView.setChoiceMode(ListView.CHOICE_MODE_SINGLE);
+    deviceView.setOnItemClickListener(
+        new AdapterView.OnItemClickListener() {
+          @Override
+          public void onItemClick(AdapterView<?> parent, View view, int position, long id) {
+            updateActiveModel();
+          }
+        });
+    deviceView.setItemChecked(0, true);
+
+    modelView.setChoiceMode(ListView.CHOICE_MODE_SINGLE);
+    ArrayAdapter<String> modelAdapter =
+        new ArrayAdapter<>(
+            getContext(), R.layout.listview_row, R.id.listview_row_text, modelStrings);
+    modelView.setAdapter(modelAdapter);
+    modelView.setItemChecked(defaultModelIndex, true);
+    modelView.setOnItemClickListener(
+        new AdapterView.OnItemClickListener() {
+          @Override
+          public void onItemClick(AdapterView<?> parent, View view, int position, long id) {
+            updateActiveModel();
           }
         });
 
@@ -323,18 +440,14 @@ public class Camera2BasicFragment extends Fragment
             backgroundHandler.post(() -> classifier.setNumThreads(newVal));
           }
         });
+
+    // Start initial model.
   }
 
   /** Load the model and labels. */
   @Override
   public void onActivityCreated(Bundle savedInstanceState) {
     super.onActivityCreated(savedInstanceState);
-    try {
-      // create either a new ImageClassifierQuantizedMobileNet or an ImageClassifierFloatInception
-      classifier = new ImageClassifierQuantizedMobileNet(getActivity());
-    } catch (IOException e) {
-      Log.e(TAG, "Failed to initialize an image classifier.", e);
-    }
     startBackgroundThread();
   }
 
@@ -562,10 +675,12 @@ public class Camera2BasicFragment extends Fragment
     backgroundThread = new HandlerThread(HANDLE_THREAD_NAME);
     backgroundThread.start();
     backgroundHandler = new Handler(backgroundThread.getLooper());
+    // Start the classification train & load an initial model.
     synchronized (lock) {
       runClassifier = true;
     }
     backgroundHandler.post(periodicClassify);
+    updateActiveModel();
   }
 
   /** Stops the background thread and its {@link Handler}. */
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/GpuDelegateHelper.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/GpuDelegateHelper.java
new file mode 100644
index 0000000000000000000000000000000000000000..8dca17744eb7a3d1e69612abf61deafb6370e4ff
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/GpuDelegateHelper.java
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.tflitecamerademo;
+
+import org.tensorflow.lite.Delegate;
+
+/**
+ * Helper class for {@code GpuDelegate}.
+ *
+ * <p>WARNING: This is an experimental API and subject to change.
+ */
+public class GpuDelegateHelper {
+  private GpuDelegateHelper() {}
+
+  /** Checks whether {@code GpuDelegate} is available. */
+  public static boolean isGpuDelegateAvailable() {
+    try {
+      Class.forName("org.tensorflow.lite.experimental.GpuDelegate");
+      return true;
+    } catch (Exception e) {
+      return false;
+    }
+  }
+
+  /** Returns an instance of {@code GpuDelegate} if available. */
+  public static Delegate createGpuDelegate() {
+    try {
+      return Class.forName("org.tensorflow.lite.experimental.GpuDelegate")
+          .asSubclass(Delegate.class)
+          .getDeclaredConstructor()
+          .newInstance();
+    } catch (Exception e) {
+      throw new IllegalStateException(e);
+    }
+  }
+}
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
index 39057aa7768c54fb0f7b48211823730dc6217a70..512f8b64db1637385e7be56db6d0889c44abb2fb 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -38,6 +38,7 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 import java.util.PriorityQueue;
+import org.tensorflow.lite.Delegate;
 import org.tensorflow.lite.Interpreter;
 
 /**
@@ -93,6 +94,9 @@ public abstract class ImageClassifier {
             }
           });
 
+  /** holds a gpu delegate */
+  Delegate gpuDelegate = null;
+
   /** Initializes an {@code ImageClassifier}. */
   ImageClassifier(Activity activity) throws IOException {
     tfliteModel = loadModelFile(activity);
@@ -159,12 +163,27 @@ public abstract class ImageClassifier {
   private void recreateInterpreter() {
     if (tflite != null) {
       tflite.close();
+      // TODO(b/120679982)
+      // gpuDelegate.close();
       tflite = new Interpreter(tfliteModel, tfliteOptions);
     }
   }
 
-  public void setUseNNAPI(Boolean nnapi) {
-    tfliteOptions.setUseNNAPI(nnapi);
+  public void useGpu() {
+    if (gpuDelegate == null && GpuDelegateHelper.isGpuDelegateAvailable()) {
+      gpuDelegate = GpuDelegateHelper.createGpuDelegate();
+      tfliteOptions.addDelegate(gpuDelegate);
+      recreateInterpreter();
+    }
+  }
+
+  public void useCPU() {
+    tfliteOptions.setUseNNAPI(false);
+    recreateInterpreter();
+  }
+
+  public void useNNAPI() {
+    tfliteOptions.setUseNNAPI(true);
     recreateInterpreter();
   }
 
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatMobileNet.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatMobileNet.java
new file mode 100644
index 0000000000000000000000000000000000000000..c87ffff8f6c39dc1d87c2cf0c09b5602edd9329c
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatMobileNet.java
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.tflitecamerademo;
+
+import android.app.Activity;
+import java.io.IOException;
+
+/** This classifier works with the float MobileNet model. */
+public class ImageClassifierFloatMobileNet extends ImageClassifier {
+
+  /**
+   * An array to hold inference results, to be feed into Tensorflow Lite as outputs. This isn't part
+   * of the super class, because we need a primitive array here.
+   */
+  private float[][] labelProbArray = null;
+
+  /**
+   * Initializes an {@code ImageClassifierFloatMobileNet}.
+   *
+   * @param activity
+   */
+  ImageClassifierFloatMobileNet(Activity activity) throws IOException {
+    super(activity);
+    labelProbArray = new float[1][getNumLabels()];
+  }
+
+  @Override
+  protected String getModelPath() {
+    // you can download this file from
+    // see build.gradle for where to obtain this file. It should be auto
+    // downloaded into assets.
+    return "mobilenet_v1_1.0_224.tflite";
+  }
+
+  @Override
+  protected String getLabelPath() {
+    return "labels_mobilenet_quant_v1_224.txt";
+  }
+
+  @Override
+  protected int getImageSizeX() {
+    return 224;
+  }
+
+  @Override
+  protected int getImageSizeY() {
+    return 224;
+  }
+
+  @Override
+  protected int getNumBytesPerChannel() {
+    return 4; // Float.SIZE / Byte.SIZE;
+  }
+
+  @Override
+  protected void addPixelValue(int pixelValue) {
+    imgData.putFloat(((pixelValue >> 16) & 0xFF) / 255.f);
+    imgData.putFloat(((pixelValue >> 8) & 0xFF) / 255.f);
+    imgData.putFloat((pixelValue & 0xFF) / 255.f);
+  }
+
+  @Override
+  protected float getProbability(int labelIndex) {
+    return labelProbArray[0][labelIndex];
+  }
+
+  @Override
+  protected void setProbability(int labelIndex, Number value) {
+    labelProbArray[0][labelIndex] = value.floatValue();
+  }
+
+  @Override
+  protected float getNormalizedProbability(int labelIndex) {
+    return labelProbArray[0][labelIndex];
+  }
+
+  @Override
+  protected void runInference() {
+    tflite.run(imgData, labelProbArray);
+  }
+}
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
index e164ac75543ebab12e6b1c057c4ed487eb9accdf..6310a5616838ac6b4258ec05028efa12e8cadab5 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
@@ -42,8 +42,9 @@ public class ImageClassifierQuantizedMobileNet extends ImageClassifier {
   @Override
   protected String getModelPath() {
     // you can download this file from
-    // https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip
-    return "mobilenet_quant_v1_224.tflite";
+    // see build.gradle for where to obtain this file. It should be auto
+    // downloaded into assets.
+    return "mobilenet_v1_1.0_224_quant.tflite";
   }
 
   @Override
diff --git a/tensorflow/lite/java/demo/app/src/main/res/drawable/item_selector.xml b/tensorflow/lite/java/demo/app/src/main/res/drawable/item_selector.xml
new file mode 100644
index 0000000000000000000000000000000000000000..202c900769fdd3be15d6b1252d5c2c4f7f728d8c
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/res/drawable/item_selector.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="utf-8"?>
+<selector xmlns:android="http://schemas.android.com/apk/res/android">
+
+    <!-- pressed -->
+    <item android:drawable="@color/selection_highlight" android:state_pressed="true" />
+    <!-- focused -->
+    <item android:drawable="@color/selection_focus" android:state_activated="true" />
+    <!-- default -->
+    <item android:drawable="@color/item_normal" />
+
+</selector>
diff --git a/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml b/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
index ef8a9e08450d72e392815756606f5ef8301cdd58..ee71ab808f4810ac092b37b0d996331072f44652 100644
--- a/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
+++ b/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
@@ -40,12 +40,27 @@
         android:scaleType="centerInside"
         android:src="@drawable/logo"/>
 
-    <ToggleButton
-        android:id="@+id/button"
+    <RadioGroup
+        android:gravity="center"
         android:layout_width="match_parent"
-        android:layout_height="wrap_content"
-        android:textOff="@string/tflite"
-        android:textOn="@string/nnapi"/>
+        android:layout_height="match_parent"
+        android:orientation="horizontal">
+        <RadioButton
+            android:id="@+id/radio_cpu"
+            android:background="#0000000f"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:text="@string/cpu"
+            android:textColor="@android:color/white" />
+        <RadioButton
+            android:id="@+id/radio_nnapi"
+            android:background="#0000000f"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:text="@string/nnapi"
+            android:textColor="@android:color/white" />
+        </RadioGroup>
+
     <NumberPicker
         android:id="@+id/np"
         android:layout_width="wrap_content"
diff --git a/tensorflow/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml b/tensorflow/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
index ddb099a950c2f83d7b2867f8f35d96885229536d..70eedfdd02ad3ac03f6d413c0d5e2357a320751f 100644
--- a/tensorflow/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
+++ b/tensorflow/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
@@ -57,38 +57,83 @@
             android:textStyle="bold" />
 
     </LinearLayout>
-    <LinearLayout
-        android:orientation="horizontal"
-        android:background="#513400"
-        android:layout_alignParentBottom="true"
 
-        android:layout_width="match_parent"
+    <LinearLayout
         android:id="@+id/bottom_info_view"
+        android:layout_width="match_parent"
+        android:layout_height="200dp"
+
+        android:layout_alignParentBottom="true"
         android:layout_marginBottom="10dp"
-        android:layout_height="50dp">
-        <TextView
+        android:background="#513400"
+        android:orientation="horizontal">
+
+        <LinearLayout
             android:layout_width="wrap_content"
             android:layout_height="match_parent"
-            android:textColor="@android:color/white"
-            android:textAlignment="center"
-            android:gravity="center"
-            android:text="Threads:"/>
-        <NumberPicker
-            android:id="@+id/np"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_marginLeft="10dp"
-            android:theme="@style/AppTheme.Picker"
-            android:visibility="visible" />
-        <ToggleButton
-            android:id="@+id/button"
-            android:textOff="@string/tflite"
-            android:textOn="@string/nnapi"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_marginLeft="10dp"
-            android:background="#0000000f"
-            android:textColor="@android:color/white" />
+            android:orientation="vertical">
+
+            <TextView
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:gravity="center"
+                android:text="Threads"
+                android:textAlignment="center"
+                android:textColor="@android:color/white" />
+
+            <NumberPicker
+                android:id="@+id/np"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginLeft="10dp"
+                android:theme="@style/AppTheme.Picker"
+                android:visibility="visible" />
+
+        </LinearLayout>
+
+        <LinearLayout
+            android:id="@+id/modelLayout"
+            android:layout_width="150dp"
+            android:layout_height="match_parent"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/modelLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white" />
+
+            <ListView
+                android:id="@+id/model"
+                android:layout_width="match_parent"
+                android:layout_height="180dp">
+
+            </ListView>
+        </LinearLayout>
+
+        <LinearLayout
+            android:id="@+id/deviceLayout"
+            android:layout_width="140dp"
+            android:layout_height="match_parent"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView2"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/deviceLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white" />
+
+            <ListView
+                android:id="@+id/device"
+                android:layout_width="match_parent"
+                android:layout_height="180dp" />
+
+        </LinearLayout>
+
     </LinearLayout>
 
 
diff --git a/tensorflow/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml b/tensorflow/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
index e567009a424ed77384bee193c47d4f4d253f5767..f8312cc0f7567a5298e5b0a851f011e4d0d6c0bb 100644
--- a/tensorflow/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
+++ b/tensorflow/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
@@ -57,22 +57,30 @@
             android:textStyle="bold" />
 
     </LinearLayout>
-    <LinearLayout
-        android:orientation="horizontal"
-        android:background="#aa7700"
-        android:layout_alignParentBottom="true"
 
-        android:layout_width="match_parent"
+    <LinearLayout
         android:id="@+id/bottom_info_view"
+        android:layout_width="match_parent"
+        android:layout_height="200dp"
+
+        android:layout_alignParentBottom="true"
         android:layout_marginBottom="10dp"
-        android:layout_height="50dp">
-        <TextView
+        android:background="#513400"
+        android:orientation="horizontal">
+
+      <LinearLayout
             android:layout_width="wrap_content"
             android:layout_height="match_parent"
-            android:textColor="@android:color/white"
-            android:textAlignment="center"
+            android:orientation="vertical">
+
+        <TextView
+            android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
             android:gravity="center"
-            android:text="@string/threads" />
+                android:text="Threads"
+            android:textAlignment="center"
+            android:textColor="@android:color/white" />
+
         <NumberPicker
             android:id="@+id/np"
             android:layout_width="wrap_content"
@@ -80,15 +88,51 @@
             android:layout_marginLeft="10dp"
             android:theme="@style/AppTheme.Picker"
             android:visibility="visible" />
-        <ToggleButton
-            android:id="@+id/button"
-            android:textOff="@string/tflite"
-            android:textOn="@string/nnapi"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_marginLeft="10dp"
-            android:background="#0000000f"
-            android:textColor="@android:color/white" />
+
+        </LinearLayout>
+
+        <LinearLayout
+            android:id="@+id/modelLayout"
+            android:layout_width="150dp"
+            android:layout_height="match_parent"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/modelLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white" />
+
+            <ListView
+                android:id="@+id/model"
+                android:layout_width="match_parent"
+                android:layout_height="180dp">
+
+            </ListView>
+        </LinearLayout>
+
+        <LinearLayout
+            android:id="@+id/deviceLayout"
+            android:layout_width="140dp"
+            android:layout_height="match_parent"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView2"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/deviceLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white" />
+
+            <ListView
+                android:id="@+id/device"
+                android:layout_width="match_parent"
+                android:layout_height="180dp" />
+
+        </LinearLayout>
 
     </LinearLayout>
 </RelativeLayout>
diff --git a/tensorflow/lite/java/demo/app/src/main/res/layout/listview_row.xml b/tensorflow/lite/java/demo/app/src/main/res/layout/listview_row.xml
new file mode 100644
index 0000000000000000000000000000000000000000..349b0f63b4dbae11d21dbb0a58c3cda47299cbf0
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/res/layout/listview_row.xml
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="utf-8"?>
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+
+    <TextView
+        android:id="@+id/listview_row_text"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_marginRight="2dp"
+        android:background="@drawable/item_selector"
+        android:padding="10dp"
+        android:textSize="18sp"
+        android:textStyle="bold" />
+
+</LinearLayout>
\ No newline at end of file
diff --git a/tensorflow/lite/java/demo/app/src/main/res/values/colors.xml b/tensorflow/lite/java/demo/app/src/main/res/values/colors.xml
index 4b75d2b2bda0f95166d0442ebae19cedcad162d8..c30f1dc3ac79a7ef33908a625710f7ac96bfc858 100644
--- a/tensorflow/lite/java/demo/app/src/main/res/values/colors.xml
+++ b/tensorflow/lite/java/demo/app/src/main/res/values/colors.xml
@@ -16,4 +16,7 @@
 -->
 <resources>
     <color name="control_background">#cc4285f4</color>
+    <color name="selection_highlight">#aaaaaa</color>
+    <color name="selection_focus">#eeaa55</color>
+    <color name="item_normal">#eeeeee</color>
 </resources>
diff --git a/tensorflow/lite/java/demo/app/src/main/res/values/strings.xml b/tensorflow/lite/java/demo/app/src/main/res/values/strings.xml
index 29a033bcd437c951ef6e8ba78f4fc3a0fcafac96..8cc88f25652256665acbab2855c60ee1a10293c4 100644
--- a/tensorflow/lite/java/demo/app/src/main/res/values/strings.xml
+++ b/tensorflow/lite/java/demo/app/src/main/res/values/strings.xml
@@ -23,4 +23,11 @@
     <string name="toggle">Use NNAPI</string>
     <string name="tflite">tflite</string>
     <string name="nnapi">NNAPI</string>
+    <string name="gpu">GPU</string>
+    <string name="cpu">CPU</string>
+    <string name="modelLabel">Model</string>
+    <string name="deviceLabel">Device</string>
+    <string name="mobilenetV1Quant">mobilenet v1 quant</string>;
+    <string name="mobilenetV1Float">mobilenet v1 float</string>;;
+
 </resources>
diff --git a/tensorflow/lite/java/jni/BUILD b/tensorflow/lite/java/jni/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ce17ac4fa0d37cb0b790617c4258ea469d14a664
--- /dev/null
+++ b/tensorflow/lite/java/jni/BUILD
@@ -0,0 +1,47 @@
+package(default_visibility = ["//tensorflow/lite:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+# Helper target for exposing JNI headers across multiple platforms.
+cc_library(
+    name = "jni",
+    hdrs = select({
+        # The Android toolchain makes "jni.h" available in the include path.
+        # For non-Android toolchains, generate jni.h and jni_md.h.
+        "//tensorflow:android": [],
+        "//conditions:default": [
+            ":jni.h",
+            ":jni_md.h",
+        ],
+    }),
+    includes = select({
+        "//tensorflow:android": [],
+        "//conditions:default": ["."],
+    }),
+)
+
+# Silly rules to make
+# #include <jni.h>
+# in the source headers work
+# (in combination with the "includes" attribute of the tf_cuda_library rule
+# above. Not needed when using the Android toolchain).
+#
+# Inspired from:
+# https://github.com/bazelbuild/bazel/blob/f99a0543f8d97339d32075c7176b79f35be84606/src/main/native/BUILD
+# but hopefully there is a simpler alternative to this.
+genrule(
+    name = "copy_jni_h",
+    srcs = ["@bazel_tools//tools/jdk:jni_header"],
+    outs = ["jni.h"],
+    cmd = "cp -f $< $@",
+)
+
+genrule(
+    name = "copy_jni_md_h",
+    srcs = select({
+        "//tensorflow:darwin": ["@bazel_tools//tools/jdk:jni_md_header-darwin"],
+        "//conditions:default": ["@bazel_tools//tools/jdk:jni_md_header-linux"],
+    }),
+    outs = ["jni_md.h"],
+    cmd = "cp -f $< $@",
+)
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index a03d7b567637e306f55b2e161cef162def3550c6..2203d5fbdb260aaf2bf826343343426a5015e889 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -43,15 +43,34 @@ import org.checkerframework.checker.nullness.qual.NonNull;
  * <pre>{@code
  * Object[] inputs = {input0, input1, ...};
  * Map<Integer, Object> map_of_indices_to_outputs = new HashMap<>();
- * float[][][] ith_output = new float[3][2][4];
+ * ByteBuffer ith_output = ByteBuffer.allocateDirect(3 * 2 * 4 * 4);  // Float tensor, shape 3x2x4.
+ * ith_output.order(ByteOrder.nativeOrder());
  * map_of_indices_to_outputs.put(i, ith_output);
  * try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
  *   interpreter.runForMultipleInputsOutputs(inputs, map_of_indices_to_outputs);
  * }
  * }</pre>
  *
+ * <p>If a model takes or produces string tensors:
+ *
+ * <pre>{@code
+ * String[] input = {"foo", "bar"};  // Input tensor shape is [2].
+ * String[] output = new String[3][2];  // Output tensor shape is [3, 2].
+ * try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
+ *   interpreter.runForMultipleInputsOutputs(input, output);
+ * }
+ * }</pre>
+ *
  * <p>Orders of inputs and outputs are determined when converting TensorFlow model to TensorFlowLite
- * model with Toco.
+ * model with Toco, as are the default shapes of the inputs.
+ *
+ * <p>When inputs are provided as (multi-dimensional) arrays, the corresponding input tensor(s) will
+ * be implicitly resized according to that array's shape. When inputs are provided as {@link
+ * ByteBuffer} types, no implicit resizing is done; the caller must ensure that the {@link
+ * ByteBuffer} byte size either matches that of the corresponding tensor, or that they first resize
+ * the tensor via {@link #resizeInput()}. Tensor shape and type information can be obtained via the
+ * {@link Tensor} class, available via {@link #getInputTensor(int)} and {@link
+ * #getOutputTensor(int)}.
  *
  * <p><b>WARNING:</b>Instances of a {@code Interpreter} is <b>not</b> thread-safe. A {@code
  * Interpreter} owns resources that <b>must</b> be explicitly freed by invoking {@link #close()}
@@ -192,12 +211,13 @@ public final class Interpreter implements AutoCloseable {
    * Runs model inference if the model takes only one input, and provides only one output.
    *
    * <p>Warning: The API runs much faster if {@link ByteBuffer} is used as input data type. Please
-   * consider using {@link ByteBuffer} to feed input data for better performance.
+   * consider using {@link ByteBuffer} to feed primitive input data for better performance.
    *
    * @param input an array or multidimensional array, or a {@link ByteBuffer} of primitive types
    *     including int, float, long, and byte. {@link ByteBuffer} is the preferred way to pass large
-   *     input data. When {@link ByteBuffer} is used, its content should remain unchanged until
-   *     model inference is done.
+   *     input data for primitive types, whereas string types require using the (multi-dimensional)
+   *     array input path. When {@link ByteBuffer} is used, its content should remain unchanged
+   *     until model inference is done.
    * @param output a multidimensional array of output data, or a {@link ByteBuffer} of primitive
    *     types including int, float, long, and byte.
    */
@@ -212,13 +232,14 @@ public final class Interpreter implements AutoCloseable {
    * Runs model inference if the model takes multiple inputs, or returns multiple outputs.
    *
    * <p>Warning: The API runs much faster if {@link ByteBuffer} is used as input data type. Please
-   * consider using {@link ByteBuffer} to feed input data for better performance.
+   * consider using {@link ByteBuffer} to feed primitive input data for better performance.
    *
    * @param inputs an array of input data. The inputs should be in the same order as inputs of the
    *     model. Each input can be an array or multidimensional array, or a {@link ByteBuffer} of
    *     primitive types including int, float, long, and byte. {@link ByteBuffer} is the preferred
-   *     way to pass large input data. When {@link ByteBuffer} is used, its content should remain
-   *     unchanged until model inference is done.
+   *     way to pass large input data, whereas string types require using the (multi-dimensional)
+   *     array input path. When {@link ByteBuffer} is used, its content should remain unchanged
+   *     until model inference is done.
    * @param outputs a map mapping output indices to multidimensional arrays of output data or {@link
    *     ByteBuffer}s of primitive types including int, float, long, and byte. It only needs to keep
    *     entries for the outputs to be used.
diff --git a/tensorflow/lite/java/src/main/native/BUILD b/tensorflow/lite/java/src/main/native/BUILD
index 8f95f14518af5b4b4d07afef19c9cfb7270af0eb..52194e86db32a259ca1fe640ca72d42010ba1a44 100644
--- a/tensorflow/lite/java/src/main/native/BUILD
+++ b/tensorflow/lite/java/src/main/native/BUILD
@@ -15,15 +15,7 @@ cc_library(
         "nativeinterpreterwrapper_jni.cc",
         "tensor_jni.cc",
         "tensorflow_lite_jni.cc",
-    ] + select({
-        # The Android toolchain makes "jni.h" available in the include path.
-        # For non-Android toolchains, generate jni.h and jni_md.h.
-        "//tensorflow:android": [],
-        "//conditions:default": [
-            ":jni.h",
-            ":jni_md.h",
-        ],
-    }),
+    ],
     hdrs = [
         "exception_jni.h",
         "nativeinterpreterwrapper_jni.h",
@@ -31,75 +23,31 @@ cc_library(
         "tensorflow_lite_jni.h",
     ],
     copts = tflite_copts(),
-    includes = select({
-        "//tensorflow:android": [],
-        "//conditions:default": ["."],
-    }),
     linkopts = [
         "-lm",
         "-ldl",
     ],
     deps = [
-        "//tensorflow/lite:context",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/java/jni",
     ],
     alwayslink = 1,
 )
 
-# Silly rules to make
-# #include <jni.h>
-# in the source headers work
-# (in combination with the "includes" attribute of the tf_cuda_library rule
-# above. Not needed when using the Android toolchain).
-#
-# Inspired from:
-# https://github.com/bazelbuild/bazel/blob/f99a0543f8d97339d32075c7176b79f35be84606/src/main/native/BUILD
-# but hopefully there is a simpler alternative to this.
-genrule(
-    name = "copy_jni_h",
-    srcs = ["@bazel_tools//tools/jdk:jni_header"],
-    outs = ["jni.h"],
-    cmd = "cp -f $< $@",
-)
-
-genrule(
-    name = "copy_jni_md_h",
-    srcs = select({
-        "//tensorflow:darwin": ["@bazel_tools//tools/jdk:jni_md_header-darwin"],
-        "//conditions:default": ["@bazel_tools//tools/jdk:jni_md_header-linux"],
-    }),
-    outs = ["jni_md.h"],
-    cmd = "cp -f $< $@",
-)
-
 cc_library(
     name = "init_tensorflow",
     srcs = [
         "init_tensorflow_jni.cc",
-    ] + select({
-        # The Android toolchain makes "jni.h" available in the include path.
-        # For non-Android toolchains, generate jni.h and jni_md.h.
-        "//tensorflow:android": [],
-        "//conditions:default": [
-            ":jni.h",
-            ":jni_md.h",
-        ],
-    }),
+    ],
     hdrs = [
         "init_tensorflow_jni.h",
     ],
     copts = tflite_copts(),
-    includes = select({
-        "//tensorflow:android": [],
-        "//conditions:default": ["."],
-    }),
-    linkopts = [
-        "-lm",
-        "-ldl",
-    ],
     deps = [
+        "//tensorflow/lite/java/jni",
         "//tensorflow/lite/testing:init_tensorflow",
     ],
     alwayslink = 1,
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.cc b/tensorflow/lite/java/src/main/native/tensor_jni.cc
index 035bec6a440d3ae22ef25f3e48ecbaff5bf3b8ee..82d2679de9c868694668bca23ce6c8a6fb55dbe8 100644
--- a/tensorflow/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.cc
@@ -278,7 +278,7 @@ void WriteMultiDimensionalStringArray(JNIEnv* env, jobject src,
   tflite::DynamicBuffer dst_buffer;
   PopulateStringDynamicBuffer(env, src, &dst_buffer, tensor->dims->size);
   if (!env->ExceptionCheck()) {
-    dst_buffer.WriteToTensor(tensor);
+    dst_buffer.WriteToTensor(tensor, /*new_shape=*/nullptr);
   }
 }
 
diff --git a/tensorflow/lite/java/src/test/native/BUILD b/tensorflow/lite/java/src/test/native/BUILD
index 4d3e82b1ac14990be13aaba1d917e26dcc00b961..481aea7ecd5dd8f9c26307e3b00992e21e6c2501 100644
--- a/tensorflow/lite/java/src/test/native/BUILD
+++ b/tensorflow/lite/java/src/test/native/BUILD
@@ -12,20 +12,11 @@ cc_library(
     testonly = 1,
     srcs = [
         "interpreter_test_jni.cc",
-    ] + select({
-        # The Android toolchain makes "jni.h" available in the include path.
-        # For non-Android toolchains, generate jni.h and jni_md.h.
-        "//tensorflow:android": [],
-        "//conditions:default": [
-            "//tensorflow/lite/java/src/main/native:jni.h",
-            "//tensorflow/lite/java/src/main/native:jni_md.h",
-        ],
-    }),
-    includes = select({
-        "//tensorflow:android": [],
-        "//conditions:default": ["../../main/native/."],
-    }),
-    deps = ["//tensorflow/lite/c:c_api_internal"],
+    ],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/java/jni",
+    ],
 )
 
 tflite_jni_binary(
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index c1b005b5809f0e046714c154017a0f65db3743b4..bad1c4aebf1e9d9c7c6d33f87a6e7ea9cab8d700 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -26,7 +26,6 @@ tf_cc_test(
     size = "small",
     srcs = ["optional_tensor_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -124,7 +123,6 @@ tf_cc_test(
     size = "small",
     srcs = ["kernel_util_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -139,7 +137,6 @@ tf_cc_test(
     size = "small",
     srcs = ["test_util_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",  # TODO(b/117786830)
     ],
     deps = [
@@ -183,6 +180,7 @@ cc_library(
         "exp.cc",
         "expand_dims.cc",
         "fake_quant.cc",
+        "fill.cc",
         "floor.cc",
         "floor_div.cc",
         "floor_mod.cc",
@@ -197,6 +195,7 @@ cc_library(
         "lstm.cc",
         "maximum_minimum.cc",
         "mfcc.cc",
+        "mirror_pad.cc",
         "mul.cc",
         "neg.cc",
         "one_hot.cc",
@@ -291,7 +290,6 @@ tf_cc_test(
     size = "small",
     srcs = ["audio_spectrogram_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -308,7 +306,6 @@ tf_cc_test(
     size = "small",
     srcs = ["mfcc_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -325,7 +322,6 @@ tf_cc_test(
     size = "small",
     srcs = ["detection_postprocess_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -342,7 +338,6 @@ tf_cc_test(
     size = "small",
     srcs = ["relu1_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -359,7 +354,6 @@ tf_cc_test(
     size = "small",
     srcs = ["sparse_output_fully_connected_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -402,7 +396,6 @@ tf_cc_test(
     size = "small",
     srcs = ["arg_min_max_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -418,7 +411,6 @@ tf_cc_test(
     size = "small",
     srcs = ["div_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -434,7 +426,6 @@ tf_cc_test(
     size = "small",
     srcs = ["sub_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -450,7 +441,6 @@ tf_cc_test(
     size = "small",
     srcs = ["transpose_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -468,7 +458,6 @@ tf_cc_test(
     size = "small",
     srcs = ["space_to_batch_nd_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -484,7 +473,6 @@ tf_cc_test(
     size = "small",
     srcs = ["batch_to_space_nd_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -500,7 +488,6 @@ tf_cc_test(
     size = "small",
     srcs = ["cast_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -557,7 +544,6 @@ tf_cc_test(
     size = "small",
     srcs = ["dequantize_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -587,7 +573,6 @@ tf_cc_test(
     size = "small",
     srcs = ["bidirectional_sequence_lstm_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -604,7 +589,6 @@ tf_cc_test(
     size = "small",
     srcs = ["floor_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -620,7 +604,6 @@ tf_cc_test(
     size = "small",
     srcs = ["elementwise_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -636,7 +619,6 @@ tf_cc_test(
     size = "small",
     srcs = ["unidirectional_sequence_lstm_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -652,7 +634,6 @@ tf_cc_test(
     size = "small",
     srcs = ["bidirectional_sequence_rnn_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable",
     ],
     deps = [
@@ -668,7 +649,6 @@ tf_cc_test(
     size = "small",
     srcs = ["unidirectional_sequence_rnn_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -697,7 +677,6 @@ tf_cc_test(
     size = "small",
     srcs = ["exp_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -713,7 +692,6 @@ tf_cc_test(
     size = "small",
     srcs = ["fake_quant_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -729,7 +707,6 @@ tf_cc_test(
     size = "small",
     srcs = ["maximum_minimum_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -745,7 +722,6 @@ tf_cc_test(
     size = "small",
     srcs = ["reduce_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -774,7 +750,6 @@ tf_cc_test(
     size = "small",
     srcs = ["pad_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -803,7 +778,6 @@ tf_cc_test(
     size = "small",
     srcs = ["gather_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -820,7 +794,6 @@ tf_cc_test(
     size = "small",
     srcs = ["topk_v2_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -957,7 +930,6 @@ tf_cc_test(
     size = "small",
     srcs = ["log_softmax_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1055,7 +1027,6 @@ tf_cc_test(
     size = "small",
     srcs = ["split_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1071,7 +1042,6 @@ tf_cc_test(
     size = "small",
     srcs = ["split_v_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1087,7 +1057,6 @@ tf_cc_test(
     size = "small",
     srcs = ["squeeze_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1103,7 +1072,6 @@ tf_cc_test(
     size = "small",
     srcs = ["strided_slice_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1119,7 +1087,6 @@ tf_cc_test(
     size = "small",
     srcs = ["tile_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1138,7 +1105,6 @@ tf_cc_test(
         "comparisons_test.cc",
     ],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1154,7 +1120,6 @@ tf_cc_test(
     size = "small",
     srcs = ["neg_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1172,7 +1137,6 @@ tf_cc_test(
         "select_test.cc",
     ],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1190,7 +1154,6 @@ tf_cc_test(
         "slice_test.cc",
     ],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1206,7 +1169,6 @@ tf_cc_test(
     size = "small",
     srcs = ["transpose_conv_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1223,7 +1185,6 @@ tf_cc_test(
     size = "small",
     srcs = ["expand_dims_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1240,7 +1201,6 @@ tf_cc_test(
     size = "small",
     srcs = ["sparse_to_dense_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1257,7 +1217,6 @@ tf_cc_test(
     size = "small",
     srcs = ["shape_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1274,7 +1233,6 @@ tf_cc_test(
     size = "small",
     srcs = ["pow_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1410,6 +1368,19 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "fill_test",
+    size = "small",
+    srcs = ["fill_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
@@ -1423,3 +1394,14 @@ filegroup(
 )
 
 tflite_portable_test_suite()
+
+tf_cc_test(
+    name = "mirror_pad_test",
+    srcs = ["mirror_pad_test.cc"],
+    deps = [
+        ":builtin_ops",
+        ":test_util",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 82072bccb243b240cecbb5e9377e18c18e18d782..ab09cf7196a951ded20f22e404570254be6ed233 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -345,9 +345,24 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       for (; in < in_end; in++, out++) *out = std::min(std::max(0.f, *in), 6.f);
       return kTfLiteOk;
     } break;
+    case kTfLiteUInt8: {
+      ActivationParams params;
+      params.activation_type = FusedActivationFunctionType::kRelu6;
+      params.quantized_activation_min = std::max(
+          0, output->params.zero_point +
+                 static_cast<int32>(roundf(0.f / output->params.scale)));
+      params.quantized_activation_max = std::min(
+          255, output->params.zero_point +
+                   static_cast<int32>(roundf(6.f / output->params.scale)));
+      optimized_ops::ReluX(params, GetTensorShape(input),
+                           GetTensorData<uint8>(input), GetTensorShape(output),
+                           GetTensorData<uint8>(output));
+      return kTfLiteOk;
+    } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %s.",
-                           TfLiteTypeGetName(input->type));
+      context->ReportError(
+          context, "Only float32 and uint8 supported currently, got %s.",
+          TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -358,11 +373,8 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
-      size_t elements = input->bytes / sizeof(float);
-      float* in = input->data.f;
-      float* in_end = in + elements;
-      float* out = output->data.f;
-      for (; in < in_end; in++, out++) *out = std::tanh(*in);
+      optimized_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
+                          GetTensorShape(output), GetTensorData<float>(output));
       return kTfLiteOk;
     } break;
     case kTfLiteInt16: {
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 1de3dbc44f89065489063676dc07cf2fe530c30e..67f137baff29808d7a03571e1880901e44c34712 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -170,6 +170,29 @@ TEST(FloatActivationsOpTest, Tanh) {
                              })));
 }
 
+TEST(QuantizedActivationsOpTest, Relu6) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_RELU6,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_UINT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax});
+  m.SetInput<uint8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0, 0, 2, 4,  //
+                      3, 0, 6, 1,  //
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAreArray({128, 128, 160, 192, 176, 128, 224, 144}));
+}
+
 TEST(QuantizedActivationsOpTest, Tanh) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index 32a7c100ce53101063d81345bcb052e680e64a28..9867cc53b342d9fddda81db270c223de2ecda14f 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -247,7 +247,10 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
         TF_LITE_ADD(reference_ops, Add);
       }
     } else {
-      if (need_broadcast) {
+      if (op_params.broadcast_category ==
+          BroadcastableOpCategory::kGenericBroadcast) {
+        TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow);
+      } else if (need_broadcast) {
         TF_LITE_ADD(optimized_ops, BroadcastAddFivefold);
       } else {
         TF_LITE_ADD(optimized_ops, Add);
diff --git a/tensorflow/lite/kernels/add_test.cc b/tensorflow/lite/kernels/add_test.cc
index 1d33adf1999ecde581badf041276ec15b4370689..16045d457238e482bd7aad1077d0344632a7550b 100644
--- a/tensorflow/lite/kernels/add_test.cc
+++ b/tensorflow/lite/kernels/add_test.cc
@@ -279,21 +279,92 @@ TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedWithBroadcast) {
-  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcast) {
+  float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
-                          ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1});
-    m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(),
-                ElementsAreArray(ArrayFloatNear({-1.9, 0.3, 0.8, 0.9, 1.2, 2.1},
-                                                kQuantizedTolerance)))
+    QuantizedAddOpModel model_fixture(
+        {TensorType_UINT8, test_shapes[i], -3.f, 3.f},
+        {TensorType_UINT8, {}, -3.f, 3.f}, {TensorType_UINT8, {}, -3.f, 3.f},
+        ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<uint8_t>(
+        model_fixture.input1(), {-2.0f, 0.2f, 0.7f, 0.8f, 1.1f, 2.0f});
+    model_fixture.QuantizeAndPopulate<uint8_t>(model_fixture.input2(), {0.1f});
+    model_fixture.Invoke();
+    EXPECT_THAT(
+        model_fixture.GetDequantizedOutput(),
+        ElementsAreArray(ArrayFloatNear({-1.9f, 0.3f, 0.8f, 0.9f, 1.2f, 2.1f},
+                                        kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+  // Re-run with exchanged inputs.
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedAddOpModel model_fixture(
+        {TensorType_UINT8, {}, -3.f, 3.f},
+        {TensorType_UINT8, test_shapes[i], -3.f, 3.f},
+        {TensorType_UINT8, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<uint8_t>(model_fixture.input1(), {0.1f});
+    model_fixture.QuantizeAndPopulate<uint8_t>(
+        model_fixture.input2(), {-2.0f, 0.2f, 0.7f, 0.8f, 1.1f, 2.0f});
+    model_fixture.Invoke();
+    EXPECT_THAT(
+        model_fixture.GetDequantizedOutput(),
+        ElementsAreArray(ArrayFloatNear({-1.9f, 0.3f, 0.8f, 0.9f, 1.2f, 2.1f},
+                                        kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
+TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcast) {
+  float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
+  const std::vector<int> base_shape = {2, 3, 1, 2};
+  std::vector<std::vector<int>> test_shapes = {
+      {1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+  std::vector<std::vector<float>> test_outputs = {
+      {-0.1f, 2.6f,  -0.7f, 2.8f, 0.7f,  3.0f, 1.1f,  0.8f, 0.5f,
+       1.0f,  1.9f,  1.4f,  1.0f, -0.8f, 0.4f, -0.6f, 1.8f, -0.2f,
+       1.4f,  3.0f,  0.8f,  3.0f, 2.2f,  3.0f, -1.4f, 0.3f, -2.0f,
+       0.5f,  -0.6f, 0.9f,  0.9f, -1.9f, 0.3f, -1.7f, 1.7f, -1.3f},
+      {-0.1f, 2.6f, 0.5f, 1.0f, 1.8f, -0.2f, 1.4f, 3.0f, -2.0f, 0.5f, 1.7f,
+       -1.3f},
+      {-0.1f, 2.5f,  0.0f, 2.6f, -0.7f, 1.9f, 1.1f,  0.7f, 1.2f,
+       0.8f,  0.5f,  0.1f, 1.0f, -0.9f, 1.1f, -0.8f, 0.4f, -1.5f,
+       1.7f,  3.0f,  2.2f, 3.0f, 2.1f,  3.0f, -1.1f, 0.5f, -0.6f,
+       1.0f,  -0.7f, 0.9f, 1.2f, -1.7f, 1.7f, -1.2f, 1.6f, -1.3f},
+      {-0.1f, 2.5f, 1.2f, 0.8f, 0.4f, -1.5f, 1.7f, 3.0f, -0.6f, 1.0f, 1.6f,
+       -1.3f}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedAddOpModel model_fixture(
+        {TensorType_UINT8, base_shape, -3.f, 3.f},
+        {TensorType_UINT8, test_shapes[i], -3.f, 3.f},
+        {TensorType_UINT8, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<uint8_t>(
+        model_fixture.input1(), {-0.3f, 2.3f, 0.9f, 0.5f, 0.8f, -1.1f, 1.2f,
+                                 2.8f, -1.6f, 0.0f, 0.7f, -2.2f});
+    model_fixture.QuantizeAndPopulate<uint8_t>(
+        model_fixture.input2(), {0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f});
+    model_fixture.Invoke();
+    EXPECT_THAT(
+        model_fixture.GetDequantizedOutput(),
+        ElementsAreArray(ArrayFloatNear(test_outputs[i], kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+  // Re-run with exchanged inputs.
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedAddOpModel model_fixture(
+        {TensorType_UINT8, test_shapes[i], -3.f, 3.f},
+        {TensorType_UINT8, base_shape, -3.f, 3.f},
+        {TensorType_UINT8, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<uint8_t>(
+        model_fixture.input1(), {0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f});
+    model_fixture.QuantizeAndPopulate<uint8_t>(
+        model_fixture.input2(), {-0.3f, 2.3f, 0.9f, 0.5f, 0.8f, -1.1f, 1.2f,
+                                 2.8f, -1.6f, 0.0f, 0.7f, -2.2f});
+    model_fixture.Invoke();
+    EXPECT_THAT(
+        model_fixture.GetDequantizedOutput(),
+        ElementsAreArray(ArrayFloatNear(test_outputs[i], kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
diff --git a/tensorflow/lite/kernels/dequantize.cc b/tensorflow/lite/kernels/dequantize.cc
index b2825bb9ea5a57789bf6f3aa312b09c43f07bbf7..7f03c73c9c960e3c134e33bf78a572f100405b7a 100644
--- a/tensorflow/lite/kernels/dequantize.cc
+++ b/tensorflow/lite/kernels/dequantize.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
@@ -57,7 +58,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   OpContext op_context(context, node);
 
-  TF_LITE_ENSURE(context, op_context.input->type == kTfLiteUInt8);
+  TF_LITE_ENSURE(context, op_context.input->type == kTfLiteUInt8 ||
+                              op_context.input->type == kTfLiteInt8);
 
   op_context.output->type = kTfLiteFloat32;
   // If the input tensor is constant, we can persist the dequantized value in
@@ -80,10 +82,25 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   tflite::DequantizationParams op_params;
   op_params.zero_point = op_context.input->params.zero_point;
   op_params.scale = op_context.input->params.scale;
-  optimized_ops::Dequantize(op_params, GetTensorShape(op_context.input),
-                            GetTensorData<uint8_t>(op_context.input),
-                            GetTensorShape(op_context.output),
-                            GetTensorData<float>(op_context.output));
+  switch (op_context.input->type) {
+    case kTfLiteUInt8:
+      optimized_ops::Dequantize(op_params, GetTensorShape(op_context.input),
+                                GetTensorData<uint8_t>(op_context.input),
+                                GetTensorShape(op_context.output),
+                                GetTensorData<float>(op_context.output));
+      break;
+    case kTfLiteInt8:
+      reference_integer_ops::Dequantize(
+          op_params, GetTensorShape(op_context.input),
+          GetTensorData<int8_t>(op_context.input),
+          GetTensorShape(op_context.output),
+          GetTensorData<float>(op_context.output));
+      break;
+    default:
+      context->ReportError(context, "Type %d not supported.",
+                           op_context.input->type);
+      return kTfLiteError;
+  }
 
   if (IsConstantTensor(op_context.input)) {
     op_data->float_dequantized_weights_initialized = true;
diff --git a/tensorflow/lite/kernels/dequantize_test.cc b/tensorflow/lite/kernels/dequantize_test.cc
index 55265d93e527fdf69d8958c14ab9e347d57b3ce0..bb5f1e74a8b0174209043e14af9c35db32bf14b5 100644
--- a/tensorflow/lite/kernels/dequantize_test.cc
+++ b/tensorflow/lite/kernels/dequantize_test.cc
@@ -25,8 +25,16 @@ using ::testing::ElementsAreArray;
 
 class DequantizeOpModel : public SingleOpModel {
  public:
-  DequantizeOpModel(std::initializer_list<int> shape, float min, float max) {
-    input_ = AddInput({TensorType_UINT8, shape, min, max});
+  DequantizeOpModel(TensorType type, std::initializer_list<int> shape,
+                    float scale, int32_t zero_point) {
+    TensorData input_tensor_data;
+    input_tensor_data.type = type;
+    input_tensor_data.shape = shape;
+    input_tensor_data.min = 0;
+    input_tensor_data.max = 0;
+    input_tensor_data.scale = scale;
+    input_tensor_data.zero_point = zero_point;
+    input_ = AddInput(input_tensor_data);
     output_ = AddOutput({TensorType_FLOAT32, shape});
     SetBuiltinOp(BuiltinOperator_DEQUANTIZE, BuiltinOptions_DequantizeOptions,
                  CreateDequantizeOptions(builder_).Union());
@@ -34,7 +42,8 @@ class DequantizeOpModel : public SingleOpModel {
     BuildInterpreter({GetShape(input_)});
   }
 
-  void SetInput(std::initializer_list<uint8_t> data) {
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
     PopulateTensor(input_, data);
   }
 
@@ -45,10 +54,22 @@ class DequantizeOpModel : public SingleOpModel {
   int output_;
 };
 
-TEST(SplitOpTest, FourDimensional) {
-  DequantizeOpModel m({2, 5}, -63.5, 64);
+TEST(DequantizeOpTest, UINT8) {
+  // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
+  DequantizeOpModel m(TensorType_UINT8, {2, 5}, 0.5, 127);
 
-  m.SetInput({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.SetInput<uint8>({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64})));
+}
+
+TEST(DequantizeOpTest, INT8) {
+  // [-63.5, 64] -> scale=0.5, zero_point=1 for INT8
+  DequantizeOpModel m(TensorType_INT8, {2, 5}, 0.5, -1);
+
+  m.SetInput<int8>({-128, -127, -126, -125, -124, 123, 124, 125, 126, 127});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
               ElementsAreArray(ArrayFloatNear(
diff --git a/tensorflow/lite/kernels/eigen_support.cc b/tensorflow/lite/kernels/eigen_support.cc
index 44e0086ad88303a5214161e533313923f9aed301..bad5975a7c187cc4bdcd65721d397897ff2cf09d 100644
--- a/tensorflow/lite/kernels/eigen_support.cc
+++ b/tensorflow/lite/kernels/eigen_support.cc
@@ -34,6 +34,15 @@ static_assert(
     "kDefaultArenaAlignment doesn't comply with Eigen alignment requirement.");
 #endif  // EIGEN_DONT_ALIGN
 
+// Helper routine for updating the global Eigen thread count used for OpenMP.
+void SetEigenNbThreads(int threads) {
+#if defined(EIGEN_HAS_OPENMP)
+  // The global Eigen thread count is only used when OpenMP is enabled. As this
+  // call causes problems with tsan, make it only when OpenMP is available.
+  Eigen::setNbThreads(context->recommended_num_threads);
+#endif  // defined(EIGEN_HAS_OPENMP)
+}
+
 // We have a single global threadpool for all convolution operations. This means
 // that inferences started from different threads may block each other, but
 // since the underlying resource of CPU cores should be consumed by the
@@ -78,7 +87,7 @@ void InitDevice(TfLiteContext* context, RefCountedEigenContext* ptr) {
 }
 
 TfLiteStatus Refresh(TfLiteContext* context) {
-  Eigen::setNbThreads(context->recommended_num_threads);
+  SetEigenNbThreads(context->recommended_num_threads);
 
   auto* ptr = GetEigenContext(context);
   if (ptr != nullptr) {
@@ -94,7 +103,7 @@ void IncrementUsageCounter(TfLiteContext* context) {
   auto* ptr = GetEigenContext(context);
   if (ptr == nullptr) {
     if (context->recommended_num_threads != -1) {
-      Eigen::setNbThreads(context->recommended_num_threads);
+      SetEigenNbThreads(context->recommended_num_threads);
     }
     ptr = new RefCountedEigenContext;
     ptr->type = kTfLiteEigenContext;
diff --git a/tensorflow/lite/kernels/fill.cc b/tensorflow/lite/kernels/fill.cc
new file mode 100644
index 0000000000000000000000000000000000000000..079ee44f3719f9fa283bf617ee3917eb4c377aff
--- /dev/null
+++ b/tensorflow/lite/kernels/fill.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace fill {
+
+namespace {
+
+constexpr int kDimsTensor = 0;
+constexpr int kValueTensor = 1;
+constexpr int kOutputTensor = 0;
+
+template <typename T>
+TfLiteStatus ResizeOutputImpl(TfLiteContext* context, const TfLiteTensor* dims,
+                              TfLiteTensor* output) {
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(dims->dims->data[0]);
+  for (int i = 0; i < output_shape->size; ++i) {
+    T data = GetTensorData<T>(dims)[i];
+    if (data < 0) {
+      context->ReportError(context, "Fill dimensions must be >= 0", dims->type);
+      return kTfLiteError;
+    }
+    output_shape->data[i] = data;
+  }
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus ResizeOutput(TfLiteContext* context, const TfLiteTensor* dims,
+                          TfLiteTensor* output) {
+  switch (dims->type) {
+    case kTfLiteInt32:
+      return ResizeOutputImpl<int32_t>(context, dims, output);
+    case kTfLiteInt64:
+      return ResizeOutputImpl<int64_t>(context, dims, output);
+    default:
+      context->ReportError(
+          context,
+          "Fill only currently supports int32, int64 for input 0, "
+          "got %d.",
+          dims->type);
+      return kTfLiteError;
+  }
+}
+
+}  // namespace
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* dims = GetInput(context, node, kDimsTensor);
+  const TfLiteTensor* value = GetInput(context, node, kValueTensor);
+
+  // Make sure the 1st input tensor is 1-D.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(dims), 1);
+
+  // Make sure the 1st input tensor is int32 or int64.
+  const auto dtype = dims->type;
+  TF_LITE_ENSURE(context, dtype == kTfLiteInt32 || dtype == kTfLiteInt64);
+
+  // Make sure the 2nd input tensor is a scalar.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(value), 0);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  output->type = value->type;
+
+  if (IsConstantTensor(dims)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutput(context, dims, output));
+  } else {
+    SetTensorToDynamic(output);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* value = GetInput(context, node, kValueTensor);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (IsDynamicTensor(output)) {
+    const TfLiteTensor* dims = GetInput(context, node, kDimsTensor);
+    TF_LITE_ENSURE_OK(context, ResizeOutput(context, dims, output));
+  }
+#define TF_LITE_FILL(data_type)                                               \
+  reference_ops::Fill(GetTensorShape(value), GetTensorData<data_type>(value), \
+                      GetTensorShape(output),                                 \
+                      GetTensorData<data_type>(output))
+  switch (output->type) {
+    case kTfLiteInt32:
+      TF_LITE_FILL(int32_t);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_FILL(int64_t);
+      break;
+    case kTfLiteFloat32:
+      TF_LITE_FILL(float);
+      break;
+    default:
+      context->ReportError(
+          context,
+          "Fill only currently supports int32, int64, float32 for input 1,"
+          "got %d.",
+          value->type);
+      return kTfLiteError;
+  }
+#undef TF_LITE_FILL
+  return kTfLiteOk;
+}
+
+}  // namespace fill
+
+TfLiteRegistration* Register_FILL() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 fill::Prepare, fill::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/fill_test.cc b/tensorflow/lite/kernels/fill_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..08044d76f9d95774fa1b0e37ebb6a9716e9809cb
--- /dev/null
+++ b/tensorflow/lite/kernels/fill_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::IsEmpty;
+
+class FillOpModel : public SingleOpModel {
+ public:
+  explicit FillOpModel(const TensorData& input1, const TensorData& input2) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(input1);
+    SetBuiltinOp(BuiltinOperator_FILL, BuiltinOptions_FillOptions,
+                 CreateFillOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+  int output() { return output_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(FillOpModel, FillInt32) {
+  FillOpModel m({TensorType_INT32, {2}}, {TensorType_INT32});
+  m.PopulateTensor<int32_t>(m.input1(), {2, 3});
+  m.PopulateTensor<int32_t>(m.input2(), {-11});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<int32_t>(m.output()),
+              ElementsAreArray({-11, -11, -11, -11, -11, -11}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 3}));
+}
+
+TEST(FillOpModel, FillInt64) {
+  FillOpModel m({TensorType_INT32, {2}}, {TensorType_INT64});
+  m.PopulateTensor<int32_t>(m.input1(), {2, 4});
+  m.PopulateTensor<int64_t>(m.input2(), {2 ^ 45});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<int64_t>(m.output()),
+              ElementsAreArray({2 ^ 45, 2 ^ 45, 2 ^ 45, 2 ^ 45, 2 ^ 45, 2 ^ 45,
+                                2 ^ 45, 2 ^ 45}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 4}));
+}
+
+TEST(FillOpModel, FillFloat) {
+  FillOpModel m({TensorType_INT64, {3}}, {TensorType_FLOAT32});
+  m.PopulateTensor<int64_t>(m.input1(), {2, 2, 2});
+  m.PopulateTensor<float>(m.input2(), {4.0});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray({4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 2, 2}));
+}
+
+TEST(FillOpModel, FillOutputScalar) {
+  FillOpModel m({TensorType_INT64, {0}}, {TensorType_FLOAT32});
+  m.PopulateTensor<float>(m.input2(), {4.0});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()), ElementsAreArray({4.0}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), IsEmpty());
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
index 61884d6a12c3e150d910244108a357dd34fe8783..f205daae1343cb0abecc95e7d1b280c10f55d897 100644
--- a/tensorflow/lite/kernels/gather.cc
+++ b/tensorflow/lite/kernels/gather.cc
@@ -118,7 +118,7 @@ TfLiteStatus GatherStrings(TfLiteContext* context, const TfLiteTensor* input,
     const auto string_ref = GetString(input, pos);
     buffer.AddString(string_ref.str, string_ref.len);
   }
-  buffer.WriteToTensor(output);
+  buffer.WriteToTensorAsVector(output);
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/kernels/hashtable_lookup.cc b/tensorflow/lite/kernels/hashtable_lookup.cc
index b6ae7a3d1a5479e8ac6996815de9cb02b472acaf..da1116cf858667b1fc35f3f88269b66f81afcdb7 100644
--- a/tensorflow/lite/kernels/hashtable_lookup.cc
+++ b/tensorflow/lite/kernels/hashtable_lookup.cc
@@ -137,7 +137,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
   }
   if (output->type == kTfLiteString) {
-    buf.WriteToTensor(output);
+    buf.WriteToTensorAsVector(output);
   }
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 7d2653f0a1dc96515687e6f57b43b6bc361289ec..69816583f5020843aeff76890f51c6c306f11a4f 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -254,7 +254,6 @@ cc_test(
     name = "tensor_test",
     srcs = ["tensor_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",  # TODO(b/117786830)
     ],
     deps = [
@@ -287,7 +286,6 @@ cc_test(
     name = "quantization_util_test",
     srcs = ["quantization_util_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",  # TODO(b/117786830)
     ],
     deps = [
@@ -315,6 +313,7 @@ cc_library(
         "reference/depthwiseconv_float.h",
         "reference/depthwiseconv_uint8.h",
         "reference/fully_connected.h",
+        "reference/integer_ops/dequantize.h",
         "reference/reference_ops.h",
         "reference/softmax.h",
     ],
@@ -562,7 +561,6 @@ cc_test(
     }),
     linkstatic = 1,
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -576,7 +574,6 @@ cc_test(
 cc_test(
     name = "depthwiseconv_float_test",
     srcs = ["depthwiseconv_float_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":optimized_base",
         ":reference_base",
@@ -591,7 +588,6 @@ cc_test(
     srcs = ["depthwiseconv_quantized_test.cc"],
     shard_count = 2,
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -607,7 +603,6 @@ cc_test(
     name = "resize_bilinear_test",
     srcs = ["resize_bilinear_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable",
     ],
     deps = [
@@ -623,7 +618,6 @@ cc_test(
     name = "resize_nearest_neighbor_test",
     srcs = ["resize_nearest_neighbor_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable",
     ],
     deps = [
@@ -641,7 +635,6 @@ cc_test(
     srcs = [
         "softmax_quantized_test.cc",
     ],
-    tags = ["no_oss"],
     deps = [
         ":optimized_base",
         ":quantization_util",
@@ -659,7 +652,6 @@ cc_test(
         "logsoftmax_quantized_test.cc",
     ],
     tags = [
-        "no_oss",
         "tflite_not_portable",
     ],
     deps = [
@@ -675,7 +667,6 @@ cc_test(
 cc_test(
     name = "log_quantized_test",
     srcs = ["log_quantized_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":optimized_base",
         ":reference_base",
@@ -703,7 +694,6 @@ cc_library(
 cc_test(
     name = "batch_to_space_nd_test",
     srcs = ["batch_to_space_nd_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":optimized_base",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
index 25b66d4b5537f58e9a8795e05128e4cb4b3d2890..c77715de57990666b362b08dae7c21b9707d942c 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -793,22 +793,26 @@ void FloatDepthwiseConvAccumRow(int stride, int dilation_factor,
     int out_x_loop_end_unclampled = 0;
     if (kAllowStrided) {
       if (stride == 2) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + 1) / 2;
         out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 1) / 2;
+            (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
       } else if (stride == 4) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + 3) / 4;
         out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 3) / 4;
+            (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
       } else {
         out_x_loop_start_unclampled =
-            (pad_width - filter_x + stride - 1) / stride;
-        out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + stride - 1) / stride;
+            (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclampled = (pad_width + input_width -
+                                     dilation_factor * filter_x + stride - 1) /
+                                    stride;
       }
     } else {
-      out_x_loop_start_unclampled = pad_width - filter_x;
-      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
+      out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclampled =
+          pad_width + input_width - dilation_factor * filter_x;
     }
     // The kernel will have to iterate on the segment of the
     // output row that starts at out_x_loop_start and out_x_loop_end.
@@ -819,7 +823,8 @@ void FloatDepthwiseConvAccumRow(int stride, int dilation_factor,
 
     float* acc_buffer_ptr =
         acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
-    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const int in_x_origin =
+        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
     const float* input_ptr = input_data + in_x_origin * input_depth;
     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
     FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
@@ -936,8 +941,7 @@ inline void DepthwiseConv(
                                         FIXED_DEPTH_MULTIPLIER)           \
   if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
       (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
-      depth_multiplier == FIXED_DEPTH_MULTIPLIER &&                       \
-      dilation_height_factor == 1 && dilation_width_factor == 1) {        \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
     row_accum_func =                                                      \
         FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,      \
                                    FIXED_DEPTH_MULTIPLIER>;               \
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index df335e9e929e0b91be373cdb5933152392576ce0..bf3902ec31f98a6a1b388d10689b6167742b7bb9 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -64,6 +64,7 @@ using reference_ops::DepthConcatenation;
 using reference_ops::Dequantize;
 using reference_ops::Div;
 using reference_ops::FakeQuant;
+using reference_ops::Fill;
 using reference_ops::Gather;
 using reference_ops::Greater;
 using reference_ops::GreaterEqual;
@@ -2346,36 +2347,37 @@ inline void Add(const ArithmeticParams& params,
 inline void AddElementwise(int size, const ArithmeticParams& params,
                            const uint8* input1_data, const uint8* input2_data,
                            uint8* output_data) {
+  gemmlowp::ScopedProfilingLabel label("AddElementwise/8bit");
   int i = 0;
   TFLITE_DCHECK_GT(params.input1_offset, -256);
   TFLITE_DCHECK_GT(params.input2_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
   TFLITE_DCHECK_LT(params.input2_offset, 256);
 #ifdef USE_NEON
-  const auto output_activation_min_vector =
+  const uint8x8_t output_activation_min_vector =
       vdup_n_u8(params.quantized_activation_min);
-  const auto output_activation_max_vector =
+  const uint8x8_t output_activation_max_vector =
       vdup_n_u8(params.quantized_activation_max);
   for (; i <= size - 8; i += 8) {
-    const auto input1_val_original = vld1_u8(input1_data + i);
-    const auto input2_val_original = vld1_u8(input2_data + i);
-    const auto input1_val_s16 =
+    const uint8x8_t input1_val_original = vld1_u8(input1_data + i);
+    const uint8x8_t input2_val_original = vld1_u8(input2_data + i);
+    const int16x8_t input1_val_s16 =
         vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
-    const auto input2_val_s16 =
+    const int16x8_t input2_val_s16 =
         vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
-    const auto input1_val =
+    const int16x8_t input1_val =
         vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
-    const auto input2_val =
+    const int16x8_t input2_val =
         vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
-    const auto input1_val_high = vget_high_s16(input1_val);
-    const auto input1_val_low = vget_low_s16(input1_val);
-    const auto input2_val_high = vget_high_s16(input2_val);
-    const auto input2_val_low = vget_low_s16(input2_val);
-    auto x11 = vmovl_s16(input1_val_low);
-    auto x12 = vmovl_s16(input1_val_high);
-    auto x21 = vmovl_s16(input2_val_low);
-    auto x22 = vmovl_s16(input2_val_high);
-    const auto left_shift_dup = vdupq_n_s32(params.left_shift);
+    const int16x4_t input1_val_high = vget_high_s16(input1_val);
+    const int16x4_t input1_val_low = vget_low_s16(input1_val);
+    const int16x4_t input2_val_high = vget_high_s16(input2_val);
+    const int16x4_t input2_val_low = vget_low_s16(input2_val);
+    int32x4_t x11 = vmovl_s16(input1_val_low);
+    int32x4_t x12 = vmovl_s16(input1_val_high);
+    int32x4_t x21 = vmovl_s16(input2_val_low);
+    int32x4_t x22 = vmovl_s16(input2_val_high);
+    const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
     x11 = vshlq_s32(x11, left_shift_dup);
     x12 = vshlq_s32(x12, left_shift_dup);
     x21 = vshlq_s32(x21, left_shift_dup);
@@ -2384,24 +2386,24 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
     x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
     x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
     x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
-    const auto input1_shift_dup = vdupq_n_s32(params.input1_shift);
-    const auto input2_shift_dup = vdupq_n_s32(params.input2_shift);
+    const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
+    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
     x11 = vshlq_s32(x11, input1_shift_dup);
     x12 = vshlq_s32(x12, input1_shift_dup);
     x21 = vshlq_s32(x21, input2_shift_dup);
     x22 = vshlq_s32(x22, input2_shift_dup);
-    auto s1 = vaddq_s32(x11, x21);
-    auto s2 = vaddq_s32(x12, x22);
+    int32x4_t s1 = vaddq_s32(x11, x21);
+    int32x4_t s2 = vaddq_s32(x12, x22);
     s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
     s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
     s1 = RoundingDivideByPOT(s1, -params.output_shift);
     s2 = RoundingDivideByPOT(s2, -params.output_shift);
-    const auto s1_narrowed = vmovn_s32(s1);
-    const auto s2_narrowed = vmovn_s32(s2);
-    const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
-                             vdupq_n_s16(params.output_offset));
-    const auto clamped =
+    const int16x4_t s1_narrowed = vmovn_s32(s1);
+    const int16x4_t s2_narrowed = vmovn_s32(s2);
+    const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
+                                  vdupq_n_s16(params.output_offset));
+    const uint8x8_t clamped =
         vmax_u8(output_activation_min_vector,
                 vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
     vst1_u8(output_data + i, clamped);
@@ -2431,6 +2433,109 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
   }
 }
 
+// Scalar-broadcast add that can be used for inner loop of more general
+// broadcast add, so that, for example, scalar-broadcast with batch will still
+// be fast.
+inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
+                               uint8 input1_data, const uint8* input2_data,
+                               uint8* output_data) {
+  using gemmlowp::RoundingDivideByPOT;
+
+  gemmlowp::ScopedProfilingLabel label("AddScalarBroadcast/8bit");
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+
+  int i = 0;
+
+#ifdef USE_NEON
+  const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
+  const uint8x8_t output_activation_min_vector =
+      vdup_n_u8(params.quantized_activation_min);
+  const uint8x8_t output_activation_max_vector =
+      vdup_n_u8(params.quantized_activation_max);
+
+  // Process broadcast scalar.
+  const uint8x8_t input1_val_original = vdup_n_u8(input1_data);
+  const int16x8_t input1_val_s16 =
+      vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
+  const int16x8_t input1_val =
+      vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
+  const int16x4_t input1_val_high = vget_high_s16(input1_val);
+  const int16x4_t input1_val_low = vget_low_s16(input1_val);
+  int32x4_t x11 = vmovl_s16(input1_val_low);
+  int32x4_t x12 = vmovl_s16(input1_val_high);
+  x11 = vshlq_s32(x11, left_shift_dup);
+  x12 = vshlq_s32(x12, left_shift_dup);
+  x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
+  x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
+  const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
+  x11 = vshlq_s32(x11, input1_shift_dup);
+  x12 = vshlq_s32(x12, input1_shift_dup);
+
+  for (; i <= size - 8; i += 8) {
+    const uint8x8_t input2_val_original = vld1_u8(input2_data + i);
+    const int16x8_t input2_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+    const int16x8_t input2_val =
+        vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
+    const int16x4_t input2_val_high = vget_high_s16(input2_val);
+    const int16x4_t input2_val_low = vget_low_s16(input2_val);
+    int32x4_t x21 = vmovl_s16(input2_val_low);
+    int32x4_t x22 = vmovl_s16(input2_val_high);
+    x21 = vshlq_s32(x21, left_shift_dup);
+    x22 = vshlq_s32(x22, left_shift_dup);
+    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
+    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
+    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
+    x21 = vshlq_s32(x21, input2_shift_dup);
+    x22 = vshlq_s32(x22, input2_shift_dup);
+    int32x4_t s1 = vaddq_s32(x11, x21);
+    int32x4_t s2 = vaddq_s32(x12, x22);
+    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
+    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+    s1 = RoundingDivideByPOT(s1, -params.output_shift);
+    s2 = RoundingDivideByPOT(s2, -params.output_shift);
+    const int16x4_t s1_narrowed = vmovn_s32(s1);
+    const int16x4_t s2_narrowed = vmovn_s32(s2);
+    const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
+                                  vdupq_n_s16(params.output_offset));
+    const uint8x8_t clamped =
+        vmax_u8(output_activation_min_vector,
+                vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
+    vst1_u8(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  if (i < size) {
+    // Process broadcast scalar.
+    const int32 input1_val = params.input1_offset + input1_data;
+    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+
+    for (; i < size; ++i) {
+      const int32 input2_val = params.input2_offset + input2_data[i];
+      const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+      const int32 scaled_input2_val =
+          MultiplyByQuantizedMultiplierSmallerThanOneExp(
+              shifted_input2_val, params.input2_multiplier,
+              params.input2_shift);
+      const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+      const int32 raw_output =
+          MultiplyByQuantizedMultiplierSmallerThanOneExp(
+              raw_sum, params.output_multiplier, params.output_shift) +
+          params.output_offset;
+      const int32 clamped_output =
+          std::min(params.quantized_activation_max,
+                   std::max(params.quantized_activation_min, raw_output));
+      output_data[i] = static_cast<uint8>(clamped_output);
+    }
+  }
+}
+
 inline void Add(const ArithmeticParams& params,
                 const RuntimeShape& input1_shape, const uint8* input1_data,
                 const RuntimeShape& input2_shape, const uint8* input2_data,
@@ -2545,26 +2650,63 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
   uint8* output_data_ptr = output_data;
   const uint8* input1_data_ptr = input1_data;
   const uint8* input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
+  // Put another way,
+  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
   int y0 = params.broadcast_shape[0];
   int y1 = params.broadcast_shape[1];
   int y2 = params.broadcast_shape[2];
   int y3 = params.broadcast_shape[3];
   int y4 = params.broadcast_shape[4];
-  for (int i0 = 0; i0 < y0; ++i0) {
-    const uint8* input2_data_ptr;
-    for (int i1 = 0; i1 < y1; ++i1) {
-      input2_data_ptr = input2_data_reset;
-      for (int i2 = 0; i2 < y2; ++i2) {
-        for (int i3 = 0; i3 < y3; ++i3) {
-          AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                         output_data_ptr);
-          input2_data_ptr += y4;
-          output_data_ptr += y4;
+  if (y4 > 1) {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          for (int i3 = 0; i3 < y3; ++i3) {
+            AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                           output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
+        }
+      }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  } else {
+    // Special case of y4 == 1, in which the innermost loop is a single element
+    // and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except simplified
+    // for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
+                             output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
         }
-        input1_data_ptr += y4;
       }
+      input2_data_reset = input2_data_ptr;
     }
-    input2_data_reset = input2_data_ptr;
   }
 }
 
@@ -3581,8 +3723,8 @@ inline void AveragePool(const PoolParams& params,
             std::min(params.filter_height, input_height - in_y_origin);
         const int filter_count =
             (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
-        // 1280 required by Inception v3
-        static constexpr int kAccBufferMaxSize = 2048;
+        // 2560 is required by MobileNetV2 with depth multiplier 2.
+        static constexpr int kAccBufferMaxSize = 4096;
         TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
         uint16 acc[kAccBufferMaxSize];
         memset(acc, 0, depth * sizeof(acc[0]));
@@ -3747,8 +3889,8 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
         const int filter_y_start = std::max(0, -in_y_origin);
         const int filter_y_end =
             std::min(params.filter_height, input_height - in_y_origin);
-        // 2048 required by Inception v3
-        static constexpr int kAccBufferMaxSize = 2048;
+        // 2560 is required by MobileNetV2 with depth multiplier 2.
+        static constexpr int kAccBufferMaxSize = 4096;
         TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
         uint8 acc[kAccBufferMaxSize];
         memset(acc, 0, depth * sizeof(acc[0]));
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/README.md b/tensorflow/lite/kernels/internal/reference/integer_ops/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b1d3c91d50a4c77865ec25fa9961f745a489aea
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/README.md
@@ -0,0 +1,8 @@
+This directory contains reference implementations for int8 fully integer kernels.
+
+Weight filters of convs are expected to be symmetric per-channel quantized in
+the range [-127, 127].
+Inputs/activations are expected to be asymmetric per-layer quantized in the
+range [-128, 127].
+
+THESE ARE EXPERIMENTAL AND PRONE TO CHANGE.
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h b/tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h
new file mode 100644
index 0000000000000000000000000000000000000000..03dcb6c220d3fcbbd219df3a1a1ea5f3b2b29c81
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEQUANTIZE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEQUANTIZE_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void Dequantize(const tflite::DequantizationParams& op_params,
+                       const RuntimeShape& input_shape, const int8* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  const int32 zero_point = op_params.zero_point;
+  const double scale = op_params.scale;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    const int32 val = input_data[i];
+    const float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEQUANTIZE_H_
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index be766ea452700f5109380730def4ff2ff2878032..b7b9139428919f9210f1bcf34dd78f696d044e31 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -735,6 +735,40 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
   }
 }
 
+// Scalar-broadcast add that can be used for inner loop of more general
+// broadcast add, so that, for example, scalar-broadcast with batch will still
+// be fast.
+inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
+                               uint8 input1_data, const uint8* input2_data,
+                               uint8* output_data) {
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+
+  const int32 input1_val = params.input1_offset + input1_data;
+  const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+  const int32 scaled_input1_val =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          shifted_input1_val, params.input1_multiplier, params.input1_shift);
+  for (int i = 0; i < size; ++i) {
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32 raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<uint8>(clamped_output);
+  }
+}
+
 inline void Add(const ArithmeticParams& params,
                 const RuntimeShape& input1_shape, const uint8* input1_data,
                 const RuntimeShape& input2_shape, const uint8* input2_data,
@@ -975,26 +1009,63 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
   uint8* output_data_ptr = output_data;
   const uint8* input1_data_ptr = input1_data;
   const uint8* input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
+  // Put another way,
+  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
   int y0 = params.broadcast_shape[0];
   int y1 = params.broadcast_shape[1];
   int y2 = params.broadcast_shape[2];
   int y3 = params.broadcast_shape[3];
   int y4 = params.broadcast_shape[4];
-  for (int i0 = 0; i0 < y0; ++i0) {
-    const uint8* input2_data_ptr;
-    for (int i1 = 0; i1 < y1; ++i1) {
-      input2_data_ptr = input2_data_reset;
-      for (int i2 = 0; i2 < y2; ++i2) {
-        for (int i3 = 0; i3 < y3; ++i3) {
-          AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                         output_data_ptr);
-          input2_data_ptr += y4;
-          output_data_ptr += y4;
+  if (y4 > 1) {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          for (int i3 = 0; i3 < y3; ++i3) {
+            AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                           output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
         }
-        input1_data_ptr += y4;
       }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  } else {
+    // Special case of y4 == 1, in which the innermost loop is a single element
+    // and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except simplified
+    // for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
+                             output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
     }
-    input2_data_reset = input2_data_ptr;
   }
 }
 
@@ -4613,6 +4684,16 @@ inline void BroadcastPrelu4DSlow(const PreluParams& params,
   }
 }
 
+template <typename T>
+void Fill(const RuntimeShape& value_shape, const T* value_data,
+          const RuntimeShape& output_shape, T* output_data) {
+  TFLITE_DCHECK_EQ(value_shape.DimensionsCount(), 0);
+  const int flat_size = output_shape.FlatSize();
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = *value_data;
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/tensor_ctypes.h b/tensorflow/lite/kernels/internal/tensor_ctypes.h
index b4822d57019b508f7e3d53403ff427f461ed263f..4a94b703f8b299e503305aaa897a2ebc65e50d3b 100644
--- a/tensorflow/lite/kernels/internal/tensor_ctypes.h
+++ b/tensorflow/lite/kernels/internal/tensor_ctypes.h
@@ -53,6 +53,11 @@ inline bool* GetTensorData(TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.b : nullptr;
 }
 
+template <>
+inline int8_t* GetTensorData(TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.int8 : nullptr;
+}
+
 template <typename T>
 inline const T* GetTensorData(const TfLiteTensor* tensor);
 
diff --git a/tensorflow/lite/kernels/layer_norm_lstm.cc b/tensorflow/lite/kernels/layer_norm_lstm.cc
index 5b0046a7b31c9c2e805c6de48572776cf8d3883c..49e8a53c829a0c4a8ae355f8e7a6b97e3bbb81e1 100644
--- a/tensorflow/lite/kernels/layer_norm_lstm.cc
+++ b/tensorflow/lite/kernels/layer_norm_lstm.cc
@@ -55,7 +55,7 @@ constexpr int kCellToForgetWeightsTensor = 10;  // Optional
 constexpr int kCellToOutputWeightsTensor = 11;  // Optional
 
 // Layer norm weights tensors of size {n_cell}, representing a diagonal matrix.
-constexpr int kInputLayerNormWeightsTensor = 12;
+constexpr int kInputLayerNormWeightsTensor = 12;  // Optional
 constexpr int kForgetLayerNormWeightsTensor = 13;
 constexpr int kCellLayerNormWeightsTensor = 14;
 constexpr int kOutputLayerNormWeightsTensor = 15;
@@ -118,7 +118,8 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  if (input_to_input_weights != nullptr) {
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  if (!use_cifg) {
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
@@ -138,7 +139,9 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  if (recurrent_to_input_weights != nullptr) {
+  if (use_cifg) {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights, nullptr);
+  } else {
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
                       n_cell);
@@ -161,15 +164,6 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
                     n_output);
 
-  // We make sure the input-gate's parameters are either both present (regular
-  // LSTM) or not at all (CIFG-LSTM).
-  const bool cifg_weights_all_or_none =
-      ((input_to_input_weights != nullptr) &&
-       (recurrent_to_input_weights != nullptr)) ||
-      ((input_to_input_weights == nullptr) &&
-       (recurrent_to_input_weights == nullptr));
-  TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
-
   const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
   if (cell_to_input_weights) {
@@ -192,7 +186,6 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   }
 
   // Making sure the peephole weights are there all or none.
-  const bool use_cifg = (input_to_input_weights == nullptr);
   const bool peephole_weights_all_or_none =
       ((cell_to_input_weights != nullptr || use_cifg) &&
        (cell_to_forget_weights != nullptr) &&
@@ -204,10 +197,14 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   // Making sure layer norm weights are not null and have the right dimension.
   const TfLiteTensor* input_layer_norm_weights =
-      GetInput(context, node, kInputLayerNormWeightsTensor);
-  TF_LITE_ENSURE(context, input_layer_norm_weights != nullptr);
-  TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->data[0], n_cell);
+      GetOptionalInputTensor(context, node, kInputLayerNormWeightsTensor);
+  if (use_cifg) {
+    TF_LITE_ENSURE_EQ(context, input_layer_norm_weights, nullptr);
+  } else {
+    TF_LITE_ENSURE(context, input_layer_norm_weights != nullptr);
+    TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->data[0], n_cell);
+  }
 
   const TfLiteTensor* forget_layer_norm_weights =
       GetInput(context, node, kForgetLayerNormWeightsTensor);
@@ -978,6 +975,9 @@ TfLiteStatus EvalFloat(
       (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
   const float* projection_bias_ptr =
       (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+  const float* input_layer_norm_weight_ptr =
+      (input_layer_norm_weights == nullptr) ? nullptr
+                                            : input_layer_norm_weights->data.f;
 
   // Required tensors, pointers are non-null.
   const float* input_ptr_batch = input->data.f;
@@ -990,7 +990,6 @@ TfLiteStatus EvalFloat(
       recurrent_to_cell_weights->data.f;
   const float* recurrent_to_output_weights_ptr =
       recurrent_to_output_weights->data.f;
-  const float* input_layer_norm_weight_ptr = input_layer_norm_weights->data.f;
   const float* forget_layer_norm_weight_ptr = forget_layer_norm_weights->data.f;
   const float* cell_layer_norm_weight_ptr = cell_layer_norm_weights->data.f;
   const float* output_layer_norm_weight_ptr = output_layer_norm_weights->data.f;
@@ -1115,6 +1114,9 @@ TfLiteStatus EvalHybrid(
       (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
   const float* projection_bias_ptr =
       (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+  const float* input_layer_norm_weight_ptr =
+      (input_layer_norm_weights == nullptr) ? nullptr
+                                            : input_layer_norm_weights->data.f;
 
   // Required tensors, pointers are non-null.
   const float* input_ptr_batch = input->data.f;
@@ -1141,7 +1143,6 @@ TfLiteStatus EvalHybrid(
       reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
   const float recurrent_to_output_weights_scale =
       recurrent_to_output_weights->params.scale;
-  const float* input_layer_norm_weight_ptr = input_layer_norm_weights->data.f;
   const float* forget_layer_norm_weight_ptr = forget_layer_norm_weights->data.f;
   const float* cell_layer_norm_weight_ptr = cell_layer_norm_weights->data.f;
   const float* output_layer_norm_weight_ptr = output_layer_norm_weights->data.f;
@@ -1221,7 +1222,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
 
   const TfLiteTensor* input_layer_norm_weights =
-      GetInput(context, node, kInputLayerNormWeightsTensor);
+      GetOptionalInputTensor(context, node, kInputLayerNormWeightsTensor);
   const TfLiteTensor* forget_layer_norm_weights =
       GetInput(context, node, kForgetLayerNormWeightsTensor);
   const TfLiteTensor* cell_layer_norm_weights =
diff --git a/tensorflow/lite/kernels/layer_norm_lstm_test.cc b/tensorflow/lite/kernels/layer_norm_lstm_test.cc
index e89bce50c311eb0bf685a7da487c18704e831c91..1c13cee1c3f66ed2a3459cd2bcc32211c3b1f00e 100644
--- a/tensorflow/lite/kernels/layer_norm_lstm_test.cc
+++ b/tensorflow/lite/kernels/layer_norm_lstm_test.cc
@@ -83,7 +83,11 @@ class LayerNormLSTMOpModel : public SingleOpModel {
       cell_to_output_weights_ = AddNullInput();
     }
 
-    input_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
+    if (use_cifg) {
+      input_layer_norm_weights_ = AddNullInput();
+    } else {
+      input_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
+    }
     forget_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
     cell_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
     output_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
@@ -650,6 +654,223 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
                 &layer_norm_lstm);
 }
 
+class CifgPeepholeProjectionNoClippingLayerNormLstmTest
+    : public BaseLayerNormLstmTest {
+  void SetUp() override {
+    input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
+                                -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
+                                -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
+    input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
+                              -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
+                              -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
+    input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
+                                -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
+                                -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
+
+    forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
+    cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
+    output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
+
+    recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
+                                  -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+    recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
+                                    0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
+    recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
+                                    -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
+
+    cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
+    cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
+
+    forget_layer_norm_weights_ = {0.2, 0.2, 0.4, 0.3};
+    cell_layer_norm_weights_ = {0.7, 0.2, 0.3, 0.8};
+    output_layer_norm_weights_ = {0.6, 0.2, 0.2, 0.5};
+    projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
+                           0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
+
+    layer_norm_lstm_input_ = {
+        {// Batch0: 3 (input_sequence_size) * 5 (n_input)
+         0.7, 0.8, 0.1, 0.2, 0.3,   // seq 0
+         0.8, 0.1, 0.2, 0.4, 0.5,   // seq 1
+         0.2, 0.7, 0.7, 0.1, 0.7},  // seq 2
+
+        {// Batch1: 3 (input_sequence_size) * 5 (n_input)
+         0.3, 0.2, 0.9, 0.8, 0.1,   // seq 0
+         0.1, 0.5, 0.2, 0.4, 0.2,   // seq 1
+         0.6, 0.9, 0.2, 0.5, 0.7},  // seq 2
+    };
+  }
+};
+
+TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       LayerNormLstmBlackBoxTest) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  LayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_layer_norm_weight tensor
+          {n_cell},  // forget_layer_norm_weight tensor
+          {n_cell},  // cell_layer_norm_weight tensor
+          {n_cell},  // output_layer_norm_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+      });
+
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
+  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
+  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  // Verify the final output.
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.02129706, 0.140816242, 0.0112733059,     // seq 0
+          0.0132302344, 0.152308047, 0.0346313119,   // seq 1
+          -0.0123688057, 0.165790111, 0.0893077999,  // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.0226350538, 0.0916948169, 0.0769175813,  // seq 0
+          -0.0269966982, 0.149707705, 0.094149217,    // seq 1
+          -0.0103429332, 0.173016444, 0.0720508844,   // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
+TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       HybridLayerNormLstmBlackBoxTest) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  HybridLayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_layer_norm_weight tensor
+          {n_cell},  // forget_layer_norm_weight tensor
+          {n_cell},  // cell_layer_norm_weight tensor
+          {n_cell},  // output_layer_norm_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+      });
+
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
+  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
+  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  // Verify the final output.
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.0212250091, 0.140474007, 0.0115012666,   // seq 0
+          0.0130806509, 0.152660668, 0.0347516984,   // seq 1
+          -0.0124010444, 0.166042402, 0.0898982584,  // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.0228835996, 0.0917588323, 0.0778886303,  // seq 0
+          -0.0275101066, 0.148769245, 0.0938384682,   // seq 1
+          -0.0103605557, 0.172605693, 0.0728750974,   // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
 }  // namespace
 }  // namespace custom
 }  // namespace ops
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index f179ecb195e4dd999cb6e3ed0582e6385a3436b0..0c6a462d291bd2fb478d98fa3597bacde580c59c 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -1118,7 +1118,7 @@ TfLiteStatus EvalHybrid(
             cell_to_output_weights_scale, input_gate_bias_ptr,
             forget_gate_bias_ptr, cell_bias_ptr, output_gate_bias_ptr,
             projection_weights_ptr, projection_weights_scale,
-            projection_bias_ptr, params, n_batch, n_cell, n_input,
+            projection_bias_ptr, params, /*n_batch=*/1, n_cell, n_input,
             aux_input_size, n_output, output_batch_leading_dim,
             input_gate_scratch, forget_gate_scratch, cell_scratch,
             output_gate_scratch, scaling_factors_ptr, prod_scaling_factors_ptr,
diff --git a/tensorflow/lite/kernels/mirror_pad.cc b/tensorflow/lite/kernels/mirror_pad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e74e47f7a37b0f449fb2a63237e95066bb452de6
--- /dev/null
+++ b/tensorflow/lite/kernels/mirror_pad.cc
@@ -0,0 +1,374 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace mirror_pad {
+namespace {
+
+// Simple class that represents a mirror padded tensor - which is the output
+// from the Op.
+struct PaddedTensor {
+  // If not null that means this is a scalar value.
+  // Note: This is not owned by default. It will point to the value
+  // in the input tensor.
+  const void* value = nullptr;
+  // If this tensor is not one value, then this vector will have
+  // all the tensors that belongs to this tensor.
+  // Pointers are owned.
+  std::vector<std::unique_ptr<PaddedTensor>> values;
+  // Pointers to PaddedTensors that are padded on the left of the current
+  // tensor.
+  std::vector<PaddedTensor*> left_pad_ptrs;
+  // Pointers to PaddedTensors that are padded on the right of the current
+  // tensor.
+  std::vector<PaddedTensor*> right_pad_ptrs;
+
+  // Returns mutable pointer to the tensor identified by 'indices'.
+  PaddedTensor* GetMutable(const std::vector<int>& indices) {
+    auto* result = this;
+    for (int i = 0; i < indices.size(); ++i) {
+      if (indices[i] >= result->values.size()) {
+        return nullptr;
+      }
+      result = result->values[indices[i]].get();
+      if (result == nullptr) break;
+    }
+    return result;
+  }
+};
+
+// Util method to initialize the memory of the padded tensor.
+void InitializeTensorMemory(const TfLiteIntArray* const dims, int dim_index,
+                            int dims_size, PaddedTensor* padded_tensor) {
+  if (dim_index >= dims_size) {
+    return;
+  }
+  padded_tensor->values.reserve(dims->data[dim_index]);
+  for (int i = 0; i < dims->data[dim_index]; ++i) {
+    padded_tensor->values.emplace_back(new PaddedTensor());
+    InitializeTensorMemory(dims, dim_index + 1, dims_size,
+                           padded_tensor->values.back().get());
+  }
+}
+
+// Returns pointer to the value at the specified index in 'data'.
+inline const void* GetValuePointerAtIndex(const void* data, int index,
+                                          const TfLiteType data_type) {
+  switch (data_type) {
+    case kTfLiteFloat32:
+      return static_cast<const float*>(data) + index;
+    case kTfLiteInt32:
+      return static_cast<const int32_t*>(data) + index;
+    case kTfLiteUInt8:
+      return static_cast<const uint8_t*>(data) + index;
+    case kTfLiteInt64:
+      return static_cast<const int64_t*>(data) + index;
+    case kTfLiteBool:
+      return static_cast<const bool*>(data) + index;
+    case kTfLiteInt16:
+      return static_cast<const int16_t*>(data) + index;
+    case kTfLiteInt8:
+      return static_cast<const int8_t*>(data) + index;
+    // Unsupported types ?
+    default:
+      return nullptr;
+  }
+  return nullptr;
+}
+
+// Util method that increment index in the N-d array.
+void IncrementTensorIndex(const TfLiteIntArray* dims,
+                          std::vector<int>* tensor_index_ptr) {
+  int dimension_index = dims->size - 1;
+  auto& tensor_index = *tensor_index_ptr;
+  tensor_index[dimension_index]++;
+  while (dimension_index >= 0 &&
+         tensor_index[dimension_index] == dims->data[dimension_index]) {
+    tensor_index[dimension_index] = 0;
+    dimension_index--;
+    if (dimension_index >= 0) tensor_index[dimension_index]++;
+  }
+}
+
+// Fills the 'padded_tensor' with data from 'input_tensor'.
+TfLiteStatus InitFromInputTensor(const TfLiteTensor* input_tensor,
+                                 PaddedTensor* padded_tensor) {
+  const auto* dims = input_tensor->dims;
+  const auto data_type = input_tensor->type;
+  const void* data = static_cast<const void*>(input_tensor->data.raw_const);
+  // Either invalid input or unsupported type.+
+  if (data == nullptr) {
+    return kTfLiteError;
+  }
+  // Index of current processing tensor.
+  std::vector<int> tensor_index(dims->size, 0);
+  int flat_index = 0;
+  const int num_elements = NumElements(input_tensor);
+  while (flat_index < num_elements) {
+    auto* tensor = padded_tensor->GetMutable(tensor_index);
+    if (tensor == nullptr) {
+      return kTfLiteError;
+    }
+    tensor->value = GetValuePointerAtIndex(data, flat_index, data_type);
+    IncrementTensorIndex(dims, &tensor_index);
+    ++flat_index;
+  }
+
+  return kTfLiteOk;
+}
+
+template <typename T>
+inline void GetPadding(const T* data, int offset, int64_t* left_pad,
+                       int64_t* right_pad) {
+  *left_pad = static_cast<int64_t>(*(data + offset * 2));
+  *right_pad = static_cast<int64_t>(*(data + offset * 2 + 1));
+}
+
+inline TfLiteStatus GetPadding(const TfLiteTensor* padding_matrix,
+                               int dimension, int64_t* left_pad,
+                               int64_t* right_pad) {
+  switch (padding_matrix->type) {
+    case kTfLiteInt32:
+      GetPadding(padding_matrix->data.i32, dimension, left_pad, right_pad);
+      break;
+    case kTfLiteInt64:
+      GetPadding(padding_matrix->data.i64, dimension, left_pad, right_pad);
+      break;
+    default:
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus ValidateTensor(const TfLiteTensor* padding_matrix, int offset,
+                            int dimension_index, PaddedTensor* padded_tensor,
+                            TfLiteContext* context) {
+  if (dimension_index >= padding_matrix->dims->data[0]) {
+    return kTfLiteOk;
+  }
+
+  int64_t left_pad = 0, right_pad = 0;
+  TF_LITE_ENSURE_STATUS(
+      GetPadding(padding_matrix, dimension_index, &left_pad, &right_pad));
+  // If we are not going to include border we must have enough values
+  // to use.
+  if (left_pad + offset > padded_tensor->values.size()) {
+    context->ReportError(
+        context, "Not enough values for Mirror Pad, required %d, available %d.",
+        left_pad + offset, padded_tensor->values.size());
+    return kTfLiteError;
+  }
+  if (right_pad + offset > padded_tensor->values.size()) {
+    context->ReportError(
+        context, "Not enough values for Mirror Pad, required %d, available %d.",
+        right_pad + offset, padded_tensor->values.size());
+    return kTfLiteError;
+  }
+  if (!padded_tensor->values.empty()) {
+    ValidateTensor(padding_matrix, offset, dimension_index + 1,
+                   padded_tensor->values[0].get(), context);
+  }
+  return kTfLiteOk;
+}
+
+// Fills 'padded_tensor' with the padding information based on
+// 'padding_matrix'.
+// 'dimension_index' represents which dimension the function is operating on.
+TfLiteStatus PadTensor(const TfLiteTensor* padding_matrix, int offset,
+                       int dimension_index, PaddedTensor* padded_tensor,
+                       TfLiteContext* context) {
+  if (dimension_index >= padding_matrix->dims->data[0]) return kTfLiteOk;
+
+  int64_t left_pad = 0, right_pad = 0;
+  TF_LITE_ENSURE_STATUS(
+      GetPadding(padding_matrix, dimension_index, &left_pad, &right_pad));
+
+  for (int i = left_pad + offset - 1; i >= offset && left_pad > 0;
+       --i, --left_pad) {
+    padded_tensor->left_pad_ptrs.push_back(padded_tensor->values[i].get());
+  }
+  for (int i = padded_tensor->values.size() - (1 + offset);
+       i >= 0 && right_pad > 0; --i, --right_pad) {
+    padded_tensor->right_pad_ptrs.push_back(padded_tensor->values[i].get());
+  }
+
+  for (auto& tensor : padded_tensor->values) {
+    TF_LITE_ENSURE_STATUS(PadTensor(padding_matrix, offset, dimension_index + 1,
+                                    tensor.get(), context));
+  }
+  return kTfLiteOk;
+}
+
+// Fills 'output_data' with data from 'padded_tensor'.
+// The function does this recursively by setting left padding first then
+// original data, followed by the right padding.
+template <typename T>
+int FillOutput(const PaddedTensor* padded_tensor, T* output_data,
+               int index_in_output) {
+  if (padded_tensor == nullptr || output_data == nullptr) {
+    return -1;
+  }
+  if (padded_tensor->value != nullptr) {
+    output_data[index_in_output] = *static_cast<const T*>(padded_tensor->value);
+    return index_in_output + 1;
+  }
+  for (const auto* tensor : padded_tensor->left_pad_ptrs) {
+    index_in_output = FillOutput(tensor, output_data, index_in_output);
+  }
+  for (const auto& tensor : padded_tensor->values) {
+    index_in_output = FillOutput(tensor.get(), output_data, index_in_output);
+  }
+  for (const auto* tensor : padded_tensor->right_pad_ptrs) {
+    index_in_output = FillOutput(tensor, output_data, index_in_output);
+  }
+  return index_in_output;
+}
+
+// Returns the shape of the final output after padding.
+std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> GetPaddedOutputShape(
+    const TfLiteTensor* input, const TfLiteTensor* padding_matrix) {
+  const int input_dims = NumDimensions(input);
+  std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
+      TfLiteIntArrayCreate(input_dims), TfLiteIntArrayFree);
+
+  int64_t left_pad = 0, right_pad = 0;
+  for (int i = 0; i < input_dims; ++i) {
+    GetPadding(padding_matrix, i, &left_pad, &right_pad);
+    shape->data[i] = SizeOfDimension(input, i) + left_pad + right_pad;
+  }
+  return shape;
+}
+
+}  // namespace
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_tensor = GetInput(context, node, 0);
+  const TfLiteTensor* padding_matrix = GetInput(context, node, 1);
+  auto* params =
+      reinterpret_cast<TfLiteMirrorPaddingParams*>(node->builtin_data);
+
+  if (params == nullptr) {
+    return kTfLiteError;
+  }
+  const int input_dims = NumDimensions(input_tensor);
+
+  TfLiteTensor* output_tensor = GetOutput(context, node, 0);
+  if (IsDynamicTensor(output_tensor)) {
+    auto output_size = GetPaddedOutputShape(input_tensor, padding_matrix);
+    if (output_size == nullptr) {
+      return kTfLiteError;
+    }
+    TF_LITE_ENSURE_STATUS(
+        context->ResizeTensor(context, output_tensor, output_size.release()));
+  }
+
+  PaddedTensor padded_tensor;
+  // Initialize memory.
+  InitializeTensorMemory(input_tensor->dims, 0, input_dims, &padded_tensor);
+  // Set the values from the input_tensor.
+  TF_LITE_ENSURE_STATUS(InitFromInputTensor(input_tensor, &padded_tensor));
+
+  const int offset =
+      params->mode != TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect ? 0
+                                                                           : 1;
+  // Make sure padding values are sufficient and valid to use.
+  TF_LITE_ENSURE_STATUS(
+      ValidateTensor(padding_matrix, offset, 0, &padded_tensor, context));
+  // Apply padding.
+  TF_LITE_ENSURE_STATUS(
+      PadTensor(padding_matrix, offset, 0, &padded_tensor, context));
+
+  // Fill the output tensor from the padded tensor.
+  TfLiteStatus status = kTfLiteOk;
+
+#define TF_LITE_MIRROR_PAD(type) \
+  FillOutput(&padded_tensor, GetTensorData<type>(output_tensor), 0);
+
+  switch (output_tensor->type) {
+    case kTfLiteFloat32: {
+      TF_LITE_MIRROR_PAD(float);
+      break;
+    }
+    case kTfLiteInt32: {
+      TF_LITE_MIRROR_PAD(int32_t);
+      break;
+    }
+    case kTfLiteUInt8: {
+      TF_LITE_MIRROR_PAD(uint8_t);
+      break;
+    }
+    case kTfLiteInt64: {
+      TF_LITE_MIRROR_PAD(int64_t);
+      break;
+    }
+    default:
+      status = kTfLiteError;
+      break;
+  }
+#undef TF_LITE_MIRROR_PAD
+  return status;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_tensor = GetInput(context, node, 0);
+  const TfLiteTensor* padding_matrix = GetInput(context, node, 1);
+  TfLiteTensor* output_tensor = GetOutput(context, node, 0);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(padding_matrix), 2);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(padding_matrix, 0),
+                    NumDimensions(input_tensor));
+
+  if (!IsConstantTensor(padding_matrix)) {
+    SetTensorToDynamic(output_tensor);
+    return kTfLiteOk;
+  }
+  // We have constant padding, so we can infer output size.
+
+  auto output_size = GetPaddedOutputShape(input_tensor, padding_matrix);
+  if (output_size == nullptr) {
+    return kTfLiteError;
+  }
+  return context->ResizeTensor(context, output_tensor, output_size.release());
+}
+
+}  // namespace mirror_pad
+TfLiteRegistration* Register_MIRROR_PAD() {
+  static TfLiteRegistration r = {mirror_pad::Init, mirror_pad::Free,
+                                 mirror_pad::Prepare, mirror_pad::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/mirror_pad_test.cc b/tensorflow/lite/kernels/mirror_pad_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fd09e6e4493d3a29bffecfcd4a4d1946840a4e5e
--- /dev/null
+++ b/tensorflow/lite/kernels/mirror_pad_test.cc
@@ -0,0 +1,189 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class BaseMirrorPadOpModel : public SingleOpModel {
+ public:
+  BaseMirrorPadOpModel(const TensorData& input,
+                       const TensorData& padding_matrix,
+                       const TensorData& output,
+                       const tflite::MirrorPadMode mode) {
+    input_id_ = AddInput(input);
+    padding_matrix_id_ = AddInput(padding_matrix);
+    output_id_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MIRROR_PAD, BuiltinOptions_MirrorPadOptions,
+                 CreateMirrorPadOptions(builder_, mode).Union());
+    BuildInterpreter({GetShape(input_id_), GetShape(padding_matrix_id_)});
+  }
+
+  int input_tensor_id() { return input_id_; }
+  int padding_matrix_tensor_id() { return padding_matrix_id_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_id_); }
+
+ protected:
+  int input_id_;
+  int padding_matrix_id_;
+  int output_id_;
+};
+
+TEST(MirrorPadTest, EmptyPad) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 0, 0, 0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadOneSide_right_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 1, 0, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 2, 4, 5, 6, 5, 1, 2, 3, 2}));
+}
+
+TEST(MirrorPadTest, PadOneSide_left_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 0, 1, 0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 4, 5, 6, 2, 1, 2, 3, 5, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadOneSide_right_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 1, 0, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 3, 4, 5, 6, 6, 4, 5, 6, 6}));
+}
+
+TEST(MirrorPadTest, PadOneSide_left_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 0, 1, 0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 1, 2, 3, 1, 1, 2, 3, 4, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 1, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 1, 2, 3, 3, 1, 1, 2, 3, 3,
+                                4, 4, 5, 6, 6, 4, 4, 5, 6, 6}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 1, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 4, 5, 6, 5, 2, 1, 2, 3, 2,
+                                5, 4, 5, 6, 5, 2, 1, 2, 3, 2}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Symmetric_Whole) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {2, 2, 3, 3});
+  model.Invoke();
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({6, 5, 4, 4, 5, 6, 6, 5, 4, 3, 2, 1, 1, 2, 3, 3, 2, 1,
+                        3, 2, 1, 1, 2, 3, 3, 2, 1, 6, 5, 4, 4, 5, 6, 6, 5, 4,
+                        6, 5, 4, 4, 5, 6, 6, 5, 4, 3, 2, 1, 1, 2, 3, 3, 2, 1}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Reflect_Whole) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 2, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1,
+                                6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1}));
+}
+
+TEST(MirrorPadTest, Pad_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 2, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({2, 1, 1, 2, 3, 3, 2, 2, 1, 1, 2, 3, 3, 2,
+                                5, 4, 4, 5, 6, 6, 5, 5, 4, 4, 5, 6, 6, 5}));
+}
+
+TEST(MirrorPadTest, Pad_1D_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {3}}, {TensorType_INT32, {1, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 2, 1}));
+}
+
+TEST(MirrorPadTest, Pad_1D_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {3}}, {TensorType_INT32, {1, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 3, 2}));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/pack.cc b/tensorflow/lite/kernels/pack.cc
index 479495c875dac5d4e827864548c6b4a188e284ee..d15a5a08af38672cbdaef76ff8a37c42d6e6f226 100644
--- a/tensorflow/lite/kernels/pack.cc
+++ b/tensorflow/lite/kernels/pack.cc
@@ -35,7 +35,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   const TfLiteTensor* input0 = GetInput(context, node, 0);
-  TF_LITE_ENSURE(context, NumDimensions(input0) < 4);
   TF_LITE_ENSURE(context, NumDimensions(input0) >= data->axis);
   // TODO(renjieliu): Support negative axis.
   TF_LITE_ENSURE(context, data->axis >= 0);
diff --git a/tensorflow/lite/kernels/pack_test.cc b/tensorflow/lite/kernels/pack_test.cc
index 4f58debc5c872ea640ed97cd51884a39b412ff2f..530cc2e50f0fe640cc5b120b8bbb1bade7e996fc 100644
--- a/tensorflow/lite/kernels/pack_test.cc
+++ b/tensorflow/lite/kernels/pack_test.cc
@@ -82,6 +82,19 @@ TEST(PackOpTest, FloatMultilDimensions) {
               ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
 }
 
+TEST(PackOpTest, FloatFiveDimensions) {
+  PackOpModel<float> model({TensorType_FLOAT32, {2, 2, 2, 2}}, 1, 2);
+  model.SetInput(0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  model.SetInput(
+      1, {17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 2, 2, 2));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1,  2,  3,  4,  5,  6,  7,  8,  17, 18, 19,
+                                20, 21, 22, 23, 24, 9,  10, 11, 12, 13, 14,
+                                15, 16, 25, 26, 27, 28, 29, 30, 31, 32}));
+}
+
 // int32 tests.
 TEST(PackOpTest, Int32ThreeInputs) {
   PackOpModel<int32_t> model({TensorType_INT32, {2}}, 0, 3);
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 7f0b9239ee19eb719a15be10fa6e8578a6ac8474..c0e6f6994fd2334917b178d4d3b16d73c27121c4 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -127,6 +127,8 @@ TfLiteRegistration* Register_FLOOR_MOD();
 TfLiteRegistration* Register_RANGE();
 TfLiteRegistration* Register_LEAKY_RELU();
 TfLiteRegistration* Register_SQUARED_DIFFERENCE();
+TfLiteRegistration* Register_FILL();
+TfLiteRegistration* Register_MIRROR_PAD();
 
 TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) {
   context->ReportError(
@@ -220,7 +222,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_LOG, Register_LOG());
   AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX());
   AddBuiltin(BuiltinOperator_CAST, Register_CAST());
-  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE());
+  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
   AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
   AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
@@ -264,6 +268,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RANGE, Register_RANGE());
   AddBuiltin(BuiltinOperator_LEAKY_RELU, Register_LEAKY_RELU());
   AddBuiltin(BuiltinOperator_SQUARED_DIFFERENCE, Register_SQUARED_DIFFERENCE());
+  AddBuiltin(BuiltinOperator_FILL, Register_FILL());
+  AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/lite/kernels/resize_bilinear_test.cc b/tensorflow/lite/kernels/resize_bilinear_test.cc
index 530bb32b946f07acf60f3ccbeab0248c7c2b5747..d3f4837a287accd93c23e17fa3a361efd4120101 100644
--- a/tensorflow/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/resize_bilinear_test.cc
@@ -26,8 +26,8 @@ using uint8 = std::uint8_t;
 
 class ResizeBilinearOpModel : public SingleOpModel {
  public:
-  ResizeBilinearOpModel(const TensorData& input,
-                        std::initializer_list<int> size_data = {}) {
+  explicit ResizeBilinearOpModel(const TensorData& input,
+                                 std::initializer_list<int> size_data = {}) {
     bool const_size = size_data.size() != 0;
     input_ = AddInput(input);
     if (const_size) {
diff --git a/tensorflow/lite/kernels/skip_gram.cc b/tensorflow/lite/kernels/skip_gram.cc
index f20719ecaf6eda023f9a2826d7a995c1708e9577..265ba18a3e39d3316fef2d41306540e7a170e675 100644
--- a/tensorflow/lite/kernels/skip_gram.cc
+++ b/tensorflow/lite/kernels/skip_gram.cc
@@ -107,7 +107,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Generate n-grams recursively.
   tflite::DynamicBuffer buf;
   if (words.size() < params->ngram_size) {
-    buf.WriteToTensor(GetOutput(context, node, 0));
+    buf.WriteToTensorAsVector(GetOutput(context, node, 0));
     return kTfLiteOk;
   }
 
@@ -145,7 +145,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
   }
 
-  buf.WriteToTensor(GetOutput(context, node, 0));
+  buf.WriteToTensorAsVector(GetOutput(context, node, 0));
   return kTfLiteOk;
 }
 }  // namespace
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index 6b2a1f89c37dd3dcccdf5aade53ed0f984263e3a..549ea78f5b45b20139b023552a98c3dcb0d75610 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -129,14 +129,14 @@ void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
 
   interpreter_->SetAllowFp16PrecisionForFp32(allow_fp32_relax_to_fp16);
 
+  CHECK(interpreter_->AllocateTensors() == kTfLiteOk)
+      << "Cannot allocate tensors";
+  interpreter_->ResetVariableTensors();
+
   // Modify delegate with function.
   if (apply_delegate_fn_) {
     apply_delegate_fn_(interpreter_.get());
   }
-
-  CHECK(interpreter_->AllocateTensors() == kTfLiteOk)
-      << "Cannot allocate tensors";
-  interpreter_->ResetVariableTensors();
 }
 
 void SingleOpModel::Invoke() { CHECK(interpreter_->Invoke() == kTfLiteOk); }
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index a49d6c2cae2b0e3423c69e2894405979e57d870b..f5c67c3e9cf239a4ce98ce564a5274507239bf58 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -199,7 +199,7 @@ class SingleOpModel {
     for (const string& s : content) {
       buf.AddString(s.data(), s.length());
     }
-    buf.WriteToTensor(tensor);
+    buf.WriteToTensor(tensor, /*new_shape=*/nullptr);
   }
 
   // Populate the tensor given its index.
@@ -307,10 +307,12 @@ class SingleOpModel {
 
     if (is_quantized) {
       if (t.min != 0 || t.max != 0) {
-        // TODO(b/119422369): Handle signed int8 here.
         if (t.type == TensorType_UINT8) {
           std::tie(t.scale, t.zero_point) =
               QuantizationParams<uint8_t>(t.min, t.max);
+        } else if (t.type == TensorType_INT8) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<int8_t>(t.min, t.max);
         } else if (t.type == TensorType_INT32) {
           std::tie(t.scale, t.zero_point) =
               QuantizationParams<int32_t>(t.min, t.max);
diff --git a/tensorflow/lite/lib_package/create_ios_frameworks.sh b/tensorflow/lite/lib_package/create_ios_frameworks.sh
index 7901655b7c6926a38dc30009a8b95185fdc2d8cc..abf40e7dec6c3f14ba38cb3491be5d2d0acc7caa 100755
--- a/tensorflow/lite/lib_package/create_ios_frameworks.sh
+++ b/tensorflow/lite/lib_package/create_ios_frameworks.sh
@@ -1,4 +1,4 @@
-#!/bin/bash -x
+#!/bin/bash
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,10 +20,41 @@ set -e
 echo "Starting"
 TFLITE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/.."
 
+usage() {
+  echo "Usage: $(basename "$0") [-a]"
+  echo "-g build with GPU delegate"
+  exit 1
+}
+
+USE_GPU_DELEGATE="false"
+FRAMEWORK_NAME="tensorflow_lite"
+while getopts "g" opt_name; do
+  case "$opt_name" in
+    g)
+        USE_GPU_DELEGATE="true"
+        FRAMEWORK_NAME="tensorflow_lite_gpu"
+        ;;
+    *) usage;;
+  esac
+done
+shift $((OPTIND - 1))
+readonly USE_GPU_DELEGATE
+readonly FRAMEWORK_NAME
+
+if [ $USE_GPU_DELEGATE == "true" ] ; then
+  for filename in metal_delegate.h libmetal_delegate.a ; do
+    if [[ ! -f "${TFLITE_DIR}/delegates/gpu/${filename}" ]] ; then
+      echo "File ${TFLITE_DIR}/delegates/gpu/${filename} doesn't exist."
+      echo "It's requried for building TFLite Framework with GPU. Aborting."
+      exit 1
+    fi
+  done
+fi
+
 TMP_DIR=$(mktemp -d)
 echo "Package dir: " $TMP_DIR
 FW_DIR=$TMP_DIR/tensorflow_lite_ios_frameworks
-FW_DIR_TFLITE=$FW_DIR/tensorflow_lite.framework
+FW_DIR_TFLITE=$FW_DIR/$FRAMEWORK_NAME.framework
 FW_DIR_TFLITE_HDRS=$FW_DIR_TFLITE/Headers
 
 echo "Creating target Headers directories"
@@ -58,8 +89,14 @@ cp $TFLITE_DIR/../../bazel-genfiles/tensorflow/tools/lib_package/include/tensorf
    $FW_DIR_TFLITE
 
 echo "Copying static libraries"
+# Note: There must be a static library with the same name
+# as the framework name.
 cp $TFLITE_DIR/tools/make/gen/lib/libtensorflow-lite.a \
-   $FW_DIR_TFLITE/tensorflow_lite
+    $FW_DIR_TFLITE/$FRAMEWORK_NAME
+if [ $USE_GPU_DELEGATE == "true" ] ; then
+  cp "${TFLITE_DIR}/delegates/gpu/libmetal_delegate.a" \
+      $FW_DIR_TFLITE/libmetal_delegate.a
+fi
 
 # This is required, otherwise they interfere with the documentation of the
 # pod at cocoapods.org.
@@ -71,10 +108,10 @@ find . -type f -name readme\* -exec rm -f {} \;
 TARGET_GEN_LOCATION="$TFLITE_DIR/gen/ios_frameworks"
 echo "Moving results to target: " $TARGET_GEN_LOCATION
 cd $FW_DIR
-zip -q -r tensorflow_lite.framework.zip tensorflow_lite.framework -x .DS_Store
+zip -q -r $FRAMEWORK_NAME.framework.zip $FRAMEWORK_NAME.framework -x .DS_Store
 rm -rf $TARGET_GEN_LOCATION
 mkdir -p $TARGET_GEN_LOCATION
-cp -r tensorflow_lite.framework.zip $TARGET_GEN_LOCATION
+cp -r $FRAMEWORK_NAME.framework.zip $TARGET_GEN_LOCATION
 
 echo "Cleaning up"
 rm -rf $TMP_DIR
diff --git a/tensorflow/lite/model.cc b/tensorflow/lite/model.cc
index 5ac0532afeffc0801a207c385be9816fa459b416..bfadf2d6a0c8e20db461f991a41abf1326bf0516 100644
--- a/tensorflow/lite/model.cc
+++ b/tensorflow/lite/model.cc
@@ -91,16 +91,25 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
 }
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromFile(
-    const char* filename, TfLiteVerifier* verifier,
+    const char* filename, TfLiteVerifier* extra_verifier,
     ErrorReporter* error_reporter) {
   error_reporter = ValidateErrorReporter(error_reporter);
 
   std::unique_ptr<FlatBufferModel> model;
   auto allocation = GetAllocationFromFile(filename, /*mmap_file=*/true,
                                           error_reporter, /*use_nnapi=*/true);
-  if (verifier &&
-      !verifier->Verify(static_cast<const char*>(allocation->base()),
-                        allocation->bytes(), error_reporter)) {
+
+  flatbuffers::Verifier base_verifier(
+      reinterpret_cast<const uint8_t*>(allocation->base()),
+      allocation->bytes());
+  if (!VerifyModelBuffer(base_verifier)) {
+    error_reporter->Report("The model is not a valid Flatbuffer file");
+    return nullptr;
+  }
+
+  if (extra_verifier &&
+      !extra_verifier->Verify(static_cast<const char*>(allocation->base()),
+                              allocation->bytes(), error_reporter)) {
     return model;
   }
   model.reset(new FlatBufferModel(allocation.release(), error_reporter));
@@ -121,6 +130,26 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromBuffer(
   return model;
 }
 
+std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromBuffer(
+    const char* buffer, size_t buffer_size, TfLiteVerifier* extra_verifier,
+    ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
+  flatbuffers::Verifier base_verifier(reinterpret_cast<const uint8_t*>(buffer),
+                                      buffer_size);
+  if (!VerifyModelBuffer(base_verifier)) {
+    error_reporter->Report("The model is not a valid Flatbuffer buffer");
+    return nullptr;
+  }
+
+  if (extra_verifier &&
+      !extra_verifier->Verify(buffer, buffer_size, error_reporter)) {
+    return nullptr;
+  }
+
+  return BuildFromBuffer(buffer, buffer_size, error_reporter);
+}
+
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromModel(
     const tflite::Model* model_spec, ErrorReporter* error_reporter) {
   error_reporter = ValidateErrorReporter(error_reporter);
diff --git a/tensorflow/lite/model.h b/tensorflow/lite/model.h
index 01e7c682056b2b14155394f978545470c7748c2d..bd0f4baef6e2ea9330ee11e33eb4e51bb4ec4fe4 100644
--- a/tensorflow/lite/model.h
+++ b/tensorflow/lite/model.h
@@ -68,11 +68,15 @@ class FlatBufferModel {
 
   // Verifies whether the content of the file is legit, then builds a model
   // based on the file.
+  // The extra_verifier argument is an additional optional verifier for the file
+  // contents. By default, we always check with tflite::VerifyModelBuffer. If
+  // extra_verifier is supplied, the file contents is also checked against the
+  // extra_verifier after the check against tflite::VerifyModelBuilder.
   // Caller retains ownership of `error_reporter` and must ensure its lifetime
   // is longer than the FlatBufferModel instance.
   // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> VerifyAndBuildFromFile(
-      const char* filename, TfLiteVerifier* verifier = nullptr,
+      const char* filename, TfLiteVerifier* extra_verifier = nullptr,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
   // Builds a model based on a pre-loaded flatbuffer. The caller retains
@@ -80,10 +84,27 @@ class FlatBufferModel {
   // is destroyed. Caller retains ownership of `error_reporter` and must ensure
   // its lifetime is longer than the FlatBufferModel instance.
   // Returns a nullptr in case of failure.
+  // NOTE: this does NOT validate the buffer so it should NOT be called on
+  // invalid/untrusted input. Use VerifyAndBuildFromBuffer in that case
   static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
       const char* buffer, size_t buffer_size,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
+  // Verifies whether the content of the buffer is legit, then builds a model
+  // based on the pre-loaded flatbuffer.
+  // The extra_verifier argument is an additional optional verifier for the
+  // buffer. By default, we always check with tflite::VerifyModelBuffer. If
+  // extra_verifier is supplied, the buffer is checked against the
+  // extra_verifier after the check against tflite::VerifyModelBuilder. The
+  // caller retains ownership of the buffer and should keep it alive until the
+  // returned object is destroyed. Caller retains ownership of `error_reporter`
+  // and must ensure its lifetime is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> VerifyAndBuildFromBuffer(
+      const char* buffer, size_t buffer_size,
+      TfLiteVerifier* extra_verifier = nullptr,
+      ErrorReporter* error_reporter = DefaultErrorReporter());
+
   // Builds a model directly from a flatbuffer pointer. The caller retains
   // ownership of the buffer and should keep it alive until the returned object
   // is destroyed. Caller retains ownership of `error_reporter` and must ensure
diff --git a/tensorflow/lite/models/smartreply/demo/app/src/main/BUILD b/tensorflow/lite/models/smartreply/demo/app/src/main/BUILD
index b14af4cb20b893f49a0b6145f63b889115f8dbf6..73326e994bcd1bcbbea13e438b7be3ff26d378e6 100644
--- a/tensorflow/lite/models/smartreply/demo/app/src/main/BUILD
+++ b/tensorflow/lite/models/smartreply/demo/app/src/main/BUILD
@@ -62,6 +62,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/java/jni",
         "//tensorflow/lite/models/smartreply:predictor_lib",
     ],
     alwayslink = 1,
diff --git a/tensorflow/lite/models/smartreply/ops/normalize.cc b/tensorflow/lite/models/smartreply/ops/normalize.cc
index 8480260f279c0072d09fb883fbd711cac3ea875f..3cb11cc055b269a6230a593617a86055e9d34139 100644
--- a/tensorflow/lite/models/smartreply/ops/normalize.cc
+++ b/tensorflow/lite/models/smartreply/ops/normalize.cc
@@ -92,7 +92,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   tflite::DynamicBuffer buf;
   buf.AddString(result.data(), result.length());
-  buf.WriteToTensor(GetOutput(context, node, 0));
+  buf.WriteToTensorAsVector(GetOutput(context, node, 0));
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/models/smartreply/predictor.cc b/tensorflow/lite/models/smartreply/predictor.cc
index 7db2502977707d66f8b45c91d4191b92b39b75e0..59bf4a3cf1ed964e58a3b3dc9c6fb62139fcd56e 100644
--- a/tensorflow/lite/models/smartreply/predictor.cc
+++ b/tensorflow/lite/models/smartreply/predictor.cc
@@ -49,7 +49,7 @@ void ExecuteTfLite(const std::string& sentence,
     TfLiteTensor* input = interpreter->tensor(interpreter->inputs()[0]);
     tflite::DynamicBuffer buf;
     buf.AddString(sentence.data(), sentence.length());
-    buf.WriteToTensor(input);
+    buf.WriteToTensorAsVector(input);
     interpreter->AllocateTensors();
 
     interpreter->Invoke();
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index c7a8e4f06ae19057e6b869d840233613a04a95d3..52ea6fe636247ec0a4d5fedb41c56fc095e6ac61 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -58,7 +58,6 @@ cc_test(
     name = "profile_summarizer_test",
     srcs = ["profile_summarizer_test.cc"],
     copts = common_copts,
-    tags = ["no_oss"],
     deps = [
         ":profile_summarizer",
         "//tensorflow/lite:framework",
diff --git a/tensorflow/lite/profiling/profiler_test.cc b/tensorflow/lite/profiling/profiler_test.cc
index 82d053729c900fbb536c59658357f3a5a550646b..addebabe1b1556e3853eb0a2bec65132f743d012 100644
--- a/tensorflow/lite/profiling/profiler_test.cc
+++ b/tensorflow/lite/profiling/profiler_test.cc
@@ -27,11 +27,8 @@ namespace tflite {
 namespace profiling {
 namespace {
 
-void AssertDurationOfEventAroundMs(const ProfileEvent* event,
-                                   double expected_ms, double eps_ms) {
-  double duration_ms =
-      (event->end_timestamp_us - event->begin_timestamp_us) / 1e3;
-  EXPECT_NEAR(expected_ms, duration_ms, eps_ms);
+double GetDurationOfEventMs(const ProfileEvent* event) {
+  return (event->end_timestamp_us - event->begin_timestamp_us) / 1e3;
 }
 
 void SleepForQuarterSecond(Profiler* profiler) {
@@ -84,12 +81,17 @@ TEST(ProfilingTest, ProfilesAreCollected) {
 
 #ifndef ADDRESS_SANITIZER
   // ASAN build is sometimes very slow. Set a large epsilon to avoid flakiness.
+  // Due to flakiness, just verify relative values match.
   const int eps_ms = 50;
-  AssertDurationOfEventAroundMs(profile_events[0], /*expected_ms*/ 500, eps_ms);
-  AssertDurationOfEventAroundMs(profile_events[1], /*expected_ms*/ 250, eps_ms);
-  AssertDurationOfEventAroundMs(profile_events[2], /*expected_ms*/ 250, eps_ms);
-  AssertDurationOfEventAroundMs(profile_events[3], /*expected_ms*/ 250, eps_ms);
-  AssertDurationOfEventAroundMs(profile_events[4], /*expected_ms*/ 250, eps_ms);
+  auto parent_ms = GetDurationOfEventMs(profile_events[0]);
+  double child_ms[2], sleep_for_quarter_ms[2];
+  child_ms[0] = GetDurationOfEventMs(profile_events[1]);
+  child_ms[1] = GetDurationOfEventMs(profile_events[3]);
+  sleep_for_quarter_ms[0] = GetDurationOfEventMs(profile_events[2]);
+  sleep_for_quarter_ms[1] = GetDurationOfEventMs(profile_events[4]);
+  EXPECT_NEAR(parent_ms, child_ms[0] + child_ms[1], eps_ms);
+  EXPECT_NEAR(child_ms[0], sleep_for_quarter_ms[0], eps_ms);
+  EXPECT_NEAR(child_ms[1], sleep_for_quarter_ms[1], eps_ms);
 #endif
 }
 
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 198612f6fec7961ee34d7a16f65f079f83811f79..9c603998717019ac8624868b16d720e300a30efd 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -97,6 +97,7 @@ def convert_dtype_to_tflite_type(tf_dtype):
   return result
 
 
+@_tf_export("lite.OpsSet")
 class OpsSet(enum.Enum):
   """Enum class defining the sets of ops available to generate TFLite models.
 
@@ -301,7 +302,9 @@ def build_toco_convert_protos(input_tensors,
     process.
 
   Raises:
-    ValueError: If the input tensor type is unknown
+    ValueError:
+      If the input tensor type is unknown
+      Missing mean_values or std_dev_values
     RuntimeError: If TOCO fails to convert (in which case the runtime error's
       error text will contain the TOCO error log)
   """
@@ -335,9 +338,14 @@ def build_toco_convert_protos(input_tensors,
   model.change_concat_input_ranges = change_concat_input_ranges
   for idx, input_tensor in enumerate(input_tensors):
     input_array = model.input_arrays.add()
+    input_array.name = tensor_name(input_tensor)
+    input_array.data_type = convert_dtype_to_tflite_type(input_tensor.dtype)
+
     if toco.inference_input_type == _types_pb2.QUANTIZED_UINT8:
+      if not quantized_input_stats:
+        raise ValueError("std_dev and mean must be defined when "
+                         "inference_input_type is QUANTIZED_UINT8.")
       input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
-    input_array.name = tensor_name(input_tensor)
     if input_shapes is None:
       shape = input_tensor.get_shape()
     else:
@@ -385,7 +393,11 @@ def toco_convert_graph_def(input_data, input_arrays_with_shape, output_arrays,
 
   for idx, (name, shape) in enumerate(input_arrays_with_shape):
     input_array = model_flags.input_arrays.add()
-    if kwargs["inference_type"] == lite_constants.QUANTIZED_UINT8:
+    if toco_flags.inference_input_type == _types_pb2.QUANTIZED_UINT8:
+      if (("quantized_input_stats" not in kwargs) or
+          (not kwargs["quantized_input_stats"])):
+        raise ValueError("std_dev and mean must be defined when "
+                         "inference_input_type is QUANTIZED_UINT8.")
       input_array.mean_value, input_array.std_value = kwargs[
           "quantized_input_stats"][idx]
     input_array.name = name
diff --git a/tensorflow/lite/python/convert_saved_model_test.py b/tensorflow/lite/python/convert_saved_model_test.py
index 76113853ca9cb47bfbfebe10b7c1c1df80186216..fdcbc79ee9cfd4ccad15d59a0df4a7e520471b6c 100644
--- a/tensorflow/lite/python/convert_saved_model_test.py
+++ b/tensorflow/lite/python/convert_saved_model_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.saved_model import tag_constants
 
 class TensorFunctionsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testGetTensorsValid(self):
     in_tensor = array_ops.placeholder(
         shape=[1, 16, 16, 3], dtype=dtypes.float32)
@@ -49,6 +50,7 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
         sess.graph, ["Placeholder"])
     self.assertEqual("Placeholder:0", tensors[0].name)
 
+  @test_util.run_v1_only("b/120545219")
   def testGetTensorsInvalid(self):
     in_tensor = array_ops.placeholder(
         shape=[1, 16, 16, 3], dtype=dtypes.float32)
@@ -61,6 +63,7 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
     self.assertEqual("Invalid tensors 'invalid-input' were found.",
                      str(error.exception))
 
+  @test_util.run_v1_only("b/120545219")
   def testSetTensorShapeValid(self):
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
@@ -68,6 +71,7 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
     convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [5, 3, 5]})
     self.assertEqual([5, 3, 5], tensor.shape.as_list())
 
+  @test_util.run_v1_only("b/120545219")
   def testSetTensorShapeNoneValid(self):
     tensor = array_ops.placeholder(dtype=dtypes.float32)
     self.assertEqual(None, tensor.shape)
@@ -75,6 +79,7 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
     convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [1, 3, 5]})
     self.assertEqual([1, 3, 5], tensor.shape.as_list())
 
+  @test_util.run_v1_only("b/120545219")
   def testSetTensorShapeArrayInvalid(self):
     # Tests set_tensor_shape where the tensor name passed in doesn't exist.
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
@@ -88,6 +93,7 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
         str(error.exception))
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
+  @test_util.run_deprecated_v1
   def testSetTensorShapeDimensionInvalid(self):
     # Tests set_tensor_shape where the shape passed in is incompatiable.
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
@@ -96,11 +102,11 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError) as error:
       convert_saved_model.set_tensor_shapes([tensor],
                                             {"Placeholder": [1, 5, 5]})
-    self.assertIn(
-        "The shape of tensor 'Placeholder' cannot be changed from "
-        "(?, 3, 5) to [1, 5, 5].", str(error.exception))
+    self.assertIn("The shape of tensor 'Placeholder' cannot be changed",
+                  str(error.exception))
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
+  @test_util.run_v1_only("b/120545219")
   def testSetTensorShapeEmpty(self):
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
diff --git a/tensorflow/lite/python/convert_test.py b/tensorflow/lite/python/convert_test.py
index 40576e16dbea06ec56a8a2999a1fddb145dbfad2..cf49ee2b472d2c6617811cde0978eb8ae3a16f8e 100644
--- a/tensorflow/lite/python/convert_test.py
+++ b/tensorflow/lite/python/convert_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("b/120545219")
 class ConvertTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
@@ -66,6 +67,21 @@ class ConvertTest(test_util.TensorFlowTestCase):
         quantized_input_stats=[(0., 1.)])
     self.assertTrue(tflite_model)
 
+  def testQuantizationInvalid(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = array_ops.fake_quant_with_min_max_args(
+        in_tensor + in_tensor, min=0., max=1.)
+    sess = session.Session()
+
+    with self.assertRaises(ValueError) as error:
+      convert.toco_convert(
+          sess.graph_def, [in_tensor], [out_tensor],
+          inference_type=lite_constants.QUANTIZED_UINT8)
+    self.assertEqual(
+        "std_dev and mean must be defined when inference_input_type is "
+        "QUANTIZED_UINT8.", str(error.exception))
+
   def testGraphDefBasic(self):
     in_tensor = array_ops.placeholder(
         shape=[1, 16, 16, 3], dtype=dtypes.float32, name="input")
@@ -139,7 +155,29 @@ class ConvertTest(test_util.TensorFlowTestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]["shape"]).all())
     self.assertTrue(output_details[0]["quantization"][0] > 0)  # scale
 
+  def testGraphDefQuantizationInvalid(self):
+    in_tensor_1 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputA")
+    in_tensor_2 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputB")
+    _ = array_ops.fake_quant_with_min_max_args(
+        in_tensor_1 + in_tensor_2, min=0., max=1., name="output")
+    sess = session.Session()
+
+    input_arrays_map = [("inputA", [1, 16, 16, 3]), ("inputB", [1, 16, 16, 3])]
+    output_arrays = ["output"]
+    with self.assertRaises(ValueError) as error:
+      convert.toco_convert_graph_def(
+          sess.graph_def,
+          input_arrays_map,
+          output_arrays,
+          inference_type=lite_constants.QUANTIZED_UINT8)
+    self.assertEqual(
+        "std_dev and mean must be defined when inference_input_type is "
+        "QUANTIZED_UINT8.", str(error.exception))
+
 
+@test_util.run_v1_only("b/120545219")
 class ConvertTestOpHint(test_util.TensorFlowTestCase):
   """Test the hint to stub functionality."""
 
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 1ae0d3c3ed054e9b6732125e5330209ff15373be..1f9c768b4441cc1385d93285d26eeee9b651ca83 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -80,6 +80,7 @@ class FromConstructor(test_util.TensorFlowTestCase):
     self.assertTrue(converter._has_valid_tensors())
 
 
+@test_util.run_v1_only('b/120545219')
 class FromSessionTest(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -177,6 +178,38 @@ class FromSessionTest(test_util.TensorFlowTestCase):
         'Quantization input stats are not available for input tensors '
         '\'inputB\'.', str(error.exception))
 
+  def testIntermediateInputArray(self):
+    """Convert a model from an intermediate input array."""
+    in_tensor_init = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    in_tensor_final = in_tensor_init + in_tensor_init
+    out_tensor = in_tensor_final + in_tensor_final
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor_final],
+                                                  [out_tensor])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('add', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add_1', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
   def testSizeNoneInvalid(self):
     in_tensor = array_ops.placeholder(dtype=dtypes.float32)
     out_tensor = in_tensor + in_tensor
@@ -465,6 +498,7 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     interpreter.allocate_tensors()
 
 
+@test_util.run_v1_only('b/120545219')
 class FromFrozenGraphFile(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -712,6 +746,7 @@ class FromFrozenGraphFile(test_util.TensorFlowTestCase):
     interpreter.allocate_tensors()
 
 
+@test_util.run_v1_only('b/120545219')
 class FromSavedModelTest(test_util.TensorFlowTestCase):
 
   def _createSavedModel(self, shape):
@@ -856,6 +891,7 @@ class FromSavedModelTest(test_util.TensorFlowTestCase):
     interpreter.allocate_tensors()
 
 
+@test_util.run_v1_only('b/120545219')
 class FromKerasFile(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/lite/schema/builtin_ops_header/BUILD b/tensorflow/lite/schema/builtin_ops_header/BUILD
index 8a01541d575e288b94f8bb049caa288a777d61d8..52cbd052d6aa8cafcf562eb483638915be297cf7 100644
--- a/tensorflow/lite/schema/builtin_ops_header/BUILD
+++ b/tensorflow/lite/schema/builtin_ops_header/BUILD
@@ -24,7 +24,6 @@ cc_binary(
 cc_test(
     name = "generator_test",
     srcs = ["generator_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":generator",
         "@com_google_googletest//:gtest",
@@ -37,7 +36,6 @@ cc_test(
     data = [
         "//tensorflow/lite:builtin_ops.h",
     ],
-    tags = ["no_oss"],
     deps = [
         ":generator",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 6436167303bb8350a7865a90a31fc2a5ec7356da..980f13b19b4f6a32fe8b693c560be2b4f4f95fd9 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -45,7 +45,7 @@ enum TensorType : byte {
 // Custom quantization parameters for experimenting with new quantization
 // techniques.
 table CustomQuantization {
-  custom:[byte];
+  custom:[ubyte] (force_align: 16);
 }
 
 // Represents a specific quantization technique's parameters.
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index af8b143364e25a7091cd7a44f4e44c4d67285c0e..637cbafabdad47892b1e3f4a93837b44d50a5b46 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -2247,7 +2247,7 @@ inline const char *EnumNameCustomOptionsFormat(CustomOptionsFormat e) {
 
 struct CustomQuantizationT : public flatbuffers::NativeTable {
   typedef CustomQuantization TableType;
-  std::vector<int8_t> custom;
+  std::vector<uint8_t> custom;
   CustomQuantizationT() {
   }
 };
@@ -2257,8 +2257,8 @@ struct CustomQuantization FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   enum {
     VT_CUSTOM = 4
   };
-  const flatbuffers::Vector<int8_t> *custom() const {
-    return GetPointer<const flatbuffers::Vector<int8_t> *>(VT_CUSTOM);
+  const flatbuffers::Vector<uint8_t> *custom() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -2274,7 +2274,7 @@ struct CustomQuantization FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 struct CustomQuantizationBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_custom(flatbuffers::Offset<flatbuffers::Vector<int8_t>> custom) {
+  void add_custom(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom) {
     fbb_.AddOffset(CustomQuantization::VT_CUSTOM, custom);
   }
   explicit CustomQuantizationBuilder(flatbuffers::FlatBufferBuilder &_fbb)
@@ -2291,7 +2291,7 @@ struct CustomQuantizationBuilder {
 
 inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int8_t>> custom = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom = 0) {
   CustomQuantizationBuilder builder_(_fbb);
   builder_.add_custom(custom);
   return builder_.Finish();
@@ -2299,10 +2299,10 @@ inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(
 
 inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantizationDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int8_t> *custom = nullptr) {
+    const std::vector<uint8_t> *custom = nullptr) {
   return tflite::CreateCustomQuantization(
       _fbb,
-      custom ? _fbb.CreateVector<int8_t>(*custom) : 0);
+      custom ? _fbb.CreateVector<uint8_t>(*custom) : 0);
 }
 
 flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
diff --git a/tensorflow/lite/string_util.cc b/tensorflow/lite/string_util.cc
index 1b33f5bcba01bf32d366436812df014c3fbc1390..6efa11d60c55540c099fadc33c7756ae8f77b97f 100644
--- a/tensorflow/lite/string_util.cc
+++ b/tensorflow/lite/string_util.cc
@@ -96,8 +96,7 @@ int DynamicBuffer::WriteToBuffer(char** buffer) {
   return bytes;
 }
 
-void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) {
-  // Set tensor content pointer to tensor_buffer, and release original data.
+void DynamicBuffer::WriteToTensorAsVector(TfLiteTensor* tensor) {
   auto dims = TfLiteIntArrayCreate(1);
   dims->data[0] = offset_.size() - 1;  // Store number of strings.
   WriteToTensor(tensor, dims);
@@ -108,6 +107,10 @@ void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor,
   char* tensor_buffer;
   int bytes = WriteToBuffer(&tensor_buffer);
 
+  if (new_shape == nullptr) {
+    new_shape = TfLiteIntArrayCopy(tensor->dims);
+  }
+
   // Set tensor content pointer to tensor_buffer, and release original data.
   TfLiteTensorReset(tensor->type, tensor->name, new_shape, tensor->params,
                     tensor_buffer, bytes, kTfLiteDynamic, tensor->allocation,
diff --git a/tensorflow/lite/string_util.h b/tensorflow/lite/string_util.h
index c9b74482f7d04b8cd667c18fc0a2aadc2f5f6490..f076db76f2d4ef416e5f7ec98ac2ec0aa94d95c2 100644
--- a/tensorflow/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -74,12 +74,18 @@ class DynamicBuffer {
   // The function allocates space for the buffer but does NOT take ownership.
   int WriteToBuffer(char** buffer);
 
-  // Fill content into a string tensor, with the given new_shape. The new
-  // shape must match the number of strings in this object.
+  // Fill content into a string tensor, with the given new_shape. The new shape
+  // must match the number of strings in this object. Caller relinquishes
+  // ownership of new_shape. If 'new_shape' is nullptr, keep the tensor's
+  // existing shape.
   void WriteToTensor(TfLiteTensor* tensor, TfLiteIntArray* new_shape);
 
   // Fill content into a string tensor. Set shape to {num_strings}.
-  void WriteToTensor(TfLiteTensor* tensor);
+  void WriteToTensorAsVector(TfLiteTensor* tensor);
+
+  // Deprecated. Use WriteToTensorAsVector() or pass in the new shpe.
+  // TODO(b/120230709): remove when people migrate away.
+  void WriteToTensor(TfLiteTensor* tensor) { WriteToTensorAsVector(tensor); }
 
  private:
   // Data buffer to store contents of strings, not including headers.
diff --git a/tensorflow/lite/string_util_test.cc b/tensorflow/lite/string_util_test.cc
index 377cdd77eb4651bb057055cc4f7a4ab33cbb5297..cbf1d7b226af20251d5f70a354a21f1eb40ae1c6 100644
--- a/tensorflow/lite/string_util_test.cc
+++ b/tensorflow/lite/string_util_test.cc
@@ -55,7 +55,7 @@ TEST(StringUtil, TestStringUtil) {
   new_shape->data[0] = 2;
   new_shape->data[1] = 1;
   buf0.WriteToTensor(t0, new_shape);
-  buf1.WriteToTensor(t1);
+  buf1.WriteToTensorAsVector(t1);
 
   // Check tensor shapes.
   EXPECT_EQ(t0->dims->size, 2);
@@ -99,7 +99,7 @@ TEST(StringUtil, TestAddJoinedString) {
 
   DynamicBuffer buf;
   buf.AddJoinedString({{s0, 3}, {s1, 4}, {s2, 0}, {s3, 3}}, ' ');
-  buf.WriteToTensor(t0);
+  buf.WriteToTensorAsVector(t0);
 
   ASSERT_EQ(GetStringCount(t0), 1);
   StringRef str_ref;
@@ -115,12 +115,43 @@ TEST(StringUtil, TestEmptyList) {
   t0->type = kTfLiteString;
   t0->allocation_type = kTfLiteDynamic;
   DynamicBuffer buf;
-  buf.WriteToTensor(t0);
+  buf.WriteToTensorAsVector(t0);
 
   ASSERT_EQ(GetStringCount(t0), 0);
   ASSERT_EQ(t0->bytes, 8);
 }
 
+TEST(StringUtil, TestShapes) {
+  Interpreter interpreter;
+  interpreter.AddTensors(1);
+  TfLiteTensor* t0 = interpreter.tensor(0);
+  t0->type = kTfLiteString;
+  t0->allocation_type = kTfLiteDynamic;
+  t0->dims = TfLiteIntArrayCreate(2);
+  t0->dims->data[0] = 2;
+  t0->dims->data[1] = 1;
+
+  // Not setting a new shape: number of strings must match
+  DynamicBuffer buf;
+  buf.AddString("ABC", 3);
+  buf.AddString("X", 1);
+  buf.WriteToTensor(t0, nullptr);
+
+  ASSERT_EQ(t0->dims->size, 2);
+  EXPECT_EQ(t0->dims->data[0], 2);
+  EXPECT_EQ(t0->dims->data[1], 1);
+
+  auto new_shape = TfLiteIntArrayCreate(2);
+  new_shape->data[0] = 1;
+  new_shape->data[1] = 2;
+
+  buf.WriteToTensor(t0, new_shape);
+
+  ASSERT_EQ(t0->dims->size, 2);
+  EXPECT_EQ(t0->dims->data[0], 1);
+  EXPECT_EQ(t0->dims->data[1], 2);
+}
+
 }  // namespace tflite
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py
index 55796846b3cce0280316ece2e1c6a27d53cc4f31..dd7b3d07456fbd9943e9f45b815e6015f4973a94 100644
--- a/tensorflow/lite/testing/generate_examples.py
+++ b/tensorflow/lite/testing/generate_examples.py
@@ -103,8 +103,6 @@ KNOWN_BUGS = {
     r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733",
     # Div will use floordiv.
     r"div.*int32": "72051395",
-    # Constant 1D gather crashes toco.
-    r"gather_buggy.*input_shape=\[3\].*": "120029508",
 }
 
 
@@ -815,6 +813,7 @@ def make_constant_tests(zip_path):
   test_parameters = [{
       "dtype": [tf.float32, tf.int32],
       "input_shape": [[], [1], [2], [1, 1, 1, 1], [2, 2, 2, 2]],
+      "constant_is_also_output": [True, False],
   }]
 
   def build_graph(parameters):
@@ -824,17 +823,19 @@ def make_constant_tests(zip_path):
         shape=parameters["input_shape"])
     constant = tf.constant(
         create_tensor_data(parameters["dtype"], parameters["input_shape"]))
-    # This maximum node is here to avoid the situation where a graph output is
-    # a constant, which is an error in toco.
-    out = tf.maximum(dummy_input, constant)
-    return [dummy_input], [out]
+    out = [tf.maximum(dummy_input, constant)]
+    if parameters["constant_is_also_output"]:
+      out.append(constant)
+
+    return [dummy_input], out
 
   def build_inputs(parameters, sess, inputs, outputs):
     dummy_input = np.zeros(
         parameters["input_shape"], dtype=_TF_TYPE_INFO[parameters["dtype"]][0])
     return [dummy_input], sess.run(outputs, feed_dict={inputs[0]: dummy_input})
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
+                    expected_tf_success=20)
 
 
 def make_binary_op_tests(zip_path, binary_operator):
@@ -1257,8 +1258,8 @@ def make_gather_tests(zip_path):
       expected_tf_success=60)
 
 
-def make_gather_buggy_tests(zip_path):
-  """Make a set of tests to show gather crashes toco."""
+def make_gather_with_constant_tests(zip_path):
+  """Make a set of test which feed a constant to gather toco."""
 
   test_parameters = [{
       "input_shape": [[3]],
@@ -1452,23 +1453,27 @@ def make_conv_with_shared_weights_tests(zip_path):
     input_shape, filter_shape = get_tensor_shapes(parameters)
     input_tensor = tf.placeholder(
         dtype=tf.float32, name="input", shape=input_shape)
+    input_tensors = [input_tensor]
 
     # Construct a constant weights tensor which will be used by both Conv2D.
     filter_tensor = tf.constant(
         create_tensor_data(np.float32, filter_shape), dtype=tf.float32)
-    input_tensors = [input_tensor]
+
+    # Ensure that FuseBinaryIntoFollowingAffine works with an input which
+    # is shared by multiple affine ops.
+    conv_input = input_tensor + 0.1
 
     # Construct 2 Conv2D operations which use exactly the same input and
     # weights.
     result1 = tf.nn.conv2d(
-        input_tensor,
+        conv_input,
         filter_tensor,
         strides=parameters["strides"],
         dilations=parameters["dilations"],
         padding=parameters["padding"],
         data_format=parameters["data_format"])
     result2 = tf.nn.conv2d(
-        input_tensor,
+        conv_input,
         filter_tensor,
         strides=parameters["strides"],
         dilations=parameters["dilations"],
@@ -3528,6 +3533,36 @@ def make_range_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_fill_tests(zip_path):
+  """Make a set of tests to do fill."""
+
+  test_parameters = [{
+      "dims_dtype": [tf.int32, tf.int64],
+      "dims_shape": [[], [1], [3], [3, 3]],
+      "value_dtype": [tf.int32, tf.int64, tf.float32],
+  }]
+
+  def build_graph(parameters):
+    """Build the fill op testing graph."""
+    input1 = tf.placeholder(
+        dtype=parameters["dims_dtype"],
+        name="dims",
+        shape=parameters["dims_shape"])
+    input2 = tf.placeholder(
+        dtype=parameters["value_dtype"], name="value", shape=[])
+    out = tf.fill(input1, input2)
+    return [input1, input2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input1 = create_tensor_data(parameters["dims_dtype"],
+                                parameters["dims_shape"], 1)
+    input2 = create_scalar_data(parameters["value_dtype"])
+    return [input1, input2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input1, input2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def _make_logical_tests(op):
   """Make a set of tests to do logical operations."""
 
@@ -3579,6 +3614,88 @@ def make_logical_xor_tests(zip_path):
   return _make_logical_tests(tf.logical_xor)(zip_path)
 
 
+def make_mirror_pad_tests(zip_path):
+  """Make a set of tests to do mirror_pad."""
+
+  test_parameters = [
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [2, 1]]],
+          "mode": ["REFLECT"],
+          "type": ["const"]
+      },
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [1, 1]]],
+          "mode": ["REFLECT"],
+          "type": ["const"]
+      },
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [2, 1]]],
+          "mode": ["SYMMETRIC"],
+          "type": ["placeholder"]
+      },
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [2, 1]]],
+          "mode": ["REFLECT"],
+          "type": ["placeholder"]
+      },
+      {
+          "input_shape": [[3]],
+          "padding_matrix": [[[0, 2]]],
+          "mode": ["SYMMETRIC"],
+          "type": ["placeholder"]
+      },
+      {
+          "input_shape": [[3]],
+          "padding_matrix": [[[0, 2]]],
+          "mode": ["SYMMETRIC"],
+          "type": ["const"]
+      },
+      {
+          "input_shape": [[3]],
+          "padding_matrix": [[[0, 2]]],
+          "mode": ["REFLECT"],
+          "type": ["const"]
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the graph for the test case."""
+
+    input_tensor = tf.placeholder(
+        dtype=tf.int32, name="input", shape=parameters["input_shape"])
+    if parameters["type"] != "const":
+      padding_matrix = tf.placeholder(
+          dtype=tf.int32,
+          name="padding",
+          shape=[len(parameters["input_shape"]), 2])
+      input_tensors = [input_tensor, padding_matrix]
+    else:
+      padding_matrix = tf.constant(np.array(parameters["padding_matrix"]))
+      input_tensors = [input_tensor]
+    output = tf.pad(
+        input_tensor, paddings=padding_matrix, mode=parameters["mode"])
+
+    return input_tensors, [output]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = [create_tensor_data(tf.int32, parameters["input_shape"])]
+    if parameters["type"] != "const":
+      input_values.append(np.array(parameters["padding_matrix"]))
+    return input_values, sess.run(
+        outputs, feed_dict=dict(zip(inputs, input_values)))
+
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_success=7)
+
+
 def make_unroll_batch_matmul_tests(zip_path):
   """Make a set of tests to test unroll_batch_matmul."""
 
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index 91a4851fb0251d4e5f7e8fcd7146ee43fdfe99f2..a9a31ad088e6f4b0297ba313c585abbe6189728b 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -205,7 +205,7 @@ tensorflow::Status ReadManifest(const string& original_file, const string& dir,
   }
   if (!added) {
     string message = "Test had no examples: " + original_file;
-    return tensorflow::Status(tensorflow::error::UNKNOWN, message.c_str());
+    return tensorflow::Status(tensorflow::error::UNKNOWN, message);
   }
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/lite/testing/join.h b/tensorflow/lite/testing/join.h
index 7d0040c488a4ce4bf38f35948efb9c0b80777079..d1c314608687f045b346cc5526ea46c8149c2755 100644
--- a/tensorflow/lite/testing/join.h
+++ b/tensorflow/lite/testing/join.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_TESTING_JOIN_H_
 
 #include <cstdlib>
+#include <iomanip>
 #include <sstream>
 
 #include "tensorflow/lite/string.h"
@@ -30,9 +31,9 @@ string Join(T* data, size_t len, const string& delimiter) {
     return "";
   }
   std::stringstream result;
-  result << data[0];
+  result << std::setprecision(9) << data[0];
   for (int i = 1; i < len; i++) {
-    result << delimiter << data[i];
+    result << std::setprecision(9) << delimiter << data[i];
   }
   return result.str();
 }
diff --git a/tensorflow/lite/testing/join_test.cc b/tensorflow/lite/testing/join_test.cc
index a8d036c547ded369618bf62544dafffcc27bbf0a..0b3c07f37e14e3815ac1eb4acd0aefac3515064c 100644
--- a/tensorflow/lite/testing/join_test.cc
+++ b/tensorflow/lite/testing/join_test.cc
@@ -28,7 +28,7 @@ TEST(JoinTest, JoinInt) {
 
 TEST(JoinTest, JoinFloat) {
   float data[] = {1.0, -3, 2.3, 1e-5};
-  EXPECT_EQ(Join(data, 4, " "), "1 -3 2.3 1e-05");
+  EXPECT_EQ(Join(data, 4, " "), "1 -3 2.29999995 9.99999975e-06");
 }
 
 TEST(JoinTest, JoinNullData) { EXPECT_THAT(Join<int>(nullptr, 3, ","), ""); }
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
index 6b4e7427ed9c69b702d37ccc1b6de0b0c414fe5d..4e329ac97d7358edf068329b21f0194c94c57cb0 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
@@ -28,6 +28,7 @@ from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -42,6 +43,7 @@ class EvaluateFrozenGraph(test.TestCase):
     write_graph(sess.graph_def, '', graph_def_file, False)
     return graph_def_file
 
+  @test_util.run_v1_only('b/120545219')
   def testFloat(self):
     with session.Session().as_default() as sess:
       in_tensor = array_ops.placeholder(
@@ -51,6 +53,7 @@ class EvaluateFrozenGraph(test.TestCase):
 
     model_coverage.test_frozen_graph(filename, ['Placeholder'], ['add'])
 
+  @test_util.run_v1_only('b/120545219')
   def testMultipleOutputs(self):
     with session.Session().as_default() as sess:
       in_tensor_1 = array_ops.placeholder(
@@ -84,15 +87,18 @@ class EvaluateFrozenGraph(test.TestCase):
     filename = self._saveFrozenGraph(sess)
     return filename
 
+  @test_util.run_v1_only('b/120545219')
   def testQuantized(self):
     filename = self._getQuantizedModel()
     model_coverage.test_frozen_graph_quant(filename, ['inputA'], ['output'])
 
+  @test_util.run_v1_only('b/120545219')
   def testQuantizedInputShapes(self):
     filename = self._getQuantizedModel()
     model_coverage.test_frozen_graph_quant(
         filename, ['inputA'], ['output'], input_shapes={'inputA': [33, 33]})
 
+  @test_util.run_v1_only('b/120545219')
   def testQuantizedFlexAll(self):
     filename = self._getQuantizedModel()
     model_coverage.test_frozen_graph_quant(
@@ -102,6 +108,7 @@ class EvaluateFrozenGraph(test.TestCase):
 
 class EvaluateSavedModel(test.TestCase):
 
+  @test_util.run_v1_only('b/120545219')
   def testFloat(self):
     saved_model_dir = os.path.join(self.get_temp_dir(), 'simple_savedmodel')
     with session.Session().as_default() as sess:
@@ -139,18 +146,21 @@ class EvaluateKerasModel(test.TestCase):
       os.close(fd)
     return keras_file
 
+  @test_util.run_v1_only('b/120545219')
   def testFloat(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(keras_file)
 
+  @test_util.run_v1_only('b/120545219')
   def testPostTrainingQuantize(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(keras_file, post_training_quantize=True)
 
+  @test_util.run_v1_only('b/120545219')
   def testTargetOps(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
diff --git a/tensorflow/lite/testing/tf_driver_test.cc b/tensorflow/lite/testing/tf_driver_test.cc
index 4381fe4c19dc2240ed2335495276e3a5dab91022..363d162d56a1670821d29768bc36411bf22d61e9 100644
--- a/tensorflow/lite/testing/tf_driver_test.cc
+++ b/tensorflow/lite/testing/tf_driver_test.cc
@@ -46,7 +46,7 @@ TEST(TfDriverTest, ReadingAndWrintingValues) {
   TestDriver driver;
   ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_FLOAT, {1, 2, 2},
                                     "0.10,0.20,0.30,0.40"),
-            "0.1,0.2,0.3,0.4");
+            "0.100000001,0.200000003,0.300000012,0.400000006");
   ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_INT32, {1, 2, 2},
                                     "10,40,100,-100"),
             "10,40,100,-100");
@@ -111,8 +111,10 @@ TEST(TfDriverTest, SimpleTest) {
   runner->ResetTensor(2);
   runner->Invoke();
 
-  ASSERT_EQ(runner->ReadOutput(0), "0.101,0.202,0.303,0.404");
-  ASSERT_EQ(runner->ReadOutput(1), "0.011,0.022,0.033,0.044");
+  ASSERT_EQ(runner->ReadOutput(0),
+            "0.101000004,0.202000007,0.303000003,0.404000014");
+  ASSERT_EQ(runner->ReadOutput(1),
+            "0.0109999999,0.0219999999,0.0329999998,0.0439999998");
 }
 
 }  // namespace
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 27e3a3770bb4ceb49c039d258df261ba7e265162..4e11d49f252818f9f7024b8bbafa8b17ad77ad48 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -77,6 +77,13 @@ class TfLiteDriver::Expectation {
     SetTensorData(values, &data_);
   }
 
+  template <>
+  void SetData<string>(const string& csv_values) {
+    string s = absl::HexStringToBytes(csv_values);
+    data_.raw = new char[s.size()];
+    memcpy(data_.raw, s.data(), s.size());
+  }
+
   bool Check(bool verbose, const TfLiteTensor& tensor) {
     switch (tensor.type) {
       case kTfLiteFloat32:
@@ -89,6 +96,8 @@ class TfLiteDriver::Expectation {
         return TypedCheck<uint8_t>(verbose, tensor);
       case kTfLiteBool:
         return TypedCheck<bool>(verbose, tensor);
+      case kTfLiteString:
+        return TypedCheck<string>(verbose, tensor);
       default:
         fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
         return false;
@@ -135,6 +144,46 @@ class TfLiteDriver::Expectation {
     return good_output;
   }
 
+  template <>
+  bool TypedCheck<string>(bool verbose, const TfLiteTensor& tensor) {
+    if (tensor.data.raw == nullptr) {
+      if (verbose) {
+        std::cerr << "  got empty string" << std::endl;
+      }
+      return false;
+    }
+    int expected_num_strings = GetStringCount(data_.raw);
+    int returned_num_strings = GetStringCount(tensor.data.raw);
+    if (expected_num_strings != returned_num_strings) {
+      if (verbose) {
+        std::cerr << "  string count differ: got " << returned_num_strings
+                  << ", but expected " << expected_num_strings << std::endl;
+      }
+      return false;
+    }
+    for (int i = 0; i < returned_num_strings; ++i) {
+      auto expected_ref = GetString(data_.raw, i);
+      auto returned_ref = GetString(tensor.data.raw, i);
+      if (expected_ref.len != returned_ref.len) {
+        if (verbose) {
+          std::cerr << "  index " << i << ": got string of size "
+                    << returned_ref.len << ", but expected size "
+                    << expected_ref.len << std::endl;
+        }
+        return false;
+      }
+      if (strncmp(expected_ref.str, returned_ref.str, returned_ref.len) != 0) {
+        if (verbose) {
+          std::cerr << "  index " << i << ": strings are different"
+                    << std::endl;
+        }
+        return false;
+      }
+    }
+
+    return true;
+  }
+
   TfLitePtrUnion data_;
   size_t num_elements_;
 };
@@ -250,8 +299,9 @@ void TfLiteDriver::SetInput(int id, const string& csv_values) {
       break;
     }
     default:
-      fprintf(stderr, "Unsupported type %d in SetInput\n", tensor->type);
-      Invalidate("Unsupported tensor data type");
+      Invalidate(absl::StrCat("Unsupported tensor type ",
+                              TfLiteTypeGetName(tensor->type),
+                              " in TfLiteDriver::SetInput"));
       return;
   }
 }
@@ -260,8 +310,7 @@ void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
   if (!IsValid()) return;
   auto* tensor = interpreter_->tensor(id);
   if (expected_output_.count(id) != 0) {
-    fprintf(stderr, "Overridden expectation for tensor %d\n", id);
-    Invalidate("Overridden expectation");
+    Invalidate(absl::StrCat("Overridden expectation for tensor '", id, "'"));
   }
   expected_output_[id].reset(new Expectation);
   switch (tensor->type) {
@@ -280,9 +329,13 @@ void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
     case kTfLiteBool:
       expected_output_[id]->SetData<bool>(csv_values);
       break;
+    case kTfLiteString:
+      expected_output_[id]->SetData<string>(csv_values);
+      break;
     default:
-      fprintf(stderr, "Unsupported type %d in SetExpectation\n", tensor->type);
-      Invalidate("Unsupported tensor data type");
+      Invalidate(absl::StrCat("Unsupported tensor type ",
+                              TfLiteTypeGetName(tensor->type),
+                              " in TfLiteDriver::SetExpectation"));
       return;
   }
 }
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 82aa1f557efec04a7af3ef5e4d8b2ceb51f42a62..93d41fcae14c8130de87471bdce64edad131c11f 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -341,7 +341,6 @@ cc_library(
 tf_cc_test(
     name = "import_tensorflow_test",
     srcs = ["import_tensorflow_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":toco_tooling",
         "//tensorflow/core:framework",
@@ -384,7 +383,6 @@ cc_library(
 tf_cc_test(
     name = "tooling_util_test",
     srcs = ["tooling_util_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":model",
         ":tooling_util",
@@ -468,7 +466,6 @@ tf_cc_test(
     data = [
         "toco_port_test.cc",
     ],
-    tags = ["no_oss"],
     deps = [
         ":toco_port",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
index 6b4765b23c47d0a8af4ef3995d6e27c978387593..436b639253f2e190fcaab895cd077b06796c1ca1 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
@@ -221,9 +221,8 @@ void FuseMulOrDivParamsIntoFollowingAffine(Model* model, Operator* following_op,
   Operator* following_op = GetOpWithInput(*model, binary_op->outputs[0]);
 
   if (!following_op) {
-    AddMessageF(
-        "Not fusing %s because it is not consumed by exactly one other op",
-        LogName(*binary_op));
+    AddMessageF("Not fusing %s because it is not consumed by any op",
+                LogName(*binary_op));
     return ::tensorflow::Status::OK();
   }
 
@@ -288,7 +287,10 @@ void FuseMulOrDivParamsIntoFollowingAffine(Model* model, Operator* following_op,
   AddMessageF("Fusing %s into the following %s", LogName(*binary_op),
               LogName(*following_op));
 
-  model->EraseArray(binary_op->outputs[0]);
+  if (CountOpsWithInput(*model, binary_op->outputs[0]) == 1) {
+    model->EraseArray(binary_op->outputs[0]);
+  }
+
   following_op->inputs[0] = binary_op->inputs[index_of_variable_input];
   const auto& old_constant_param_name =
       binary_op->inputs[index_of_constant_input];
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
index 73a90c8239b2a24de8bb4d63e711225b4127f19a..187b584b6989cc55894160fc5508c13474a1d2d3 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
@@ -139,7 +139,7 @@ DECLARE_GRAPH_TRANSFORMATION(MakeInitialDequantizeOperator)
 DECLARE_GRAPH_TRANSFORMATION(MoveBinaryOperatorBeforeReshape)
 DECLARE_GRAPH_TRANSFORMATION(PropagateActivationFunctionIntoConstants)
 DECLARE_GRAPH_TRANSFORMATION(PropagateArrayDataTypes)
-DECLARE_GRAPH_TRANSFORMATION(PropagateFakeQuantNumBits);
+DECLARE_GRAPH_TRANSFORMATION(PropagateFakeQuantNumBits)
 DECLARE_GRAPH_TRANSFORMATION(PropagateFixedSizes)
 DECLARE_GRAPH_TRANSFORMATION(HardcodeMinMax)
 DECLARE_GRAPH_TRANSFORMATION(Quantize)
diff --git a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index df50f31de88cd8114ee66ce417354e33a12a5d8b..2e41767095fb3cde09a7fb5d690ac57b1cfcd762 100644
--- a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -208,12 +208,32 @@ bool HardcodeMinMaxForSelect(Model* model, Operator* op) {
   if (output_array.minmax) {
     return false;
   }
-  const auto& input_array_1 = model->GetArray(op->inputs[1]);
-  if (!input_array_1.minmax) {
+
+  auto& input_array_1 = model->GetArray(op->inputs[1]);
+  auto& input_array_2 = model->GetArray(op->inputs[2]);
+
+  if (!input_array_1.minmax && !input_array_2.minmax) {
     return false;
   }
-  const auto& input_array_2 = model->GetArray(op->inputs[2]);
-  if (!input_array_2.minmax) {
+
+  // Propagate up if one input is quantized and the other is constant.
+  if (!input_array_1.minmax &&
+      IsConstantParameterArray(*model, op->inputs[1])) {
+    auto& minmax_1 = input_array_1.GetOrCreateMinMax();
+    const auto& minmax_2 = input_array_2.GetMinMax();
+    minmax_1.min = minmax_2.min;
+    minmax_1.max = minmax_2.max;
+  }
+
+  if (!input_array_2.minmax &&
+      IsConstantParameterArray(*model, op->inputs[2])) {
+    auto& minmax_2 = input_array_2.GetOrCreateMinMax();
+    const auto& minmax_1 = input_array_1.GetMinMax();
+    minmax_2.min = minmax_1.min;
+    minmax_2.max = minmax_1.max;
+  }
+
+  if (!input_array_1.minmax || !input_array_2.minmax) {
     return false;
   }
 
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index fc2ed07aa0b4fceb79cb8c08a534d15ac7471d62..0e653f08a04f237c861038639a1469eb62f35dfa 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1783,6 +1783,51 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
   }
 }
 
+void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
+  CHECK_EQ(op->inputs.size(), 2);
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  const auto& padding_matrix = model->GetArray(op->inputs[1]);
+
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  // If output already computed or padding matrix is non
+  // const then return.
+  if (output_array.has_shape() ||
+      !IsConstantParameterArray(*model, op->inputs[1])) {
+    return;
+  }
+  Shape output_shape = input_array.shape();
+  std::vector<int>& dims = *output_shape.mutable_dims();
+
+  std::vector<int64_t> padding;
+  if (padding_matrix.data_type == ArrayDataType::kInt32) {
+    const auto& data = padding_matrix.GetBuffer<ArrayDataType::kInt32>().data;
+    for (auto elem : data) {
+      padding.push_back(static_cast<int64_t>(elem));
+    }
+  } else if (padding_matrix.data_type == ArrayDataType::kInt64) {
+    const auto& data = padding_matrix.GetBuffer<ArrayDataType::kInt64>().data;
+    for (auto elem : data) {
+      padding.push_back(elem);
+    }
+  } else {
+    CHECK(padding_matrix.data_type == ArrayDataType::kInt64 ||
+          padding_matrix.data_type == ArrayDataType::kInt32);
+  }
+  CHECK_EQ(padding_matrix.shape().dimensions_count(), 2);
+  CHECK_EQ(input_array.shape().dimensions_count(),
+           padding_matrix.shape().dims(0));
+  for (int i = 0; i < input_array.shape().dimensions_count(); ++i) {
+    dims[i] += padding[i * 2] + padding[i * 2 + 1];
+  }
+
+  output_array.copy_shape(output_shape);
+}
+
 }  // namespace
 
 ::tensorflow::Status PropagateFixedSizes::Run(Model* model,
@@ -2055,6 +2100,9 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
     case OperatorType::kUnpack:
       ProcessUnpackOperator(model, static_cast<UnpackOperator*>(op));
       break;
+    case OperatorType::kMirrorPad:
+      ProcessMirrorPadOperator(model, static_cast<MirrorPadOperator*>(op));
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index 1146078c301fd1b880c99da23e5be8223efe31e3..2fa80f2edac2bc4e1c6a9147afca20798fca372b 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -64,6 +64,7 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kRelu1 || type == OperatorType::kRelu6 ||
          type == OperatorType::kShape || type == OperatorType::kExpandDims ||
          type == OperatorType::kPack || type == OperatorType::kTopK_V2 ||
+         type == OperatorType::kRandomUniform ||
          type == OperatorType::kResizeNearestNeighbor ||
          type == OperatorType::kPRelu;
 }
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
index c72135923e5dd111a608a52947baa370e2c1b4d9..27836efb0b2ff77d72811205617b721cc7106cf1 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
@@ -25,8 +25,8 @@ namespace {
 
 // Gathers data from axis 0.
 template <ArrayDataType Type>
-inline void Gather(const Array& input_array, int input_rank,
-                   const Array& coords_array, Array* output_array) {
+inline void Gather(const Array& input_array, const Array& coords_array,
+                   Array* output_array) {
   const Shape& input_shape = input_array.shape();
   const std::vector<DataType<Type>>& input_data =
       input_array.GetBuffer<Type>().data;
@@ -39,21 +39,20 @@ inline void Gather(const Array& input_array, int input_rank,
       output_array->GetMutableBuffer<Type>().data;
   output_data.resize(RequiredBufferSizeForShape(output_shape));
 
-  int rev_input_rank = input_shape.dimensions_count() - 1 - (input_rank - 1);
-  CHECK_EQ(coords_shape.dims(0), output_array->shape().dims(rev_input_rank));
+  CHECK_EQ(coords_shape.dims(0), output_array->shape().dims(0));
 
   int stride = 1;
-  for (int i = input_shape.dimensions_count() - 1; i >= input_rank - 1; --i) {
+  for (int i = 1; i < input_shape.dimensions_count(); ++i) {
     stride *= input_shape.dims(i);
   }
 
   // Let's make sure we have enough space for all element in the memcpy()
-  // below, which writes 'stride' elements startng at 'i * stride'.
+  // below, which writes 'stride' elements starting at 'i * stride'.
   CHECK_EQ(stride * coords_shape.dims(0), output_data.size());
 
   for (int i = 0; i < coords_shape.dims(0); ++i) {
     DCHECK_GE(coords_data[i], 0);
-    DCHECK_LT(coords_data[i], input_shape.dims(rev_input_rank));
+    DCHECK_LT(coords_data[i], input_shape.dims(0));
     DataType<Type>* out = output_data.data() + i * stride;
     const DataType<Type>* in = input_data.data() + coords_data[i] * stride;
     memcpy(out, in, sizeof(DataType<Type>) * stride);
@@ -122,24 +121,20 @@ inline void Gather(const Array& input_array, int input_rank,
   CHECK(!output_array.buffer);
   switch (output_array.data_type) {
     case ArrayDataType::kFloat:
-      Gather<ArrayDataType::kFloat>(input_array, op->input_rank, coords_array,
-                                    &output_array);
+      Gather<ArrayDataType::kFloat>(input_array, coords_array, &output_array);
       break;
     case ArrayDataType::kUint8:
-      Gather<ArrayDataType::kUint8>(input_array, op->input_rank, coords_array,
-                                    &output_array);
+      Gather<ArrayDataType::kUint8>(input_array, coords_array, &output_array);
       break;
     case ArrayDataType::kInt32:
-      Gather<ArrayDataType::kInt32>(input_array, op->input_rank, coords_array,
-                                    &output_array);
+      Gather<ArrayDataType::kInt32>(input_array, coords_array, &output_array);
       break;
     case ArrayDataType::kInt64:
-      Gather<ArrayDataType::kInt64>(input_array, op->input_rank, coords_array,
-                                    &output_array);
+      Gather<ArrayDataType::kInt64>(input_array, coords_array, &output_array);
       break;
     case ArrayDataType::kComplex64:
-      Gather<ArrayDataType::kComplex64>(input_array, op->input_rank,
-                                        coords_array, &output_array);
+      Gather<ArrayDataType::kComplex64>(input_array, coords_array,
+                                        &output_array);
       break;
     default:
       LOG(FATAL) << "Unsupported data type given to Gather op with output \""
diff --git a/tensorflow/lite/toco/graph_transformations/tests/BUILD b/tensorflow/lite/toco/graph_transformations/tests/BUILD
index 2e9b213d0018f547740673e26f2ffe7aac010777..bbbedbe3a93065e3a7007073aad7f6e7600e2651 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/lite/toco/graph_transformations/tests/BUILD
@@ -10,7 +10,6 @@ load(
 tf_cc_test(
     name = "lstm_utils_test",
     srcs = ["lstm_utils_test.cc"],
-    tags = ["no_oss"],
     deps = [
         "//tensorflow/lite/toco:graph_transformations",
         "//tensorflow/lite/toco:model",
@@ -22,7 +21,6 @@ tf_cc_test(
 tf_cc_test(
     name = "resolve_constant_concatenation_test",
     srcs = ["resolve_constant_concatenation_test.cc"],
-    tags = ["no_oss"],
     deps = [
         "//tensorflow/lite/toco:graph_transformations",
         "//tensorflow/lite/toco:model",
@@ -34,7 +32,6 @@ tf_cc_test(
 tf_cc_test(
     name = "resolve_constant_unary_test",
     srcs = ["resolve_constant_unary_test.cc"],
-    tags = ["no_oss"],
     deps = [
         "//tensorflow/lite/toco:graph_transformations",
         "//tensorflow/lite/toco:model",
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index aa6b4a3bc58ea9810230d4905e8558f56945e122..0b2f810394311a33899b9242e73131e109a2b4c0 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -1153,6 +1153,31 @@ tensorflow::Status ConvertConcatOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertMirrorPadOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  if (node.op() != "MirrorPad") {
+    LOG(FATAL) << "Expected MirrorPad.";
+  }
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  CHECK_EQ(num_inputs, 2);
+  auto* op = new MirrorPadOperator;
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  const auto mode = GetStringAttr(node, "mode");
+  if (mode == "REFLECT") {
+    op->mode = toco::MirrorPadMode::kReflect;
+  } else if (mode == "SYMMETRIC") {
+    op->mode = toco::MirrorPadMode::kSymmetric;
+  }
+
+  model->operators.emplace_back(op);
+
+  return tensorflow::Status::OK();
+}
+
 static constexpr int kAnyNumInputs = -1;
 
 enum FlexSupport { kFlexOk, kFlexNotOk };
@@ -2389,6 +2414,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"Unpack", ConvertUnpackOperator},
       {"ZerosLike", ConvertSimpleOperator<TensorFlowZerosLikeOperator, 1>},
       {"UnidirectionalSequenceLstm", ConvertUnidirectionalSequenceLstm},
+      {"MirrorPad", ConvertMirrorPadOperator},
   });
 }
 
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index d4fe62ac756e0c3dda34309eedf1cad989e5d288..d392535f5c98cdd3532299064f2c6d9305214e71 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -156,7 +156,8 @@ enum class OperatorType : uint8 {
   kZerosLike,
   kResizeNearestNeighbor,
   kLeakyRelu,
-  kAbs
+  kAbs,
+  kMirrorPad
 };
 
 // Helper to deal with TensorFlow arrays using a different ordering of
@@ -1672,6 +1673,9 @@ struct GatherOperator : Operator {
   // ResolveGatherAttributes. An empty axis indicates that the axis has not yet
   // be resolved.
   absl::optional<int> axis;
+
+  // This field is not used by the standard TF Lite export but it is still need
+  // for legacy Gather implementations.
   int input_rank = 0;
 };
 
@@ -1932,6 +1936,23 @@ struct TensorFlowZerosLikeOperator : Operator {
   TensorFlowZerosLikeOperator() : Operator(OperatorType::kZerosLike) {}
 };
 
+enum class MirrorPadMode { kNone, kSymmetric, kReflect };
+
+// MirrorPad Operator:
+//
+// Inputs:
+// Inputs[0]: required: input tensor to be padded.
+// Inputs[1]: required: 2 Column matrix specifying padding sizes. The number of
+// rows must be the same as the rank of the input.
+// Inputs[2]: required: REFLECT or SYMMETRIC.
+//
+// TensorFlow equivalent: MirrorPad.
+struct MirrorPadOperator : Operator {
+  MirrorPadOperator() : Operator(OperatorType::kMirrorPad) {}
+  // mode is either SYMMETRIC or REFLECT.
+  MirrorPadMode mode;
+};
+
 // Alloc's are used for transient arrays only. An Alloc specifies which interval
 // of the "transient_data" workspace buffer passed to inference functions, is to
 // be used for the transient array at hand. The 'start' and 'end' values are
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index 07056f66c35536e82b8f1fdd7938161e216b850a..8a6e82ec46445b5ec5440de129177eae836f8db8 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -1,4 +1,8 @@
-package(default_visibility = ["//visibility:public"])
+package(default_visibility = [
+    "//tensorflow/contrib/lite:__subpackages__",
+    "//tensorflow/lite:__subpackages__",
+    "//tensorflow/tools/pip_package:__subpackages__",
+])
 
 licenses(["notice"])  # Apache 2.0
 
@@ -9,7 +13,10 @@ load("//tensorflow:tensorflow.bzl", "py_binary")
 config_setting(
     name = "tflite_convert_with_select_tf_ops",
     define_values = {"tflite_convert_with_select_tf_ops": "true"},
-    visibility = ["//visibility:public"],
+    visibility = [
+        "//tensorflow/contrib/lite:__subpackages__",
+        "//tensorflow/lite:__subpackages__",
+    ],
 )
 
 cc_library(
@@ -37,6 +44,12 @@ cc_library(
 tf_py_wrap_cc(
     name = "tensorflow_wrap_toco",
     srcs = ["toco.i"],
+    visibility = [
+        "//learning/expander/pod/deep_pod/utils:__subpackages__",
+        "//research/handwriting/converters/tflite:__subpackages__",
+        "//tensorflow/contrib/lite:__subpackages__",
+        "//tensorflow/lite:__subpackages__",
+    ],
     deps = [
         ":toco_python_api",
         "//tensorflow/lite/toco:model_flags_proto_cc",
diff --git a/tensorflow/lite/toco/tensorflow_graph_matching/BUILD b/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
index 56acc284cc06d6bb8a277adb15aacfee5b1e781c..ae361bf212daeae5cede941111329b2265962ce6 100644
--- a/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
@@ -60,7 +60,6 @@ cc_library(
 tf_cc_test(
     name = "resolve_svdf_test",
     srcs = ["resolve_svdf_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":cluster",
         ":cluster_utils",
diff --git a/tensorflow/lite/toco/tflite/BUILD b/tensorflow/lite/toco/tflite/BUILD
index 99c4f8edebe5186400253d01689520230594b885..36ca638ee8c83f6cc1d887a0efaf2b0676f95bd8 100644
--- a/tensorflow/lite/toco/tflite/BUILD
+++ b/tensorflow/lite/toco/tflite/BUILD
@@ -41,7 +41,6 @@ tf_cc_test(
     srcs = [
         "operator_test.cc",
     ],
-    tags = ["no_oss"],
     deps = [
         ":operator",
         "//tensorflow/core:ops",
@@ -72,7 +71,6 @@ tf_cc_test(
     srcs = [
         "types_test.cc",
     ],
-    tags = ["no_oss"],
     deps = [
         ":types",
         "//tensorflow/core:ops",
@@ -107,7 +105,6 @@ tf_cc_test(
     srcs = [
         "export_test.cc",
     ],
-    tags = ["no_oss"],
     deps = [
         ":export",
         "//tensorflow/core:ops",
@@ -142,7 +139,6 @@ tf_cc_test(
     srcs = [
         "import_test.cc",
     ],
-    tags = ["no_oss"],
     deps = [
         ":import",
         "//tensorflow/core:ops",
diff --git a/tensorflow/lite/toco/tflite/import.cc b/tensorflow/lite/toco/tflite/import.cc
index 88028aa144f2dcf090153252157a1b9b46e13279..1692f721256090f5a03c4e46dabdbe65be497d16 100644
--- a/tensorflow/lite/toco/tflite/import.cc
+++ b/tensorflow/lite/toco/tflite/import.cc
@@ -165,21 +165,28 @@ void ImportOperators(
   }
 }
 
-void ImportIOTensors(const ::tflite::Model& input_model,
+void ImportIOTensors(const ModelFlags& model_flags,
+                     const ::tflite::Model& input_model,
                      const details::TensorsTable& tensors_table, Model* model) {
-  auto inputs = (*input_model.subgraphs())[0]->inputs();
-  if (inputs) {
-    for (int input : *inputs) {
-      const string& input_name = tensors_table.at(input);
-      model->flags.add_input_arrays()->set_name(input_name);
+  // Import from the first subgraph if input arrays have not been specified.
+  if (model_flags.input_arrays().empty()) {
+    auto inputs = (*input_model.subgraphs())[0]->inputs();
+    if (inputs) {
+      for (int input : *inputs) {
+        const string& input_name = tensors_table.at(input);
+        model->flags.add_input_arrays()->set_name(input_name);
+      }
     }
   }
 
-  auto outputs = (*input_model.subgraphs())[0]->outputs();
-  if (outputs) {
-    for (int output : *outputs) {
-      const string& output_name = tensors_table.at(output);
-      model->flags.add_output_arrays(output_name);
+  // Import from the first subgraph if output arrays have not been specified.
+  if (model_flags.output_arrays().empty()) {
+    auto outputs = (*input_model.subgraphs())[0]->outputs();
+    if (outputs) {
+      for (int output : *outputs) {
+        const string& output_name = tensors_table.at(output);
+        model->flags.add_output_arrays(output_name);
+      }
     }
   }
 }
@@ -219,7 +226,8 @@ std::unique_ptr<Model> Import(const ModelFlags& model_flags,
   ImportTensors(*input_model, model.get());
   ImportOperators(*input_model, ops_by_name, tensors_table, operators_table,
                   model.get());
-  ImportIOTensors(*input_model, tensors_table, model.get());
+
+  ImportIOTensors(model_flags, *input_model, tensors_table, model.get());
 
   UndoWeightsShuffling(model.get());
 
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 1c3bee3c5a38f1f939fc68d04204c174a3fbe5eb..205af23da57b08c8c62367df1c154bea5e50cc57 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -1275,6 +1275,29 @@ class SquaredDifference
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class MirrorPad
+    : public BuiltinOperator<MirrorPadOperator, ::tflite::MirrorPadOptions,
+                             ::tflite::BuiltinOptions_MirrorPadOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateMirrorPadOptions(
+        *builder, op.mode == MirrorPadMode::kReflect
+                      ? ::tflite::MirrorPadMode::MirrorPadMode_REFLECT
+                      : ::tflite::MirrorPadMode::MirrorPadMode_SYMMETRIC);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->mode = options.mode() == ::tflite::MirrorPadMode::MirrorPadMode_REFLECT
+                   ? MirrorPadMode::kReflect
+                   : MirrorPadMode::kSymmetric;
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
     const string& tensorflow_node_def) {
   auto fbb = absl::make_unique<flexbuffers::Builder>();
@@ -1459,6 +1482,30 @@ class TensorFlowUnsupported : public BaseOperator {
   const bool enable_select_tf_ops_;
 };
 
+class Dequantize
+    : public BuiltinOperator<DequantizeOperator, ::tflite::DequantizeOptions,
+                             ::tflite::BuiltinOptions_DequantizeOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateDequantizeOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override {
+    // TODO(suharshs): Dequantize now supports INT8 in addition to
+    // QUANTIZED_UINT8. When TOCO can create models with INT8, we need
+    // to find a way to see the type here and return version 2. Right now
+    // version 2 will only be added by post training quantization tools.
+    return 1;
+  }
+};
+
 namespace {
 // Build a vector containing all the known operators.
 std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
@@ -1581,6 +1628,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   ops.push_back(MakeUnique<SquaredDifference>(
       ::tflite::BuiltinOperator_SQUARED_DIFFERENCE,
       OperatorType::kSquaredDifference));
+  ops.push_back(MakeUnique<MirrorPad>(::tflite::BuiltinOperator_MIRROR_PAD,
+                                      OperatorType::kMirrorPad));
 
   // Custom Operators.
   ops.push_back(
@@ -1667,6 +1716,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
       "ZEROS_LIKE", OperatorType::kZerosLike));
   ops.push_back(
       MakeUnique<SimpleOperator<AbsOperator>>("ABS", OperatorType::kAbs));
+  ops.push_back(
+      MakeUnique<SimpleOperator<FillOperator>>("FILL", OperatorType::kFill));
   return ops;
 }
 }  // namespace
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index 09e4435a1814fce3cb29473aeea39f6a86a15554..14ec89cd73f19fcd141640bda7bfba6435f59ac7 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -151,6 +151,7 @@ TEST_F(OperatorTest, SimpleOperators) {
                                                    OperatorType::kZerosLike);
   CheckSimpleOperator<FloorModOperator>("FLOOR_MOD", OperatorType::kFloorMod);
   CheckSimpleOperator<RangeOperator>("RANGE", OperatorType::kRange);
+  CheckSimpleOperator<FillOperator>("FILL", OperatorType::kFill);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -615,6 +616,14 @@ TEST_F(OperatorTest, TestShouldExportAsFlexOp) {
   EXPECT_FALSE(ShouldExportAsFlexOp(true, "RFFT"));
 }
 
+TEST_F(OperatorTest, BuiltinMirrorPad) {
+  MirrorPadOperator op;
+  op.mode = MirrorPadMode::kReflect;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("MIRROR_PAD", OperatorType::kMirrorPad), op);
+  EXPECT_EQ(op.mode, output_toco_op->mode);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc
index d8b111d03792721eb0d4d60f122cfe5c5cc7d3de..55a454e66de4d0afce18421450d875911bea01f4 100644
--- a/tensorflow/lite/toco/toco_tooling.cc
+++ b/tensorflow/lite/toco/toco_tooling.cc
@@ -309,6 +309,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   // Fix any issues with IO edges. This must happen after any transform that
   // may modify the structure of the edges.
   FixEdgeArrays(model);
+  FixOperatorOrdering(model);
 
   if (quantize_output) {
     // If the user specified default min/max ranges we need to set all arrays
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index d0cc7994245bf17f1e1c0ad365586eaedadad7b9..af4cd386a209d82cb56a877410abe6fbdbf99c7b 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -415,6 +415,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(ResizeNearestNeighbor)
     HANDLE_OPERATORTYPENAME_CASE(LeakyRelu)
     HANDLE_OPERATORTYPENAME_CASE(SquaredDifference)
+    HANDLE_OPERATORTYPENAME_CASE(MirrorPad)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
@@ -898,6 +899,9 @@ void CheckNonExistentIOArrays(const Model& model) {
         << "\" is not consumed by any op in this graph. " << general_comment;
   }
   for (const string& output_array : model.flags.output_arrays()) {
+    if (IsConstantParameterArray(model, output_array)) {
+      continue;  // It is OK to request that a constant be an output.
+    }
     QCHECK(GetOpWithOutput(model, output_array))
         << "Specified output array \"" << output_array
         << "\" is not produced by any op in this graph. " << general_comment;
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 93725b5de473e43c4f7c398a2ac0bf1a52e0b3f2..1d141b5dd01a4a03c65d0c8a119ad62eea224d52 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -53,7 +53,6 @@ cc_test(
         "//tensorflow/lite:testdata/test_model_broken.bin",
     ],
     tags = [
-        "no_oss",
         "tflite_not_portable_android",
         "tflite_not_portable_ios",
     ],
@@ -80,7 +79,6 @@ cc_test(
     size = "small",
     srcs = ["verifier_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable",
     ],
     deps = [
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index 583046ad73d67ba9fba76570299fc1331aef07e4..bc47406cd92d406a0900743986ea67a4ba39240e 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -70,6 +70,7 @@ cc_test(
     deps = [
         ":benchmark_tflite_model_lib",
         ":command_line_flags",
+        "//tensorflow/lite:framework",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index a71a2fa1c0ec3c17b49c6acd62feacfb029c43d2..a4d9c879eb645019a7626502207e9a3f4e89b1c1 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -11,6 +11,11 @@ The instructions below are for running the binary on Desktop and Android,
 for iOS please use the
 [iOS benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios).
 
+An experimental Android APK wrapper for the benchmark model utility offers more
+faithful execution behavior on Android (via a foreground Activity). It is
+located
+[here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/android).
+
 ## Parameters
 
 The binary takes the following required parameters:
diff --git a/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml b/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml
new file mode 100644
index 0000000000000000000000000000000000000000..7cdca2885ddabe89bc846f3099dc055d471874b3
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="org.tensorflow.lite.benchmark">
+
+    <!-- Necessary for loading custom models from disk. -->
+    <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE"/>
+
+    <!-- Target SDK 21 (<23) to avoid the need for requesting storage
+         permissions. This APK will almost always be used from the command-line
+         anyway, and be expicitly installed by the developer. -->
+    <uses-sdk
+        android:minSdkVersion="21"
+        android:targetSdkVersion="21" />
+
+    <application>
+        <!-- This Activity runs the TensorFlow Lite benchmark at creation, using
+             a provided set of arguments, then immediately terminates. -->
+        <activity android:name="org.tensorflow.lite.benchmark.BenchmarkModelActivity"
+                  android:screenOrientation="portrait"
+                  android:label="TFLite Benchmark"
+                  android:theme="@android:style/Theme.NoDisplay"
+                  android:exported="true"
+                  android:noHistory="true" />
+    </application>
+
+</manifest>
diff --git a/tensorflow/lite/tools/benchmark/android/BUILD b/tensorflow/lite/tools/benchmark/android/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a291effddc91d2abd153e9e8422ec7cbf5725c4b
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/BUILD
@@ -0,0 +1,44 @@
+# Description:
+#   BenchmarkModel Android harness for TensorFlow Lite benchmarks.
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
+load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
+
+# See README.md for details about building and executing this benchmark.
+android_binary(
+    name = "benchmark_model",
+    srcs = glob([
+        "src/**/*.java",
+    ]),
+    custom_package = "org.tensorflow.lite.benchmark",
+    manifest = "AndroidManifest.xml",
+    # In some platforms we don't have an Android SDK/NDK and this target
+    # can't be built. We need to prevent the build system from trying to
+    # use the target in that case.
+    tags = ["manual"],
+    deps = [":tensorflowlite_benchmark_native"],
+)
+
+tflite_jni_binary(
+    name = "libtensorflowlite_benchmark.so",
+    srcs = glob([
+        "jni/**/*.cc",
+        "jni/**/*.h",
+    ]),
+    deps = [
+        "//tensorflow/lite/java/jni",
+        "//tensorflow/lite/tools/benchmark:benchmark_tflite_model_lib",
+        "//tensorflow/lite/tools/benchmark:logging",
+    ],
+)
+
+cc_library(
+    name = "tensorflowlite_benchmark_native",
+    srcs = ["libtensorflowlite_benchmark.so"],
+    visibility = ["//visibility:private"],
+)
diff --git a/tensorflow/lite/tools/benchmark/android/README.md b/tensorflow/lite/tools/benchmark/android/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f5b67e3f79aa669c5424d46c23f053213ad3a101
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/README.md
@@ -0,0 +1,65 @@
+# TFLite Android Model Benchmark Tool
+
+## Description
+
+This Android benchmark app is a simple wrapper around the TensorFlow Lite
+[command-line benchmark utility](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark).
+
+Pushing and executing binaries directly on Android is a valid approach to
+benchmarking, but it can result in subtle (but observable) differences in
+performance relative to execution within an actual Android app. In particular,
+Android's scheduler tailors behavior based on thread and process priorities,
+which differ between a foreground Activity/Application and a regular background
+binary executed via `adb shell ...`. This tailored behavior is most evident when
+enabling multi-threaded CPU execution with TensorFlow Lite.
+
+To that end, this app offers perhaps a more faithful view of runtime performance
+that developers can expected when deploying TensorFlow Lite with their
+application.
+
+## To build/install/run
+
+(0) Refer to
+https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android
+to edit the `WORKSPACE` to configure the android NDK/SDK.
+
+(1) Build for your specific platform, e.g.:
+
+```
+bazel build -c opt \
+  --config=android_arm64 \
+  --cxxopt='--std=c++11' \
+  tensorflow/lite/tools/benchmark/android:benchmark_model
+```
+
+(2) Connect your phone. Install the benchmark APK to your phone with adb:
+
+```
+adb install -r -d bazel-bin/tensorflow/lite/tools/benchmark/android/benchmark_model.apk
+```
+
+(3) Push the compute graph that you need to test.
+
+```
+adb push mobilenet_quant_v1_224.tflite /data/local/tmp
+```
+
+(4) Run the benchmark. Additional command-line flags are documented
+[here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/README.md)
+and can be appended to the `args` string alongside the required `--graph` flag
+(note that all args must be nested in the single quoted string that follows the
+args key).
+
+```
+adb shell am start -S -n
+  org.tensorflow.lite.benchmark/org.tensorflow.lite.benchmark.BenchmarkModelActivity \
+  --es args '"--graph=/data/local/tmp/mobilenet_quant_v1_224.tflite --num_threads=4"'
+```
+
+(5) The results will be available in Android's logcat, e.g.:
+
+```
+adb logcat | grep "Average inference"
+
+... tflite  : Average inference timings in us: Warmup: 91471, Init: 4108, Inference: 80660.1
+```
diff --git a/tensorflow/lite/tools/benchmark/android/jni/benchmark_model_jni.cc b/tensorflow/lite/tools/benchmark/android/jni/benchmark_model_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee67bdafb0d3dd84ca1eaba8062e385887f3eb74
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/jni/benchmark_model_jni.cc
@@ -0,0 +1,92 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <jni.h>
+
+#include <sstream>
+#include <string>
+
+#include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
+#include "tensorflow/lite/tools/benchmark/logging.h"
+
+#ifdef __ANDROID__
+#include <android/log.h>
+#endif
+
+namespace tflite {
+namespace benchmark {
+namespace {
+
+class AndroidBenchmarkLoggingListener : public BenchmarkListener {
+  void OnBenchmarkEnd(const BenchmarkResults& results) override {
+    auto inference_us = results.inference_time_us();
+    auto init_us = results.startup_latency_us();
+    auto warmup_us = results.warmup_time_us();
+    std::stringstream results_output;
+    results_output << "Average inference timings in us: "
+                   << "Warmup: " << warmup_us.avg() << ", "
+                   << "Init: " << init_us << ", "
+                   << "Inference: " << inference_us.avg();
+#ifdef __ANDROID__
+    __android_log_print(ANDROID_LOG_ERROR, "tflite", "%s",
+                        results_output.str().c_str());
+#else
+    fprintf(stderr, "%s", results_output.str().c_str());
+#endif
+  }
+};
+
+void Run(int argc, char** argv) {
+  BenchmarkTfLiteModel benchmark;
+  AndroidBenchmarkLoggingListener listener;
+  benchmark.AddListener(&listener);
+  benchmark.Run(argc, argv);
+}
+
+}  // namespace
+}  // namespace benchmark
+}  // namespace tflite
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_benchmark_BenchmarkModel_nativeRun(JNIEnv* env,
+                                                            jclass clazz,
+                                                            jstring args_obj) {
+  const char* args_chars = env->GetStringUTFChars(args_obj, nullptr);
+
+  // Split the args string into individual arg tokens.
+  std::istringstream iss(args_chars);
+  std::vector<std::string> args_split{std::istream_iterator<std::string>(iss),
+                                      {}};
+
+  // Construct a fake argv command-line object for the benchmark.
+  std::vector<char*> argv;
+  std::string arg0 = "(BenchmarkModelAndroid)";
+  argv.push_back(const_cast<char*>(arg0.data()));
+  for (auto& arg : args_split) {
+    argv.push_back(const_cast<char*>(arg.data()));
+  }
+
+  tflite::benchmark::Run(static_cast<int>(argv.size()), argv.data());
+
+  env->ReleaseStringUTFChars(args_obj, args_chars);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/tensorflow/core/platform/cuda_libdevice_path.cc b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModel.java
similarity index 53%
rename from tensorflow/core/platform/cuda_libdevice_path.cc
rename to tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModel.java
index 4d6532b983d52e7882ab540da31fb0b57183eb6f..a6cf8d78d5703300b3576ab3221326a2335e602e 100644
--- a/tensorflow/core/platform/cuda_libdevice_path.cc
+++ b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModel.java
@@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,14 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/cuda_libdevice_path.h"
+package org.tensorflow.lite.benchmark;
 
-#include "tensorflow/core/lib/io/path.h"
+/** Helper class for running a native TensorFlow Lite benchmark. */
+class BenchmarkModel {
+  static {
+    System.loadLibrary("tensorflowlite_benchmark");
+  }
 
-namespace tensorflow {
+  // Executes a standard TensorFlow Lite benchmark according to the provided args.
+  //
+  // Note that {@code args} will be split by the native execution code.
+  public static void run(String args) {
+    nativeRun(args);
+  }
 
-string LibdeviceRoot() {
-  return tensorflow::io::JoinPath(tensorflow::CudaRoot(), "nvvm/libdevice");
+  private static native void nativeRun(String args);
 }
-
-}  // namespace tensorflow
diff --git a/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..12410adf3d6687ffa514c6ba21981fb19286fe62
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.benchmark;
+
+import android.app.Activity;
+import android.content.Intent;
+import android.os.Bundle;
+import android.util.Log;
+
+/** Main {@code Activity} class for the benchmark app. */
+public class BenchmarkModelActivity extends Activity {
+
+  private static final String TAG = "tflite_BenchmarkModelActivity";
+
+  private static final String ARGS_INTENT_KEY_0 = "args";
+  private static final String ARGS_INTENT_KEY_1 = "--args";
+
+  @Override
+  public void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+
+    Intent intent = getIntent();
+    Bundle bundle = intent.getExtras();
+    String args = bundle.getString(ARGS_INTENT_KEY_0, bundle.getString(ARGS_INTENT_KEY_1));
+    Log.i(TAG, "Running TensorFlow Lite benchmark with args: " + args);
+
+    BenchmarkModel.run(args);
+
+    finish();
+  }
+}
diff --git a/tensorflow/lite/tools/benchmark/benchmark_test.cc b/tensorflow/lite/tools/benchmark/benchmark_test.cc
index 8191fbcd7356ced765e088d9609118489226dcc3..a4f830122f65bcacb0eae4783998cf8bb5611fb9 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
 #include "tensorflow/lite/tools/benchmark/command_line_flags.h"
@@ -47,6 +48,15 @@ BenchmarkParams CreateParams() {
   return params;
 }
 
+class TestBenchmark : public BenchmarkTfLiteModel {
+ public:
+  explicit TestBenchmark(BenchmarkParams params)
+      : BenchmarkTfLiteModel(std::move(params)) {}
+  const tflite::Interpreter* GetInterpreter() { return interpreter.get(); }
+
+  void Prepare() { PrepareInputsAndOutputs(); }
+};
+
 TEST(BenchmarkTest, DoesntCrash) {
   ASSERT_THAT(g_model_path, testing::NotNull());
 
@@ -54,6 +64,37 @@ TEST(BenchmarkTest, DoesntCrash) {
   benchmark.Run();
 }
 
+TEST(BenchmarkTest, ParametersArePopulatedWhenInputShapeIsNotSpecified) {
+  ASSERT_THAT(g_model_path, testing::NotNull());
+
+  TestBenchmark benchmark(CreateParams());
+  benchmark.Init();
+  benchmark.Prepare();
+
+  auto interpreter = benchmark.GetInterpreter();
+  auto inputs = interpreter->inputs();
+  ASSERT_GE(inputs.size(), 1);
+  auto input_tensor = interpreter->tensor(inputs[0]);
+
+  std::vector<char> input_bytes;
+  input_bytes.reserve(input_tensor->bytes);
+  for (size_t i = 0; i < input_tensor->bytes; i++) {
+    input_bytes.push_back(input_tensor->data.raw_const[i]);
+  }
+  benchmark.Prepare();
+
+  // Expect data is not the same.
+  EXPECT_EQ(input_bytes.size(), input_tensor->bytes);
+  bool is_same = true;
+  for (size_t i = 0; i < input_tensor->bytes; i++) {
+    if (input_bytes[i] != input_tensor->data.raw_const[i]) {
+      is_same = false;
+      break;
+    }
+  }
+  EXPECT_FALSE(is_same);
+}
+
 }  // namespace
 }  // namespace benchmark
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 7768b75f769c12d6603154a35fe650b550542faf..32cf4e4292a57ebb73abfaeb3d73d5c1e5717f43 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/benchmark/logging.h"
 
 #ifdef GEMMLOWP_PROFILING
-#include "third_party/gemmlowp/profiling/profiler.h"
+#include "gemmlowp/profiling/profiler.h"
 #endif
 
 #ifdef TFLITE_CUSTOM_OPS_HEADER
@@ -181,6 +181,15 @@ bool PopulateInputLayerInfo(
   return true;
 }
 
+std::vector<int> TfLiteIntArrayToVector(const TfLiteIntArray* int_array) {
+  std::vector<int> values;
+  values.reserve(int_array->size);
+  for (size_t i = 0; i < int_array->size; i++) {
+    values.push_back(int_array->data[i]);
+  }
+  return values;
+}
+
 }  // namespace
 
 BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
@@ -250,12 +259,10 @@ uint64_t BenchmarkTfLiteModel::ComputeInputBytes() {
 void BenchmarkTfLiteModel::PrepareInputsAndOutputs() {
   auto interpreter_inputs = interpreter->inputs();
   // Set the values of the input tensors.
-  for (int j = 0; j < inputs.size(); ++j) {
-    const InputLayerInfo& input = inputs[j];
+  for (int j = 0; j < interpreter_inputs.size(); ++j) {
     int i = interpreter_inputs[j];
     TfLiteTensor* t = interpreter->tensor(i);
-    std::vector<int> sizes = input.shape;
-
+    std::vector<int> sizes = TfLiteIntArrayToVector(t->dims);
     // TODO(ahentz): below we ignore the O-th dimension (number of batches).
     if (t->type == kTfLiteFloat32) {
       FillRandomValue<float>(
@@ -274,12 +281,17 @@ void BenchmarkTfLiteModel::PrepareInputsAndOutputs() {
           interpreter->typed_tensor<uint8_t>(i),
           std::vector<int>(sizes.begin() + 1, sizes.end()),
           []() { return static_cast<uint8_t>(rand()) % 255; });
+    } else if (t->type == kTfLiteInt8) {
+      FillRandomValue<int8_t>(
+          interpreter->typed_tensor<int8_t>(i),
+          std::vector<int>(sizes.begin() + 1, sizes.end()),
+          []() { return static_cast<int8_t>(rand()) % 255 - 127; });
     } else if (t->type == kTfLiteString) {
       tflite::DynamicBuffer buffer;
       FillRandomString(&buffer, sizes, []() {
         return "we're have some friends over saturday to hang out in the yard";
       });
-      buffer.WriteToTensor(interpreter->tensor(i));
+      buffer.WriteToTensor(interpreter->tensor(i), /*new_shape=*/nullptr);
     } else {
       TFLITE_LOG(FATAL) << "Don't know how to populate tensor " << t->name
                         << " of type " << t->type;
diff --git a/tensorflow/lite/tools/benchmark/ios/README.md b/tensorflow/lite/tools/benchmark/ios/README.md
index 3dc29d9b94119564344e2060665daf585a3acd2d..fed9e7ea7e8633e00413118fa3e9e4f12d5188a4 100644
--- a/tensorflow/lite/tools/benchmark/ios/README.md
+++ b/tensorflow/lite/tools/benchmark/ios/README.md
@@ -41,3 +41,14 @@ resources that need to be copied.
 
 - Now try running the app. The app has a single button that runs the benchmark
   on the model and displays results in a text view below.
+
+## Profiling
+
+If you want detailed profiling, use the following command:
+
+```bash
+tensorflow/lite/build_ios_universal_lib.sh -p
+```
+
+Then following the same steps above and run the benchmark app. You will see the
+detailed profiling results in the outputs.
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 363a069d5e2e521b88c8053be1ce8bf48b476561..994f660dba7742de162525dcf6a8c6a288ee71c6 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -52,6 +52,7 @@ LIBS := \
 # generate things like the protobuf compiler that require that), so all of
 # these settings are for the target compiler.
 CXXFLAGS := -O3 -DNDEBUG
+CXXFLAGS += $(EXTRA_CXXFLAGS)
 CCFLAGS := ${CXXFLAGS}
 CXXFLAGS += --std=c++11
 CFLAGS :=
@@ -116,8 +117,8 @@ tensorflow/lite/mmap_allocation.cc \
 tensorflow/lite/nnapi_delegate.cc
 else
 CORE_CC_EXCLUDE_SRCS += \
-tensorflow/contrib/lite/mmap_allocation_disabled.cc \
-tensorflow/contrib/lite/nnapi_delegate_disabled.cc
+tensorflow/lite/mmap_allocation_disabled.cc \
+tensorflow/lite/nnapi_delegate_disabled.cc
 endif
 # Filter out all the excluded files.
 TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
diff --git a/tensorflow/lite/tools/make/build_ios_universal_lib.sh b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
index 6e0d262827f0944918580d073f082d20e0e1803b..8b617ef5937a062261ee23bed3cfd1f40e6a3995 100755
--- a/tensorflow/lite/tools/make/build_ios_universal_lib.sh
+++ b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
@@ -19,20 +19,36 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR/../../../.."
 
+usage() {
+  echo "Usage: $(basename "$0") [-a]"
+  echo "-a [build_arch] build for specified arch comma separate for multiple archs (eg: x86_64 arm64)"
+  echo "  default is [x86_64 armv7 armv7s arm64]"
+  echo "-p enable profiling"
+  exit 1
+}
+
+profiling_args=""
+BUILD_ARCHS="x86_64 armv7 armv7s arm64"
+while getopts "a:p" opt_name; do
+  case "$opt_name" in
+    a) BUILD_ARCHS="${OPTARG}";;
+    p) profiling_args='-DGEMMLOWP_PROFILING,-DTFLITE_PROFILING_ENABLED';;
+    *) usage;;
+  esac
+done
+shift $(($OPTIND - 1))
+
 # Build library for supported architectures and packs them in a fat binary.
 make_library() {
-    for arch in x86_64 armv7 armv7s arm64
+    LIBS=""
+    for arch in $BUILD_ARCHS
     do
         make -f tensorflow/lite/tools/make/Makefile TARGET=ios TARGET_ARCH=${arch} \
-        -j 8
+            EXTRA_CXXFLAGS=$profiling_args -j 8
+        LIBS="${LIBS} tensorflow/lite/tools/make/gen/ios_${arch}/lib/${1}"
     done
     mkdir -p tensorflow/lite/tools/make/gen/lib
-    lipo \
-    tensorflow/lite/tools/make/gen/ios_x86_64/lib/${1} \
-    tensorflow/lite/tools/make/gen/ios_armv7/lib/${1} \
-    tensorflow/lite/tools/make/gen/ios_armv7s/lib/${1} \
-    tensorflow/lite/tools/make/gen/ios_arm64/lib/${1} \
-    -create \
+    lipo $LIBS -create \
     -output tensorflow/lite/tools/make/gen/lib/${1}
 }
 
diff --git a/tensorflow/lite/tutorials/mnist_tflite.py b/tensorflow/lite/tutorials/mnist_tflite.py
index 002365717fce9e98dad6bacaaff6cdc4e6f5280a..6cc5846163594d74cfcbd95ab99ddb6a7b67bdf1 100644
--- a/tensorflow/lite/tutorials/mnist_tflite.py
+++ b/tensorflow/lite/tutorials/mnist_tflite.py
@@ -34,8 +34,8 @@ flags = flags.FLAGS
 def test_image_generator():
   # Generates an iterator over images
   with tf.Session() as sess:
-    input_data = dataset.test(
-        flags.data_dir).make_one_shot_iterator().get_next()
+    input_data = tf.compat.v1.data.make_one_shot_iterator(dataset.test(
+        flags.data_dir)).get_next()
     try:
       while True:
         yield sess.run(input_data)
diff --git a/tensorflow/lite/tutorials/post_training_quant.ipynb b/tensorflow/lite/tutorials/post_training_quant.ipynb
index 3ff145d9ce9291ad4fbc2f49b423d78632019059..394ab0760b5672978e0638c0ff01a8f00442302c 100644
--- a/tensorflow/lite/tutorials/post_training_quant.ipynb
+++ b/tensorflow/lite/tutorials/post_training_quant.ipynb
@@ -235,9 +235,9 @@
         "id": "AT8BgkKmljOy"
       },
       "source": [
-        "Using the python `TocoConverter`, the saved model can be converted into a TFLite model.\n",
+        "Using the python `TFLiteConverter`, the saved model can be converted into a TFLite model.\n",
         "\n",
-        "First load the model using the `TocoConverter`:"
+        "First load the model using the `TFLiteConverter`:"
       ]
     },
     {
@@ -252,7 +252,7 @@
       "source": [
         "import tensorflow as tf\n",
         "tf.enable_eager_execution()\n",
-        "converter = tf.lite.TocoConverter.from_saved_model(saved_model_dir)\n",
+        "converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)\n",
         "tflite_model = converter.convert()"
       ]
     },
@@ -648,7 +648,7 @@
         "graph_def_file = pathlib.Path(archive_path).parent/\"resnet_v2_101_299_frozen.pb\"\n",
         "input_arrays = [\"input\"] \n",
         "output_arrays = [\"output\"]\n",
-        "converter = tf.lite.TocoConverter.from_frozen_graph(\n",
+        "converter = tf.lite.TFLiteConverter.from_frozen_graph(\n",
         "  str(graph_def_file), input_arrays, output_arrays, input_shapes={\"input\":[1,299,299,3]})\n",
         "converter.post_training_quantize = True\n",
         "resnet_tflite_file = graph_def_file.parent/\"resnet_v2_101_quantized.tflite\"\n",
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
new file mode 100644
index 0000000000000000000000000000000000000000..88800c295124cbb7e1f292c6970b81e3b0594ab3
--- /dev/null
+++ b/tensorflow/opensource_only.files
@@ -0,0 +1,246 @@
+tensorflow/contrib/tpu/profiler/pip_package/BUILD
+tensorflow/contrib/tpu/profiler/pip_package/setup.py
+tensorflow/contrib/tpu/profiler/pip_package/README
+tensorflow/contrib/tpu/profiler/pip_package/build_pip_package.sh
+tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/__init__.py
+tensorflow/contrib/mpi/BUILD
+tensorflow/tools/ci_build/remote/BUILD
+tensorflow/tools/pip_package/README
+tensorflow/tools/pip_package/MANIFEST.in
+tensorflow/tools/pip_package/simple_console.py
+tensorflow/tools/pip_package/build_pip_package.sh
+tensorflow/tools/pip_package/check_load_py_test.py
+tensorflow/tools/pip_package/pip_smoke_test.py
+tensorflow/tools/pip_package/simple_console_for_windows.py
+tensorflow/tools/pip_package/setup.py
+tensorflow/tools/pip_package/BUILD
+tensorflow/tools/lib_package/concat_licenses.sh
+tensorflow/tools/lib_package/libtensorflow_test.c
+tensorflow/tools/lib_package/LibTensorFlowTest.java
+tensorflow/tools/lib_package/BUILD
+tensorflow/tools/lib_package/libtensorflow_test.sh
+tensorflow/tools/lib_package/README.md
+tensorflow/tools/lib_package/libtensorflow_java_test.sh
+tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+tensorflow/tools/def_file_filter/BUILD
+tensorflow/tools/def_file_filter/BUILD.tpl
+tensorflow/tools/def_file_filter/def_file_filter.py.tpl
+tensorflow/third_party/mkl/MKL_LICENSE
+tensorflow/third_party/mkl/LICENSE
+tensorflow/third_party/mkl/BUILD
+tensorflow/third_party/mkl/mkl.BUILD
+tensorflow/third_party/mkl/build_defs.bzl
+tensorflow/third_party/backports_weakref.BUILD
+tensorflow/third_party/toolchains/clang6/BUILD
+tensorflow/third_party/toolchains/clang6/README.md
+tensorflow/third_party/toolchains/clang6/repo.bzl
+tensorflow/third_party/toolchains/clang6/CROSSTOOL.tpl
+tensorflow/third_party/toolchains/clang6/clang.BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/nccl2/BUILD
+tensorflow/third_party/toolchains/preconfig/generate/workspace.bzl
+tensorflow/third_party/toolchains/preconfig/generate/containers.bzl
+tensorflow/third_party/toolchains/preconfig/generate/generate.bzl
+tensorflow/third_party/toolchains/preconfig/generate/archives.bzl
+tensorflow/third_party/toolchains/preconfig/generate/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/dummy_toolchain.bzl
+tensorflow/third_party/toolchains/preconfig/win_1803/py36/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/BUILD
+tensorflow/third_party/toolchains/gpus/cuda/build_defs.bzl
+tensorflow/third_party/toolchains/gpus/cuda/BUILD
+tensorflow/third_party/toolchains/gpus/cuda/cuda/cuda_config.h
+tensorflow/third_party/toolchains/gpus/crosstool/BUILD
+tensorflow/third_party/toolchains/gpus/crosstool/CROSSTOOL
+tensorflow/third_party/toolchains/gpus/py/BUILD
+tensorflow/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
+tensorflow/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
+tensorflow/third_party/toolchains/cpus/arm/BUILD
+tensorflow/third_party/toolchains/cpus/py3/BUILD
+tensorflow/third_party/toolchains/cpus/py/BUILD
+tensorflow/third_party/toolchains/BUILD
+tensorflow/third_party/nccl/remote.BUILD.tpl
+tensorflow/third_party/nccl/archive.BUILD
+tensorflow/third_party/nccl/LICENSE
+tensorflow/third_party/nccl/system.BUILD.tpl
+tensorflow/third_party/nccl/nccl_configure.bzl
+tensorflow/third_party/nccl/build_defs.bzl.tpl
+tensorflow/third_party/nccl/BUILD
+tensorflow/third_party/gpus/BUILD
+tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+tensorflow/third_party/gpus/crosstool/CROSSTOOL.tpl
+tensorflow/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl
+tensorflow/third_party/gpus/crosstool/LICENSE
+tensorflow/third_party/gpus/crosstool/remote.BUILD.tpl
+tensorflow/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+tensorflow/third_party/gpus/crosstool/BUILD.tpl
+tensorflow/third_party/gpus/crosstool/BUILD
+tensorflow/third_party/gpus/cuda/LICENSE
+tensorflow/third_party/gpus/cuda/BUILD.tpl
+tensorflow/third_party/gpus/cuda/BUILD.windows.tpl
+tensorflow/third_party/gpus/cuda/cuda_config.h.tpl
+tensorflow/third_party/gpus/cuda/remote.BUILD.tpl
+tensorflow/third_party/gpus/cuda/BUILD
+tensorflow/third_party/gpus/cuda/build_defs.bzl.tpl
+tensorflow/third_party/gpus/rocm/rocm_config.h.tpl
+tensorflow/third_party/gpus/rocm/BUILD
+tensorflow/third_party/gpus/rocm/BUILD.tpl
+tensorflow/third_party/gpus/rocm/build_defs.bzl.tpl
+tensorflow/third_party/gpus/cuda_configure.bzl
+tensorflow/third_party/gpus/rocm_configure.bzl
+tensorflow/third_party/snappy.BUILD
+tensorflow/third_party/cython.BUILD
+tensorflow/third_party/farmhash.BUILD
+tensorflow/third_party/eigen3/Eigen/Cholesky
+tensorflow/third_party/eigen3/Eigen/QR
+tensorflow/third_party/eigen3/Eigen/LU
+tensorflow/third_party/eigen3/Eigen/Core
+tensorflow/third_party/eigen3/Eigen/SVD
+tensorflow/third_party/eigen3/Eigen/Eigenvalues
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool
+tensorflow/third_party/eigen3/unsupported/Eigen/SpecialFunctions
+tensorflow/third_party/eigen3/unsupported/Eigen/MatrixFunctions
+tensorflow/third_party/eigen3/LICENSE
+tensorflow/third_party/eigen3/BUILD
+tensorflow/third_party/systemlibs/build_defs.bzl.tpl
+tensorflow/third_party/systemlibs/absl_py.BUILD
+tensorflow/third_party/systemlibs/curl.BUILD
+tensorflow/third_party/systemlibs/termcolor.BUILD
+tensorflow/third_party/systemlibs/absl_py.absl.flags.BUILD
+tensorflow/third_party/systemlibs/grpc.BUILD
+tensorflow/third_party/systemlibs/swig.BUILD
+tensorflow/third_party/systemlibs/protobuf.bzl
+tensorflow/third_party/systemlibs/protobuf.BUILD
+tensorflow/third_party/systemlibs/BUILD
+tensorflow/third_party/systemlibs/google_cloud_cpp.BUILD
+tensorflow/third_party/systemlibs/astor.BUILD
+tensorflow/third_party/systemlibs/six.BUILD
+tensorflow/third_party/systemlibs/absl_py.absl.testing.BUILD
+tensorflow/third_party/systemlibs/boringssl.BUILD
+tensorflow/third_party/systemlibs/nsync.BUILD
+tensorflow/third_party/systemlibs/google_cloud_cpp.google.cloud.bigtable.BUILD
+tensorflow/third_party/systemlibs/gif.BUILD
+tensorflow/third_party/systemlibs/pcre.BUILD
+tensorflow/third_party/systemlibs/BUILD.tpl
+tensorflow/third_party/systemlibs/snappy.BUILD
+tensorflow/third_party/systemlibs/gast.BUILD
+tensorflow/third_party/systemlibs/cython.BUILD
+tensorflow/third_party/systemlibs/double_conversion.BUILD
+tensorflow/third_party/systemlibs/zlib.BUILD
+tensorflow/third_party/systemlibs/jsoncpp.BUILD
+tensorflow/third_party/systemlibs/re2.BUILD
+tensorflow/third_party/systemlibs/lmdb.BUILD
+tensorflow/third_party/systemlibs/googleapis.BUILD
+tensorflow/third_party/systemlibs/png.BUILD
+tensorflow/third_party/systemlibs/syslibs_configure.bzl
+tensorflow/third_party/systemlibs/sqlite.BUILD
+tensorflow/third_party/python_runtime/BUILD
+tensorflow/third_party/sycl/crosstool/BUILD
+tensorflow/third_party/ngraph/LICENSE
+tensorflow/third_party/ngraph/tbb.BUILD
+tensorflow/third_party/ngraph/BUILD
+tensorflow/third_party/ngraph/ngraph.BUILD
+tensorflow/third_party/ngraph/build_defs.bzl
+tensorflow/third_party/ngraph/NGRAPH_LICENSE
+tensorflow/third_party/ngraph/ngraph_tf.BUILD
+tensorflow/third_party/ngraph/nlohmann_json.BUILD
+tensorflow/third_party/clang_toolchain/download_clang.bzl
+tensorflow/third_party/clang_toolchain/BUILD
+tensorflow/third_party/clang_toolchain/cc_configure_clang.bzl
+tensorflow/third_party/gast.BUILD
+tensorflow/third_party/llvm/BUILD
+tensorflow/third_party/llvm/expand_cmake_vars.py
+tensorflow/third_party/llvm/llvm.autogenerated.BUILD
+tensorflow/third_party/llvm/llvm.bzl
+tensorflow/third_party/icu/udata.patch
+tensorflow/third_party/fft2d/BUILD
+tensorflow/third_party/fft2d/fft.h
+tensorflow/third_party/fft2d/LICENSE
+tensorflow/third_party/fft2d/fft2d.BUILD
+tensorflow/third_party/boringssl/BUILD
+tensorflow/third_party/mpi/.gitignore
+tensorflow/third_party/mpi/BUILD
+tensorflow/third_party/tensorrt/LICENSE
+tensorflow/third_party/tensorrt/BUILD
+tensorflow/third_party/tensorrt/build_defs.bzl.tpl
+tensorflow/third_party/tensorrt/BUILD.tpl
+tensorflow/third_party/tensorrt/tensorrt_configure.bzl
+tensorflow/third_party/tensorrt/remote.BUILD.tpl
+tensorflow/third_party/kafka/config.patch
+tensorflow/third_party/kafka/BUILD
+tensorflow/third_party/android/BUILD
+tensorflow/third_party/android/android.bzl.tpl
+tensorflow/third_party/android/android_configure.bzl
+tensorflow/third_party/android/android_configure.BUILD.tpl
+tensorflow/third_party/tflite_smartreply.BUILD
+tensorflow/third_party/mkl_dnn/LICENSE
+tensorflow/third_party/mkl_dnn/mkldnn.BUILD
+tensorflow/third_party/pcre.BUILD
+tensorflow/third_party/linenoise.BUILD
+tensorflow/third_party/sqlite.BUILD
+tensorflow/third_party/common.bzl
+tensorflow/third_party/com_google_absl.BUILD
+tensorflow/third_party/pprof.BUILD
+tensorflow/third_party/BUILD
+tensorflow/third_party/tflite_mobilenet_quant.BUILD
+tensorflow/third_party/lmdb.BUILD
+tensorflow/third_party/git/BUILD.tpl
+tensorflow/third_party/git/BUILD
+tensorflow/third_party/git/git_configure.bzl
+tensorflow/third_party/protobuf/BUILD
+tensorflow/third_party/tflite_mobilenet.BUILD
+tensorflow/third_party/py/BUILD
+tensorflow/third_party/py/BUILD.tpl
+tensorflow/third_party/py/remote.BUILD.tpl
+tensorflow/third_party/py/numpy/BUILD
+tensorflow/third_party/py/python_configure.bzl
+tensorflow/third_party/termcolor.BUILD
+tensorflow/third_party/png_fix_rpi.patch
+tensorflow/third_party/swig.BUILD
+tensorflow/third_party/astor.BUILD
+tensorflow/third_party/grpc/BUILD
+tensorflow/third_party/curl.BUILD
+tensorflow/third_party/arm_neon_2_x86_sse.BUILD
+tensorflow/third_party/png.BUILD
+tensorflow/third_party/googleapis.BUILD
+tensorflow/third_party/mpi_collectives/BUILD
+tensorflow/third_party/nanopb.BUILD
+tensorflow/third_party/gif.BUILD
+tensorflow/third_party/double_conversion.BUILD
+tensorflow/third_party/six.BUILD
+tensorflow/third_party/tflite_mobilenet_float.BUILD
+tensorflow/third_party/repo.bzl
+tensorflow/third_party/codegen.BUILD
+tensorflow/third_party/cub.BUILD
+tensorflow/third_party/jsoncpp.BUILD
+tensorflow/third_party/tflite_ovic_testdata.BUILD
+tensorflow/third_party/libxsmm.BUILD
+tensorflow/third_party/zlib.BUILD
+tensorflow/third_party/eigen.BUILD
+tensorflow/stream_executor/BUILD
+tensorflow/api_template_v1.__init__.py
+tensorflow/compat_template_v1.__init__.py
+tensorflow/api_template.__init__.py
+tensorflow/__init__.py
\ No newline at end of file
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 92aac5fc0ae7f4b89027dc62f48f82326879777b..d851c229ac90c60d8c411bb8c3af05020cc45bbd 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -6,6 +6,7 @@
 
 visibility = [
     "//engedu/ml/tf_from_scratch:__pkg__",
+    "//third_party/cloud_tpu/convergence_tools:__subpackages__",
     "//tensorflow:internal",
     "//tensorflow/lite/toco/python:__pkg__",
     "//tensorflow_models:__subpackages__",
@@ -102,6 +103,7 @@ py_library(
         ":framework_for_generated_wrappers",
         ":functional_ops",
         ":gradient_checker",
+        ":gradient_checker_v2",
         ":graph_util",
         ":histogram_ops",
         ":image_ops",
@@ -114,6 +116,7 @@ py_library(
         ":manip_ops",
         ":math_ops",
         ":metrics",
+        ":mode_keys",
         ":nccl_ops",
         ":nn",
         ":ops",
@@ -144,6 +147,7 @@ py_library(
         "//tensorflow/lite/python:lite",
         "//tensorflow/python/compat",
         "//tensorflow/python/data",
+        "//tensorflow/python/distribute",
         "//tensorflow/python/distribute:estimator_training",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/feature_column:feature_column_py",
@@ -524,6 +528,17 @@ py_test(
     ],
 )
 
+py_test(
+    name = "dispatch_test",
+    srcs = ["util/dispatch_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":platform",
+        ":util",
+    ],
+)
+
 py_test(
     name = "keyword_args_test",
     srcs = ["util/keyword_args_test.py"],
@@ -1086,6 +1101,7 @@ py_library(
         ":cond_v2",
         ":framework_test_lib",
         ":gradient_checker",
+        ":gradient_checker_v2",
         ":platform_test",
         ":util",
         ":while_v2",
@@ -2181,7 +2197,6 @@ py_library(
         ":control_flow_util_v2",
         ":dtypes",
         ":framework_ops",
-        ":framework_test_lib",
         ":function_def_to_graph",
         ":functional_ops_gen",
         ":gradients_impl",
@@ -2857,6 +2872,7 @@ py_test(
         ":framework_test_lib",
         ":sparse_ops",
         ":sparse_tensor",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -3018,6 +3034,7 @@ py_library(
         "//tensorflow/python/eager:wrap_function",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/ops/ragged",
     ],
 )
 
@@ -3178,6 +3195,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "gradient_checker_v2",
+    srcs = ["ops/gradient_checker_v2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":framework_for_generated_wrappers",
+        ":gradients",
+        ":platform",
+        "//third_party/py/numpy",
+    ],
+)
+
 # This target is deprecated.
 py_library(
     name = "ops",
@@ -3209,6 +3239,7 @@ cuda_py_test(
     srcs = ["ops/control_flow_ops_test.py"],
     additional_deps = [
         ":array_ops",
+        ":cond_v2",
         ":control_flow_ops",
         ":embedding_ops",
         ":framework_for_generated_wrappers",
@@ -3224,6 +3255,8 @@ cuda_py_test(
         ":util",
         ":variable_scope",
         ":variables",
+        ":while_v2",
+        "//tensorflow/python/eager:def_function",
     ],
 )
 
@@ -3243,6 +3276,22 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "gradient_checker_v2_test",
+    size = "medium",
+    srcs = ["ops/gradient_checker_v2_test.py"],
+    additional_deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":math_ops",
+        ":nn_grad",
+        ":nn_ops",
+        ":platform",
+        "//third_party/py/numpy",
+    ],
+)
+
 cuda_py_test(
     name = "gradients_test",
     size = "medium",
@@ -3351,6 +3400,9 @@ cuda_py_test(
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:execution_callbacks",
         "//third_party/py/numpy",
     ],
     tags = ["no_windows_gpu"],
@@ -3466,13 +3518,13 @@ py_library(
         exclude = [
             "**/*test*",
             "training/checkpointable/**/*.py",
+            "training/saving/**/*.py",
             # The following targets have their own build rules (same name as the
             # file):
             "training/basic_session_run_hooks.py",
             "training/checkpoint_management.py",
             "training/distribute.py",
             "training/distribution_strategy_context.py",
-            "training/saveable_object.py",
             "training/saver.py",
             "training/session_run_hook.py",
             "training/training_util.py",
@@ -3547,12 +3599,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "saveable_object",
-    srcs = ["training/saveable_object.py"],
-    srcs_version = "PY2AND3",
-)
-
 py_library(
     name = "checkpoint_management",
     srcs = ["training/checkpoint_management.py"],
@@ -3606,7 +3652,6 @@ py_library(
         ":platform",
         ":pywrap_tensorflow",
         ":resource_variable_ops",
-        ":saveable_object",
         ":session",
         ":state_ops",
         ":string_ops",
@@ -3616,6 +3661,8 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/training/saving:saveable_object_util",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -3789,6 +3836,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":util",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python/eager:context",
         "@six_archive//:six",
     ],
@@ -4136,11 +4184,24 @@ genrule(
 
 # Get the import library of  _pywrap_tensorflow_internal.dll
 filegroup(
-    name = "pywrap_tensorflow_import_lib_file",
+    name = "get_pywrap_tensorflow_import_lib_file",
     srcs = [":_pywrap_tensorflow_internal.so"],
     output_group = "interface_library",
 )
 
+# Rename the import library for _pywrap_tensorflow_internal.pyd to _pywrap_tensorflow_internal.lib
+# (It was _pywrap_tensorflow_internal.so.if.lib).
+genrule(
+    name = "pywrap_tensorflow_import_lib_file",
+    srcs = [":get_pywrap_tensorflow_import_lib_file"],
+    outs = ["_pywrap_tensorflow_internal.lib"],
+    cmd = select({
+        "//tensorflow:windows": "cp -f $< $@",
+        "//conditions:default": "touch $@",  # Just a placeholder for Unix platforms
+    }),
+    visibility = ["//visibility:public"],
+)
+
 # Create a cc_import rule for the import library of _pywrap_tensorflow_internal.dll
 # so that custom ops' dynamic libraries can link against it.
 cc_import(
@@ -5261,6 +5322,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "quantized_ops_test",
+    size = "small",
+    srcs = ["ops/quantized_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "quantized_conv_ops_test",
     size = "small",
@@ -5842,6 +5917,30 @@ py_binary(
     ],
 )
 
+py_library(
+    name = "mode_keys",
+    srcs = [
+        "training/mode_keys.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":util",
+    ],
+)
+
+py_test(
+    name = "mode_keys_test",
+    size = "small",
+    srcs = [
+        "training/mode_keys_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":mode_keys",
+    ],
+)
+
 pyx_library(
     name = "framework_fast_tensor_util",
     srcs = ["framework/fast_tensor_util.pyx"],
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 547043030b169280e8add09b3e71c06208e564fd..b2cc63bd1320700801d4aaf0a9b33c8da7821412 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -78,6 +78,7 @@ from tensorflow.python.ops import initializers_ns as initializers
 
 # Bring in subpackages.
 from tensorflow.python import data
+from tensorflow.python import distribute
 from tensorflow.python import keras
 from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.layers import layers
@@ -144,21 +145,21 @@ nn.rnn_cell = rnn_cell
 
 # Export protos
 # pylint: disable=undefined-variable
-tf_export('AttrValue')(AttrValue)
-tf_export('ConfigProto')(ConfigProto)
+tf_export(v1=['AttrValue'])(AttrValue)
+tf_export(v1=['ConfigProto'])(ConfigProto)
 tf_export('Event', 'summary.Event')(Event)
-tf_export('GPUOptions')(GPUOptions)
-tf_export('GraphDef')(GraphDef)
-tf_export('GraphOptions')(GraphOptions)
-tf_export('HistogramProto')(HistogramProto)
-tf_export('LogMessage')(LogMessage)
-tf_export('MetaGraphDef')(MetaGraphDef)
-tf_export('NameAttrList')(NameAttrList)
-tf_export('NodeDef')(NodeDef)
-tf_export('OptimizerOptions')(OptimizerOptions)
-tf_export('RunMetadata')(RunMetadata)
-tf_export('RunOptions')(RunOptions)
-tf_export('SessionLog', 'summary.SessionLog')(SessionLog)
+tf_export(v1=['GPUOptions'])(GPUOptions)
+tf_export(v1=['GraphDef'])(GraphDef)
+tf_export(v1=['GraphOptions'])(GraphOptions)
+tf_export(v1=['HistogramProto'])(HistogramProto)
+tf_export(v1=['LogMessage'])(LogMessage)
+tf_export(v1=['MetaGraphDef'])(MetaGraphDef)
+tf_export(v1=['NameAttrList'])(NameAttrList)
+tf_export(v1=['NodeDef'])(NodeDef)
+tf_export(v1=['OptimizerOptions'])(OptimizerOptions)
+tf_export(v1=['RunMetadata'])(RunMetadata)
+tf_export(v1=['RunOptions'])(RunOptions)
+tf_export(v1=['SessionLog', 'summary.SessionLog'])(SessionLog)
 tf_export('Summary', 'summary.Summary')(Summary)
 tf_export('summary.SummaryDescription')(SummaryDescription)
 tf_export('SummaryMetadata')(SummaryMetadata)
diff --git a/tensorflow/python/autograph/__init__.py b/tensorflow/python/autograph/__init__.py
index 7252e0d9bf92e430e224fe00d9a9a5ff4254b46f..6faeb016072479ab7e860b6520515edb4c88fab9 100644
--- a/tensorflow/python/autograph/__init__.py
+++ b/tensorflow/python/autograph/__init__.py
@@ -12,10 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Autograph compiles Python code into equivalent TensorFlow code.
+"""Conversion of plain Python into TensorFlow graph code.
 
-Equivalent here means that they have the same effect when executed.
+NOTE: In TensorFlow 2.0, AutoGraph is automatically applied when using
+`tf.function`. This module contains lower-level APIs for advanced use.
+
+For more information, see the
+[AutoGraph guide](https://www.tensorflow.org/guide/autograph).
+
+By equivalent graph code we mean code that generates a TensorFlow graph when
+run. The generated graph has the same effects as the original code when executed
+(for example with `tf.function` or `tf.compat.v1.Session.run`). In other words,
+using AutoGraph can be thought of as running Python in TensorFlow.
 """
+# TODO(b/119833526): Link to the new tf.function + autograph tutorial.
 
 from __future__ import absolute_import
 from __future__ import division
@@ -43,6 +53,7 @@ from tensorflow.python.autograph.lang.special_functions import tensor_list
 from tensorflow.python.autograph.pyct.transformer import AutographParseError
 from tensorflow.python.util.all_util import remove_undocumented
 
+# TODO(mdan): Revisit this list once we finalize the generated code mechanism.
 _allowed_symbols = [
     # Main API
     'ConversionOptions',
diff --git a/tensorflow/python/autograph/converters/call_trees.py b/tensorflow/python/autograph/converters/call_trees.py
index 4a8af42e6970d0cc8f37baebf1fbd73989cb672f..d4eb17e976f6fdf321903a878326e668aeb6ea49 100644
--- a/tensorflow/python/autograph/converters/call_trees.py
+++ b/tensorflow/python/autograph/converters/call_trees.py
@@ -22,7 +22,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import namedtuple
+import collections
 
 import gast
 
@@ -35,7 +35,7 @@ from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.util import tf_inspect
 
 
-class FunctionInfo(namedtuple('FunctionInfo', ('dtype',))):
+class FunctionInfo(collections.namedtuple('FunctionInfo', ('dtype',))):
   pass
 
 
@@ -116,12 +116,19 @@ class CallTreeTransformer(converter.Base):
   def _function_is_compilable(self, target_entity):
     """Determines whether an entity can be compiled at all."""
     # TODO(mdan): Expand.
+
     if target_entity.__module__ is None:
       # Functions like builtins and NumPy don't expose a module.
       # Those in general should not be compiled.
       return False
+
     if inspect_utils.isbuiltin(target_entity):
       return False
+
+    if inspect_utils.isnamedtuple(target_entity):
+      # namedtuple doesn't expose its source code, making it uncompilable.
+      return False
+
     return True
 
   def _should_compile(self, node, fqn):
@@ -176,7 +183,7 @@ class CallTreeTransformer(converter.Base):
       for dec in target_node.decorator_list:
         decorator_fn = self._resolve_decorator_name(dec)
         if (decorator_fn is not None and
-            decorator_fn in self.ctx.program.options.strip_decorators):
+            self.ctx.program.options.should_strip(decorator_fn)):
           return False
 
     return True
@@ -254,7 +261,7 @@ class CallTreeTransformer(converter.Base):
         func=func,
         owner=owner,
         options=self.ctx.program.options.to_ast(
-            self.ctx.info.namespace,
+            self.ctx,
             internal_convert_user_code=self.ctx.program.options.recursive),
         args=node.args)
     # TODO(mdan): Improve the template mechanism to better support this.
@@ -301,7 +308,13 @@ class CallTreeTransformer(converter.Base):
         # safe for graph mode.
         return node
 
+      elif inspect_utils.isnamedtuple(target_entity):
+        # Although not compilable, we assume they are safe for graph mode.
+        node = self.generic_visit(node)
+        return node
+
       else:
+        # TODO(mdan): Instert dynamic conversion here instead.
         raise NotImplementedError(
             'py_func with return values (unknown function)')
     else:
@@ -310,12 +323,12 @@ class CallTreeTransformer(converter.Base):
 
       # 1. super() calls - these are preserved. The class conversion mechanism
       # will ensure that they return the correct value.
-      if ast_util.matches(node, 'super(_)'):
+      if ast_util.matches(node, parser.parse_expression('super(_)')):
         return node
 
       # 2. super().method calls - these are preserved as well, when the
       # conversion processes the entire class.
-      if (ast_util.matches(node, 'super(_)._(_)') and
+      if (ast_util.matches(node, parser.parse_expression('super(_)._(_)')) and
           self.ctx.info.owner_type is not None):
         return node
 
diff --git a/tensorflow/python/autograph/converters/call_trees_test.py b/tensorflow/python/autograph/converters/call_trees_test.py
index b2a740020ce78596c01dcb0455bbeb6df44e0ff1..454d75d755c7273d11e1f89e4138cd997eb6e49a 100644
--- a/tensorflow/python/autograph/converters/call_trees_test.py
+++ b/tensorflow/python/autograph/converters/call_trees_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 import numpy as np
 
 from tensorflow.python.autograph.converters import call_trees
@@ -99,6 +101,20 @@ class CallTreesTest(converter_testing.TestCase):
     with self.compiled(node, ns) as result:
       self.assertEquals(1, result.test_fn(1))
 
+  def test_known_called_namedtuple(self):
+
+    nt = collections.namedtuple('TestNamedTuple', ['a'])
+
+    def test_fn(a):
+      return nt(a)
+
+    ns = {'nt': nt}
+    node, ctx = self.prepare(test_fn, ns)
+    node = call_trees.transform(node, ctx)
+
+    with self.compiled(node, ns) as result:
+      self.assertEquals(nt(1), result.test_fn(1))
+
   def test_py_func_known_function(self):
 
     def test_fn():
diff --git a/tensorflow/python/autograph/converters/continue_statements.py b/tensorflow/python/autograph/converters/continue_statements.py
index 584cdc1efd4b2e4327be34e1d9d51de3635fccd5..05e19e59fc6701db618e925e1d305f299b270e33 100644
--- a/tensorflow/python/autograph/converters/continue_statements.py
+++ b/tensorflow/python/autograph/converters/continue_statements.py
@@ -24,94 +24,93 @@ from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
 
 
-# Tags for local state.
-CONTROL_VAR_NAME = 'control_var_name'
-CONTINUE_USED = 'continue_used'
-GUARD_CREATED = 'guard_created'
-CREATE_GUARD_NEXT = 'create_guard_next'
+class _Continue(object):
+
+  def __init__(self):
+    self.used = False
+    self.control_var_name = None
+    self.create_guard = False
+    self.guard_created = False
+
+  def __repr__(self):
+    return 'used: %s, var: %s' % (self.used, self.control_var_name)
 
 
 class ContinueCanonicalizationTransformer(converter.Base):
   """Canonicalizes continue statements into additional conditionals."""
 
   def visit_Continue(self, node):
-    self.set_local(CONTINUE_USED, True)
+    self.state[_Continue].used = True
     template = """
-      var_name = tf.constant(True)
+      var_name = True
     """
     return templates.replace(
-        template, var_name=self.get_local(CONTROL_VAR_NAME))
+        template, var_name=self.state[_Continue].control_var_name)
 
   def _postprocess_statement(self, node):
     # Example of how the state machine below works:
     #
-    #   1| stmt           # State: CONTINUE_USED = False
+    #   1| stmt           # State: Continue_.used = False
     #    |                # Action: none
     #   2| if cond:
-    #   3|   continue     # State: CONTINUE_USED = True,
-    #    |                #        GUARD_CREATED = False,
-    #    |                #        CREATE_GUARD_NEXT = False
-    #    |                # Action: set CREATE_GUARD_NEXT = True
-    #   4| stmt           # State: CONTINUE_USED = True,
-    #    |                #        GUARD_CREATED = False,
-    #    |                #        CREATE_GUARD_NEXT = True
+    #   3|   continue     # State: Continue_.used = True,
+    #    |                #        Continue_.guard_created = False,
+    #    |                #        Continue_.create_guard = False
+    #    |                # Action: Continue_.create_guard = True
+    #   4| stmt           # State: Continue_.used = True,
+    #    |                #        Continue_.guard_created = False,
+    #    |                #        Continue_.create_guard = True
     #    |                # Action: create `if not continue_used`,
-    #    |                #         set GUARD_CREATED = True
-    #   5| stmt           # State: CONTINUE_USED = True, GUARD_CREATED = True
+    #    |                #         set Continue_.guard_created = True
+    #   5| stmt           # State: Continue_.used = True,
+    #    |                #        Continue_.guard_created = True
     #    |                # Action: none (will be wrapped under previously
     #    |                #         created if node)
 
-    if self.get_local(CONTINUE_USED, False):
-      if self.get_local(GUARD_CREATED, False):
+    if self.state[_Continue].used:
+      if self.state[_Continue].guard_created:
         return node, None
 
-      elif not self.get_local(CREATE_GUARD_NEXT, False):
-        self.set_local(CREATE_GUARD_NEXT, True)
+      elif not self.state[_Continue].create_guard:
+        self.state[_Continue].create_guard = True
         return node, None
 
       else:
-        self.set_local(GUARD_CREATED, True)
+        self.state[_Continue].guard_created = True
         template = """
           if not var_name:
             original_node
         """
         cond, = templates.replace(
             template,
-            var_name=self.get_local(CONTROL_VAR_NAME),
+            var_name=self.state[_Continue].control_var_name,
             original_node=node)
         return cond, cond.body
     return node, None
 
   def _visit_loop_body(self, node, nodes):
-    self.enter_local_scope()
+    self.state[_Continue].enter()
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
     continue_var = self.ctx.namer.new_symbol('continue_', scope.referenced)
-    self.set_local(CONTROL_VAR_NAME, continue_var)
+    self.state[_Continue].control_var_name = continue_var
 
     nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
 
-    if self.get_local(CONTINUE_USED, False):
+    if self.state[_Continue].used:
       template = """
-        var_name = tf.constant(False)
+        var_name = False
       """
       control_var_init = templates.replace(template, var_name=continue_var)
       nodes = control_var_init + nodes
 
-    self.exit_local_scope()
+    self.state[_Continue].exit()
     return nodes
 
-  def _visit_non_loop_body(self, nodes):
-    self.enter_local_scope(inherit=(CONTROL_VAR_NAME,))
-    nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
-    continue_used = self.get_local(CONTINUE_USED, False)
-    self.exit_local_scope(keep=(CONTINUE_USED,))
-    return nodes, continue_used
-
   def visit_While(self, node):
     node.test = self.visit(node.test)
     node.body = self._visit_loop_body(node, node.body)
     # A continue in the else clause applies to the containing scope.
-    node.orelse, _ = self._visit_non_loop_body(node.orelse)
+    node.orelse = self.visit_block(node.orelse)
     return node
 
   def visit_For(self, node):
@@ -119,21 +118,11 @@ class ContinueCanonicalizationTransformer(converter.Base):
     node.iter = self.generic_visit(node.iter)
     node.body = self._visit_loop_body(node, node.body)
     # A continue in the else clause applies to the containing scope.
-    node.orelse, _ = self._visit_non_loop_body(node.orelse)
-    return node
-
-  def visit_If(self, node):
-    node.test = self.generic_visit(node.test)
-    node.body, continue_used_body = self._visit_non_loop_body(node.body)
-    node.orelse, continue_used_orelse = self._visit_non_loop_body(node.orelse)
-    self.set_local(CONTINUE_USED, continue_used_body or continue_used_orelse)
-    return node
-
-  def visit_With(self, node):
-    node.items = self.visit_block(node.items)
-    node.body, _ = self._visit_non_loop_body(node.body)
+    node.orelse = self.visit_block(node.orelse)
     return node
 
 
 def transform(node, ctx):
-  return ContinueCanonicalizationTransformer(ctx).visit(node)
+  transformer = ContinueCanonicalizationTransformer(ctx)
+  node = transformer.visit(node)
+  return node
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index 5853e044c532d24c3327f06da790f85fddcd5700..a39a0b0cdb16280312b830c9c9bbe78c06ab77b0 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -49,7 +49,13 @@ class ControlFlowTransformer(converter.Base):
 
   def _create_cond_branch(self, body_name, aliased_orig_names,
                           aliased_new_names, body, returns):
-    if len(returns) == 1:
+    if not returns:
+      # TODO(b/110167197): Replace with a plain return.
+      template = """
+        return 1
+      """
+      return_stmt = templates.replace(template)
+    elif len(returns) == 1:
       template = """
         return retval
       """
@@ -106,14 +112,49 @@ class ControlFlowTransformer(converter.Base):
       return 'no variables'
     return ', '.join(map(str, symbol_set))
 
-  def visit_If(self, node):
-    node = self.generic_visit(node)
+  def _determine_aliased_symbols(self, scope, node_defined_in, block):
+    if block:
+      block_live_in = set(anno.getanno(block[0], anno.Static.LIVE_VARS_IN))
+    else:
+      block_live_in = set()
+
+    # For the purpose of aliasing, composite symbols with live owners are live
+    # as well. Otherwise this would leak tensors from the conditional's body.
+    #
+    # For example:
+    #
+    #   obj = some_obj
+    #   if cond:
+    #     obj.a = val
+    #
+    # Thanslating to the code below would be incorrect:
+    #
+    #   def true_fn():
+    #     obj.a = val()  # Wrong! leaks ops owned by true_fn
+    #     return obj.a
+    for s in scope.modified:
+      if s.is_composite():
+        live_parents = block_live_in & s.owner_set
+        if live_parents:
+          block_live_in.add(s)
+    return scope.modified & node_defined_in & block_live_in
 
+  def visit_If(self, node):
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
     orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
     defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
     live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
 
+    # Note: this information needs to be extracted before the body conversion
+    # that happens in the call to generic_visit below, because the conversion
+    # generates nodes that lack static analysis annotations.
+    need_alias_in_body = self._determine_aliased_symbols(
+        body_scope, defined_in, node.body)
+    need_alias_in_orelse = self._determine_aliased_symbols(
+        orelse_scope, defined_in, node.orelse)
+
+    node = self.generic_visit(node)
+
     modified_in_cond = body_scope.modified | orelse_scope.modified
     returned_from_cond = set()
     for s in modified_in_cond:
@@ -125,9 +166,6 @@ class ControlFlowTransformer(converter.Base):
         if live_out & s.owner_set:
           returned_from_cond.add(s)
 
-    need_alias_in_body = body_scope.modified & defined_in
-    need_alias_in_orelse = orelse_scope.modified & defined_in
-
     created_in_body = body_scope.modified & returned_from_cond - defined_in
     created_in_orelse = orelse_scope.modified & returned_from_cond - defined_in
 
@@ -188,7 +226,7 @@ class ControlFlowTransformer(converter.Base):
       # branch functions will return a dummy value that ensures cond
       # actually has some return value as well.
       cond_results = None
-      # TODO(mdan): This doesn't belong here; it's specific to the operator.
+      # TODO(mdan): Replace with None once side_effect_guards is retired.
       returned_from_body = (templates.replace_as_expression(
           'ag__.match_staging_level(1, cond_var_name)',
           cond_var_name=cond_var_name),)
@@ -246,14 +284,6 @@ class ControlFlowTransformer(converter.Base):
           ' these symbols before the loop'.format(
               self._fmt_symbols(live_defs_in_loop)))
 
-    if not loop_state:
-      # TODO(mdan): Implement this properly.
-      # We need to check whether any variable created inside the body scope
-      # is used before being modified outside the scope. This should be done
-      # during activity analysis, and in general should cover the case where
-      # variables may not be initialized.
-      raise ValueError('cannot convert loop: no outputs')
-
     return loop_state, reserved_symbols
 
   def _state_constructs(self, loop_state, reserved_symbols):
@@ -305,26 +335,44 @@ class ControlFlowTransformer(converter.Base):
     node_body = ast_util.rename_symbols(node.body, ssf_map)
     test = ast_util.rename_symbols(node.test, ssf_map)
 
-    template = """
-      def test_name(state_ssf):
-        return test
-      def body_name(state_ssf):
-        body
-        return state_ssf,
-      state_ast_tuple = ag__.while_stmt(
-          test_name, body_name, (state,), (extra_deps,))
-    """
-    node = templates.replace(
-        template,
-        state=loop_state,
-        state_ssf=state_ssf,
-        state_ast_tuple=state_ast_tuple,
-        test_name=self.ctx.namer.new_symbol('loop_test', reserved_symbols),
-        test=test,
-        body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
-        body=node_body,
-        extra_deps=tuple(s.ast() for s in cond_closure),
-    )
+    if loop_state:
+      template = """
+        def test_name(state_ssf):
+          return test
+        def body_name(state_ssf):
+          body
+          return state_ssf,
+        state_ast_tuple = ag__.while_stmt(
+            test_name, body_name, (state,), (extra_deps,))
+      """
+      node = templates.replace(
+          template,
+          state=loop_state,
+          state_ssf=state_ssf,
+          state_ast_tuple=state_ast_tuple,
+          test_name=self.ctx.namer.new_symbol('loop_test', reserved_symbols),
+          test=test,
+          body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
+          body=node_body,
+          extra_deps=tuple(s.ast() for s in cond_closure),
+      )
+    else:
+      template = """
+        def test_name():
+          return test
+        def body_name():
+          body
+          return ()
+        ag__.while_stmt(test_name, body_name, (), (extra_deps,))
+      """
+      node = templates.replace(
+          template,
+          test_name=self.ctx.namer.new_symbol('loop_test', reserved_symbols),
+          test=test,
+          body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
+          body=node_body,
+          extra_deps=tuple(s.ast() for s in cond_closure),
+      )
 
     return node
 
@@ -341,29 +389,50 @@ class ControlFlowTransformer(converter.Base):
     else:
       extra_test = parser.parse_expression('True')
 
-    template = """
-      def extra_test_name(state_ssf):
-        return extra_test_expr
-      def body_name(loop_vars, state_ssf):
-        # Workaround for PEP-3113
-        iterate = loop_vars
-        body
-        return state_ssf,
-      state_ast_tuple = ag__.for_stmt(
-          iter_, extra_test_name, body_name, (state,))
-    """
-    node = templates.replace(
-        template,
-        state=loop_state,
-        state_ssf=state_ssf,
-        state_ast_tuple=state_ast_tuple,
-        iter_=node.iter,
-        iterate=node.target,
-        extra_test_name=self.ctx.namer.new_symbol('extra_test',
-                                                  reserved_symbols),
-        extra_test_expr=extra_test,
-        body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
-        body=node_body)
+    if loop_state:
+      template = """
+        def extra_test_name(state_ssf):
+          return extra_test_expr
+        def body_name(loop_vars, state_ssf):
+          # Workaround for PEP-3113
+          iterate = loop_vars
+          body
+          return state_ssf,
+        state_ast_tuple = ag__.for_stmt(
+            iter_, extra_test_name, body_name, (state,))
+      """
+      node = templates.replace(
+          template,
+          state=loop_state,
+          state_ssf=state_ssf,
+          state_ast_tuple=state_ast_tuple,
+          iter_=node.iter,
+          iterate=node.target,
+          extra_test_name=self.ctx.namer.new_symbol('extra_test',
+                                                    reserved_symbols),
+          extra_test_expr=extra_test,
+          body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
+          body=node_body)
+    else:
+      template = """
+        def extra_test_name():
+          return extra_test_expr
+        def body_name(loop_vars):
+          # Workaround for PEP-3113
+          iterate = loop_vars
+          body
+          return ()
+        ag__.for_stmt(iter_, extra_test_name, body_name, ())
+      """
+      node = templates.replace(
+          template,
+          iter_=node.iter,
+          iterate=node.target,
+          extra_test_name=self.ctx.namer.new_symbol('extra_test',
+                                                    reserved_symbols),
+          extra_test_expr=extra_test,
+          body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
+          body=node_body)
 
     return node
 
diff --git a/tensorflow/python/autograph/converters/side_effect_guards.py b/tensorflow/python/autograph/converters/side_effect_guards.py
index 98e29ec8e1b27061371f0328402d8cb45a0f69e7..d7c0951fcc68318ff82e4873deef8707e7018f73 100644
--- a/tensorflow/python/autograph/converters/side_effect_guards.py
+++ b/tensorflow/python/autograph/converters/side_effect_guards.py
@@ -85,11 +85,26 @@ class SideEffectGuardTransformer(converter.Base):
         new_alias_map.update(alias_map)
         alias_map = new_alias_map
         current_dest = new_dest
-    if reindent_requested and not current_dest:
-      # TODO(mdan): There may still be something that could be done.
-      raise ValueError('Unable to insert statement into the computation flow: '
-                       'it is not followed by any computation which '
-                       'the statement could gate.')
+
+    if reindent_requested:
+      no_controls_to_gate = False
+      if not current_dest:
+        no_controls_to_gate = True
+      if len(current_dest) == 1:
+        if ast_util.matches(current_dest[0], 'return'):
+          no_controls_to_gate = True
+        if ast_util.matches(current_dest[0], 'return ()'):
+          no_controls_to_gate = True
+        if ast_util.matches(current_dest[0], 'return []'):
+          no_controls_to_gate = True
+        if ast_util.matches(current_dest[0], 'return {}'):
+          no_controls_to_gate = True
+      if no_controls_to_gate:
+        # TODO(mdan): There may still be something that could be done.
+        raise ValueError(
+            'Unable to insert statement into the computation flow: it is not'
+            ' followed by any computation which the statement could gate.')
+
     return new_nodes
 
   def visit_FunctionDef(self, node):
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index e88c4674ee24867dec32d62589afdc2e48dfcace..4543b113983f56e8a987a4dbce3bba9db47da517 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -63,8 +63,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from enum import Enum
-from enum import IntEnum
+import weakref
+
+import enum
 
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import naming
@@ -83,6 +84,7 @@ from tensorflow.python.autograph.pyct.static_analysis import liveness
 from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
 from tensorflow.python.autograph.pyct.static_analysis import type_info
 from tensorflow.python.eager import function
+from tensorflow.python.util.tf_export import tf_export
 
 # TODO(mdan): These contexts can be refactored into first class objects.
 # For example, we could define Program and Entity abstractions that hold on
@@ -91,37 +93,42 @@ from tensorflow.python.eager import function
 # TODO(mdan): Add a test specific to this converter.
 
 
-class Verbosity(IntEnum):
-  """Different levels of verbosity for printing errors.
+@tf_export('autograph.experimental.Verbosity')
+class Verbosity(enum.IntEnum):
+  """Represents conversion verbosity levels.
 
   Attributes:
-   * BRIEF: No logging, minimal error messages.
-   * VERBOSE: Detailed logging of generated code, detailed error messages.
+    BRIEF: No logging, minimal error messages.
+    VERBOSE: Detailed logging of generated code, detailed error messages.
   """
+
   BRIEF = 0
   VERBOSE = 1
 
 
-class Feature(Enum):
-  """Constants to use when selecting AutoGraph features."""
+@tf_export('autograph.experimental.Feature')
+class Feature(enum.Enum):
+  """Represents conversion options that can be toggled on or off.
 
-  ALL = 'Enable all features.'
+  Attributes:
+    ALL: Enable all features.
+    AUTO_CONTROL_DEPS: Insert of control dependencies in the generated code.
+    DECORATORS: Allow decorators in local functions. Note that special
+      decorators, like `tf.function`, are allowed regardless of this toggle.
+    ERROR_REWRITING: Rewrite errors that occur in the generated code to
+      indicate the source code to which the failing code corresponds.
+    LISTS: Convert list idioms, like initializers, slices, append, etc.
+    NAME_SCOPES: Insert name scopes that name ops according to context, like the
+      function they were defined in.
+  """
 
-  AUTO_CONTROL_DEPS = (
-      'Insert of control dependencies in the generated code.')
-  DECORATORS = (
-      'Allow decorators in local functions. Note that special decorators,'
-      ' like ag.convert or tf.function are allowed regardless of this toggle.')
-  ERROR_REWRITING = (
-      'Rewrite errors that occur in the generated code to indicate the source'
-      ' code to which the failing code corresponds.')
-  LISTS = 'Convert list idioms, like initializers, slices, append, etc.'
-  NAME_SCOPES = (
-      'Insert name scopes that name ops according to context, like the'
-      ' function they were defined in.')
+  ALL = 'ALL'
 
-  def __repr__(self):
-    return self.name
+  AUTO_CONTROL_DEPS = 'AUTO_CONTROL_DEPS'
+  DECORATORS = 'DECORATORS'
+  ERROR_REWRITING = 'ERROR_REWRITING'
+  LISTS = 'LISTS'
+  NAME_SCOPES = 'NAME_SCOPES'
 
 
 class ConversionOptions(object):
@@ -157,7 +164,9 @@ class ConversionOptions(object):
     # TODO(mdan): Rename to conversion_recursion_depth?
     self.internal_convert_user_code = internal_convert_user_code
 
-    if isinstance(optional_features, Feature):
+    if optional_features is None:
+      optional_features = ()
+    elif isinstance(optional_features, Feature):
       optional_features = (optional_features,)
     optional_features = frozenset(optional_features)
     self.optional_features = optional_features
@@ -168,19 +177,28 @@ class ConversionOptions(object):
     # TODO(mdan): Revert if function.defun becomes a public symbol.
     return self._strip_decorators + (function.defun,)
 
+  def should_strip(self, decorator):
+    for blacklisted in self.strip_decorators:
+      if blacklisted is decorator:
+        return True
+      if isinstance(blacklisted, weakref.ref):
+        blacklisted_deref = blacklisted()
+        if (blacklisted_deref is not None and blacklisted_deref is decorator):
+          return True
+    return False
+
   def uses(self, feature):
     return (Feature.ALL in self.optional_features or
             feature in self.optional_features)
 
-  def to_ast(self, namespace, internal_convert_user_code=None):
+  def to_ast(self, ctx, internal_convert_user_code=None):
     """Returns a representation of this object as an AST node.
 
     The AST node encodes a constructor that would create an object with the
     same contents.
 
     Args:
-      namespace: Dict[str, Any], the namespace to use when serializing values to
-        names.
+      ctx: EntityContext, the entity with which this AST needs to be consistent.
       internal_convert_user_code: Optional[bool], allows ovrriding the
         corresponding value.
 
@@ -198,10 +216,11 @@ class ConversionOptions(object):
     """
 
     def as_qualified_name(o):
-      name = inspect_utils.getqualifiedname(namespace, o)
+      name = inspect_utils.getqualifiedname(ctx.info.namespace, o, max_depth=1)
       if not name:
-        raise ValueError('Could not locate entity {} in {}'.format(
-            o, namespace))
+        # TODO(mdan): This needs to account for the symbols defined locally.
+        name = ctx.namer.new_symbol(o.__name__, ())
+        ctx.program.add_symbol(name, weakref.ref(o))
       return name
 
     def list_of_names(values):
@@ -272,6 +291,7 @@ class ProgramContext(object):
     self.dependency_cache = {}
     self.additional_imports = set()
     self.name_map = {}
+    self.additional_symbols = {}
 
   @property
   def required_imports(self):
@@ -314,6 +334,11 @@ class ProgramContext(object):
       else:
         self.name_map[o] = name
 
+  def add_symbol(self, name, value):
+    if name in self.additional_symbols:
+      assert self.additional_symbols[name] is value
+    self.additional_symbols[name] = value
+
   def add_to_cache(self, original_entity, converted_ast):
     self.conversion_order.append(original_entity)
     self.dependency_cache[original_entity] = converted_ast
@@ -419,7 +444,7 @@ class AnnotatedDef(reaching_definitions.Definition):
     self.directives = {}
 
 
-class AgAnno(Enum):
+class AgAnno(enum.Enum):
   """Annotation labels specific to AutoGraph. See anno.py."""
 
   DIRECTIVES = 'User directives associated with the annotated statement.'
diff --git a/tensorflow/python/autograph/core/converter_test.py b/tensorflow/python/autograph/core/converter_test.py
index b73c67e337748e1f9f2729842c309e6263b444df..864ea6c7d2b891cd1f21f4b1c83f66949cd6ab9b 100644
--- a/tensorflow/python/autograph/core/converter_test.py
+++ b/tensorflow/python/autograph/core/converter_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import weakref
+
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.pyct import anno
@@ -29,6 +31,36 @@ class TestConverter(converter.Base):
   pass
 
 
+class ConversionOptionsTest(test.TestCase):
+
+  def test_should_strip_weakrefs(self):
+    def test_fn():
+      pass
+
+    def weak_test_fn_a():
+      pass
+
+    def weak_test_fn_b():
+      pass
+
+    def weak_test_fn_c():
+      pass
+
+    wr_a = weakref.ref(weak_test_fn_a)
+    # Create an extra weakref to check whether the existence of multiple weak
+    # references influences the process.
+    _ = weakref.ref(weak_test_fn_b)
+    wr_b = weakref.ref(weak_test_fn_b)
+    _ = weakref.ref(weak_test_fn_c)
+
+    opts = converter.ConversionOptions(strip_decorators=(test_fn, wr_a, wr_b))
+
+    self.assertTrue(opts.should_strip(test_fn))
+    self.assertTrue(opts.should_strip(weak_test_fn_a))
+    self.assertTrue(opts.should_strip(weak_test_fn_b))
+    self.assertFalse(opts.should_strip(weak_test_fn_c))
+
+
 class ConverterBaseTest(converter_testing.TestCase):
 
   def test_get_definition_directive_basic(self):
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index 19a472064ae8334af7cba53efb8d386e7bde21e1..a98c1dfe9a3e3887d70e23cb2d89f3ed911f6327 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -40,6 +40,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 # TODO(mdan): Properly document the type hints.
 # TODO(mdan): Reduce the type hint information to (module, type).
@@ -49,7 +50,10 @@ from tensorflow.python.util import tf_inspect
 # TODO(mdan): This should behave like to_graph (e.g. convert statically).
 # TODO(znado): Make an alias so can write Verbosity directly without needing
 # to write converter.
-def convert(recursive=False, verbose=converter.Verbosity.VERBOSE):
+def convert(
+    recursive=False,
+    verbose=converter.Verbosity.BRIEF,
+    optional_features=converter.Feature.ALL):
   """Decorator that compiles a function to use TensorFlow ops.
 
   The decorator is dynamic - it recompiles the target whenever the decorated
@@ -61,6 +65,9 @@ def convert(recursive=False, verbose=converter.Verbosity.VERBOSE):
     recursive: bool, whether to recursively convert any functions or classes
       that the converted function may use.
     verbose: converter.Verbosity, the level of verbosity.
+    optional_features: converted.Feature, allows toggling optional or
+      experimental features. When set to None, only the core features are
+      enabled.
 
   Returns:
     Callable, a decorator that converts the given function into an equivalent
@@ -78,7 +85,7 @@ def convert(recursive=False, verbose=converter.Verbosity.VERBOSE):
               recursive=recursive,
               verbose=verbose,
               force_conversion=True,
-              optional_features=converter.Feature.ALL,
+              optional_features=optional_features,
           ), *args, **kwargs)
 
     wrapper = tf_decorator.make_decorator(f, wrapper)
@@ -151,11 +158,9 @@ def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
   return decorator
 
 
-# TODO(mdan): Move to a private, undocumented module.
 def converted_call(f, owner, options, *args, **kwargs):
   """Compiles a function call inline. For internal use only."""
-  if options.verbose >= converter.Verbosity.VERBOSE:
-    logging.info('Converted call: {}; owner: {}'.format(f, owner))
+  logging.vlog(logging.DEBUG, 'Converted call: %s; owner: %s', f, owner)
 
   if owner is not None:
     if not isinstance(f, str):
@@ -182,8 +187,8 @@ def converted_call(f, owner, options, *args, **kwargs):
     # When conversion is skipped, `self` is not necessary, because the
     # original bound method is being executed. This code removes it.
     if tf_inspect.ismethod(f) and args:
-      f_class = inspect_utils.getmethodclass(f)
-      if args[0] is f_class:
+      f_self = inspect_utils.getmethodself(f)
+      if args[0] is f_self:
         args = args[1:]
 
     return f(*args, **kwargs)
@@ -196,7 +201,7 @@ def converted_call(f, owner, options, *args, **kwargs):
     return f(*args, **kwargs)
 
   # Unwrap functools.partial objects
-  # TODO(allenl, mdan): Consider sharing unwrapping logic with tf_inspect.
+  # TODO(mdan): Consider sharing unwrapping logic with tf_inspect.
   while isinstance(f, functools.partial):
     args = f.args + args
     new_kwargs = {}
@@ -210,10 +215,10 @@ def converted_call(f, owner, options, *args, **kwargs):
     # Regular functions
     target_entity = f
     arg_map_target = f
-    f_class = inspect_utils.getmethodclass(f)
+    f_self = inspect_utils.getmethodself(f)
 
     # TODO(b/119246461): This may be more elegantly handled using __get__?
-    if f_class is not None:
+    if f_self is not None:
       # If this is a method call, it may or may not include self.
       #
       # Example when self is included:
@@ -228,11 +233,11 @@ def converted_call(f, owner, options, *args, **kwargs):
         # When the owner is not specified, use the result of
         # inspect_utils.getmethodclass.
         # TODO(b/119246461): Make sure an owner is always specified.
-        if not args or args[0] is not f_class:
-          effective_args = (f_class,) + args
+        if not args or args[0] is not f_self:
+          effective_args = (f_self,) + args
         else:
-          effective_args = (f_class,) + args[1:]
-      partial_types = (f_class,)
+          effective_args = (f_self,) + args[1:]
+      partial_types = (f_self,)
     else:
       effective_args = args
       partial_types = ()
@@ -274,12 +279,12 @@ def converted_call(f, owner, options, *args, **kwargs):
   converted_f = to_graph(
       target_entity,
       recursive=options.recursive,
-      verbose=options.verbose,
       arg_values=arg_values,
       arg_types=arg_types,
-      partial_types=partial_types,
-      strip_decorators=options.strip_decorators,
-      optional_features=options.optional_features)
+      experimental_optional_features=options.optional_features,
+      experimental_strip_decorators=options.strip_decorators,
+      experimental_verbose=options.verbose,
+      experimental_partial_types=partial_types)
 
   result = converted_f(*effective_args, **kwargs)
 
@@ -308,63 +313,100 @@ def _is_not_callable(obj):
   return False
 
 
-# TODO(mdan): Rename: to_ops?
-# TODO(mdan): Look into overloading as function and decorator, like tfe.defun?
-# TODO(mdan): Remove partial_types.
-def to_graph(e,
+@tf_export('autograph.to_graph')
+def to_graph(entity,
              recursive=True,
-             verbose=converter.Verbosity.VERBOSE,
              arg_values=None,
              arg_types=None,
-             partial_types=None,
-             strip_decorators=None,
-             optional_features=converter.Feature.ALL):
-  """Converts a Python entity into equivalent code that uses TensorFlow ops.
+             experimental_optional_features=converter.Feature.ALL,
+             experimental_strip_decorators=None,
+             experimental_verbose=converter.Verbosity.BRIEF,
+             experimental_partial_types=None):
+  """Converts a Python entity into a TensorFlow graph.
+
+  Also see: `tf.autograph.to_code`, `tf.function`.
+
+  Unlike `tf.function`, `to_graph` is a low-level transpiler that converts
+  Python code to TensorFlow graph code. It does not implement any caching,
+  variable management or create any actual ops, and is best used where greater
+  control over the generated TensorFlow graph is desired. Another difference
+  from `tf.function` is that `to_graph` will not wrap the graph into a
+  TensorFlow function or a Python callable. Internally, `tf.function` uses
+  `to_graph`.
+
+  _Example Usage_
+
+  ```python
+    def foo(x):
+      if x > 0:
+        y = x * x
+      else:
+        y = -x
+      return y
+
+    converted_foo = to_graph(foo)
+
+    x = tf.constant(1)
+    y = converted_foo(x)  # converted_foo is a TensorFlow Op-like.
+    assert is_tensor(y)
+  ```
 
   Supported Python entities include:
     * functions
     * classes
+    * object methods
+
+  Functions are converted into new functions with converted code.
 
-  Classes are converted by converting all their methods into a new class.
+  Classes are converted by generating a new class whose methods use converted
+  code.
+
+  Methods are converted into unbound function that have an additional first
+  argument called `self`.
 
   Args:
-    e: Union[Callable, Type], the Python entity to convert.
-    recursive: bool, whether to recursively convert any functions that the
+    entity: Python callable or class to convert.
+    recursive: Whether to recursively convert any functions that the
       converted function may call.
-    verbose: converter.Verbosity, the level of printing verbosity to use.
-    arg_values: Optional[Dict[Text, Any]], value hints for symbols including
-      function arguments.
-    arg_types: Optional[Dict[Text, Type]], type hints for symbols including
-      function arguments.
-    partial_types: Set[Type], reserved for internal use.
-    strip_decorators: Tuple[Callable], same as
-      ConversionOptions.strip_decorators.
-    optional_features: Union[Feature, Set[Feature]], same as
-      ConversionOptions.optional_features.
+    arg_values: Optional dict of value hints for symbols including
+      function arguments mapping string names to actual values. For example,
+      `arg_values={'a': 1}` will map the variable `a` to the value `1`.
+    arg_types: Optional dict of type hints for symbols including function
+      arguments. Type hints allow specifying just the type of a variable, rather
+      than a specific value.
+    experimental_optional_features: `None`, a tuple of, or a single
+      `tf.autograph.experimental.Feature` value. Controls the use of
+      optional features in the conversion process.
+    experimental_strip_decorators: A tuple specifying decorators that should be
+      excluded from the compiled output. By default, when converting a function
+      before the decorators are applied, the compiled output will include those
+      decorators.
+    experimental_verbose: The level of printing verbosity to use, as a
+      `tf.autograph.experimental.Verbosity` value.
+    experimental_partial_types: A `set` of `type` values, reserved for internal
+      use.
 
   Returns:
-    Union[Callable, Type], the converted entity, which is the same kind as e
-    (that is, a function is e is a function, a class if e is a class, etc.) but
-    its code has been converted to use TF ops.
+    Same as `entity`, the converted Python function or class.
 
   Raises:
     ValueError: If the entity could not be converted.
   """
-  if strip_decorators is None:
-    strip_decorators = ()
-  strip_decorators += (convert, do_not_convert, converted_call)
+  if experimental_strip_decorators is None:
+    experimental_strip_decorators = ()
+  experimental_strip_decorators += (convert, do_not_convert, converted_call)
 
   program_ctx = converter.ProgramContext(
       options=converter.ConversionOptions(
           recursive=recursive,
-          verbose=verbose,
-          strip_decorators=strip_decorators,
-          optional_features=optional_features),
-      partial_types=partial_types,
+          verbose=experimental_verbose,
+          strip_decorators=experimental_strip_decorators,
+          optional_features=experimental_optional_features),
+      partial_types=experimental_partial_types,
       autograph_module=tf_inspect.getmodule(to_graph),
       uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
-  _, name, namespace = conversion.entity_to_graph(e, program_ctx, arg_values,
-                                                  arg_types)
+  _, name, namespace = conversion.entity_to_graph(entity, program_ctx,
+                                                  arg_values, arg_types)
 
   nodes = []
   for dep in reversed(program_ctx.conversion_order):
@@ -381,10 +423,13 @@ def to_graph(e,
     # Avoid overwriting entities that have been transformed.
     if key not in compiled_module.__dict__:
       compiled_module.__dict__[key] = val
+  for key, val in program_ctx.additional_symbols.items():
+    if key not in compiled_module.__dict__:
+      compiled_module.__dict__[key] = val
   compiled = getattr(compiled_module, name)
 
-  if tf_inspect.isfunction(e):
-    compiled.__defaults__ = e.__defaults__
+  if tf_inspect.isfunction(entity):
+    compiled.__defaults__ = entity.__defaults__
 
   if hasattr(compiled, '__globals__'):
     # Remove self to avoid circular references. This will probably only work
@@ -409,38 +454,52 @@ def to_graph(e,
   return compiled
 
 
-def to_code(e,
+@tf_export('autograph.to_code')
+def to_code(entity,
             recursive=True,
             arg_values=None,
             arg_types=None,
-            partial_types=None,
-            indentation='  '):
-  """Returns the equivalent code that uses TensorFlow ops.
+            indentation='  ',
+            experimental_optional_features=converter.Feature.ALL,
+            experimental_partial_types=None):
+  """Similar to `to_graph`, but returns Python source code as a string.
+
+  Also see: `tf.autograph.to_graph`.
 
-  Also see: `to_graph`, `convert`
+  `to_graph` returns the Python source code that can be used to generate a
+  TensorFlow graph that is functionally identical to the input Python code.
 
   Args:
-    e: Union[Callable, Type], the Python entity to convert.
-    recursive: bool, whether to recursively convert any functions that the
+    entity: Python callable or class to convert.
+    recursive: Whether to recursively convert any functions that the
       converted function may call.
-    arg_values: Optional[Dict[Text, Any]], value hints for symbols including
-      function arguments.
-    arg_types: Optional[Dict[Text, Type]], type hints for symbols including
-      function arguments.
-    partial_types: Set[Type], reserved for internal use.
-    indentation: Text, when to use for each level of indentation.
+    arg_values: Optional dict of value hints for symbols including
+      function arguments mapping string names to actual values. For example,
+      `arg_values={'a': 1}` will map the variable `a` to the value `1`.
+    arg_types: Optional dict of type hints for symbols including function
+      arguments. Type hints allow specifying just the type of a variable, rather
+      than a specific value.
+    indentation: The string to use for indenting. Typically two or four spaces,
+      or just the tab character.
+    experimental_optional_features: `None`, a tuple of, or a single
+      `tf.autograph.experimental.Feature` value. Controls the use of
+      optional features in the conversion process.
+    experimental_partial_types: A `set` of `type` values, reserved for internal
+      use.
 
   Returns:
-    Text, the converted code.
+    The converted code as string.
   """
   program_ctx = converter.ProgramContext(
       options=converter.ConversionOptions(
           recursive=recursive,
-          strip_decorators=(convert, do_not_convert, converted_call)),
-      partial_types=partial_types,
+          verbose=converter.Verbosity.BRIEF,
+          strip_decorators=(convert, do_not_convert, converted_call),
+          optional_features=experimental_optional_features),
+      partial_types=experimental_partial_types,
       autograph_module=tf_inspect.getmodule(to_graph),
       uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
-  conversion.entity_to_graph(e, program_ctx, arg_values, arg_types)
+  conversion.entity_to_graph(entity, program_ctx, arg_values, arg_types)
 
   code = '\n'.join(
       compiler.ast_to_source(program_ctx.dependency_cache[dep], indentation)
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index 66edda5119324bbdcc32e0bf4914b99b7ea647ca..d5561ba8249f539e720fa1ecb5800b76c61a8c2f 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -218,6 +218,7 @@ class ApiTest(test.TestCase):
                              constant_op.constant(-1))
       self.assertEqual(1, self.evaluate(x))
 
+  @test_util.run_v1_only('b/120545219')
   def test_converted_call_functools_partial(self):
 
     def test_fn(x, y, z):
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index 055769c73ad9e946232ecd1e6c8a95e5f93f37a0..733d4f1c717c86cd65425fb1c66c5fd271f2bacb 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -73,17 +73,69 @@ def is_whitelisted_for_graph(o):
   Returns:
     Boolean
   """
+  # TODO(b/120224672): Fix this.
   if isinstance(o, functools.partial):
     # tf_inspect.getmodule(functools.partial(...)) otherwise returns None since
     # functools.partial objects do not have a __module__ attribute.
     m = functools
   else:
     m = tf_inspect.getmodule(o)
+  if not hasattr(m, '__name__'):
+    logging.vlog(1, '%s is NOT whitelisted for graph: unknown module name', o)
+    return False
+
   for prefix, in config.DEFAULT_UNCOMPILED_MODULES:
     if m.__name__.startswith(prefix):
+      logging.vlog(1, '%s is whitelisted: name starts with "%s"', o, prefix)
       return True
+
   if hasattr(o, 'autograph_info__'):
     return True
+
+  if (not inspect_utils.isweakrefself(o) and not tf_inspect.isclass(o) and
+      hasattr(o, '__call__') and hasattr(o, '__class__')):
+    # Callable objects: whitelisted if their __call__ method is.
+    retval = is_whitelisted_for_graph(o.__call__)
+    logging.vlog(1, '%s is whitelisted: object __call__ whitelisted', o)
+    return retval
+
+  if tf_inspect.ismethod(o):
+    # Methods of whitelisted classes are also whitelisted, even if they are
+    # bound via user subclasses.
+    #
+    # For example, suppose `tf.Foo` has a method called `bar`, and `baz` is
+    # defined as below. `tf.Foo` is whitelisted. Then `baz.bar` is also
+    # whitelisted.
+    #
+    #   class Custom(tf.Foo):
+    #     pass
+    #
+    #   baz = Custom()
+    #
+    # For the example above, if `Custom` did overload `bar`, then it would no
+    # longer be whitelisted.
+
+    owner_class = inspect_utils.getmethodclass(o)
+    if owner_class is not None:
+      owner_class = inspect_utils.getdefiningclass(o, owner_class)
+      if is_whitelisted_for_graph(owner_class):
+        logging.vlog(1, '%s is whitelisted: owner is whitelisted %s', o,
+                     owner_class)
+        return True
+
+  if inspect_utils.isnamedtuple(o):
+    # Due to the way they're constructed, namedtuple types cannot be converted
+    # because they don't expose source code. But we assume they are safe for
+    # graph mode since they are just containers.
+    if tf_inspect.isclass(o) and len(o.__bases__) > 1:
+      logging.log_first_n(
+          logging.level_warning(),
+          'Entity {} looks like a namedtuple subclass. If it has any custom'
+          ' methods, they will not be converted by AutoGraph.'.format(o), 1)
+    logging.vlog(1, '%s is whitelisted: named tuple', o)
+    return True
+
+  logging.vlog(1, '%s is NOT whitelisted for graph', o)
   return False
 
 
@@ -115,8 +167,7 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
   Raises:
     ValueError: if the entity type is not supported.
   """
-  if program_ctx.options.verbose == converter.Verbosity.VERBOSE:
-    logging.info('Converting {}'.format(o))
+  logging.vlog(logging.DEBUG, 'Converting %s', o)
 
   if tf_inspect.isclass(o):
     node, name, ns = class_to_graph(o, program_ctx)
@@ -150,9 +201,9 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
 
   program_ctx.add_to_cache(o, node)
 
-  if program_ctx.options.verbose == converter.Verbosity.VERBOSE:
-    logging.info('Compiled output of {}:\n\n{}\n'.format(
-        o, compiler.ast_to_source(node)))
+  if logging.get_verbosity() <= logging.DEBUG:
+    logging.vlog(logging.DEBUG, 'Compiled output of %s:\n\n%s\n', o,
+                 compiler.ast_to_source(node))
 
   if program_ctx.options.recursive:
     while True:
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 1a35efedfaa4fea5cdcef6e27c36fecbf5ebdfc6..afa3787d4277985285d5dc8b3e1531a00460076b 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
@@ -88,7 +87,10 @@ def _known_len_for_stmt(iter_, extra_test, body, init_state):
   def while_body(iterate_index, *state):
     iterate = iter_[iterate_index]
     new_state = body(iterate, *state)
-    return (iterate_index + 1,) + new_state
+    if new_state:
+      return (iterate_index + 1,) + new_state
+    else:
+      return iterate_index + 1
 
   def while_cond(iterate_index, *state):
     return gen_math_ops.logical_and(iterate_index < n, extra_test(*state))
@@ -99,51 +101,33 @@ def _known_len_for_stmt(iter_, extra_test, body, init_state):
       init_state=(0,) + init_state,
       extra_deps=(iter_,),
       opts=dict(maximum_iterations=n))
+
   # Dropping the iteration index because it's not syntactically visible.
-  results = results[1:]
+  # TODO(mdan): Don't.
+  if isinstance(results, (tuple, list)):
+    assert len(results) >= 1  # Has at least the iterate.
+    if len(results) > 1:
+      results = results[1:]
+    if len(results) == 1:
+      # TODO(mdan): Remove this special case.
+      results, = results
+  else:
+    results = ()
 
-  # TODO(mdan): Remove this special case.
-  if len(results) == 1:
-    return results[0]
   return results
 
 
 def _dataset_for_stmt(ds, extra_test, body, init_state):
   """Overload of for_stmt that iterates over TF Datasets."""
-  # Because Datsets only expose get_next, in the style of Python iterators,
-  # we are forced to unpack the loop as:
-  #
-  # epoch_number, iterate = ds.get_next()
-  # while epoch_number < 2:
-  #   <body>
-  #   epoch_number, iterate = ds.get_next()
-  epoch_numbers = dataset_ops.Dataset.range(2)
-  def tag_with(ds, tag):
-    return dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.from_tensors(tag).repeat(), ds))
-  ds_with_epoch = epoch_numbers.flat_map(lambda i: tag_with(ds, i))
-
-  iterator = ds_with_epoch.make_initializable_iterator()
-  with ops.control_dependencies((iterator.initializer,)):
-    epoch_number, iterate = iterator.get_next()
-
-    def while_body(epoch_number, iterate, *state):
-      new_state = body(iterate, *state)
-      epoch_number, iterate = iterator.get_next()
-      return (epoch_number, iterate) + new_state
-
-    def while_cond(epoch_number, iterate, *state):
-      del iterate
-      return gen_math_ops.logical_and(epoch_number < 1, extra_test(*state))
-
-    results = while_stmt(
-        while_cond,
-        while_body,
-        init_state=(epoch_number, iterate) + init_state,
-        extra_deps=())
-  # Dropping the epoch number and iterate because they are not syntactically
-  # visible.
-  results = results[2:]
+  if extra_test(*init_state) is not True:
+    raise NotImplementedError(
+        'break statements are not yet supported in for/Dataset loops')
+
+  def reduce_body(state, iterate):
+    new_state = body(iterate, *state)
+    return new_state
+
+  results = ds.reduce(init_state, reduce_body)
 
   # TODO(mdan): Remove this special case.
   if len(results) == 1:
diff --git a/tensorflow/python/autograph/operators/data_structures_test.py b/tensorflow/python/autograph/operators/data_structures_test.py
index 9397b9acb821501f1b0c4f07d5078c961bbbeeab..c5a3a3d1cac998a0fc59163d73288317bd4a3e30 100644
--- a/tensorflow/python/autograph/operators/data_structures_test.py
+++ b/tensorflow/python/autograph/operators/data_structures_test.py
@@ -106,7 +106,7 @@ class ListTest(test.TestCase):
     with self.cached_session() as sess:
       self.assertAllEqual(self.evaluate(t), [[1, 2, 3]])
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/117943489")
   def test_append_tensorarray(self):
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
     l1 = data_structures.list_append(l, 1)
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index 2f55d538924609f4ad2549acccbc15a57ac13c19..ddf05f73f37821c6ff7e246051cd82a560f370e3 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -174,6 +174,7 @@ def _tf_py_func_print(objects, kwargs):
     override_kwargs['flush'] = True
 
   def print_wrapper(*vals):
+    vals = tuple(v.numpy() if tensor_util.is_tensor(v) else v for v in vals)
     if six.PY3:
       # TensorFlow doesn't seem to generate Unicode when passing strings to
       # py_func. This causes the print to add a "b'" wrapper to the output,
@@ -193,6 +194,7 @@ def range_(start_or_stop, stop=UNDEFINED, step=UNDEFINED):
 
 
 def _tf_range(start_or_stop, stop, step):
+  """Overload of range_ that generates a TF range tensor."""
   # Note: for static inputs (e.g. constants), tf.range errors out at graph
   # construction time, instead of returning an empty tensor. Preventing the
   # graph construction error aligns the semantics with Python.
diff --git a/tensorflow/python/autograph/pyct/ast_util.py b/tensorflow/python/autograph/pyct/ast_util.py
index ea7eca6463a17d43f1a3536ebdd1770cfcf265f7..3dc10cf3492d4485f901e7048571fa936a570967 100644
--- a/tensorflow/python/autograph/pyct/ast_util.py
+++ b/tensorflow/python/autograph/pyct/ast_util.py
@@ -200,7 +200,8 @@ def matches(node, pattern):
     bool
   """
   if isinstance(pattern, str):
-    pattern = parser.parse_expression(pattern)
+    pattern, = parser.parse_str(pattern).body
+
   matcher = PatternMatcher(pattern)
   matcher.visit(node)
   return matcher.matches
diff --git a/tensorflow/python/autograph/pyct/compiler.py b/tensorflow/python/autograph/pyct/compiler.py
index 06e66c5b5871d5528bccfcc9fe47268207594ea6..420f3bb22388801c54f27e8bf1701febb90ad34a 100644
--- a/tensorflow/python/autograph/pyct/compiler.py
+++ b/tensorflow/python/autograph/pyct/compiler.py
@@ -67,6 +67,13 @@ def ast_to_source(node, indentation='  '):
       trimmed_code_lines.append(l)
   code = '\n'.join(trimmed_code_lines)
 
+  # Work around the reference cycle generated by astor.
+  # See https://github.com/berkerpeksag/astor/blob/55dd323f7d8d696610c703c0296763c567685c31/astor/code_gen.py#L162  # pylint:disable=line-too-long
+  # Reference cycles are quite disliked by TensorFlow's tests.
+  if hasattr(generator, 'write'):
+    generator.write = None
+  del generator
+
   return code
 
 
diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py
index 2319430d09bcc8156839f7dd408fc1dc20db477a..6d9bc43d34652f2fd67b74faf4bff77afad54119 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils.py
@@ -31,15 +31,18 @@ from tensorflow.python.util import tf_inspect
 
 # These functions test negative for isinstance(*, types.BuiltinFunctionType)
 # and inspect.isbuiltin, and are generally not visible in globals().
+# TODO(mdan): Find a more generic way to test this - just enumerate __builtin__?
 SPECIAL_BUILTINS = {
     'dict': dict,
+    'enumerate': enumerate,
     'float': float,
     'int': int,
     'len': len,
     'list': list,
     'print': print,
     'range': range,
-    'tuple': tuple
+    'tuple': tuple,
+    'zip': zip
 }
 
 if six.PY2:
@@ -54,6 +57,20 @@ def islambda(f):
   return f.__name__ == '<lambda>'
 
 
+def isnamedtuple(f):
+  """Returns True if the argument is a namedtuple-like."""
+  if not (tf_inspect.isclass(f) and issubclass(f, tuple)):
+    return False
+  if not hasattr(f, '_fields'):
+    return False
+  fields = getattr(f, '_fields')
+  if not isinstance(fields, tuple):
+    return False
+  if not all(isinstance(f, str) for f in fields):
+    return False
+  return True
+
+
 def isbuiltin(f):
   """Returns True if the argument is a built-in function."""
   if f in SPECIAL_BUILTINS.values():
@@ -87,7 +104,7 @@ def getnamespace(f):
   return namespace
 
 
-def getqualifiedname(namespace, object_, max_depth=2):
+def getqualifiedname(namespace, object_, max_depth=5, visited=None):
   """Returns the name by which a value can be referred to in a given namespace.
 
   If the object defines a parent module, the function attempts to use it to
@@ -101,16 +118,20 @@ def getqualifiedname(namespace, object_, max_depth=2):
     object_: Any, the value to search.
     max_depth: Optional[int], a limit to the recursion depth when searching
         inside modules.
+    visited: Optional[Set[int]], ID of modules to avoid visiting.
   Returns: Union[str, None], the fully-qualified name that resolves to the value
       o, or None if it couldn't be found.
   """
-  for name, value in namespace.items():
+  if visited is None:
+    visited = set()
+
+  for name in namespace:
     # The value may be referenced by more than one symbol, case in which
     # any symbol will be fine. If the program contains symbol aliases that
     # change over time, this may capture a symbol that will later point to
     # something else.
     # TODO(mdan): Prefer the symbol that matches the value type name.
-    if object_ is value:
+    if object_ is namespace[name]:
       return name
 
   # If an object is not found, try to search its parent modules.
@@ -118,22 +139,25 @@ def getqualifiedname(namespace, object_, max_depth=2):
   if (parent is not None and parent is not object_ and
       parent is not namespace):
     # No limit to recursion depth because of the guard above.
-    parent_name = getqualifiedname(namespace, parent, max_depth=0)
+    parent_name = getqualifiedname(
+        namespace, parent, max_depth=0, visited=visited)
     if parent_name is not None:
-      name_in_parent = getqualifiedname(parent.__dict__, object_, max_depth=0)
+      name_in_parent = getqualifiedname(
+          parent.__dict__, object_, max_depth=0, visited=visited)
       assert name_in_parent is not None, (
           'An object should always be found in its owner module')
       return '{}.{}'.format(parent_name, name_in_parent)
 
-  # TODO(mdan): Use breadth-first search and avoid visiting modules twice.
   if max_depth:
     # Iterating over a copy prevents "changed size due to iteration" errors.
     # It's unclear why those occur - suspecting new modules may load during
     # iteration.
-    for name, value in namespace.copy().items():
-      if tf_inspect.ismodule(value):
+    for name in namespace.keys():
+      value = namespace[name]
+      if tf_inspect.ismodule(value) and id(value) not in visited:
+        visited.add(id(value))
         name_in_module = getqualifiedname(value.__dict__, object_,
-                                          max_depth - 1)
+                                          max_depth - 1, visited)
         if name_in_module is not None:
           return '{}.{}'.format(name, name_in_module)
   return None
@@ -162,6 +186,27 @@ def getdefiningclass(m, owner_class):
   return owner_class
 
 
+def isweakrefself(m):
+  """Tests whether an object is a "weakref self" wrapper, see getmethodself."""
+  return hasattr(m, '__self__') and hasattr(m.__self__, 'ag_self_weakref__')
+
+
+def getmethodself(m):
+  """An extended version of inspect.getmethodclass."""
+  if not hasattr(m, '__self__'):
+    return None
+  if m.__self__ is None:
+    return None
+
+  # A fallback allowing methods to be actually bound to a type different
+  # than __self__. This is useful when a strong reference from the method
+  # to the object is not desired, for example when caching is involved.
+  if isweakrefself(m):
+    return m.__self__.ag_self_weakref__()
+
+  return m.__self__
+
+
 def getmethodclass(m):
   """Resolves a function's owner, e.g. a method's class.
 
@@ -192,16 +237,12 @@ def getmethodclass(m):
     if isinstance(m.__class__, six.class_types):
       return m.__class__
 
-  # Instance method and class methods: should be bound to a non-null "self".
-  if hasattr(m, '__self__'):
-    if m.__self__ is not None:
-      # A fallback allowing methods to be actually bound to a type different
-      # than __self__. This is useful when a strong reference from the method
-      # to the object is not desired, for example when caching is involved.
-      if hasattr(m.__self__, 'ag_self_weakref__'):
-        return m.__self__.ag_self_weakref__()
-
-      return m.__self__
+  # Instance method and class methods: return the class of "self".
+  m_self = getmethodself(m)
+  if m_self is not None:
+    if tf_inspect.isclass(m_self):
+      return m_self
+    return m_self.__class__
 
   # Class, static and unbound methods: search all defined classes in any
   # namespace. This is inefficient but more robust method.
diff --git a/tensorflow/python/autograph/pyct/inspect_utils_test.py b/tensorflow/python/autograph/pyct/inspect_utils_test.py
index 7e8466d58ee06555b7306785cf8454cfb24c7456..4c4c0977b0fef2fdfee69d2e7c608ad1a412aa21 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils_test.py
@@ -18,7 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from functools import wraps
+import collections
+import functools
 import imp
 import types
 import weakref
@@ -46,7 +47,7 @@ def wrapping_decorator():
     def replacement(*_):
       return None
 
-    @wraps(f)
+    @functools.wraps(f)
     def wrapper(*args, **kwargs):
       return replacement(*args, **kwargs)
     return wrapper
@@ -102,6 +103,31 @@ class InspectUtilsTest(test.TestCase):
     self.assertTrue(inspect_utils.islambda(lambda x: x))
     self.assertFalse(inspect_utils.islambda(test_fn))
 
+  def test_isnamedtuple(self):
+    nt = collections.namedtuple('TestNamedTuple', ['a', 'b'])
+
+    class NotANamedTuple(tuple):
+      pass
+
+    self.assertTrue(inspect_utils.isnamedtuple(nt))
+    self.assertFalse(inspect_utils.isnamedtuple(NotANamedTuple))
+
+  def test_isnamedtuple_confounder(self):
+    """This test highlights false positives when detecting named tuples."""
+
+    class NamedTupleLike(tuple):
+      _fields = ('a', 'b')
+
+    self.assertTrue(inspect_utils.isnamedtuple(NamedTupleLike))
+
+  def test_isnamedtuple_subclass(self):
+    """This test highlights false positives when detecting named tuples."""
+
+    class NamedTupleSubclass(collections.namedtuple('Test', ['a', 'b'])):
+      pass
+
+    self.assertTrue(inspect_utils.isnamedtuple(NamedTupleSubclass))
+
   def test_getnamespace_globals(self):
     ns = inspect_utils.getnamespace(factory)
     self.assertEqual(ns['free_function'], free_function)
@@ -157,6 +183,63 @@ class InspectUtilsTest(test.TestCase):
     self.assertEqual(inspect_utils.getqualifiedname(ns, bar), 'bar')
     self.assertEqual(inspect_utils.getqualifiedname(ns, baz), 'bar.baz')
 
+  def test_getqualifiedname_efficiency(self):
+    foo = object()
+
+    # We create a densely connected graph consisting of a relatively small
+    # number of modules and hide our symbol in one of them. The path to the
+    # symbol is at least 10, and each node has about 10 neighbors. However,
+    # by skipping visited modules, the search should take much less.
+    ns = {}
+    prev_level = []
+    for i in range(10):
+      current_level = []
+      for j in range(10):
+        mod_name = 'mod_{}_{}'.format(i, j)
+        mod = imp.new_module(mod_name)
+        current_level.append(mod)
+        if i == 9 and j == 9:
+          mod.foo = foo
+      if prev_level:
+        # All modules at level i refer to all modules at level i+1
+        for prev in prev_level:
+          for mod in current_level:
+            prev.__dict__[mod.__name__] = mod
+      else:
+        for mod in current_level:
+          ns[mod.__name__] = mod
+      prev_level = current_level
+
+    self.assertIsNone(inspect_utils.getqualifiedname(ns, inspect_utils))
+    self.assertIsNotNone(
+        inspect_utils.getqualifiedname(ns, foo, max_depth=10000000000))
+
+  def test_getqualifiedname_cycles(self):
+    foo = object()
+
+    # We create a graph of modules that contains circular references. The
+    # search process should avoid them. The searched object is hidden at the
+    # bottom of a path of length roughly 10.
+    ns = {}
+    mods = []
+    for i in range(10):
+      mod = imp.new_module('mod_{}'.format(i))
+      if i == 9:
+        mod.foo = foo
+      # Module i refers to module i+1
+      if mods:
+        mods[-1].__dict__[mod.__name__] = mod
+      else:
+        ns[mod.__name__] = mod
+      # Module i refers to all modules j < i.
+      for prev in mods:
+        mod.__dict__[prev.__name__] = prev
+      mods.append(mod)
+
+    self.assertIsNone(inspect_utils.getqualifiedname(ns, inspect_utils))
+    self.assertIsNotNone(
+        inspect_utils.getqualifiedname(ns, foo, max_depth=10000000000))
+
   def test_getqualifiedname_finds_via_parent_module(self):
     # TODO(mdan): This test is vulnerable to change in the lib module.
     # A better way to forge modules should be found.
@@ -194,16 +277,16 @@ class InspectUtilsTest(test.TestCase):
     test_obj = TestClass()
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.member_function),
-        test_obj)
+        TestClass)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.decorated_member),
-        test_obj)
+        TestClass)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.fn_decorated_member),
-        test_obj)
+        TestClass)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.wrap_decorated_member),
-        test_obj)
+        TestClass)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.static_method),
         TestClass)
@@ -252,16 +335,16 @@ class InspectUtilsTest(test.TestCase):
     test_obj = LocalClass()
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.member_function),
-        test_obj)
+        LocalClass)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.decorated_member),
-        test_obj)
+        LocalClass)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.fn_decorated_member),
-        test_obj)
+        LocalClass)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.wrap_decorated_member),
-        test_obj)
+        LocalClass)
 
   def test_getmethodclass_callables(self):
     class TestCallable(object):
@@ -284,12 +367,13 @@ class InspectUtilsTest(test.TestCase):
       return self
 
     bound_method = types.MethodType(test_fn, WeakrefWrapper())
-    self.assertEqual(inspect_utils.getmethodclass(bound_method), test_obj)
+    self.assertEqual(inspect_utils.getmethodclass(bound_method), TestClass)
 
   def test_getmethodclass_no_bool_conversion(self):
 
     tensor = constant_op.constant([1])
-    self.assertEqual(inspect_utils.getmethodclass(tensor.get_shape), tensor)
+    self.assertEqual(
+        inspect_utils.getmethodclass(tensor.get_shape), type(tensor))
 
   def test_getdefiningclass(self):
     class Superclass(object):
@@ -323,10 +407,12 @@ class InspectUtilsTest(test.TestCase):
         Superclass)
 
   def test_isbuiltin(self):
-    self.assertTrue(inspect_utils.isbuiltin(range))
+    self.assertTrue(inspect_utils.isbuiltin(enumerate))
     self.assertTrue(inspect_utils.isbuiltin(float))
     self.assertTrue(inspect_utils.isbuiltin(int))
     self.assertTrue(inspect_utils.isbuiltin(len))
+    self.assertTrue(inspect_utils.isbuiltin(range))
+    self.assertTrue(inspect_utils.isbuiltin(zip))
     self.assertFalse(inspect_utils.isbuiltin(function_decorator))
 
   def test_super_wrapper_for_dynamic_attrs(self):
diff --git a/tensorflow/python/autograph/pyct/parser.py b/tensorflow/python/autograph/pyct/parser.py
index 8f4037c5e286accc600dbac97acd7b5fe045b582..d04a40157e7ef59c887b2e3af0870ab087fd93d0 100644
--- a/tensorflow/python/autograph/pyct/parser.py
+++ b/tensorflow/python/autograph/pyct/parser.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 import textwrap
 
 import gast
+import six
 
 from tensorflow.python.util import tf_inspect
 
@@ -91,7 +92,17 @@ def parse_entity(entity):
 def parse_str(src):
   """Returns the AST of given piece of code."""
   # TODO(mdan): This should exclude the module things are autowrapped in.
-  return gast.parse(src)
+
+  if six.PY2 and '.print(' in src:
+    # This special treatment is required because gast.parse is not aware of
+    # whether print_function was present in the original context.
+    src = 'from __future__ import print_function\n' + src
+    parsed_module = gast.parse(src)
+    parsed_module.body = parsed_module.body[1:]
+  else:
+    parsed_module = gast.parse(src)
+
+  return parsed_module
 
 
 def parse_expression(src):
@@ -106,7 +117,7 @@ def parse_expression(src):
   """
   node = parse_str(src)
   assert isinstance(node, gast.Module)
-  if len(node.body) != 1 and not isinstance(node.body[0], gast.Expr):
+  if len(node.body) != 1 or not isinstance(node.body[0], gast.Expr):
     raise ValueError(
         'Expected a single expression, found instead %s' % node.body)
   return node.body[0].value
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness.py b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
index 451398f1b70abf56d6c141305930c8a4e1a66a07..f8b8d7fa77c167e0ebf96dd533e3c42b0c30b8e5 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
@@ -161,6 +161,16 @@ class Annotator(transformer.Base):
     self.cross_function_analyzer = cross_function_analyzer
     self.current_analyzer = None
 
+  def visit(self, node):
+    node = super(Annotator, self).visit(node)
+    if (self.current_analyzer is not None and
+        isinstance(node, gast.stmt) and
+        node in self.current_analyzer.graph.index):
+      cfg_node = self.current_analyzer.graph.index[node]
+      anno.setanno(node, anno.Static.LIVE_VARS_IN,
+                   frozenset(self.current_analyzer.in_[cfg_node]))
+    return node
+
   def visit_FunctionDef(self, node):
     parent_analyzer = self.current_analyzer
     self.current_analyzer = self.cross_function_analyzer.analyzers[node]
@@ -198,6 +208,10 @@ class Annotator(transformer.Base):
     node = self._block_statement_live_out(node)
     return self._block_statement_live_in(node, node.test)
 
+  def visit_With(self, node):
+    node = self.generic_visit(node)
+    return self._block_statement_live_in(node, node.items[0])
+
   def visit_Expr(self, node):
     node = self.generic_visit(node)
     cfg_node = self.current_analyzer.graph.index[node]
diff --git a/tensorflow/python/autograph/pyct/templates.py b/tensorflow/python/autograph/pyct/templates.py
index 9901295445c7a77c78ee1c0de9c27724948741c0..43279b3ca0111b8ea3860f1c467df1c602b3de74 100644
--- a/tensorflow/python/autograph/pyct/templates.py
+++ b/tensorflow/python/autograph/pyct/templates.py
@@ -32,6 +32,66 @@ from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import qual_names
 
 
+class ContextAdjuster(gast.NodeTransformer):
+  """Adjusts the ctx field of nodes to ensure consistency.
+
+  This transformer can change the ctx fields of a variable, tuple and other
+  AST elements that allow one, based on whether the element is being read or
+  written.
+  """
+
+  def __init__(self, override_value):
+    self._ctx_override = override_value
+
+  def visit(self, node):
+    original_override = self._ctx_override
+    node = super(ContextAdjuster, self).visit(node)
+    if hasattr(node, 'ctx'):
+      assert node.ctx is not None, 'node {} has ctx unset'.format(node)
+    self._ctx_override = original_override
+    return node
+
+  def _apply_override(self, node):
+    if self._ctx_override is not None:
+      node.ctx = self._ctx_override()
+
+  def visit_Attribute(self, node):
+    self._apply_override(node)
+    self._ctx_override = gast.Load
+    node = self.generic_visit(node)
+    return node
+
+  def visit_Tuple(self, node):
+    self._apply_override(node)
+    return self.generic_visit(node)
+
+  def visit_List(self, node):
+    self._apply_override(node)
+    return self.generic_visit(node)
+
+  def visit_Name(self, node):
+    self._apply_override(node)
+    return self.generic_visit(node)
+
+  def visit_Call(self, node):
+    self._apply_override(node)
+    # We may be able to override these to Load(), but for now it's simpler
+    # to just assert that they're set.
+    self._ctx_override = None
+    return self.generic_visit(node)
+
+  def visit_Dict(self, node):
+    # We may be able to override these to Load(), but for now it's simpler
+    # to just assert that they're set.
+    self._ctx_override = None
+    return self.generic_visit(node)
+
+  def visit_Subscript(self, node):
+    node.value = self.visit(node.value)
+    self._ctx_override = None
+    return self.generic_visit(node)
+
+
 class ReplaceTransformer(gast.NodeTransformer):
   """Replace AST nodes."""
 
@@ -106,91 +166,6 @@ class ReplaceTransformer(gast.NodeTransformer):
     node.name = repl.id
     return node
 
-  def _check_has_context(self, node):
-    if not node.ctx:
-      raise ValueError('node %s is missing ctx value' % node)
-
-  # TODO(mdan): Rewrite _check and _set using a separate transformer.
-  def _check_inner_children_have_context(self, node):
-    if isinstance(node, gast.Attribute):
-      self._check_inner_children_have_context(node.value)
-      self._check_has_context(node)
-    elif isinstance(node, (gast.Tuple, gast.List)):
-      for e in node.elts:
-        self._check_inner_children_have_context(e)
-      self._check_has_context(node)
-    elif isinstance(node, gast.Dict):
-      for e in node.keys:
-        self._check_inner_children_have_context(e)
-      for e in node.values:
-        self._check_inner_children_have_context(e)
-    elif isinstance(node, gast.Index):
-      self._check_inner_children_have_context(node.value)
-    elif isinstance(node, gast.Subscript):
-      self._check_inner_children_have_context(node.value)
-      self._check_inner_children_have_context(node.slice)
-    elif isinstance(node, gast.Slice):
-      self._check_inner_children_have_context(node.lower)
-      if node.upper:
-        self._check_inner_children_have_context(node.upper)
-      if node.step:
-        self._check_inner_children_have_context(node.step)
-    elif isinstance(node, gast.BinOp):
-      self._check_inner_children_have_context(node.left)
-      self._check_inner_children_have_context(node.right)
-    elif isinstance(node, gast.UnaryOp):
-      self._check_inner_children_have_context(node.operand)
-    elif isinstance(node, gast.Name):
-      self._check_has_context(node)
-    elif isinstance(node, (gast.Str, gast.Num)):
-      pass
-    elif isinstance(node, gast.Call):
-      self._check_inner_children_have_context(node.func)
-      for a in node.args:
-        self._check_inner_children_have_context(a)
-      for k in node.keywords:
-        self._check_inner_children_have_context(k.value)
-    else:
-      raise ValueError('unexpected node type "%s"' % node)
-
-  def _set_inner_child_context(self, node, ctx):
-    if isinstance(node, gast.Attribute):
-      self._set_inner_child_context(node.value, gast.Load())
-      node.ctx = ctx
-    elif isinstance(node, (gast.Tuple, gast.List)):
-      for e in node.elts:
-        self._set_inner_child_context(e, ctx)
-      node.ctx = ctx
-    elif isinstance(node, gast.Name):
-      node.ctx = ctx
-    elif isinstance(node, gast.Call):
-      self._set_inner_child_context(node.func, ctx)
-      # We may be able to override these to Load(), but for now it's simpler
-      # to just assert that they're set.
-      for a in node.args:
-        self._check_inner_children_have_context(a)
-      for k in node.keywords:
-        self._check_inner_children_have_context(k.value)
-    elif isinstance(node, gast.Dict):
-      # We may be able to override these to Load(), but for now it's simpler
-      # to just assert that they're set.
-      for e in node.keys:
-        self._check_inner_children_have_context(e)
-      for e in node.values:
-        self._check_inner_children_have_context(e)
-    elif isinstance(node, gast.Subscript):
-      self._set_inner_child_context(node.value, ctx)
-      self._check_inner_children_have_context(node.slice)
-    elif isinstance(node, gast.BinOp):
-      self._check_inner_children_have_context(node.left)
-      self._check_inner_children_have_context(node.right)
-    elif isinstance(node, gast.UnaryOp):
-      self._check_inner_children_have_context(node.operand)
-    elif isinstance(node, (gast.Str, gast.Num)):
-      pass
-    else:
-      raise ValueError('unexpected node type "%s"' % node)
-
   def visit_Attribute(self, node):
     node = self.generic_visit(node)
     if node.attr not in self.replacements:
@@ -209,17 +184,14 @@ class ReplaceTransformer(gast.NodeTransformer):
 
     new_nodes = self._prepare_replacement(node, node.id)
 
+    if not new_nodes:
+      return new_nodes
+
     # Preserve the target context.
+    adjuster = ContextAdjuster(type(node.ctx))
     for n in new_nodes:
-      if isinstance(n, (gast.Tuple, gast.List)):
-        for e in n.elts:
-          self._set_inner_child_context(e, node.ctx)
-      if isinstance(n, gast.Attribute):
-        # For attributes, the inner Name node receives the context, while the
-        # outer ones have it set to Load.
-        self._set_inner_child_context(n, node.ctx)
-      else:
-        n.ctx = node.ctx
+      if hasattr(n, 'ctx'):
+        adjuster.visit(n)
 
     if len(new_nodes) == 1:
       new_nodes, = new_nodes
diff --git a/tensorflow/python/autograph/pyct/templates_test.py b/tensorflow/python/autograph/pyct/templates_test.py
index 54019ef5f4a20ed4a4d69d9c57c8addd12ee3c75..cdb44b822e84ad5822c78d50c2f958b1fba9ec18 100644
--- a/tensorflow/python/autograph/pyct/templates_test.py
+++ b/tensorflow/python/autograph/pyct/templates_test.py
@@ -134,19 +134,18 @@ class TemplatesTest(test.TestCase):
 
   def test_replace_expression_context(self):
     template = """
-      def test_fn(foo):
+      def test_fn():
         foo
     """
 
     node = templates.replace(
         template, foo=parser.parse_expression('a + 2 * b / -c'))[0]
-    self.assertIsInstance(node.body[0].ctx, gast.Load)
     self.assertIsInstance(node.body[0].left.ctx, gast.Load)
     self.assertIsInstance(node.body[0].right.left.right.ctx, gast.Load)
 
   def test_replace_complex_context(self):
     template = """
-      def test_fn(foo):
+      def test_fn():
         foo = 0
     """
 
@@ -160,7 +159,7 @@ class TemplatesTest(test.TestCase):
 
   def test_replace_index(self):
     template = """
-      def test_fn(foo):
+      def test_fn():
         foo = 0
     """
 
diff --git a/tensorflow/python/autograph/utils/py_func.py b/tensorflow/python/autograph/utils/py_func.py
index 11ebfb2e49f0e762b56ae2cde2b76d2e24032d72..ee8b46b52061f28eacdf2f980cccb07c889e7274 100644
--- a/tensorflow/python/autograph/utils/py_func.py
+++ b/tensorflow/python/autograph/utils/py_func.py
@@ -127,5 +127,6 @@ def wrap_py_func(f, return_dtypes, args, kwargs=None, use_dummy_return=False):
     retval = f(*f_args, **f_kwargs)
     return 1 if use_dummy_return else retval
 
-  return script_ops.py_func(f_wrapper, tensor_args, dtypes.int64
-                            if use_dummy_return else return_dtypes)
+  if use_dummy_return:
+    return_dtypes = dtypes.int32
+  return script_ops.eager_py_func(f_wrapper, tensor_args, return_dtypes)
diff --git a/tensorflow/python/autograph/utils/py_func_test.py b/tensorflow/python/autograph/utils/py_func_test.py
index 28cefd8c3edb343aa10d458b9e3a3cd55e3418c4..d17ede77142483208a0954244579b3249f0ffba5 100644
--- a/tensorflow/python/autograph/utils/py_func_test.py
+++ b/tensorflow/python/autograph/utils/py_func_test.py
@@ -32,13 +32,13 @@ class PyFuncTest(test.TestCase):
       return a + b + c
 
     with self.cached_session() as sess:
-      result = py_func.wrap_py_func(test_fn, dtypes.int64,
+      result = py_func.wrap_py_func(test_fn, dtypes.int32,
                                     (1, constant_op.constant(1), 1))
       self.assertEqual(3, self.evaluate(result))
-      result = py_func.wrap_py_func(test_fn, dtypes.int64, (1, 1, 1))
+      result = py_func.wrap_py_func(test_fn, dtypes.int32, (1, 1, 1))
       self.assertEqual(3, self.evaluate(result))
       result = py_func.wrap_py_func(
-          test_fn, dtypes.int64,
+          test_fn, dtypes.int32,
           (constant_op.constant(1), 1, constant_op.constant(1)))
       self.assertEqual(3, self.evaluate(result))
 
@@ -53,9 +53,9 @@ class PyFuncTest(test.TestCase):
       return a * b.foo
 
     with self.cached_session() as sess:
-      result = py_func.wrap_py_func(test_fn, dtypes.int64, (7, TestClass()))
+      result = py_func.wrap_py_func(test_fn, dtypes.int32, (7, TestClass()))
       self.assertEqual(35, self.evaluate(result))
-      result = py_func.wrap_py_func(test_fn, dtypes.int64,
+      result = py_func.wrap_py_func(test_fn, dtypes.int32,
                                     (constant_op.constant(7), TestClass()))
       self.assertEqual(35, self.evaluate(result))
 
@@ -70,12 +70,12 @@ class PyFuncTest(test.TestCase):
       return a * b.foo + c * d.foo
 
     with self.cached_session() as sess:
-      result = py_func.wrap_py_func(test_fn, dtypes.int64, (7, TestClass(5)), {
+      result = py_func.wrap_py_func(test_fn, dtypes.int32, (7, TestClass(5)), {
           'c': 11,
           'd': TestClass(13)
       })
       self.assertEqual(178, self.evaluate(result))
-      result = py_func.wrap_py_func(test_fn, dtypes.int64,
+      result = py_func.wrap_py_func(test_fn, dtypes.int32,
                                     (constant_op.constant(7), TestClass(5)), {
                                         'c': constant_op.constant(11),
                                         'd': TestClass(13)
diff --git a/tensorflow/python/autograph/utils/tensor_list_test.py b/tensorflow/python/autograph/utils/tensor_list_test.py
index 0bd300724972513b6b498913c77b03c48e573b5a..bbbc3bf691818d292d53999c563bcc1112d0703f 100644
--- a/tensorflow/python/autograph/utils/tensor_list_test.py
+++ b/tensorflow/python/autograph/utils/tensor_list_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.utils import tensor_list as tl
-from tensorflow.python.client.session import Session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -35,7 +34,7 @@ class TensorListTest(test.TestCase):
   def _shape(self, shape_tuple):
     return constant(shape_tuple, dtypes.int32)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/117943489")
   def test_dynamic_list_append(self):
     l = []
     l = tl.dynamic_list_append(l, 1)
@@ -44,19 +43,16 @@ class TensorListTest(test.TestCase):
     l = list_ops.empty_tensor_list(self._shape(()), dtypes.int32)
     l = tl.dynamic_list_append(l, 1)
     s = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.cached_session() as sess:
-      self.assertAllEqual(self.evaluate(s), [1])
+    self.assertAllEqual(s, [1])
 
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
     l = tl.dynamic_list_append(l, 1)
     s = l.stack()
-    with self.cached_session() as sess:
-      self.assertAllEqual(self.evaluate(s), [1])
+    self.assertAllEqual(s, [1])
 
     l = tl.TensorList(self._shape(()), dtypes.int32)
     l = tl.dynamic_list_append(l, 1)
-    with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(l[0]), 1)
+    self.assertAllEqual(l[0], 1)
 
   def test_list_append_python(self):
     with context.eager_mode():
@@ -94,13 +90,12 @@ class TensorListTest(test.TestCase):
     c3 = l.count()
     a2 = l.pop()
     c4 = l.count()
-    with Session() as sess:
-      c1, c2, c3, c4, a, a2 = self.evaluate([c1, c2, c3, c4, a, a2])
-      self.assertEqual(c1, 1)
-      self.assertEqual(c2, 2)
-      self.assertEqual(c3, 1)
-      self.assertEqual(c4, 0)
-      self.assertEqual(a, a2)
+    c1, c2, c3, c4, a, a2 = self.evaluate([c1, c2, c3, c4, a, a2])
+    self.assertEqual(c1, 1)
+    self.assertEqual(c2, 2)
+    self.assertEqual(c3, 1)
+    self.assertEqual(c4, 0)
+    self.assertEqual(a, a2)
 
   def test_list_index_tf(self):
     a = constant(3.0)
@@ -110,10 +105,9 @@ class TensorListTest(test.TestCase):
     l0 = l[0]
     l[0] = b
     l1 = l[0]
-    with self.cached_session() as sess:
-      l0, l1, a, b = self.evaluate([l0, l1, a, b])
-      self.assertEqual(l0, a)
-      self.assertEqual(l1, b)
+    l0, l1, a, b = self.evaluate([l0, l1, a, b])
+    self.assertEqual(l0, a)
+    self.assertEqual(l1, b)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 347833ce8fd095eb4acdef4a8a7e09046b554ba3..c4a118a41406afc52586553b1d3f0b446005c46d 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -312,6 +312,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(None, res[2])
       self.assertEqual(44.0, res[1])
 
+  @test_util.run_v1_only('b/120545219')
   def testFetchAttrs(self):
     if attr is None:
       self.skipTest('attr module is unavailable.')
@@ -340,6 +341,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(val3, result.field1)
       self.assertAllEqual(val2, result.field2)
 
+  @test_util.run_v1_only('b/120545219')
   def testFetchNestedAttrs(self):
     if attr is None:
       self.skipTest('attr module is unavailable.')
@@ -1024,6 +1026,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       fed_c_val = c.eval(feed_dict={a.name: [[4.0, 4.0]]})
       self.assertAllEqual([[16.0, 16.0, 16.0]], fed_c_val)
 
+  @test_util.run_v1_only('b/120545219')
   def testOperationRunMethod(self):
     with session.Session():
       a = constant_op.constant(1.0, shape=[1, 2])
@@ -1154,6 +1157,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         else:
           importer.import_graph_def(gdef, name='import')
 
+  @test_util.run_v1_only('b/120545219')
   def testParallelRunAndSingleBuild(self):
     with session.Session() as sess:
       c = constant_op.constant(5.0)
@@ -1174,6 +1178,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       for t in threads:
         t.join()
 
+  @test_util.run_v1_only('b/120545219')
   def testParallelRunAndParallelBuild(self):
     with session.Session() as sess:
       c = constant_op.constant(5.0)
@@ -1274,6 +1279,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(RuntimeError, 'The Session graph is empty.'):
         sess.run({})
 
+  @test_util.run_v1_only('b/120545219')
   def testNotEntered(self):
     # pylint: disable=protected-access
     self.assertEqual(ops._default_session_stack.get_default(), None)
@@ -1289,6 +1295,7 @@ class SessionTest(test_util.TensorFlowTestCase):
           ValueError, lambda e: 'No default session is registered.' in str(e)):
         c_2.eval()
 
+  @test_util.run_v1_only('b/120545219')
   def testInteractive(self):
     with ops.device('/cpu:0'):
       sess = session.InteractiveSession()
@@ -1301,6 +1308,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[24.0]], e.eval())
       sess.close()
 
+  @test_util.run_v1_only('b/120545219')
   def testMultipleInteractiveSessionsWarning(self):
     # Reinitialize the global state to ensure that the expected warnings will
     # be emitted.
@@ -1328,6 +1336,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     sess2.close()
     sess.close()
 
+  @test_util.run_v1_only('b/120545219')
   def testInteractivePlacePrunedGraph(self):
     sess = session.InteractiveSession()
 
@@ -1349,6 +1358,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       a.eval()
     sess.close()
 
+  @test_util.run_v1_only('b/120545219')
   def testDefaultSessionPlacePrunedGraph(self):
     sess = session.Session()
 
@@ -1769,9 +1779,11 @@ class SessionTest(test_util.TensorFlowTestCase):
     sess.run(a, run_metadata=run_metadata)
     self.assertEqual(len(run_metadata.partition_graphs), 0)
 
+  @test_util.run_v1_only('b/120545219')
   def testOutputPartitionGraphsDirect(self):
     self.runTestOutputPartitionGraphs(session.Session())
 
+  @test_util.run_v1_only('b/120545219')
   def testOutputPartitionGraphsDistributed(self):
     server = server_lib.Server.create_local_server()
     self.runTestOutputPartitionGraphs(session.Session(server.target))
@@ -1796,6 +1808,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     del sess1
     del sess2
 
+  @test_util.run_v1_only('b/120545219')
   def testAsDefault(self):
     c = constant_op.constant(37)
     sess = session.Session()
@@ -1821,6 +1834,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(TypeError, 'graph must be a tf.Graph'):
       session.Session(graph=37)
 
+  @test_util.run_v1_only('b/120545219')
   def testTimeoutWithShortOperations(self):
     num_epochs = 5
     q = data_flow_ops.FIFOQueue(capacity=50, dtypes=[dtypes.int32], shapes=[()])
@@ -1834,6 +1848,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         sess.run(enqueue_op)
       self.assertEqual(sess.run(q.size()), num_epochs * 2)
 
+  @test_util.run_v1_only('b/120545219')
   def testRegisterFetchAndFeedConversionFunctions(self):
 
     class SquaredTensor(object):
@@ -1865,6 +1880,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       squared_eval = sess.partial_run(partial_run, squared_tensor)
       self.assertAllClose(np2 * np2, squared_eval)
 
+  @test_util.run_v1_only('b/120545219')
   def testDefaultLogDevicePlacement(self):
 
     class CaptureStderr(str):
@@ -1914,6 +1930,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertTrue('/job:local/replica:0/task:0/device:CPU:0' in str(log),
                       str(log))
 
+  @test_util.run_v1_only('b/120545219')
   def testLocalMasterSessionTimeout(self):
     # Test that the timeout passed in a config to the session works correctly.
     config = config_pb2.ConfigProto(operation_timeout_in_ms=1000)
@@ -1927,6 +1944,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaises(errors.DeadlineExceededError):
         sess.run(dequeued_t)
 
+  @test_util.run_v1_only('b/120545219')
   def testDefaultServerTimeout(self):
     # Test that the default server config timeout gets used when no Session
     # config is provided.
@@ -1952,9 +1970,11 @@ class SessionTest(test_util.TensorFlowTestCase):
     with self.assertRaisesOpError('has inputs from different frames'):
       sess.run(res, feed_dict={data: 1.0})
 
+  @test_util.run_v1_only('b/120545219')
   def testBuildGraphErrorDirect(self):
     self.runTestBuildGraphError(session.Session())
 
+  @test_util.run_v1_only('b/120545219')
   def testBuildGraphErrorDist(self):
     server = server_lib.Server.create_local_server()
     self.runTestBuildGraphError(session.Session(server.target))
@@ -1993,9 +2013,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       result = sess.run(f)
       self.assertEqual(result, 2.0)
 
+  @test_util.run_v1_only('b/120545219')
   def testAddFunctionToSession(self):
     self.runTestAddFunctionToSession()
 
+  @test_util.run_v1_only('b/120545219')
   def testAddFunctionToGrpcSession(self):
     server = server_lib.Server.create_local_server()
     self.runTestAddFunctionToSession(server.target)
@@ -2009,6 +2031,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     with session.Session():
       pass
 
+  @test_util.run_v1_only('b/120545219')
   def testAutoConvertAndCheckData(self):
     with self.cached_session() as sess:
       a = array_ops.placeholder(dtype=dtypes.string)
diff --git a/tensorflow/python/compat/BUILD b/tensorflow/python/compat/BUILD
index e0a1c8e0571879e9661cdb0714cc6a794b7ea455..9f2ce8c676e77480106c525bdc9c6440c599acec 100644
--- a/tensorflow/python/compat/BUILD
+++ b/tensorflow/python/compat/BUILD
@@ -9,7 +9,10 @@ py_library(
     srcs = ["compat.py"],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
-    deps = ["//tensorflow/python:util"],
+    deps = [
+        "//tensorflow/python:tf2",
+        "//tensorflow/python:util",
+    ],
 )
 
 tf_py_test(
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index f83af0a583390282ea76fa79c42234bd8d99362a..dadf72b9ab0a8250f419b1e36065ae5dae09d0a7 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -23,10 +23,16 @@ from __future__ import division
 from __future__ import print_function
 
 import datetime
+
+from tensorflow.python import tf2
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import variable_scope
+
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 11, 30)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 12, 15)
 
 
 @tf_export("compat.forward_compatible")
@@ -132,3 +138,40 @@ def forward_compatibility_horizon(year, month, day):
     yield
   finally:
     _FORWARD_COMPATIBILITY_HORIZON = old_compat_date
+
+
+@tf_export(v1=["enable_v2_behavior"])
+def enable_v2_behavior():
+  """Enables TensorFlow 2.x behaviors.
+
+  This function can be called at the beginning of the program (before `Tensors`,
+  `Graphs` or other structures have been created, and before devices have been
+  initialized. It switches all global behaviors that are different between
+  TensorFlow 1.x and 2.x to behave as intended for 2.x.
+
+  This function is called in the main TensorFlow `__init__.py` file, user should
+  not need to call it, except during complex migrations.
+  """
+  tf2.enable()  # Switches TensorArrayV2 and control flow V2
+  ops.enable_eager_execution()
+  tensor_shape.enable_v2_tensorshape()  # Also switched by tf2
+  variable_scope.enable_resource_variables()
+
+
+@tf_export(v1=["disable_v2_behavior"])
+def disable_v2_behavior():
+  """Disables TensorFlow 2.x behaviors.
+
+  This function can be called at the beginning of the program (before `Tensors`,
+  `Graphs` or other structures have been created, and before devices have been
+  initialized. It switches all global behaviors that are different between
+  TensorFlow 1.x and 2.x to behave as intended for 1.x.
+
+  User can call this function to disable 2.x behavior during complex migrations.
+  """
+  tf2.disable()  # Switches TensorArrayV2 and control flow V2
+  ops.disable_eager_execution()
+  tensor_shape.disable_v2_tensorshape()  # Also switched by tf2
+  variable_scope.disable_resource_variables()
+
+
diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index 7536ba668abf5f3aa62fb73921d14e7ffe5b8c19..75ba88f3034632bd925c7736fe7af42cd3aa274f 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -24,6 +24,8 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.python.data import experimental
 from tensorflow.python.data.ops.dataset_ops import Dataset
+from tensorflow.python.data.ops.dataset_ops import make_initializable_iterator
+from tensorflow.python.data.ops.dataset_ops import make_one_shot_iterator
 from tensorflow.python.data.ops.iterator_ops import Iterator
 from tensorflow.python.data.ops.readers import FixedLengthRecordDataset
 from tensorflow.python.data.ops.readers import TextLineDataset
diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
index 5b0500eae1970b4f183737d4fc0cd4171dd1ea15..fd0eca9dd7012ce44435dbbf6749121022c7ba29 100644
--- a/tensorflow/python/data/benchmarks/BUILD
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -48,6 +48,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "list_files_benchmark",
+    srcs = ["list_files_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "map_benchmark",
     srcs = ["map_benchmark.py"],
diff --git a/tensorflow/python/data/benchmarks/batch_benchmark.py b/tensorflow/python/data/benchmarks/batch_benchmark.py
index b61ac86eb52a7f9318232a9073cfb0c0da73fc8d..e063849f70381b8244a8a916353a3cc3be15c230 100644
--- a/tensorflow/python/data/benchmarks/batch_benchmark.py
+++ b/tensorflow/python/data/benchmarks/batch_benchmark.py
@@ -42,7 +42,7 @@ class BatchBenchmark(test.Benchmark):
 
     dataset = dataset_ops.Dataset.from_tensors(sparse_placeholder).repeat(
         ).batch(batch_size_placeholder)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     for non_zeros_per_row in non_zeros_per_row_values:
diff --git a/tensorflow/python/data/benchmarks/filter_benchmark.py b/tensorflow/python/data/benchmarks/filter_benchmark.py
index b9acdc72273762ee34b4fc0f4b7050eda467c081..a6d86fe2218aec835e4f09f0c8c708596cf511f8 100644
--- a/tensorflow/python/data/benchmarks/filter_benchmark.py
+++ b/tensorflow/python/data/benchmarks/filter_benchmark.py
@@ -36,7 +36,7 @@ class FilterBenchmark(test.Benchmark):
     with ops.Graph().as_default():
       dataset = (
           dataset_ops.Dataset.from_tensors(True).repeat(None).filter(predicate))
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       next_element = iterator.get_next()
 
       with session.Session() as sess:
diff --git a/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
index 74a2d271addd444954cd5bcc1c7b05b928b780cb..d7f1a4e7af5b00569e71900df8f2a7486d7c813b 100644
--- a/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
+++ b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
@@ -41,7 +41,7 @@ class FromTensorSlicesBenchmark(test.Benchmark):
     dataset = (
         dataset_ops.Dataset.from_tensor_slices(input_data)
         .repeat(num_epochs + 1).batch(batch_size))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with session.Session() as sess:
@@ -77,7 +77,7 @@ class FromTensorSlicesBenchmark(test.Benchmark):
     dataset = (
         dataset_ops.Dataset.from_tensor_slices(input_data)
         .repeat(num_epochs + 1).batch(batch_size))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with session.Session() as sess:
@@ -116,7 +116,7 @@ class FromTensorSlicesBenchmark(test.Benchmark):
     dataset = (
         dataset_ops.Dataset.from_tensor_slices(input_data.reshape(100, 100))
         .repeat(num_epochs + 1))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with session.Session() as sess:
@@ -154,7 +154,7 @@ class FromTensorSlicesBenchmark(test.Benchmark):
     dataset = (
         dataset_ops.Dataset.from_tensor_slices(input_data).batch(batch_size)
         .cache().repeat(num_epochs + 1))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with session.Session() as sess:
diff --git a/tensorflow/python/data/benchmarks/list_files_benchmark.py b/tensorflow/python/data/benchmarks/list_files_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dc21471129d5ca288a68c957e424035fea9dd66
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/list_files_benchmark.py
@@ -0,0 +1,96 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.Dataset.list_files()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+from os import makedirs
+import shutil
+import time
+import tempfile
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+class ListFilesBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.Dataset.list_files()`."""
+
+  def benchmarkNestedDirectories(self):
+    tmp_dir = tempfile.mkdtemp()
+    width = 1024
+    depth = 16
+    for i in range(width):
+      for j in range(depth):
+        new_base = path.join(tmp_dir, str(i),
+                             *[str(dir_name) for dir_name in range(j)])
+        makedirs(new_base)
+        child_files = ['a.py', 'b.pyc'] if j < depth - 1 else ['c.txt', 'd.log']
+        for f in child_files:
+          filename = path.join(new_base, f)
+          open(filename, 'w').close()
+    patterns = [
+        path.join(tmp_dir, path.join(*['**'
+                                       for _ in range(depth)]), suffix)
+        for suffix in ['*.txt', '*.log']
+    ]
+    deltas = []
+    iters = 3
+    for _ in range(iters):
+      with ops.Graph().as_default():
+        dataset = dataset_ops.Dataset.list_files(patterns)
+        next_element = dataset.make_one_shot_iterator().get_next()
+        with session.Session() as sess:
+          sub_deltas = []
+          while True:
+            try:
+              start = time.time()
+              sess.run(next_element)
+              end = time.time()
+              sub_deltas.append(end - start)
+            except errors.OutOfRangeError:
+              break
+          deltas.append(sub_deltas)
+    median_deltas = np.median(deltas, axis=0)
+    print('Nested directory size (width*depth): %d*%d Median wall time: '
+          '%fs (read first filename), %fs (read second filename), avg %fs'
+          ' (read %d more filenames)' %
+          (width, depth, median_deltas[0], median_deltas[1],
+           np.average(median_deltas[2:]), len(median_deltas) - 2))
+    self.report_benchmark(
+        iters=iters,
+        wall_time=np.sum(median_deltas),
+        extras={
+            'read first file:':
+                median_deltas[0],
+            'read second file:':
+                median_deltas[1],
+            'avg time for reading %d more filenames:' %
+            (len(median_deltas) - 2):
+                np.average(median_deltas[2:])
+        },
+        name='nested_directory(%d*%d)' % (width, depth))
+    shutil.rmtree(tmp_dir, ignore_errors=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/map_benchmark.py b/tensorflow/python/data/benchmarks/map_benchmark.py
index 48294eeb89af61ad48bfa7983dcbb5b17b004a4b..65d945cdae87aedad55351cfb63ad06e3521d570 100644
--- a/tensorflow/python/data/benchmarks/map_benchmark.py
+++ b/tensorflow/python/data/benchmarks/map_benchmark.py
@@ -58,7 +58,7 @@ class MapBenchmark(test.Benchmark):
                 dataset,
                 map_fn,
                 use_inter_op_parallelism=use_inter_op_parallelism)
-          iterator = dataset.make_one_shot_iterator()
+          iterator = dataset_ops.make_one_shot_iterator(dataset)
           next_element = iterator.get_next()
 
           with session.Session() as sess:
@@ -108,7 +108,7 @@ class MapBenchmark(test.Benchmark):
               dataset,
               map_fn,
               use_inter_op_parallelism=use_inter_op_parallelism)
-          iterator = dataset.make_one_shot_iterator()
+          iterator = dataset_ops.make_one_shot_iterator(dataset)
           next_element = iterator.get_next()
 
           with session.Session() as sess:
diff --git a/tensorflow/python/data/benchmarks/range_benchmark.py b/tensorflow/python/data/benchmarks/range_benchmark.py
index 25f63b79a26e37bd381df7c1f3c0ae91667a70bf..a5020e2873063ea8b01801c0889a23cb60601ec3 100644
--- a/tensorflow/python/data/benchmarks/range_benchmark.py
+++ b/tensorflow/python/data/benchmarks/range_benchmark.py
@@ -39,7 +39,7 @@ class RangeBenchmark(test.Benchmark):
     # costs).
     dataset = dataset_ops.Dataset.range(num_elements).skip(
         num_elements - 1).take(1).with_options(options)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with session.Session() as sess:
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index 8a1048513af379a73162d6a0629274c30a86f8c0..ffc2e5ef5fa239beada67687ec700437b2fc44ba 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -25,17 +25,24 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@Counter
 @@CheckpointInputPipelineHook
 @@CsvDataset
+@@DatasetStructure
+@@NestedStructure
 @@OptimizationOptions
 @@Optional
+@@OptionalStructure
 @@RandomDataset
 @@Reducer
+@@SparseTensorStructure
 @@SqlDataset
 @@StatsAggregator
 @@StatsOptions
+@@Structure
 @@TFRecordWriter
+@@TensorStructure
 @@ThreadingOptions
 
 @@bucket_by_sequence_length
+@@cardinality
 @@choose_from_datasets
 @@copy_to_device
 @@dense_to_sparse_batch
@@ -62,6 +69,8 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@unique
 
 @@AUTOTUNE
+@@INFINITE_CARDINALITY
+@@UNKNOWN_CARDINALITY
 """
 
 from __future__ import absolute_import
@@ -73,6 +82,9 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.ops.batching import dense_to_sparse_batch
 from tensorflow.python.data.experimental.ops.batching import map_and_batch
 from tensorflow.python.data.experimental.ops.batching import unbatch
+from tensorflow.python.data.experimental.ops.cardinality import cardinality
+from tensorflow.python.data.experimental.ops.cardinality import INFINITE as INFINITE_CARDINALITY
+from tensorflow.python.data.experimental.ops.cardinality import UNKNOWN as UNKNOWN_CARDINALITY
 from tensorflow.python.data.experimental.ops.counter import Counter
 from tensorflow.python.data.experimental.ops.enumerate_ops import enumerate_dataset
 from tensorflow.python.data.experimental.ops.error_ops import ignore_errors
@@ -106,8 +118,14 @@ from tensorflow.python.data.experimental.ops.stats_options import StatsOptions
 from tensorflow.python.data.experimental.ops.threading_options import ThreadingOptions
 from tensorflow.python.data.experimental.ops.unique import unique
 from tensorflow.python.data.experimental.ops.writers import TFRecordWriter
+from tensorflow.python.data.ops.dataset_ops import DatasetStructure
 from tensorflow.python.data.ops.iterator_ops import get_next_as_optional
 from tensorflow.python.data.ops.optional_ops import Optional
+from tensorflow.python.data.ops.optional_ops import OptionalStructure
+from tensorflow.python.data.util.structure import NestedStructure
+from tensorflow.python.data.util.structure import SparseTensorStructure
+from tensorflow.python.data.util.structure import Structure
+from tensorflow.python.data.util.structure import TensorStructure
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index 8175116c6eddf4a754202a2fbb22499c79a3f5b8..651dfd6857af319135c3ba594a48b824bc9f3b46 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -58,6 +58,22 @@ py_test(
     ],
 )
 
+py_test(
+    name = "map_defun_benchmark",
+    srcs = ["map_defun_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/data/experimental/ops:map_defun",
+        "//tensorflow/python/eager:function",
+    ],
+)
+
 py_test(
     name = "map_vectorization_benchmark",
     srcs = ["map_vectorization_benchmark.py"],
@@ -108,6 +124,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "rejection_resample_benchmark",
+    srcs = ["rejection_resample_benchmark.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:resampling",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
 py_test(
     name = "unbatch_benchmark",
     srcs = ["unbatch_benchmark.py"],
diff --git a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
index b00e918338339493d88ce648ed33b435ab2da692..e713494b526320f2c18774c7198406521c373033 100644
--- a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
@@ -39,7 +39,7 @@ class AutotuneBenchmark(test.Benchmark):
                                                                1))).repeat()
     dataset = dataset.map(
         math_ops.matmul, num_parallel_calls=optimization.AUTOTUNE)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
     deltas = []
@@ -76,7 +76,7 @@ class AutotuneBenchmark(test.Benchmark):
     options = dataset_ops.Options()
     options.experimental_numa_aware = numa_aware
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
     deltas = []
@@ -108,7 +108,7 @@ class AutotuneBenchmark(test.Benchmark):
         lambda _: dataset,
         cycle_length=10,
         num_parallel_calls=optimization.AUTOTUNE)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
     deltas = []
@@ -134,34 +134,34 @@ class AutotuneBenchmark(test.Benchmark):
     a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1))
     b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))
     c = (np.random.rand(1, 2 * k), np.random.rand(2 * k, 1))
-    dataset = dataset_ops.Dataset.from_tensors((a, b, c)).repeat()
+    dataset_a = dataset_ops.Dataset.from_tensors(a).repeat()
+    dataset_b = dataset_ops.Dataset.from_tensors(b).repeat()
+    dataset_c = dataset_ops.Dataset.from_tensors(c).repeat()
 
-    def f1(a, b, c):
-      x, y = a
-      return math_ops.matmul(x, y), b, c
+    def f1(x, y):
+      return math_ops.matmul(x, y)
 
-    def f2(a, b, c):
+    def f2(a, b):
       x, y = b
-      return a, math_ops.matmul(x, y), c
-
-    def f3(a, b, c):
-      x, y = c
-      return a, b, math_ops.matmul(x, y)
+      return a, math_ops.matmul(x, y)
 
+    dataset = dataset_a
     dataset = dataset.map(f1, num_parallel_calls=optimization.AUTOTUNE)
     dataset = dataset_ops.Dataset.range(1).repeat().interleave(
         lambda _: dataset,
         num_parallel_calls=optimization.AUTOTUNE,
         cycle_length=2)
 
+    dataset = dataset_ops.Dataset.zip((dataset, dataset_b))
     dataset = dataset.map(f2, num_parallel_calls=optimization.AUTOTUNE)
     dataset = dataset_ops.Dataset.range(1).repeat().interleave(
         lambda _: dataset,
         num_parallel_calls=optimization.AUTOTUNE,
         cycle_length=2)
 
-    dataset = dataset.map(f3, num_parallel_calls=optimization.AUTOTUNE)
-    iterator = dataset.make_one_shot_iterator()
+    dataset = dataset_ops.Dataset.zip((dataset, dataset_c))
+    dataset = dataset.map(f2, num_parallel_calls=optimization.AUTOTUNE)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
     deltas = []
diff --git a/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py b/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
index 7eebf49c38247c750541a261c98ea6fd74f850cf..03345ce4e6648fecf47348806c55adba10aeed5a 100644
--- a/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
@@ -27,6 +27,7 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import gfile
@@ -64,7 +65,7 @@ class CsvDatasetBenchmark(test.Benchmark):
     dataset = dataset.skip(self._num_per_iter - 1)
     deltas = []
     for _ in range(10):
-      next_element = dataset.make_one_shot_iterator().get_next()
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
       with session.Session() as sess:
         start = time.time()
         # NOTE: This depends on the underlying implementation of skip, to have
diff --git a/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
index 1e8dd0f63df22448c642b58168c0193b6af7f110..b17f2bcd12b2b78c97e7c390d919331ac4ef5386 100644
--- a/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
@@ -26,6 +26,7 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -54,7 +55,7 @@ class MapAndBatchBenchmark(test.Benchmark):
 
     dataset = dataset.apply(batching.map_and_batch(
         lambda _: dense_value, batch_size_placeholder))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     for shape in shapes:
@@ -139,47 +140,49 @@ class MapAndBatchBenchmark(test.Benchmark):
 
         num_iters = 1024 // (
             (element_size * batch_size) // min(num_calls, inter_op))
-        dataset = make_base_dataset(element_size)
-        chained_dataset = dataset.map(
+        fused_dataset = make_base_dataset(element_size)
+        fused_dataset = fused_dataset.map(
             math_ops.matmul,
             num_parallel_calls=num_calls).batch(batch_size=batch_size)
-        chained_iterator = chained_dataset.make_one_shot_iterator()
-        chained_get_next = chained_iterator.get_next()
 
-        chained_deltas = []
+        fused_iterator = dataset_ops.make_one_shot_iterator(fused_dataset)
+        fused_get_next = fused_iterator.get_next()
+
+        fused_deltas = []
         with session.Session(
             config=config_pb2.ConfigProto(
                 inter_op_parallelism_threads=inter_op,
                 use_per_session_threads=True)) as sess:
+
           for _ in range(5):
-            sess.run(chained_get_next.op)
+            sess.run(fused_get_next.op)
           for _ in range(num_iters):
             start = time.time()
-            sess.run(chained_get_next.op)
+            sess.run(fused_get_next.op)
             end = time.time()
-            chained_deltas.append(end - start)
+            fused_deltas.append(end - start)
 
-        fused_dataset = dataset.apply(
-            batching.map_and_batch(
-                math_ops.matmul,
-                num_parallel_calls=num_calls,
-                batch_size=batch_size))
-        fused_iterator = fused_dataset.make_one_shot_iterator()
-        fused_get_next = fused_iterator.get_next()
+        # `map_and_batch_fusion` is optimized by default. To get the chained
+        # dataset, with have to disable it.
+        options = dataset_ops.Options()
+        options.experimental_optimization = OptimizationOptions()
+        options.experimental_optimization.map_and_batch_fusion = False
+        chained_dataset = fused_dataset.with_options(options)
+        chained_iterator = dataset_ops.make_one_shot_iterator(chained_dataset)
+        chained_get_next = chained_iterator.get_next()
 
-        fused_deltas = []
+        chained_deltas = []
         with session.Session(
             config=config_pb2.ConfigProto(
                 inter_op_parallelism_threads=inter_op,
                 use_per_session_threads=True)) as sess:
-
           for _ in range(5):
-            sess.run(fused_get_next.op)
+            sess.run(chained_get_next.op)
           for _ in range(num_iters):
             start = time.time()
-            sess.run(fused_get_next.op)
+            sess.run(chained_get_next.op)
             end = time.time()
-            fused_deltas.append(end - start)
+            chained_deltas.append(end - start)
 
         print(
             "batch size: %d, num parallel calls: %d, inter-op parallelism: %d, "
diff --git a/tensorflow/python/data/experimental/benchmarks/map_defun_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_defun_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..49297ca7c58f4ce3127e6e64944a09d0837cea3f
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/map_defun_benchmark.py
@@ -0,0 +1,75 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for MapDefunOp."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import map_defun
+from tensorflow.python.eager import function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks too.
+class MapDefunBenchmark(test.Benchmark):
+  """Benchmarks for MapDefunOp."""
+
+  def _run(self, op, name=None, num_iters=3000):
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(op)
+      start = time.time()
+      for _ in range(num_iters):
+        sess.run(op)
+      end = time.time()
+      mean_us = (end - start) * 1e6 / num_iters
+      self.report_benchmark(
+          name=name,
+          iters=num_iters,
+          wall_time=mean_us,
+          extras={"examples_per_sec": num_iters / (end - start)})
+
+  def benchmarkDefunVsMapFn(self):
+    """Benchmarks to compare the performance of MapDefun vs tf.map_fn."""
+
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)])
+    def defun(x):
+      return array_ops.identity(x)
+
+    def map_fn(x):
+      return array_ops.identity(x)
+
+    base = math_ops.range(100)
+    for input_size in [10, 100, 1000, 10000]:
+      num_iters = 100000 // input_size
+      map_defun_op = map_defun.map_defun(defun, [base], [dtypes.int32], [()])
+      map_fn_op = functional_ops.map_fn(map_fn, base)
+
+      self._run(
+          map_defun_op, "with_defun_size_%d" % input_size, num_iters=num_iters)
+      self._run(
+          map_fn_op, "without_defun_size_%d" % input_size, num_iters=num_iters)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
index 0c3ac8b37154dd878929852d8d6875aa791fa49d..a60ba0a857ee18e88e912fc25000a479e4a86e72 100644
--- a/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
@@ -115,14 +116,24 @@ class MapVectorizationBenchmark(test.Benchmark):
   def _compare(self, input_dataset, map_fn, batch_size, input_size, str_id):
     num_elems = int(np.sum([np.prod(x) for x in input_size]))
     name_template = "{}__batch_size_{}_input_element_size_{}_{}"
-    unoptimized = input_dataset.map(map_fn).batch(batch_size)
-    unoptimized_op = unoptimized.make_one_shot_iterator().get_next()
 
-    optimized = input_dataset.map(map_fn).batch(batch_size)
+    base_dataset = input_dataset.map(map_fn).batch(batch_size)
+
     options = dataset_ops.Options()
-    options.experimental_map_vectorization = True
-    optimized = optimized.with_options(options)
-    optimized_op = optimized.make_one_shot_iterator().get_next()
+    opt_options = optimization_options.OptimizationOptions()
+    # Disable default map_and_batch_fusion optimization
+    opt_options.map_and_batch_fusion = False
+    options.experimental_optimization = opt_options
+    base_dataset = base_dataset.with_options(options)
+
+    unoptimized_op = dataset_ops.make_one_shot_iterator(base_dataset).get_next()
+
+    optimized_options = dataset_ops.Options()
+    opt_options = optimization_options.OptimizationOptions()
+    opt_options.map_vectorization = True
+    optimized_options.experimental_optimization = opt_options
+    optimized = base_dataset.with_options(optimized_options)
+    optimized_op = dataset_ops.make_one_shot_iterator(optimized).get_next()
 
     unoptimized_time = self._run(
         unoptimized_op,
diff --git a/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py b/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
index 2eb5561b11a450b80ce7e3efa75ecd2d82cb477e..c53f8dd7c537fecbfcd551e2a4809aaf5447ff46 100644
--- a/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
@@ -26,6 +26,7 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import matching_files
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
@@ -59,7 +60,7 @@ class MatchingFilesBenchmark(test.Benchmark):
     for _ in range(iters):
       with ops.Graph().as_default():
         dataset = matching_files.MatchingFilesDataset(patterns)
-        next_element = dataset.make_one_shot_iterator().get_next()
+        next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
         with session.Session() as sess:
           sub_deltas = []
diff --git a/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py b/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
index 0eca97d26dea0ab36b3db9cd830696ecda84eb59..1bbee5e7a3ff61a2d7c8d418cc6bdd360595dbe7 100644
--- a/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+# TODO(b/119837791): Add eager benchmarks too.
 class OptimizationBenchmark(test.Benchmark):
   """Benchmarks for static optimizations."""
 
@@ -46,10 +47,10 @@ class OptimizationBenchmark(test.Benchmark):
         dataset = dataset.map(lambda x: x)
       if optimize_dataset:
         options = dataset_ops.Options()
-        options.experimental_map_fusion = True
+        options.experimental_optimization.map_fusion = True
         dataset = dataset.with_options(options)
 
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       next_element = iterator.get_next()
 
       with session.Session() as sess:
@@ -89,9 +90,9 @@ class OptimizationBenchmark(test.Benchmark):
             lambda x: math_ops.greater_equal(x - 5, 0))
       if optimize_dataset:
         options = dataset_ops.Options()
-        options.experimental_map_and_filter_fusion = True
+        options.experimental_optimization.map_and_filter_fusion = True
         dataset = dataset.with_options(options)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       next_element = iterator.get_next()
 
       with session.Session() as sess:
@@ -115,6 +116,47 @@ class OptimizationBenchmark(test.Benchmark):
             name="map_and_filter_fusion_{}_chain_length_{}".format(
                 opt_mark, chain_length))
 
+  # This benchmark compares the performance of pipeline with multiple chained
+  # filter with and without filter fusion.
+  def benchmarkFilterFusion(self):
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      self._benchmarkFilterFusion(chain_length, False)
+      self._benchmarkFilterFusion(chain_length, True)
+
+  def _benchmarkFilterFusion(self, chain_length, optimize_dataset):
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors(5).repeat(None)
+      for _ in range(chain_length):
+        dataset = dataset.filter(lambda x: math_ops.greater_equal(x - 5, 0))
+      if optimize_dataset:
+        options = dataset_ops.Options()
+        options.experimental_optimization.filter_fusion = True
+        dataset = dataset.with_options(options)
+
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for _ in range(10):
+          sess.run(next_element.op)
+        deltas = []
+        for _ in range(100):
+          start = time.time()
+          for _ in range(100):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100
+        opt_mark = "opt" if optimize_dataset else "no-opt"
+        print("Filter dataset {} chain length: {} Median wall time: {}".format(
+            opt_mark, chain_length, median_wall_time))
+        self.report_benchmark(
+            iters=1000,
+            wall_time=median_wall_time,
+            name="chain_length_{}_{}".format(opt_mark, chain_length))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/rejection_resample_benchmark.py b/tensorflow/python/data/experimental/benchmarks/rejection_resample_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a64f7ecb00b4c2c02b1a579562cbf0afcf50f10e
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/rejection_resample_benchmark.py
@@ -0,0 +1,71 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.experimental.rejection_resample()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import resampling
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+def _time_resampling(data_np, target_dist, init_dist, num_to_sample):  # pylint: disable=missing-docstring
+  dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat()
+
+  # Reshape distribution via rejection sampling.
+  dataset = dataset.apply(
+      resampling.rejection_resample(
+          class_func=lambda x: x,
+          target_dist=target_dist,
+          initial_dist=init_dist,
+          seed=142))
+
+  get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
+
+  with session.Session() as sess:
+    start_time = time.time()
+    for _ in xrange(num_to_sample):
+      sess.run(get_next)
+    end_time = time.time()
+
+  return end_time - start_time
+
+
+class RejectionResampleBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.experimental.rejection_resample()`."""
+
+  def benchmarkResamplePerformance(self):
+    init_dist = [0.25, 0.25, 0.25, 0.25]
+    target_dist = [0.0, 0.0, 0.0, 1.0]
+    num_classes = len(init_dist)
+    # We don't need many samples to test a dirac-delta target distribution
+    num_samples = 1000
+    data_np = np.random.choice(num_classes, num_samples, p=init_dist)
+
+    resample_time = _time_resampling(
+        data_np, target_dist, init_dist, num_to_sample=1000)
+
+    self.report_benchmark(iters=1000, wall_time=resample_time, name="resample")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
index c40d4798238032cdb5ec02377aaa5001133378a0..6f80df50b847c4e93c16603061b63399a1a4ff2d 100644
--- a/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for `tf.data.experimental.unbatch()`."""
+"""Benchmarks for `tf.data.experimental.unbatch()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -42,7 +42,7 @@ class UnbatchBenchmark(test.Benchmark):
       dataset = dataset.batch(batch_size_placeholder)
       dataset = dataset.apply(batching.unbatch())
       dataset = dataset.skip(elems_per_trial)
-      iterator = dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(dataset)
       next_element = iterator.get_next()
 
       with session.Session() as sess:
@@ -78,7 +78,7 @@ class UnbatchBenchmark(test.Benchmark):
       dataset = dataset.batch(batch_size_placeholder)
       dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
       dataset = dataset.skip(elems_per_trial)
-      iterator = dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(dataset)
       next_element = iterator.get_next()
 
       with session.Session() as sess:
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 897e949b0fbc1f0ed0b144c2f706c59cef287d3a..36478785c9155243e092bb498f332f031a2d0e46 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -1,12 +1,12 @@
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
 package(default_visibility = ["//tensorflow:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-load("//tensorflow:tensorflow.bzl", "py_test")
-
 py_test(
     name = "bucket_by_sequence_length_test",
     size = "medium",
@@ -130,26 +130,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "filter_dataset_op_test",
-    size = "medium",
-    srcs = ["filter_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/experimental/ops:optimization",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_test(
     name = "get_single_element_test",
     size = "small",
@@ -366,6 +346,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "cardinality_test",
+    srcs = ["cardinality_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/data/experimental/ops:cardinality",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "override_threadpool_test",
     size = "small",
@@ -534,6 +526,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:scan_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
@@ -610,7 +603,7 @@ py_test(
 
 py_test(
     name = "stats_dataset_ops_test",
-    size = "medium",
+    size = "large",
     srcs = ["stats_dataset_ops_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -704,3 +697,14 @@ py_test(
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
+
+cuda_py_test(
+    name = "wrap_unwrap_test",
+    size = "small",
+    srcs = ["wrap_unwrap_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
diff --git a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
index bcb7ef94962801a1f3db5d2845eb0a0ce7888e42..e0978676fd5f1fa6143af041f16d0b6c9927611f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
@@ -24,10 +24,12 @@ from absl.testing import parameterized
 from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -71,6 +73,7 @@ def _get_record_shape(sparse):
   return tensor_shape.TensorShape([None])
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BucketBySequenceLengthTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
@@ -200,11 +203,12 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, parameterized.TestCa
 
     _test_bucket_by_padding(param_no_padding)
 
+  # TODO(b/117581999): add eager coverage.
   @parameterized.named_parameters(
       ("WithoutPadding", True),
       ("WithPadding", False),
   )
-  def testBucket(self, param_no_padding):
+  def testSkipEagerBucket(self, param_no_padding):
 
     boundaries = [10, 20, 30]
     batch_sizes = [10, 8, 4, 2]
@@ -238,14 +242,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, parameterized.TestCa
               boundaries,
               batch_sizes,
               no_padding=no_padding))
-      batch, = dataset.make_one_shot_iterator().get_next()
+      get_next = self.getNext(dataset)
+      batches = []
+      for _ in range(4):
+        batch, = self.evaluate(get_next())
+        batches.append(batch)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
 
-      with self.cached_session() as sess:
-        batches = []
-        for _ in range(4):
-          batches.append(self.evaluate(batch))
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(batch)
       batch_sizes_val = []
       lengths_val = []
       for batch in batches:
@@ -254,8 +258,9 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, parameterized.TestCa
         length = shape[1]
         batch_sizes_val.append(batch_size)
         lengths_val.append(length)
-        sum_check = batch.values.sum() if no_padding else batch.sum()
-        self.assertEqual(sum_check, batch_size * length - 1)
+        if not context.executing_eagerly():
+          sum_check = batch.values.sum() if no_padding else batch.sum()
+          self.assertEqual(sum_check, batch_size * length - 1)
       self.assertEqual(sum(batch_sizes_val), sum(batch_sizes))
       self.assertEqual(sorted(batch_sizes), sorted(batch_sizes_val))
       self.assertEqual(sorted(lengths), sorted(lengths_val))
@@ -287,14 +292,15 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, parameterized.TestCa
             grouping.bucket_by_sequence_length(
                 element_len, boundaries, batch_sizes,
                 pad_to_bucket_boundary=True))
-    batch, = dataset.make_one_shot_iterator().get_next()
+    get_next = self.getNext(dataset)
+
+    batches = []
+    for _ in range(3):
+      batch, = self.evaluate(get_next())
+      batches.append(batch)
+    with self.assertRaisesOpError("bucket_boundaries"):
+      self.evaluate(get_next())
 
-    with self.cached_session() as sess:
-      batches = []
-      for _ in range(3):
-        batches.append(self.evaluate(batch))
-      with self.assertRaisesOpError("bucket_boundaries"):
-        self.evaluate(batch)
     batch_sizes_val = []
     lengths_val = []
     for batch in batches:
@@ -324,14 +330,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, parameterized.TestCa
             grouping.bucket_by_sequence_length(
                 element_len, boundaries, batch_sizes,
                 pad_to_bucket_boundary=True))
-    batch, = dataset.make_one_shot_iterator().get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      batches = []
-      for _ in range(5):
-        batches.append(self.evaluate(batch))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(batch)
+    batches = []
+    for _ in range(5):
+      batch, = self.evaluate(get_next())
+      batches.append(batch)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
     self.assertAllEqual(batches[0], [[1, 0],
                                      [1, 1]])
@@ -378,11 +384,12 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, parameterized.TestCa
 
     _test_tuple_elements_by_padding(param_no_padding)
 
+  # TODO(b/117581999): add eager coverage
   @parameterized.named_parameters(
       ("DoDropRemainder", True),
       ("DoNotDropRemainder", False),
   )
-  def testBucketSparse(self, param_drop_remainder):
+  def testSkipEagerBucketSparse(self, param_drop_remainder):
     """Tests bucketing of sparse tensors (case where `no_padding` == True).
 
     Test runs on following dataset:
@@ -439,17 +446,16 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, parameterized.TestCa
 
     def _compute_batches(dataset):
       """Computes actual batch outputs of dataset and stores in a set."""
-      batch = dataset.make_one_shot_iterator().get_next()
+      batch = self.getNext(dataset)
       all_sparse_tensors = set()
-      with self.cached_session() as sess:
-        with self.assertRaises(errors.OutOfRangeError):
-          while True:
-            output = self.evaluate(batch)
-            sprs_tensor = (tuple([tuple(idx) for idx in output.indices]),
-                           tuple(output.values))
-            all_sparse_tensors.add(sprs_tensor)
-      return all_sparse_tensors
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          output = self.evaluate(batch())
+          sprs_tensor = (tuple([tuple(idx) for idx in output.indices]),
+                         tuple(output.values))
+          all_sparse_tensors.add(sprs_tensor)
 
+      return all_sparse_tensors
     dataset = _build_dataset()
     boundaries = range(min_len + bucket_size + 1, max_len, bucket_size)
     dataset = dataset.apply(grouping.bucket_by_sequence_length(
diff --git a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a8296d08482d4d800eb3bb0b94bbae940264da6
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
@@ -0,0 +1,160 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.cardinality()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import cardinality
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
+  """Tests for `tf.data.experimental.cardinality()`."""
+
+  @parameterized.named_parameters(
+      # pylint: disable=g-long-lambda
+      ("Batch1",
+       lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=True), 2),
+      ("Batch2",
+       lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=False), 3),
+      ("Batch3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).batch(2),
+       cardinality.UNKNOWN),
+      ("Batch4", lambda: dataset_ops.Dataset.range(5).repeat().batch(2),
+       cardinality.INFINITE),
+      ("Cache1", lambda: dataset_ops.Dataset.range(5).cache(), 5),
+      ("Cache2", lambda: dataset_ops.Dataset.range(5).cache("foo"), 5),
+      ("Concatenate1", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5)), 10),
+      ("Concatenate2",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5)), cardinality.UNKNOWN),
+      ("Concatenate3", lambda: dataset_ops.Dataset.range(5).repeat().
+       concatenate(dataset_ops.Dataset.range(5)),
+       cardinality.INFINITE),
+      ("Concatenate4", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       cardinality.UNKNOWN),
+      ("Concatenate5",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       cardinality.UNKNOWN),
+      ("Concatenate6", lambda: dataset_ops.Dataset.range(5).repeat().
+       concatenate(dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       cardinality.INFINITE),
+      ("Concatenate7", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5).repeat()), cardinality.INFINITE),
+      ("Concatenate8",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5).repeat()), cardinality.INFINITE),
+      ("Concatenate9",
+       lambda: dataset_ops.Dataset.range(5).repeat().concatenate(
+           dataset_ops.Dataset.range(5).repeat()), cardinality.INFINITE),
+      ("FlatMap", lambda: dataset_ops.Dataset.range(5).flat_map(
+          lambda _: dataset_ops.Dataset.from_tensors(0)),
+       cardinality.UNKNOWN),
+      ("Filter", lambda: dataset_ops.Dataset.range(5).filter(lambda _: True),
+       cardinality.UNKNOWN),
+      ("FromTensors1", lambda: dataset_ops.Dataset.from_tensors(0), 1),
+      ("FromTensors2", lambda: dataset_ops.Dataset.from_tensors((0, 1)), 1),
+      ("FromTensorSlices1",
+       lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0]), 3),
+      ("FromTensorSlices2",
+       lambda: dataset_ops.Dataset.from_tensor_slices(([0, 0, 0], [1, 1, 1])),
+       3),
+      ("Interleave1", lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
+       cardinality.UNKNOWN),
+      ("Interleave2", lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0),
+          cycle_length=1,
+          num_parallel_calls=1), cardinality.UNKNOWN),
+      ("Map1", lambda: dataset_ops.Dataset.range(5).map(lambda x: x), 5),
+      ("Map2", lambda: dataset_ops.Dataset.range(5).map(
+          lambda x: x, num_parallel_calls=1), 5),
+      ("PaddedBatch1", lambda: dataset_ops.Dataset.range(5).padded_batch(
+          2, [], drop_remainder=True), 2),
+      ("PaddedBatch2", lambda: dataset_ops.Dataset.range(5).padded_batch(
+          2, [], drop_remainder=False), 3),
+      ("PaddedBatch3", lambda: dataset_ops.Dataset.range(5).filter(
+          lambda _: True).padded_batch(2, []), cardinality.UNKNOWN),
+      ("PaddedBatch4",
+       lambda: dataset_ops.Dataset.range(5).repeat().padded_batch(2, []),
+       cardinality.INFINITE),
+      ("Prefetch", lambda: dataset_ops.Dataset.range(5).prefetch(buffer_size=1),
+       5),
+      ("Range1", lambda: dataset_ops.Dataset.range(0), 0),
+      ("Range2", lambda: dataset_ops.Dataset.range(5), 5),
+      ("Range3", lambda: dataset_ops.Dataset.range(5, 10), 5),
+      ("Range4", lambda: dataset_ops.Dataset.range(10, 5), 0),
+      ("Range5", lambda: dataset_ops.Dataset.range(5, 10, 2), 3),
+      ("Range6", lambda: dataset_ops.Dataset.range(10, 5, -2), 3),
+      ("Repeat1", lambda: dataset_ops.Dataset.range(0).repeat(0), 0),
+      ("Repeat2", lambda: dataset_ops.Dataset.range(1).repeat(0), 0),
+      ("Repeat3", lambda: dataset_ops.Dataset.range(0).repeat(5), 0),
+      ("Repeat4", lambda: dataset_ops.Dataset.range(1).repeat(5), 5),
+      ("Repeat5", lambda: dataset_ops.Dataset.range(0).repeat(), 0),
+      ("Repeat6", lambda: dataset_ops.Dataset.range(1).repeat(),
+       cardinality.INFINITE),
+      ("Shuffle", lambda: dataset_ops.Dataset.range(5).shuffle(buffer_size=1),
+       5),
+      ("Skip1", lambda: dataset_ops.Dataset.range(5).skip(2), 3),
+      ("Skip2", lambda: dataset_ops.Dataset.range(5).skip(8), 0),
+      ("Skip3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).skip(2),
+       cardinality.UNKNOWN),
+      ("Skip4", lambda: dataset_ops.Dataset.range(5).repeat().skip(2),
+       cardinality.INFINITE),
+      ("Take1", lambda: dataset_ops.Dataset.range(5).take(2), 2),
+      ("Take2", lambda: dataset_ops.Dataset.range(5).take(8), 5),
+      ("Take3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).take(2),
+       cardinality.UNKNOWN),
+      ("Take4", lambda: dataset_ops.Dataset.range(5).repeat().take(2), 2),
+      ("Window1", lambda: dataset_ops.Dataset.range(5).window(
+          size=2, shift=2, drop_remainder=True), 2),
+      ("Window2", lambda: dataset_ops.Dataset.range(5).window(
+          size=2, shift=2, drop_remainder=False), 3),
+      ("Zip1", lambda: dataset_ops.Dataset.zip(dataset_ops.Dataset.range(5)),
+       5),
+      ("Zip2", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5), dataset_ops.Dataset.range(3))), 3),
+      ("Zip3", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5),
+           dataset_ops.Dataset.range(3).repeat())), 5),
+      ("Zip4", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5).repeat(),
+           dataset_ops.Dataset.range(3).repeat())), cardinality.INFINITE),
+      ("Zip5", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5),
+           dataset_ops.Dataset.range(3).filter(lambda _: True))),
+       cardinality.UNKNOWN),
+      # pylint: enable=g-long-lambda
+  )
+  def testNumElements(self, dataset_fn, expected_result):
+    with self.cached_session() as sess:
+      self.assertEqual(
+          sess.run(cardinality.cardinality(dataset_fn())), expected_result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
index b0516573f5f13a543ad75efc00cf6085486c5569..d9fbe9e0e18c526e7e0bf88b9c3b477bf0917fe5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat as util_compat
 
 
+# TODO(b/117581999): add eager coverage when supported.
 class CopyToDeviceTest(test_base.DatasetTestBase):
 
   @test_util.run_deprecated_v1
@@ -42,7 +43,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -56,7 +57,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
         self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
@@ -69,7 +70,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -83,7 +84,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual((4,), next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       self.assertAllEqual([0, 1, 2, 3], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
@@ -95,7 +96,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:0"))
 
     with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -109,7 +110,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
         self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
@@ -122,7 +123,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -136,7 +137,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
         self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
@@ -149,7 +150,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -163,7 +164,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element["a"].shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
         self.assertEqual({"a": i}, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
@@ -176,7 +177,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -190,7 +191,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element["a"].shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
         self.assertEqual({"a": i}, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
@@ -209,7 +210,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -222,7 +223,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(dtypes.int64, next_element.dtype)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
         actual = self.evaluate(next_element)
         self.assertAllEqual([i], actual.values)
@@ -244,7 +245,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -257,7 +258,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(dtypes.int64, next_element.dtype)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
         actual = self.evaluate(next_element)
         self.assertAllEqual([i], actual.values)
@@ -275,10 +276,11 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
       self.evaluate(iterator.initializer)
       for i in range(10):
         self.assertEqual(i, self.evaluate(next_element))
@@ -294,10 +296,11 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0")).prefetch(1)
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
       self.evaluate(iterator.initializer)
       for i in range(10):
         self.assertEqual(i, self.evaluate(next_element))
@@ -327,10 +330,11 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     device_dataset = device_dataset.with_options(options)
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
       self.evaluate(iterator.initializer)
       for i in range(10):
         x, y, z = self.evaluate(next_element)
@@ -349,10 +353,11 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
       self.evaluate(iterator.initializer)
       self.assertAllEqual([0, 1, 2, 3], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
@@ -367,10 +372,11 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0")).prefetch(1)
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
       self.evaluate(iterator.initializer)
       self.assertAllEqual([0, 1, 2, 3], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
@@ -385,10 +391,11 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
       self.evaluate(iterator.initializer)
       self.assertAllEqual([b"a", b"b", b"c"], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
@@ -403,10 +410,11 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
       self.evaluate(iterator.initializer)
       self.assertAllEqual([b"a", b"b", b"c"], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
@@ -424,10 +432,11 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
           prefetching_ops.copy_to_device("/cpu:0", source_device="/gpu:0"))
 
       with ops.device("/cpu:0"):
-        iterator = back_to_cpu_dataset.make_initializable_iterator()
+        iterator = dataset_ops.make_initializable_iterator(back_to_cpu_dataset)
         next_element = iterator.get_next()
 
-      with self.cached_session() as sess:
+      with self.cached_session(
+          config=config_pb2.ConfigProto(allow_soft_placement=False)):
         self.evaluate(iterator.initializer)
         for i in range(10):
           self.assertEqual(i, self.evaluate(next_element))
@@ -441,7 +450,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -455,7 +464,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       self.evaluate(iterator.initializer)
       for i in range(5):
         self.assertEqual(i, self.evaluate(next_element))
@@ -472,7 +481,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -486,7 +495,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       self.evaluate(iterator.initializer)
       for i in range(5):
         self.assertEqual(i, self.evaluate(next_element))
@@ -505,10 +514,11 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
       self.evaluate(iterator.initializer)
       for i in range(5):
         self.assertEqual(i, self.evaluate(next_element))
@@ -527,10 +537,11 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0")).prefetch(1)
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
       self.evaluate(iterator.initializer)
       for i in range(5):
         self.assertEqual(i, self.evaluate(next_element))
@@ -548,12 +559,13 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/gpu:0"))
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_elem = iterator_ops.get_next_as_optional(iterator)
       elem_has_value_t = next_elem.has_value()
       elem_value_t = next_elem.get_value()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
       # Before initializing the iterator, evaluating the optional fails with
       # a FailedPreconditionError.
       with self.assertRaises(errors.FailedPreconditionError):
diff --git a/tensorflow/python/data/experimental/kernel_tests/counter_test.py b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
index b370e0029e94ef739ba732c1993ac25d975b6200..436fa506c419dd73bf1836b9ba5486f9d435105b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/counter_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
@@ -24,29 +24,26 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class CounterTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testCounter(self):
     """Test dataset construction using `count`."""
-    iterator = (counter.Counter(start=3, step=4)
-                .make_one_shot_iterator())
-    get_next = iterator.get_next()
-    self.assertEqual([], get_next.shape.as_list())
-    self.assertEqual(dtypes.int64, get_next.dtype)
-
-    negative_iterator = (counter.Counter(start=0, step=-1)
-                         .make_one_shot_iterator())
-    negative_get_next = negative_iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertEqual(3, self.evaluate(get_next))
-      self.assertEqual(3 + 4, self.evaluate(get_next))
-      self.assertEqual(3 + 2 * 4, self.evaluate(get_next))
-
-      self.assertEqual(0, self.evaluate(negative_get_next))
-      self.assertEqual(-1, self.evaluate(negative_get_next))
-      self.assertEqual(-2, self.evaluate(negative_get_next))
+    dataset = counter.Counter(start=3, step=4)
+    self.assertEqual([], dataset.output_shapes.as_list())
+    self.assertEqual(dtypes.int64, dataset.output_types)
+    get_next = self.getNext(dataset)
+
+    negative_dataset = counter.Counter(start=0, step=-1)
+    negative_get_next = self.getNext(negative_dataset)
+
+    self.assertEqual(3, self.evaluate(get_next()))
+    self.assertEqual(3 + 4, self.evaluate(get_next()))
+    self.assertEqual(3 + 2 * 4, self.evaluate(get_next()))
+
+    self.assertEqual(0, self.evaluate(negative_get_next()))
+    self.assertEqual(-1, self.evaluate(negative_get_next()))
+    self.assertEqual(-2, self.evaluate(negative_get_next()))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
index 4b84446be8aa26d4df6b02bcc117669bc3dba8a8..cca7ae073ee07124715725c5913036cb41a37950 100644
--- a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
@@ -22,107 +22,87 @@ import numpy as np
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class DenseToSparseBatchTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDataset(self):
     components = np.random.randint(12, size=(100,)).astype(np.int32)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: array_ops.fill([x], x)).apply(
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        components).map(lambda x: array_ops.fill([x], x)).apply(
             batching.dense_to_sparse_batch(4, [12]))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-
-      for start in range(0, len(components), 4):
-        results = self.evaluate(get_next)
-        self.assertAllEqual([[i, j]
-                             for i, c in enumerate(components[start:start + 4])
-                             for j in range(c)], results.indices)
-        self.assertAllEqual(
-            [c for c in components[start:start + 4] for _ in range(c)],
-            results.values)
-        self.assertAllEqual([min(4,
-                                 len(components) - start), 12],
-                            results.dense_shape)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-
-  @test_util.run_deprecated_v1
+    get_next = self.getNext(dataset)
+
+    for start in range(0, len(components), 4):
+      results = self.evaluate(get_next())
+      self.assertAllEqual([[i, j]
+                           for i, c in enumerate(components[start:start + 4])
+                           for j in range(c)], results.indices)
+      self.assertAllEqual(
+          [c for c in components[start:start + 4] for _ in range(c)],
+          results.values)
+      self.assertAllEqual([min(4,
+                               len(components) - start), 12],
+                          results.dense_shape)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
   def testDenseToSparseBatchDatasetWithUnknownShape(self):
     components = np.random.randint(5, size=(40,)).astype(np.int32)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: array_ops.fill([x, x], x)).apply(
-            batching.dense_to_sparse_batch(
-                4, [5, None])).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-
-      for start in range(0, len(components), 4):
-        results = self.evaluate(get_next)
-        self.assertAllEqual([[i, j, z]
-                             for i, c in enumerate(components[start:start + 4])
-                             for j in range(c)
-                             for z in range(c)], results.indices)
-        self.assertAllEqual([
-            c
-            for c in components[start:start + 4] for _ in range(c)
-            for _ in range(c)
-        ], results.values)
-        self.assertAllEqual([
-            min(4,
-                len(components) - start), 5,
-            np.max(components[start:start + 4])
-        ], results.dense_shape)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-
-  @test_util.run_deprecated_v1
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        components).map(lambda x: array_ops.fill([x, x], x)).apply(
+            batching.dense_to_sparse_batch(4, [5, None]))
+
+    get_next = self.getNext(dataset)
+
+    for start in range(0, len(components), 4):
+      results = self.evaluate(get_next())
+      self.assertAllEqual([[i, j, z]
+                           for i, c in enumerate(components[start:start + 4])
+                           for j in range(c)
+                           for z in range(c)], results.indices)
+      self.assertAllEqual([
+          c for c in components[start:start + 4] for _ in range(c)
+          for _ in range(c)
+      ], results.values)
+      self.assertAllEqual([
+          min(4,
+              len(components) - start), 5,
+          np.max(components[start:start + 4])
+      ], results.dense_shape)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
   def testDenseToSparseBatchDatasetWithInvalidShape(self):
     input_tensor = array_ops.constant([[1]])
     with self.assertRaisesRegexp(ValueError, "Dimension -2 must be >= 0"):
       dataset_ops.Dataset.from_tensors(input_tensor).apply(
-          batching.dense_to_sparse_batch(4, [-2])).make_initializable_iterator()
+          batching.dense_to_sparse_batch(4, [-2]))
 
-  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDatasetShapeErrors(self):
-    input_tensor = array_ops.placeholder(dtypes.int32)
-    iterator = (
-        dataset_ops.Dataset.from_tensors(input_tensor).apply(
-            batching.dense_to_sparse_batch(4, [12]))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Initialize with an input tensor of incompatible rank.
-      sess.run(init_op, feed_dict={input_tensor: [[1]]})
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "incompatible with the row shape"):
-        self.evaluate(get_next)
-
-      # Initialize with an input tensor that is larger than `row_shape`.
-      sess.run(init_op, feed_dict={input_tensor: range(13)})
-      with self.assertRaisesRegexp(errors.DataLossError,
-                                   "larger than the row shape"):
-        self.evaluate(get_next)
+
+    def dataset_fn(input_tensor):
+      return dataset_ops.Dataset.from_tensors(input_tensor).apply(
+          batching.dense_to_sparse_batch(4, [12]))
+
+    # Initialize with an input tensor of incompatible rank.
+    get_next = self.getNext(dataset_fn([[1]]))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "incompatible with the row shape"):
+      self.evaluate(get_next())
+
+    # Initialize with an input tensor that is larger than `row_shape`.
+    get_next = self.getNext(dataset_fn(np.int32(range(13))))
+    with self.assertRaisesRegexp(errors.DataLossError,
+                                 "larger than the row shape"):
+      self.evaluate(get_next())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
index 269c27dde29bd2d5db492ebdc9a7fb211109425e..df69a9dbb01b6f7049f76a83df682232d4012ead 100644
--- a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
@@ -28,9 +28,9 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
     selector_dataset = dataset_ops.Dataset.range(10).repeat(100)
     input_datasets = [
@@ -38,16 +38,13 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
     ]
     dataset = interleave_ops._DirectedInterleaveDataset(selector_dataset,
                                                         input_datasets)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
+    next_element = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      for _ in range(100):
-        for i in range(10):
-          self.assertEqual(i, self.evaluate(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    for _ in range(100):
+      for i in range(10):
+        self.assertEqual(i, self.evaluate(next_element()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def _normalize(self, vec):
     return vec / vec.sum()
@@ -67,19 +64,16 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
         for i in range(num_datasets)
     ], weights)
     dataset = dataset.take(num_samples)
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      freqs = np.zeros([num_datasets])
-      for _ in range(num_samples):
-        freqs[self.evaluate(next_element)] += 1
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    next_element = self.getNext(dataset)
+    freqs = np.zeros([num_datasets])
+    for _ in range(num_samples):
+      freqs[self.evaluate(next_element())] += 1
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
     return freqs
 
-  @test_util.run_deprecated_v1
   def testSampleFromDatasets(self):
     random_seed.set_random_seed(1619)
     num_samples = 5000
@@ -99,21 +93,17 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
       freqs = self._testSampleFromDatasetsHelper(probs_ds, classes, num_samples)
       self.assertLess(self._chi2(probs, freqs / num_samples), 1e-2)
 
-  @test_util.run_deprecated_v1
   def testSelectFromDatasets(self):
     words = [b"foo", b"bar", b"baz"]
     datasets = [dataset_ops.Dataset.from_tensors(w).repeat() for w in words]
     choice_array = np.random.randint(3, size=(15,), dtype=np.int64)
     choice_dataset = dataset_ops.Dataset.from_tensor_slices(choice_array)
     dataset = interleave_ops.choose_from_datasets(datasets, choice_dataset)
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in choice_array:
-        self.assertEqual(words[i], self.evaluate(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    next_element = self.getNext(dataset)
+    for i in choice_array:
+      self.assertEqual(words[i], self.evaluate(next_element()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testErrors(self):
     with self.assertRaisesRegexp(ValueError,
diff --git a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
index 3c2e1bb7f3d5fabdba3217d2222064cb3254e0c0..cbc048e3ab460c9bc3bf4efa63221f814075f4ac 100644
--- a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
@@ -22,36 +22,28 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class EnumerateDatasetTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testEnumerateDataset(self):
     components = (["a", "b"], [1, 2], [37.0, 38])
     start = constant_op.constant(20, dtype=dtypes.int64)
 
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).apply(
-        enumerate_ops.enumerate_dataset(start)).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
+        enumerate_ops.enumerate_dataset(start))
 
-    self.assertEqual(dtypes.int64, get_next[0].dtype)
-    self.assertEqual((), get_next[0].shape)
+    self.assertEqual(dtypes.int64, dataset.output_types[0])
+    self.assertEqual((), dataset.output_shapes[0])
     self.assertEqual([tensor_shape.TensorShape([])] * 3,
-                     [t.shape for t in get_next[1]])
+                     [shape for shape in dataset.output_shapes[1]])
 
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      self.assertEqual((20, (b"a", 1, 37.0)), self.evaluate(get_next))
-      self.assertEqual((21, (b"b", 2, 38.0)), self.evaluate(get_next))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    self.assertDatasetProduces(dataset, [(20, (b"a", 1, 37.0)),
+                                         (21, (b"b", 2, 38.0))])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
deleted file mode 100644
index 4f8cb1246f3daf2b577dcffc826d06141b4ef64f..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Benchmarks FilterDataset input pipeline op."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class FilterBenchmark(test.Benchmark):
-
-  # This benchmark compares the performance of pipeline with multiple chained
-  # filter with and without filter fusion.
-  def benchmarkFilters(self):
-    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
-    for chain_length in chain_lengths:
-      self._benchmarkFilters(chain_length, False)
-      self._benchmarkFilters(chain_length, True)
-
-  def _benchmarkFilters(self, chain_length, optimize_dataset):
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors(5).repeat(None)
-      for _ in range(chain_length):
-        dataset = dataset.filter(lambda x: math_ops.greater_equal(x - 5, 0))
-      if optimize_dataset:
-        dataset = dataset.apply(optimization.optimize(["filter_fusion"]))
-
-      iterator = dataset.make_one_shot_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for _ in range(10):
-          self.evaluate(next_element.op)
-        deltas = []
-        for _ in range(100):
-          start = time.time()
-          for _ in range(100):
-            self.evaluate(next_element.op)
-          end = time.time()
-          deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100
-        opt_mark = "opt" if optimize_dataset else "no-opt"
-        print("Filter dataset {} chain length: {} Median wall time: {}".format(
-            opt_mark, chain_length, median_wall_time))
-        self.report_benchmark(
-            iters=1000,
-            wall_time=median_wall_time,
-            name="benchmark_filter_dataset_chain_latency_{}_{}".format(
-                opt_mark, chain_length))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
index ef576563a15a7385d450e4f254e1cb579f79ce8c..3e2cf779a3f9d138e83986abcf5b8387d7c19412 100644
--- a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
@@ -22,7 +22,6 @@ from absl.testing import parameterized
 from tensorflow.python.data.experimental.ops import get_single_element
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
@@ -30,6 +29,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
@@ -40,34 +40,25 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("MoreThanOne", 0, 2, errors.InvalidArgumentError,
        "Dataset had more than one element."),
   )
-  @test_util.run_deprecated_v1
   def testGetSingleElement(self, skip, take, error=None, error_msg=None):
-    skip_t = array_ops.placeholder(dtypes.int64, shape=[])
-    take_t = array_ops.placeholder(dtypes.int64, shape=[])
 
     def make_sparse(x):
       x_1d = array_ops.reshape(x, [1])
       x_2d = array_ops.reshape(x, [1, 1])
       return sparse_tensor.SparseTensor(x_2d, x_1d, x_1d)
 
-    dataset = dataset_ops.Dataset.range(100).skip(skip_t).map(
-        lambda x: (x * x, make_sparse(x))).take(take_t)
-    element = get_single_element.get_single_element(dataset)
-
-    with self.cached_session() as sess:
-      if error is None:
-        dense_val, sparse_val = sess.run(
-            element, feed_dict={
-                skip_t: skip,
-                take_t: take
-            })
-        self.assertEqual(skip * skip, dense_val)
-        self.assertAllEqual([[skip]], sparse_val.indices)
-        self.assertAllEqual([skip], sparse_val.values)
-        self.assertAllEqual([skip], sparse_val.dense_shape)
-      else:
-        with self.assertRaisesRegexp(error, error_msg):
-          sess.run(element, feed_dict={skip_t: skip, take_t: take})
+    dataset = dataset_ops.Dataset.range(100).skip(
+        skip).map(lambda x: (x * x, make_sparse(x))).take(take)
+    if error is None:
+      dense_val, sparse_val = self.evaluate(
+          get_single_element.get_single_element(dataset))
+      self.assertEqual(skip * skip, dense_val)
+      self.assertAllEqual([[skip]], sparse_val.indices)
+      self.assertAllEqual([skip], sparse_val.values)
+      self.assertAllEqual([skip], sparse_val.dense_shape)
+    else:
+      with self.assertRaisesRegexp(error, error_msg):
+        self.evaluate(get_single_element.get_single_element(dataset))
 
   def testWindow(self):
     """Test that `get_single_element()` can consume a nested dataset."""
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
index c7366f66411896e95cc059d5241bdd376fc827b2..4194f06a34a8008ac2ed835b5300959bda9e3f78 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
@@ -33,19 +33,9 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class GroupByReducerTest(test_base.DatasetTestBase):
 
-  def checkResults(self, dataset, shapes, values):
-    self.assertEqual(shapes, dataset.output_shapes)
-    get_next = dataset.make_one_shot_iterator().get_next()
-    with self.cached_session() as sess:
-      for expected in values:
-        got = self.evaluate(get_next)
-        self.assertEqual(got, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-
-  @test_util.run_deprecated_v1
   def testSum(self):
     reducer = grouping.Reducer(
         init_func=lambda _: np.int64(0),
@@ -54,10 +44,11 @@ class GroupByReducerTest(test_base.DatasetTestBase):
     for i in range(1, 11):
       dataset = dataset_ops.Dataset.range(2 * i).apply(
           grouping.group_by_reducer(lambda x: x % 2, reducer))
-      self.checkResults(
-          dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
+      self.assertDatasetProduces(
+          dataset,
+          expected_shapes=tensor_shape.scalar(),
+          expected_output=[(i - 1) * i, i * i])
 
-  @test_util.run_deprecated_v1
   def testAverage(self):
 
     def reduce_fn(x, y):
@@ -72,10 +63,11 @@ class GroupByReducerTest(test_base.DatasetTestBase):
       dataset = dataset_ops.Dataset.range(2 * i).apply(
           grouping.group_by_reducer(
               lambda x: math_ops.cast(x, dtypes.int64) % 2, reducer))
-      self.checkResults(
-          dataset, shapes=tensor_shape.scalar(), values=[i - 1, i])
+      self.assertDatasetProduces(
+          dataset,
+          expected_shapes=tensor_shape.scalar(),
+          expected_output=[i - 1, i])
 
-  @test_util.run_deprecated_v1
   def testConcat(self):
     components = np.array(list("abcdefghijklmnopqrst")).view(np.chararray)
     reducer = grouping.Reducer(
@@ -87,12 +79,11 @@ class GroupByReducerTest(test_base.DatasetTestBase):
           (dataset_ops.Dataset.from_tensor_slices(components),
            dataset_ops.Dataset.range(2 * i))).apply(
                grouping.group_by_reducer(lambda x, y: y % 2, reducer))
-      self.checkResults(
+      self.assertDatasetProduces(
           dataset,
-          shapes=tensor_shape.scalar(),
-          values=[b"acegikmoqs" [:i], b"bdfhjlnprt" [:i]])
+          expected_shapes=tensor_shape.scalar(),
+          expected_output=[b"acegikmoqs" [:i], b"bdfhjlnprt" [:i]])
 
-  @test_util.run_deprecated_v1
   def testSparseSum(self):
     def _sparse(i):
       return sparse_tensor.SparseTensorValue(
@@ -107,10 +98,11 @@ class GroupByReducerTest(test_base.DatasetTestBase):
     for i in range(1, 11):
       dataset = dataset_ops.Dataset.range(2 * i).map(_sparse).apply(
           grouping.group_by_reducer(lambda x: x.values[0] % 2, reducer))
-      self.checkResults(
-          dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
+      self.assertDatasetProduces(
+          dataset,
+          expected_shapes=tensor_shape.scalar(),
+          expected_output=[(i - 1) * i, i * i])
 
-  @test_util.run_deprecated_v1
   def testChangingStateShape(self):
 
     def reduce_fn(x, _):
@@ -130,14 +122,12 @@ class GroupByReducerTest(test_base.DatasetTestBase):
           grouping.group_by_reducer(lambda x: x, reducer))
       self.assertEqual([None], dataset.output_shapes[0].as_list())
       self.assertIs(None, dataset.output_shapes[1].ndims)
-      iterator = dataset.make_one_shot_iterator()
-      get_next = iterator.get_next()
-      with self.cached_session() as sess:
-        x, y = self.evaluate(get_next)
-        self.assertAllEqual([0] * (2**i), x)
-        self.assertAllEqual(np.array(1, ndmin=i), y)
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(get_next)
+      get_next = self.getNext(dataset)
+      x, y = self.evaluate(get_next())
+      self.assertAllEqual([0] * (2**i), x)
+      self.assertAllEqual(np.array(1, ndmin=i), y)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
 
   def testTypeMismatch(self):
     reducer = grouping.Reducer(
@@ -194,11 +184,10 @@ class GroupByReducerTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.zip(
         (dataset_ops.Dataset.range(10), dataset_ops.Dataset.range(10))).apply(
             grouping.group_by_reducer(lambda x, y: np.int64(0), reducer))
-    get_next = dataset.make_one_shot_iterator().get_next()
-    with self.cached_session() as sess:
-      x, y = self.evaluate(get_next)
-      self.assertAllEqual(x, np.asarray([x for x in range(10)]))
-      self.assertEqual(y, 45)
+    get_next = self.getNext(dataset)
+    x, y = self.evaluate(get_next())
+    self.assertAllEqual(x, np.asarray([x for x in range(10)]))
+    self.assertEqual(y, 45)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
index 1e54091c7dbd8fafbbe1c77a0974af7b9b8fe0b1..d1270703c56138ca8546b04ce0e16b6c5da41fe9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.platform import test
 # NOTE(mrry): These tests are based on the tests in bucket_ops_test.py.
 # Currently, they use a constant batch size, though should be made to use a
 # different batch size per key.
+@test_util.run_all_in_graph_and_eager_modes
 class GroupByWindowTest(test_base.DatasetTestBase):
 
   def _dynamicPad(self, bucket, window, window_size):
@@ -50,101 +51,87 @@ class GroupByWindowTest(test_base.DatasetTestBase):
              32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape(
                  [None]), tensor_shape.TensorShape([3])))))
 
-  @test_util.run_deprecated_v1
   def testSingleBucket(self):
 
     def _map_fn(v):
       return (v, array_ops.fill([v], v),
               array_ops.fill([3], string_ops.as_string(v)))
 
-    input_dataset = (
-        dataset_ops.Dataset.from_tensor_slices(math_ops.range(32)).map(_map_fn))
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(
+        math_ops.range(32)).map(_map_fn)
 
     bucketed_dataset = input_dataset.apply(
         grouping.group_by_window(
             lambda x, y, z: 0,
             lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
+    get_next = self.getNext(bucketed_dataset)
 
-    iterator = bucketed_dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    which_bucket, bucketed_values = self.evaluate(get_next())
 
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
+    self.assertEqual(0, which_bucket)
 
-      which_bucket, bucketed_values = self.evaluate(get_next)
+    expected_scalar_int = np.arange(32, dtype=np.int64)
+    expected_unk_int64 = np.zeros((32, 31)).astype(np.int64)
+    for i in range(32):
+      expected_unk_int64[i, :i] = i
+    expected_vec3_str = np.vstack(3 * [np.arange(32).astype(bytes)]).T
 
-      self.assertEqual(0, which_bucket)
+    self.assertAllEqual(expected_scalar_int, bucketed_values[0])
+    self.assertAllEqual(expected_unk_int64, bucketed_values[1])
+    self.assertAllEqual(expected_vec3_str, bucketed_values[2])
 
-      expected_scalar_int = np.arange(32, dtype=np.int64)
-      expected_unk_int64 = np.zeros((32, 31)).astype(np.int64)
-      for i in range(32):
-        expected_unk_int64[i, :i] = i
-      expected_vec3_str = np.vstack(3 * [np.arange(32).astype(bytes)]).T
-
-      self.assertAllEqual(expected_scalar_int, bucketed_values[0])
-      self.assertAllEqual(expected_unk_int64, bucketed_values[1])
-      self.assertAllEqual(expected_vec3_str, bucketed_values[2])
-
-  @test_util.run_deprecated_v1
   def testEvenOddBuckets(self):
 
     def _map_fn(v):
       return (v, array_ops.fill([v], v),
               array_ops.fill([3], string_ops.as_string(v)))
 
-    input_dataset = (
-        dataset_ops.Dataset.from_tensor_slices(math_ops.range(64)).map(_map_fn))
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(
+        math_ops.range(64)).map(_map_fn)
 
     bucketed_dataset = input_dataset.apply(
         grouping.group_by_window(
             lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
             lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
 
-    iterator = bucketed_dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-
-      # Get two minibatches (one containing even values, one containing odds)
-      which_bucket_even, bucketed_values_even = self.evaluate(get_next)
-      which_bucket_odd, bucketed_values_odd = self.evaluate(get_next)
-
-      # Count number of bucket_tensors.
-      self.assertEqual(3, len(bucketed_values_even))
-      self.assertEqual(3, len(bucketed_values_odd))
-
-      # Ensure bucket 0 was used for all minibatch entries.
-      self.assertAllEqual(0, which_bucket_even)
-      self.assertAllEqual(1, which_bucket_odd)
-
-      # Test the first bucket outputted, the events starting at 0
-      expected_scalar_int = np.arange(0, 32 * 2, 2, dtype=np.int64)
-      expected_unk_int64 = np.zeros((32, 31 * 2)).astype(np.int64)
-      for i in range(0, 32):
-        expected_unk_int64[i, :2 * i] = 2 * i
-        expected_vec3_str = np.vstack(
-            3 * [np.arange(0, 32 * 2, 2).astype(bytes)]).T
-
-      self.assertAllEqual(expected_scalar_int, bucketed_values_even[0])
-      self.assertAllEqual(expected_unk_int64, bucketed_values_even[1])
-      self.assertAllEqual(expected_vec3_str, bucketed_values_even[2])
-
-      # Test the second bucket outputted, the odds starting at 1
-      expected_scalar_int = np.arange(1, 32 * 2 + 1, 2, dtype=np.int64)
-      expected_unk_int64 = np.zeros((32, 31 * 2 + 1)).astype(np.int64)
-      for i in range(0, 32):
-        expected_unk_int64[i, :2 * i + 1] = 2 * i + 1
-        expected_vec3_str = np.vstack(
-            3 * [np.arange(1, 32 * 2 + 1, 2).astype(bytes)]).T
-
-      self.assertAllEqual(expected_scalar_int, bucketed_values_odd[0])
-      self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1])
-      self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
-
-  @test_util.run_deprecated_v1
+    get_next = self.getNext(bucketed_dataset)
+
+    # Get two minibatches (one containing even values, one containing odds)
+    which_bucket_even, bucketed_values_even = self.evaluate(get_next())
+    which_bucket_odd, bucketed_values_odd = self.evaluate(get_next())
+
+    # Count number of bucket_tensors.
+    self.assertEqual(3, len(bucketed_values_even))
+    self.assertEqual(3, len(bucketed_values_odd))
+
+    # Ensure bucket 0 was used for all minibatch entries.
+    self.assertAllEqual(0, which_bucket_even)
+    self.assertAllEqual(1, which_bucket_odd)
+
+    # Test the first bucket outputted, the events starting at 0
+    expected_scalar_int = np.arange(0, 32 * 2, 2, dtype=np.int64)
+    expected_unk_int64 = np.zeros((32, 31 * 2)).astype(np.int64)
+    for i in range(0, 32):
+      expected_unk_int64[i, :2 * i] = 2 * i
+      expected_vec3_str = np.vstack(
+          3 * [np.arange(0, 32 * 2, 2).astype(bytes)]).T
+
+    self.assertAllEqual(expected_scalar_int, bucketed_values_even[0])
+    self.assertAllEqual(expected_unk_int64, bucketed_values_even[1])
+    self.assertAllEqual(expected_vec3_str, bucketed_values_even[2])
+
+    # Test the second bucket outputted, the odds starting at 1
+    expected_scalar_int = np.arange(1, 32 * 2 + 1, 2, dtype=np.int64)
+    expected_unk_int64 = np.zeros((32, 31 * 2 + 1)).astype(np.int64)
+    for i in range(0, 32):
+      expected_unk_int64[i, :2 * i + 1] = 2 * i + 1
+      expected_vec3_str = np.vstack(
+          3 * [np.arange(1, 32 * 2 + 1, 2).astype(bytes)]).T
+
+    self.assertAllEqual(expected_scalar_int, bucketed_values_odd[0])
+    self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1])
+    self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
+
   def testEvenOddBucketsFilterOutAllOdd(self):
 
     def _map_fn(v):
@@ -164,35 +151,28 @@ class GroupByWindowTest(test_base.DatasetTestBase):
                    "z": tensor_shape.TensorShape([3])
                })))
 
-    input_dataset = (
-        dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn)
-        .filter(lambda d: math_ops.equal(d["x"] % 2, 0)))
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(math_ops.range(
+        128)).map(_map_fn).filter(lambda d: math_ops.equal(d["x"] % 2, 0))
 
     bucketed_dataset = input_dataset.apply(
         grouping.group_by_window(
             lambda d: math_ops.cast(d["x"] % 2, dtypes.int64),
             lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32))
 
-    iterator = bucketed_dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
+    get_next = self.getNext(bucketed_dataset)
 
-      # Get two minibatches ([0, 2, ...] and [64, 66, ...])
-      which_bucket0, bucketed_values_even0 = self.evaluate(get_next)
-      which_bucket1, bucketed_values_even1 = self.evaluate(get_next)
+    # Get two minibatches ([0, 2, ...] and [64, 66, ...])
+    which_bucket0, bucketed_values_even0 = self.evaluate(get_next())
+    which_bucket1, bucketed_values_even1 = self.evaluate(get_next())
 
-      # Ensure that bucket 1 was completely filtered out
-      self.assertAllEqual(0, which_bucket0)
-      self.assertAllEqual(0, which_bucket1)
-      self.assertAllEqual(
-          np.arange(0, 64, 2, dtype=np.int64), bucketed_values_even0["x"])
-      self.assertAllEqual(
-          np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"])
+    # Ensure that bucket 1 was completely filtered out
+    self.assertAllEqual(0, which_bucket0)
+    self.assertAllEqual(0, which_bucket1)
+    self.assertAllEqual(
+        np.arange(0, 64, 2, dtype=np.int64), bucketed_values_even0["x"])
+    self.assertAllEqual(
+        np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"])
 
-  @test_util.run_deprecated_v1
   def testDynamicWindowSize(self):
     components = np.arange(100).astype(np.int64)
 
@@ -207,112 +187,81 @@ class GroupByWindowTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
         grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(20),
                                  None, window_size_func))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      with self.assertRaises(errors.OutOfRangeError):
-        batches = 0
-        while True:
-          result = self.evaluate(get_next)
-          is_even = all(x % 2 == 0 for x in result)
-          is_odd = all(x % 2 == 1 for x in result)
-          self.assertTrue(is_even or is_odd)
-          expected_batch_size = 5 if is_even else 10
-          self.assertEqual(expected_batch_size, result.shape[0])
-          batches += 1
-
-      self.assertEqual(batches, 15)
-
-  @test_util.run_deprecated_v1
+
+    get_next = self.getNext(dataset)
+    with self.assertRaises(errors.OutOfRangeError):
+      batches = 0
+      while True:
+        result = self.evaluate(get_next())
+        is_even = all(x % 2 == 0 for x in result)
+        is_odd = all(x % 2 == 1 for x in result)
+        self.assertTrue(is_even or is_odd)
+        expected_batch_size = 5 if is_even else 10
+        self.assertEqual(expected_batch_size, result.shape[0])
+        batches += 1
+
+    self.assertEqual(batches, 15)
+
   def testSimple(self):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x)
-        .apply(
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        components).map(lambda x: x * x).apply(
             grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4)).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      counts = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          result = self.evaluate(get_next)
-          self.assertTrue(
-              all(x % 2 == 0
-                  for x in result) or all(x % 2 == 1)
-              for x in result)
-          counts.append(result.shape[0])
-
-      self.assertEqual(len(components), sum(counts))
-      num_full_batches = len([c for c in counts if c == 4])
-      self.assertGreaterEqual(num_full_batches, 24)
-      self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
-
-  @test_util.run_deprecated_v1
+                                     4))
+    get_next = self.getNext(dataset)
+    counts = []
+    with self.assertRaises(errors.OutOfRangeError):
+      while True:
+        result = self.evaluate(get_next())
+        self.assertTrue(
+            all(x % 2 == 0 for x in result) or all(x % 2 == 1) for x in result)
+        counts.append(result.shape[0])
+
+    self.assertEqual(len(components), sum(counts))
+    num_full_batches = len([c for c in counts if c == 4])
+    self.assertGreaterEqual(num_full_batches, 24)
+    self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
+
   def testImmediateOutput(self):
     components = np.array(
         [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply(
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).repeat(
+        -1).apply(
             grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4),
-                                     4)).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      # The input is infinite, so this test demonstrates that:
-      # 1. We produce output without having to consume the entire input,
-      # 2. Different buckets can produce output at different rates, and
-      # 3. For deterministic input, the output is deterministic.
-      for _ in range(3):
-        self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
-        self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next))
-        self.assertAllEqual([2, 2, 2, 2], self.evaluate(get_next))
-        self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
-
-  @test_util.run_deprecated_v1
+                                     4))
+    get_next = self.getNext(dataset)
+    # The input is infinite, so this test demonstrates that:
+    # 1. We produce output without having to consume the entire input,
+    # 2. Different buckets can produce output at different rates, and
+    # 3. For deterministic input, the output is deterministic.
+    for _ in range(3):
+      self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next()))
+      self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next()))
+      self.assertAllEqual([2, 2, 2, 2], self.evaluate(get_next()))
+      self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next()))
+
   def testSmallGroups(self):
     components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).apply(
-            grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4)).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
-      self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next))
-      # The small outputs at the end are deterministically produced in key
-      # order.
-      self.assertAllEqual([0, 0, 0], self.evaluate(get_next))
-      self.assertAllEqual([1], self.evaluate(get_next))
-
-  @test_util.run_deprecated_v1
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
+        grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4))
+    get_next = self.getNext(dataset)
+    self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next()))
+    self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next()))
+    # The small outputs at the end are deterministically produced in key
+    # order.
+    self.assertAllEqual([0, 0, 0], self.evaluate(get_next()))
+    self.assertAllEqual([1], self.evaluate(get_next()))
+
   def testEmpty(self):
-    iterator = (
-        dataset_ops.Dataset.range(4).apply(
-            grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "Window size must be greater than zero, but got 0."):
-        print(self.evaluate(get_next))
-
-  @test_util.run_deprecated_v1
+    dataset = dataset_ops.Dataset.range(4).apply(
+        grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0))
+
+    get_next = self.getNext(dataset)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Window size must be greater than zero, but got 0."):
+      print(self.evaluate(get_next()))
+
   def testReduceFuncError(self):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
 
@@ -324,20 +273,13 @@ class GroupByWindowTest(test_base.DatasetTestBase):
           padded_shapes=(tensor_shape.TensorShape([]),
                          constant_op.constant([5], dtype=dtypes.int64) * -1))
 
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply(
-            grouping.group_by_window(lambda x, _: x % 2, reduce_func,
-                                     32)).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        components).map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply(
+            grouping.group_by_window(lambda x, _: x % 2, reduce_func, 32))
+    get_next = self.getNext(dataset)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
 
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(get_next)
-
-  @test_util.run_deprecated_v1
   def testConsumeWindowDatasetMoreThanOnce(self):
     components = np.random.randint(50, size=(200,)).astype(np.int64)
 
@@ -351,27 +293,23 @@ class GroupByWindowTest(test_base.DatasetTestBase):
               4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),
       ))
 
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x))
-        .apply(grouping.group_by_window(
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        components
+    ).map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x)).apply(
+        grouping.group_by_window(
             lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64),
             reduce_func, 4))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      counts = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          tight_result, multiple_of_10_result = self.evaluate(get_next)
-          self.assertEqual(0, multiple_of_10_result.shape[1] % 10)
-          self.assertAllEqual(tight_result,
-                              multiple_of_10_result[:, :tight_result.shape[1]])
-          counts.append(tight_result.shape[0])
-      self.assertEqual(len(components), sum(counts))
+
+    get_next = self.getNext(dataset)
+    counts = []
+    with self.assertRaises(errors.OutOfRangeError):
+      while True:
+        tight_result, multiple_of_10_result = self.evaluate(get_next())
+        self.assertEqual(0, multiple_of_10_result.shape[1] % 10)
+        self.assertAllEqual(tight_result,
+                            multiple_of_10_result[:, :tight_result.shape[1]])
+        counts.append(tight_result.shape[0])
+    self.assertEqual(len(components), sum(counts))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
index bd323592e442727cdc25c13cf5329cc674580a37..1d02f4fb773537de3800d4039d10112e465df285 100644
--- a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
@@ -34,9 +34,9 @@ from tensorflow.python.util import compat
 _NUMPY_RANDOM_SEED = 42
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class IgnoreErrorsTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testMapIgnoreError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
 
@@ -44,18 +44,13 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.check_numerics(x, "message")).apply(
             error_ops.ignore_errors()))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      for x in [1., 2., 3., 5.]:
-        self.assertEqual(x, self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-
-  @test_util.run_deprecated_v1
+    get_next = self.getNext(dataset)
+
+    for x in [1., 2., 3., 5.]:
+      self.assertEqual(x, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
   def testParallelMapIgnoreError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
 
@@ -63,18 +58,13 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
         dataset_ops.Dataset.from_tensor_slices(components).map(
             lambda x: array_ops.check_numerics(x, "message"),
             num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      for x in [1., 2., 3., 5.]:
-        self.assertEqual(x, self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-
-  @test_util.run_deprecated_v1
+    get_next = self.getNext(dataset)
+
+    for x in [1., 2., 3., 5.]:
+      self.assertEqual(x, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
   def testReadFileIgnoreError(self):
 
     def write_string_to_file(value, filename):
@@ -91,28 +81,24 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
         dataset_ops.Dataset.from_tensor_slices(filenames).map(
             io_ops.read_file,
             num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # All of the files are present.
-      self.evaluate(init_op)
-      for filename in filenames:
-        self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-
-      # Delete one of the files.
-      os.remove(filenames[0])
-
-      # Attempting to read filenames[0] will fail, but ignore_errors()
-      # will catch the error.
-      self.evaluate(init_op)
-      for filename in filenames[1:]:
-        self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(dataset)
+
+    # All of the files are present.
+    for filename in filenames:
+      self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+    # Delete one of the files.
+    os.remove(filenames[0])
+
+    # Attempting to read filenames[0] will fail, but ignore_errors()
+    # will catch the error.
+    get_next = self.getNext(dataset)
+    for filename in filenames[1:]:
+      self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
index c3c4ccd07708d2c7cfdc57c2a6fcbf320f1dfb36..79b8c492c1f09d6ef6df49c2c1d27569b095b9a7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
@@ -25,14 +25,13 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class IndexedDatasetOpsTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testLowLevelIndexedDatasetOps(self):
     identity = ged_ops.experimental_identity_indexed_dataset(
         ops.convert_to_tensor(16, dtype=dtypes.uint64))
@@ -43,40 +42,34 @@ class IndexedDatasetOpsTest(test_base.DatasetTestBase):
         output_shapes=[[]])
     materialize = ged_ops.experimental_indexed_dataset_materialize(
         identity, handle)
-    index = array_ops.placeholder(dtypes.uint64)
     get_op = ged_ops.experimental_indexed_dataset_get(
-        handle, index, output_types=[dtypes.uint64], output_shapes=[[]])
+        handle, 3, output_types=[dtypes.uint64], output_shapes=[[]])
 
-    with self.cached_session() as sess:
-      self.evaluate(materialize)
-      self.assertEqual([3], sess.run(get_op, feed_dict={index: 3}))
+    self.evaluate(materialize)
+    self.assertEqual([3], self.evaluate(get_op))
 
+  # TODO(b/117581999): Eager mode not supported.
   @test_util.run_deprecated_v1
-  def testIdentityIndexedDataset(self):
+  def testSkipEagerIdentityIndexedDataset(self):
     ds = indexed_dataset_ops.IdentityIndexedDataset(16)
     materialized = ds.materialize()
-    with self.cached_session() as sess:
-      self.evaluate(materialized.initializer)
-      placeholder = array_ops.placeholder(dtypes.uint64, shape=[])
-      for i in range(16):
-        output = sess.run(
-            materialized.get(placeholder), feed_dict={placeholder: i})
-        self.assertEqual([i], output)
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(materialized.get(placeholder), feed_dict={placeholder: 16})
+    self.evaluate(materialized.initializer)
+    for i in range(16):
+      output = self.evaluate(materialized.get(i))
+      self.assertEqual([i], output)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(materialized.get(16))
 
   @unittest.skip("Requisite functionality currently unimplemented.")
   def testIdentityIndexedDatasetIterator(self):
     ds = indexed_dataset_ops.IdentityIndexedDataset(16)
-    itr = ds.make_initializable_iterator()
-    n = itr.get_next()
-    with self.cached_session() as sess:
-      self.evaluate(itr.initializer)
-      for i in range(16):
-        output = self.evaluate(n)
-        self.assertEqual(i, output)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(n)
+    n = self.getNext(ds)
+
+    for i in range(16):
+      output = self.evaluate(n())
+      self.assertEqual(i, output)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(n())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
index 48916471b35fef4f5367c2daca76f1dc7e88b410..1fb6971ecdec90964a6f860a797d7bf8ddf8bfb8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
@@ -32,74 +32,58 @@ from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MakeBatchedFeaturesDatasetTest(
     reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase):
 
   def testRead(self):
     for batch_size in [1, 2]:
       for num_epochs in [1, 10]:
-        with ops.Graph().as_default() as g:
-          with self.session(graph=g) as sess:
-            # Basic test: read from file 0.
-            self.outputs = self.make_batch_feature(
+        # Basic test: read from file 0.
+        self.outputs = self.getNext(
+            self.make_batch_feature(
                 filenames=self.test_filenames[0],
                 label_key="label",
                 num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
-            self.verify_records(
-                sess,
-                batch_size,
-                0,
-                num_epochs=num_epochs,
-                label_key_provided=True)
-            with self.assertRaises(errors.OutOfRangeError):
-              self._next_actual_batch(sess, label_key_provided=True)
-
-        with ops.Graph().as_default() as g:
-          with self.session(graph=g) as sess:
-            # Basic test: read from file 1.
-            self.outputs = self.make_batch_feature(
+                batch_size=batch_size))
+        self.verify_records(
+            batch_size, 0, num_epochs=num_epochs, label_key_provided=True)
+        with self.assertRaises(errors.OutOfRangeError):
+          self._next_actual_batch(label_key_provided=True)
+
+          # Basic test: read from file 1.
+        self.outputs = self.getNext(
+            self.make_batch_feature(
                 filenames=self.test_filenames[1],
                 label_key="label",
                 num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
-            self.verify_records(
-                sess,
-                batch_size,
-                1,
-                num_epochs=num_epochs,
-                label_key_provided=True)
-            with self.assertRaises(errors.OutOfRangeError):
-              self._next_actual_batch(sess, label_key_provided=True)
-
-        with ops.Graph().as_default() as g:
-          with self.session(graph=g) as sess:
-            # Basic test: read from both files.
-            self.outputs = self.make_batch_feature(
+                batch_size=batch_size))
+        self.verify_records(
+            batch_size, 1, num_epochs=num_epochs, label_key_provided=True)
+        with self.assertRaises(errors.OutOfRangeError):
+          self._next_actual_batch(label_key_provided=True)
+
+        # Basic test: read from both files.
+        self.outputs = self.getNext(
+            self.make_batch_feature(
                 filenames=self.test_filenames,
                 label_key="label",
                 num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
-            self.verify_records(
-                sess,
-                batch_size,
-                num_epochs=num_epochs,
-                label_key_provided=True)
-            with self.assertRaises(errors.OutOfRangeError):
-              self._next_actual_batch(sess, label_key_provided=True)
-
-        with ops.Graph().as_default() as g:
-          with self.session(graph=g) as sess:
-            # Basic test: read from both files.
-            self.outputs = self.make_batch_feature(
+                batch_size=batch_size))
+        self.verify_records(
+            batch_size, num_epochs=num_epochs, label_key_provided=True)
+        with self.assertRaises(errors.OutOfRangeError):
+          self._next_actual_batch(label_key_provided=True)
+        # Basic test: read from both files.
+        self.outputs = self.getNext(
+            self.make_batch_feature(
                 filenames=self.test_filenames,
                 num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
-            self.verify_records(sess, batch_size, num_epochs=num_epochs)
-            with self.assertRaises(errors.OutOfRangeError):
-              self._next_actual_batch(sess)
+                batch_size=batch_size))
+        self.verify_records(batch_size, num_epochs=num_epochs)
+        with self.assertRaises(errors.OutOfRangeError):
+          self._next_actual_batch()
 
-  @test_util.run_deprecated_v1
   def testReadWithEquivalentDataset(self):
     features = {
         "file": parsing_ops.FixedLenFeature([], dtypes.int64),
@@ -109,108 +93,97 @@ class MakeBatchedFeaturesDatasetTest(
         core_readers.TFRecordDataset(self.test_filenames)
         .map(lambda x: parsing_ops.parse_single_example(x, features))
         .repeat(10).batch(2))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      for file_batch, _, _, _, record_batch, _ in self._next_expected_batch(
-          range(self._num_files), 2, 10):
-        actual_batch = self.evaluate(next_element)
-        self.assertAllEqual(file_batch, actual_batch["file"])
-        self.assertAllEqual(record_batch, actual_batch["record"])
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    next_element = self.getNext(dataset)
+    for file_batch, _, _, _, record_batch, _ in self._next_expected_batch(
+        range(self._num_files), 2, 10):
+      actual_batch = self.evaluate(next_element())
+      self.assertAllEqual(file_batch, actual_batch["file"])
+      self.assertAllEqual(record_batch, actual_batch["record"])
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testReadWithFusedShuffleRepeatDataset(self):
     num_epochs = 5
     total_records = num_epochs * self._num_records
     for batch_size in [1, 2]:
       # Test that shuffling with same seed produces the same result.
-      with ops.Graph().as_default() as g:
-        with self.session(graph=g) as sess:
-          outputs1 = self.make_batch_feature(
+      outputs1 = self.getNext(
+          self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5).make_one_shot_iterator().get_next()
-          outputs2 = self.make_batch_feature(
+              shuffle_seed=5))
+      outputs2 = self.getNext(
+          self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5).make_one_shot_iterator().get_next()
-          for _ in range(total_records // batch_size):
-            batch1 = self._run_actual_batch(outputs1, sess)
-            batch2 = self._run_actual_batch(outputs2, sess)
-            for i in range(len(batch1)):
-              self.assertAllEqual(batch1[i], batch2[i])
+              shuffle_seed=5))
+      for _ in range(total_records // batch_size):
+        batch1 = self._run_actual_batch(outputs1)
+        batch2 = self._run_actual_batch(outputs2)
+        for i in range(len(batch1)):
+          self.assertAllEqual(batch1[i], batch2[i])
 
       # Test that shuffling with different seeds produces a different order.
-      with ops.Graph().as_default() as g:
-        with self.session(graph=g) as sess:
-          outputs1 = self.make_batch_feature(
+      outputs1 = self.getNext(
+          self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5).make_one_shot_iterator().get_next()
-          outputs2 = self.make_batch_feature(
+              shuffle_seed=5))
+      outputs2 = self.getNext(
+          self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=15).make_one_shot_iterator().get_next()
-          all_equal = True
-          for _ in range(total_records // batch_size):
-            batch1 = self._run_actual_batch(outputs1, sess)
-            batch2 = self._run_actual_batch(outputs2, sess)
-            for i in range(len(batch1)):
-              all_equal = all_equal and np.array_equal(batch1[i], batch2[i])
-          self.assertFalse(all_equal)
+              shuffle_seed=15))
+      all_equal = True
+      for _ in range(total_records // batch_size):
+        batch1 = self._run_actual_batch(outputs1)
+        batch2 = self._run_actual_batch(outputs2)
+        for i in range(len(batch1)):
+          all_equal = all_equal and np.array_equal(batch1[i], batch2[i])
+      self.assertFalse(all_equal)
 
   def testParallelReadersAndParsers(self):
     num_epochs = 5
     for batch_size in [1, 2]:
       for reader_num_threads in [2, 4]:
         for parser_num_threads in [2, 4]:
-          with ops.Graph().as_default() as g:
-            with self.session(graph=g) as sess:
-              self.outputs = self.make_batch_feature(
+          self.outputs = self.getNext(
+              self.make_batch_feature(
                   filenames=self.test_filenames,
                   label_key="label",
                   num_epochs=num_epochs,
                   batch_size=batch_size,
                   reader_num_threads=reader_num_threads,
-                  parser_num_threads=parser_num_threads).make_one_shot_iterator(
-                  ).get_next()
-              self.verify_records(
-                  sess,
-                  batch_size,
-                  num_epochs=num_epochs,
-                  label_key_provided=True,
-                  interleave_cycle_length=reader_num_threads)
-              with self.assertRaises(errors.OutOfRangeError):
-                self._next_actual_batch(sess, label_key_provided=True)
+                  parser_num_threads=parser_num_threads))
+          self.verify_records(
+              batch_size,
+              num_epochs=num_epochs,
+              label_key_provided=True,
+              interleave_cycle_length=reader_num_threads)
+          with self.assertRaises(errors.OutOfRangeError):
+            self._next_actual_batch(label_key_provided=True)
 
-          with ops.Graph().as_default() as g:
-            with self.session(graph=g) as sess:
-              self.outputs = self.make_batch_feature(
+          self.outputs = self.getNext(
+              self.make_batch_feature(
                   filenames=self.test_filenames,
                   num_epochs=num_epochs,
                   batch_size=batch_size,
                   reader_num_threads=reader_num_threads,
-                  parser_num_threads=parser_num_threads).make_one_shot_iterator(
-                  ).get_next()
-              self.verify_records(
-                  sess,
-                  batch_size,
-                  num_epochs=num_epochs,
-                  interleave_cycle_length=reader_num_threads)
-              with self.assertRaises(errors.OutOfRangeError):
-                self._next_actual_batch(sess)
+                  parser_num_threads=parser_num_threads))
+          self.verify_records(
+              batch_size,
+              num_epochs=num_epochs,
+              interleave_cycle_length=reader_num_threads)
+          with self.assertRaises(errors.OutOfRangeError):
+            self._next_actual_batch()
 
   def testDropFinalBatch(self):
     for batch_size in [1, 2]:
@@ -222,7 +195,7 @@ class MakeBatchedFeaturesDatasetTest(
               label_key="label",
               num_epochs=num_epochs,
               batch_size=batch_size,
-              drop_final_batch=True).make_one_shot_iterator().get_next()
+              drop_final_batch=True)
           for tensor in nest.flatten(outputs):
             if isinstance(tensor, ops.Tensor):  # Guard against SparseTensor.
               self.assertEqual(tensor.shape[0], batch_size)
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
index bcbaf1a7c49ae7e4fccdc2217eb8eb031ea6e9b7..3b7b335e7066175fba6ef190b977362bc461ca1d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
@@ -29,11 +29,11 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MakeCsvDatasetTest(test_base.DatasetTestBase):
 
   def _make_csv_dataset(self, filenames, batch_size, num_epochs=1, **kwargs):
@@ -75,7 +75,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
 
   def _verify_output(
       self,
-      sess,
       dataset,
       batch_size,
       num_epochs,
@@ -83,7 +82,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
       expected_output,
       expected_keys,
   ):
-    nxt = dataset.make_one_shot_iterator().get_next()
+    get_next = self.getNext(dataset)
 
     for expected_features in self._next_expected_batch(
         expected_output,
@@ -91,7 +90,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         batch_size,
         num_epochs,
     ):
-      actual_features = self.evaluate(nxt)
+      actual_features = self.evaluate(get_next())
 
       if label_name is not None:
         expected_labels = expected_features.pop(label_name)
@@ -103,7 +102,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         self.assertAllEqual(expected_features[k], actual_features[k])
 
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(nxt)
+      self.evaluate(get_next())
 
   def _test_dataset(self,
                     inputs,
@@ -117,18 +116,15 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
     # Convert str type because py3 tf strings are bytestrings
     filenames = self._setup_files(
         inputs, compression_type=kwargs.get("compression_type", None))
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g) as sess:
-        dataset = self._make_csv_dataset(
-            filenames,
-            batch_size=batch_size,
-            num_epochs=num_epochs,
-            label_name=label_name,
-            **kwargs)
-        self._verify_output(sess, dataset, batch_size, num_epochs, label_name,
-                            expected_output, expected_keys)
-
-  @test_util.run_deprecated_v1
+    dataset = self._make_csv_dataset(
+        filenames,
+        batch_size=batch_size,
+        num_epochs=num_epochs,
+        label_name=label_name,
+        **kwargs)
+    self._verify_output(dataset, batch_size, num_epochs, label_name,
+                        expected_output, expected_keys)
+
   def testMakeCSVDataset(self):
     """Tests making a CSV dataset with keys and defaults provided."""
     record_defaults = [
@@ -160,7 +156,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         column_defaults=record_defaults,
     )
 
-  @test_util.run_deprecated_v1
   def testMakeCSVDataset_withBatchSizeAndEpochs(self):
     """Tests making a CSV dataset with keys and defaults provided."""
     record_defaults = [
@@ -192,7 +187,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         column_defaults=record_defaults,
     )
 
-  @test_util.run_deprecated_v1
   def testMakeCSVDataset_withCompressionType(self):
     """Tests `compression_type` argument."""
     record_defaults = [
@@ -261,7 +255,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
           label_name="not_a_real_label",
           column_names=column_names)
 
-  @test_util.run_deprecated_v1
   def testMakeCSVDataset_withNoLabel(self):
     """Tests making a CSV dataset with no label provided."""
     record_defaults = [
@@ -291,7 +284,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         column_defaults=record_defaults,
     )
 
-  @test_util.run_deprecated_v1
   def testMakeCSVDataset_withNoHeader(self):
     """Tests that datasets can be created from CSV files with no header line.
     """
@@ -353,7 +345,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         column_defaults=record_defaults,
     )
 
-  @test_util.run_deprecated_v1
   def testMakeCSVDataset_withNoColNames(self):
     """Tests that datasets can be created when column names are not specified.
 
@@ -458,7 +449,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         header=True,
     )
 
-  @test_util.run_deprecated_v1
   def testMakeCSVDataset_withSelectCols(self):
     record_defaults = [
         constant_op.constant([], dtypes.int32),
@@ -565,7 +555,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
           label_name=None,
           select_columns=["invalid_col_name"])
 
-  @test_util.run_deprecated_v1
   def testMakeCSVDataset_withShuffle(self):
     record_defaults = [
         constant_op.constant([], dtypes.int32),
@@ -590,69 +579,65 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
 
     total_records = 20
     for batch_size in [1, 2]:
-      with ops.Graph().as_default() as g:
-        with self.session(graph=g) as sess:
-          # Test that shuffling with the same seed produces the same result
-          dataset1 = self._make_csv_dataset(
-              filenames,
-              column_defaults=record_defaults,
-              column_names=column_names,
-              batch_size=batch_size,
-              header=True,
-              shuffle=True,
-              shuffle_seed=5,
-              num_epochs=2,
-          )
-          dataset2 = self._make_csv_dataset(
-              filenames,
-              column_defaults=record_defaults,
-              column_names=column_names,
-              batch_size=batch_size,
-              header=True,
-              shuffle=True,
-              shuffle_seed=5,
-              num_epochs=2,
-          )
-          outputs1 = dataset1.make_one_shot_iterator().get_next()
-          outputs2 = dataset2.make_one_shot_iterator().get_next()
-          for _ in range(total_records // batch_size):
-            batch1 = nest.flatten(self.evaluate(outputs1))
-            batch2 = nest.flatten(self.evaluate(outputs2))
-            for i in range(len(batch1)):
-              self.assertAllEqual(batch1[i], batch2[i])
-
-      with ops.Graph().as_default() as g:
-        with self.session(graph=g) as sess:
-          # Test that shuffling with a different seed produces different results
-          dataset1 = self._make_csv_dataset(
-              filenames,
-              column_defaults=record_defaults,
-              column_names=column_names,
-              batch_size=batch_size,
-              header=True,
-              shuffle=True,
-              shuffle_seed=5,
-              num_epochs=2,
-          )
-          dataset2 = self._make_csv_dataset(
-              filenames,
-              column_defaults=record_defaults,
-              column_names=column_names,
-              batch_size=batch_size,
-              header=True,
-              shuffle=True,
-              shuffle_seed=6,
-              num_epochs=2,
-          )
-          outputs1 = dataset1.make_one_shot_iterator().get_next()
-          outputs2 = dataset2.make_one_shot_iterator().get_next()
-          all_equal = False
-          for _ in range(total_records // batch_size):
-            batch1 = nest.flatten(self.evaluate(outputs1))
-            batch2 = nest.flatten(self.evaluate(outputs2))
-            for i in range(len(batch1)):
-              all_equal = all_equal and np.array_equal(batch1[i], batch2[i])
-          self.assertFalse(all_equal)
+      # Test that shuffling with the same seed produces the same result
+      dataset1 = self._make_csv_dataset(
+          filenames,
+          column_defaults=record_defaults,
+          column_names=column_names,
+          batch_size=batch_size,
+          header=True,
+          shuffle=True,
+          shuffle_seed=5,
+          num_epochs=2,
+      )
+      dataset2 = self._make_csv_dataset(
+          filenames,
+          column_defaults=record_defaults,
+          column_names=column_names,
+          batch_size=batch_size,
+          header=True,
+          shuffle=True,
+          shuffle_seed=5,
+          num_epochs=2,
+      )
+      next1 = self.getNext(dataset1)
+      next2 = self.getNext(dataset2)
+      for _ in range(total_records // batch_size):
+        batch1 = nest.flatten(self.evaluate(next1()))
+        batch2 = nest.flatten(self.evaluate(next2()))
+        for i in range(len(batch1)):
+          self.assertAllEqual(batch1[i], batch2[i])
+
+      # Test that shuffling with a different seed produces different results
+      dataset1 = self._make_csv_dataset(
+          filenames,
+          column_defaults=record_defaults,
+          column_names=column_names,
+          batch_size=batch_size,
+          header=True,
+          shuffle=True,
+          shuffle_seed=5,
+          num_epochs=2,
+      )
+      dataset2 = self._make_csv_dataset(
+          filenames,
+          column_defaults=record_defaults,
+          column_names=column_names,
+          batch_size=batch_size,
+          header=True,
+          shuffle=True,
+          shuffle_seed=6,
+          num_epochs=2,
+      )
+      next1 = self.getNext(dataset1)
+      next2 = self.getNext(dataset2)
+      all_equal = False
+      for _ in range(total_records // batch_size):
+        batch1 = nest.flatten(self.evaluate(next1()))
+        batch2 = nest.flatten(self.evaluate(next2()))
+        for i in range(len(batch1)):
+          all_equal = all_equal and np.array_equal(batch1[i], batch2[i])
+      self.assertFalse(all_equal)
 
   def testIndefiniteRepeatShapeInference(self):
     column_names = ["col%d" % i for i in range(5)]
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
index 0bb7b7c5f353ecf589921a0b0a781f2a2f433ad8..9f35aa69a834dc82d50550a99665d5d248e02e0f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
@@ -21,11 +21,12 @@ from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MakeTFRecordDatasetTest(
     reader_dataset_ops_test_base.TFRecordDatasetTestBase):
 
@@ -89,7 +90,6 @@ class MakeTFRecordDatasetTest(
       yield record_batch
 
   def _verify_records(self,
-                      sess,
                       outputs,
                       batch_size,
                       file_index,
@@ -105,7 +105,7 @@ class MakeTFRecordDatasetTest(
     for expected_batch in self._next_expected_batch(
         file_indices, batch_size, num_epochs, interleave_cycle_length,
         drop_final_batch, use_parser_fn):
-      actual_batch = self.evaluate(outputs)
+      actual_batch = self.evaluate(outputs())
       self.assertAllEqual(expected_batch, actual_batch)
 
   def _read_test(self, batch_size, num_epochs, file_index=None,
@@ -120,22 +120,25 @@ class MakeTFRecordDatasetTest(
     else:
       fn = None
 
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g) as sess:
-        outputs = readers.make_tf_record_dataset(
+    outputs = self.getNext(
+        readers.make_tf_record_dataset(
             file_pattern=file_pattern,
             num_epochs=num_epochs,
             batch_size=batch_size,
             parser_fn=fn,
             num_parallel_reads=num_parallel_reads,
             drop_final_batch=drop_final_batch,
-            shuffle=False).make_one_shot_iterator().get_next()
-        self._verify_records(
-            sess, outputs, batch_size, file_index, num_epochs=num_epochs,
-            interleave_cycle_length=num_parallel_reads,
-            drop_final_batch=drop_final_batch, use_parser_fn=parser_fn)
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(outputs)
+            shuffle=False))
+    self._verify_records(
+        outputs,
+        batch_size,
+        file_index,
+        num_epochs=num_epochs,
+        interleave_cycle_length=num_parallel_reads,
+        drop_final_batch=drop_final_batch,
+        use_parser_fn=parser_fn)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(outputs())
 
   def testRead(self):
     for batch_size in [1, 2]:
@@ -176,50 +179,46 @@ class MakeTFRecordDatasetTest(
 
   def _shuffle_test(self, batch_size, num_epochs, num_parallel_reads=1,
                     seed=None):
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g) as sess:
-        dataset = readers.make_tf_record_dataset(
-            file_pattern=self.test_filenames,
-            num_epochs=num_epochs,
-            batch_size=batch_size,
-            num_parallel_reads=num_parallel_reads,
-            shuffle=True,
-            shuffle_seed=seed)
-        iterator = dataset.make_initializable_iterator()
-        next_element = iterator.get_next()
-
-        self.evaluate(iterator.initializer)
-        first_batches = []
-        try:
-          while True:
-            first_batches.append(self.evaluate(next_element))
-        except errors.OutOfRangeError:
-          pass
-
-        self.evaluate(iterator.initializer)
-        second_batches = []
-        try:
-          while True:
-            second_batches.append(self.evaluate(next_element))
-        except errors.OutOfRangeError:
-          pass
-
-        self.assertEqual(len(first_batches), len(second_batches))
-        if seed is not None:
-          # if you set a seed, should get the same results
-          for i in range(len(first_batches)):
-            self.assertAllEqual(first_batches[i], second_batches[i])
-
-        expected = []
-        for f in range(self._num_files):
-          for r in range(self._num_records):
-            expected.extend([self._record(f, r)] * num_epochs)
-
-        for batches in (first_batches, second_batches):
-          actual = []
-          for b in batches:
-            actual.extend(b)
-          self.assertAllEqual(sorted(expected), sorted(actual))
+    dataset = readers.make_tf_record_dataset(
+        file_pattern=self.test_filenames,
+        num_epochs=num_epochs,
+        batch_size=batch_size,
+        num_parallel_reads=num_parallel_reads,
+        shuffle=True,
+        shuffle_seed=seed)
+
+    next_element = self.getNext(dataset)
+    first_batches = []
+    try:
+      while True:
+        first_batches.append(self.evaluate(next_element()))
+    except errors.OutOfRangeError:
+      pass
+
+    next_element = self.getNext(dataset)
+    second_batches = []
+    try:
+      while True:
+        second_batches.append(self.evaluate(next_element()))
+    except errors.OutOfRangeError:
+      pass
+
+    self.assertEqual(len(first_batches), len(second_batches))
+    if seed is not None:
+      # if you set a seed, should get the same results
+      for i in range(len(first_batches)):
+        self.assertAllEqual(first_batches[i], second_batches[i])
+
+    expected = []
+    for f in range(self._num_files):
+      for r in range(self._num_records):
+        expected.extend([self._record(f, r)] * num_epochs)
+
+    for batches in (first_batches, second_batches):
+      actual = []
+      for b in batches:
+        actual.extend(b)
+      self.assertAllEqual(sorted(expected), sorted(actual))
 
   def testShuffle(self):
     for batch_size in [1, 2]:
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
index fc97f63358ae8d9a66a914ecc198844d7f616d92..ceadebc5411aeeafdbda0ed3c2a6c5cba78ce1c8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -32,11 +33,14 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
@@ -49,7 +53,6 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("ParallelCallsNUMA", 2, None, True),
       ("ParallelBatchesNUMA", None, 10, True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatch(self, num_parallel_calls, num_parallel_batches,
                       numa_aware):
     """Test a dataset that maps a TF function across its input elements."""
@@ -59,74 +62,66 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
                   np.array(37.0) * np.arange(7))
 
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(components).repeat(count).apply(
-            batching.map_and_batch(
-                map_func=_map_fn,
-                batch_size=batch_size,
-                num_parallel_calls=num_parallel_calls,
-                num_parallel_batches=num_parallel_batches)))
-
-    if numa_aware:
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      dataset = dataset.with_options(options)
-
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
+    def dataset_fn(batch_size, count, numa_aware=numa_aware):
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).repeat(
+          count).apply(
+              batching.map_and_batch(
+                  map_func=_map_fn,
+                  batch_size=batch_size,
+                  num_parallel_calls=num_parallel_calls,
+                  num_parallel_batches=num_parallel_batches))
+      if numa_aware:
+        options = dataset_ops.Options()
+        options.experimental_numa_aware = True
+        dataset = dataset.with_options(options)
+      return dataset
+
+    # Batch of a finite input, where the batch_size divides the
+    # total number of elements.
+    dataset = dataset_fn(14, 28)
+    get_next = self.getNext(dataset)
     self.assertEqual([[None] + list(c.shape[1:]) for c in components],
-                     [t.shape.as_list() for t in get_next])
+                     [shape.as_list() for shape in dataset.output_shapes])
+    num_batches = (28 * 7) // 14
+    for i in range(num_batches):
+      result = self.evaluate(get_next())
+      for component, result_component in zip(components, result):
+        for j in range(14):
+          self.assertAllEqual(component[(i * 14 + j) % 7]**2,
+                              result_component[j])
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
-    with self.cached_session() as sess:
-      # Batch of a finite input, where the batch_size divides the
-      # total number of elements.
-      sess.run(init_op, feed_dict={count: 28, batch_size: 14})
-      num_batches = (28 * 7) // 14
-      for i in range(num_batches):
-        result = self.evaluate(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(14):
-            self.assertAllEqual(component[(i * 14 + j) % 7]**2,
-                                result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-
-      # Batch of a finite input, where the batch_size does not
-      # divide the total number of elements.
-      sess.run(init_op, feed_dict={count: 14, batch_size: 8})
-
-      # We expect (num_batches - 1) full-sized batches.
-      num_batches = int(math.ceil((14 * 7) / 8))
-      for i in range(num_batches - 1):
-        result = self.evaluate(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(8):
-            self.assertAllEqual(component[(i * 8 + j) % 7]**2,
-                                result_component[j])
-      result = self.evaluate(get_next)
+    # Batch of a finite input, where the batch_size does not
+    # divide the total number of elements.
+    get_next = self.getNext(dataset_fn(8, 14))
+
+    # We expect (num_batches - 1) full-sized batches.
+    num_batches = int(math.ceil((14 * 7) / 8))
+    for i in range(num_batches - 1):
+      result = self.evaluate(get_next())
       for component, result_component in zip(components, result):
-        for j in range((14 * 7) % 8):
-          self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
+        for j in range(8):
+          self.assertAllEqual(component[(i * 8 + j) % 7]**2,
                               result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
 
-      # Batch of an empty input should fail straight away.
-      sess.run(init_op, feed_dict={count: 0, batch_size: 8})
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    result = self.evaluate(get_next())
+    for component, result_component in zip(components, result):
+      for j in range((14 * 7) % 8):
+        self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
+                            result_component[j])
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+    # Batch of an empty input should fail straight away.
+    self.assertDatasetProduces(dataset_fn(8, 0), expected_output=[])
 
-      # Empty batch should be an initialization time error.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={count: 14, batch_size: 0})
+    # Empty batch should be an initialization time error.
+    self.assertDatasetProduces(
+        dataset_fn(0, 14), expected_error=(errors.InvalidArgumentError, ""))
 
   @parameterized.named_parameters(
       ("Even", False, False),
@@ -134,7 +129,6 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("EvenNUMA", False, True),
       ("UnevenNUMA", True, True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchPartialBatch(self, drop_remainder, numa_aware):
     dataset = (
         dataset_ops.Dataset.range(10).apply(
@@ -147,26 +141,20 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
 
     if drop_remainder:
-      self.assertEqual([4, 1], iterator.output_shapes.as_list())
+      self.assertEqual([4, 1], dataset.output_shapes.as_list())
     else:
-      self.assertEqual([None, 1], iterator.output_shapes.as_list())
-    next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], self.evaluate(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], self.evaluate(next_element))
-      if not drop_remainder:
-        self.assertAllEqual([[64], [81]], self.evaluate(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+      self.assertEqual([None, 1], dataset.output_shapes.as_list())
+    expected_output = [[[0], [1], [4], [9]], [[16], [25], [36], [49]]]
+    if not drop_remainder:
+      expected_output.append([[64], [81]])
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchYieldsPartialBatch(self, numa_aware):
     dataset = (
         dataset_ops.Dataset.range(10).apply(
@@ -176,16 +164,12 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
+    self.assertEqual([None, 1], dataset.output_shapes.as_list())
+    expected_output = [[[0], [1], [4], [9]], [[16], [25], [36], [49]],
+                       [[64], [81]]]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
-    iterator = dataset.make_one_shot_iterator()
-    self.assertEqual([None, 1], iterator.output_shapes.as_list())
-    next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], self.evaluate(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], self.evaluate(next_element))
-      self.assertAllEqual([[64], [81]], self.evaluate(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+# TODO(b/117581999): eager expected not same as actual, debug.
 
   @parameterized.named_parameters(
       ("Normal", False),
@@ -199,27 +183,32 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
+
+    if context.executing_eagerly():
+      iterator = iter(dataset)
+      get_next = iterator._next_internal  # pylint: disable=protected-access
+    else:
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      get_next = iterator.get_next
 
     elements = []
     for _ in range(100):
-      elements.append(iterator.get_next())
-    with self.cached_session() as sess:
-      for i in range(5):
-        got = self.evaluate(elements)
-        got.sort(key=lambda x: x[0])
-        expected = []
-        for j in range(100):
-          expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
-        self.assertAllEqual(got, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(elements)
+      elements.append(get_next)
+
+    for i in range(5):
+      got = self.evaluate([element() for element in elements])
+      got.sort(key=lambda x: x[0])
+      expected = []
+      for j in range(100):
+        expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
+      self.assertAllEqual(got, expected)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate([element() for element in elements])
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchParallelGetNextDropRemainder(self, numa_aware):
     dataset = dataset_ops.Dataset.range(49999).apply(
         batching.map_and_batch(
@@ -229,27 +218,32 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
+
+    if context.executing_eagerly():
+      iterator = iter(dataset)
+      get_next = iterator._next_internal  # pylint: disable=protected-access
+    else:
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      get_next = iterator.get_next
 
     elements = []
     for _ in range(100):
-      elements.append(iterator.get_next())
-    with self.cached_session() as sess:
-      for i in range(4):
-        got = self.evaluate(elements)
-        got.sort(key=lambda x: x[0])
-        expected = []
-        for j in range(100):
-          expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
-        self.assertAllEqual(got, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(elements)
+      elements.append(get_next)
+
+    for i in range(4):
+      got = self.evaluate([element() for element in elements])
+      got.sort(key=lambda x: x[0])
+      expected = []
+      for j in range(100):
+        expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
+      self.assertAllEqual(got, expected)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate([element() for element in elements])
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchSparse(self, numa_aware):
 
     def _sparse(i):
@@ -262,52 +256,39 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
 
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      for i in range(2):
-        actual = self.evaluate(get_next)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
-            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
-            dense_shape=[5, 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            sparse_tensor.SparseTensorValue(
+                indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+                values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
+                dense_shape=[5, 1]) for i in range(2)
+        ])
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchFails(self, numa_aware):
     """Test a dataset that maps a TF function across its input elements."""
-    dataset = dataset_ops.Dataset.from_tensors(
-        array_ops.check_numerics(
-            constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-    dataset = dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
-    if numa_aware:
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
 
-    init_op = iterator.initializer
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
-        sess.run(init_op, feed_dict={batch_size: 14})
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
+      dataset = dataset_ops.Dataset.from_tensors(
+          array_ops.check_numerics(
+              constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
+      dataset = dataset.apply(batching.map_and_batch(lambda x: x, 14))
+      if numa_aware:
+        options = dataset_ops.Options()
+        options.experimental_numa_aware = True
+        dataset = dataset.with_options(options)
+      get_next = self.getNext(dataset)
+      self.evaluate(get_next())
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchShapeMismatch(self, numa_aware):
     """Test a dataset that maps a TF function across its input elements."""
 
@@ -325,15 +306,10 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
-
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "number of elements does not match"):
-        self.evaluate(get_next)
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(errors.InvalidArgumentError,
+                        "number of elements does not match"))
 
   @parameterized.named_parameters(
       ("Normal", False),
@@ -358,12 +334,9 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(3):
-        self.evaluate(get_next)
+    get_next = self.getNext(dataset)
+    for _ in range(3):
+      self.evaluate(get_next())
 
   @parameterized.named_parameters(
       ("1", 0, False),
@@ -379,14 +352,11 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("5NUMA", 95, True),
       ("6NUMA", 99, True),
   )
-  @test_util.run_deprecated_v1
-  def testMapAndBatchOutOfRangeError(self, threshold, numa_aware):
+  def testMapAndBatchMapError(self, threshold, numa_aware):
 
     def raising_py_fn(i):
-      if i == threshold:
+      if i >= threshold:
         raise StopIteration()
-      elif i > threshold:
-        raise RuntimeError("Alternate error; you shouldn't see me! (i: %s)" % i)
       else:
         return i
 
@@ -398,19 +368,22 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
 
-    with self.cached_session() as sess:
-      for i in range(threshold // 10):
-        self.assertAllEqual([i * 10 + j for j in range(10)],
-                            self.evaluate(get_next))
+    get_next = self.getNext(dataset)
+    for i in range(threshold // 10):
+      self.assertAllEqual([i * 10 + j for j in range(10)],
+                          self.evaluate(get_next()))
+    if numa_aware:
       if threshold % 10 != 0:
         self.assertAllEqual(
             [threshold // 10 * 10 + j for j in range(threshold % 10)],
-            self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+            self.evaluate(get_next()))
+    else:
+      for i in range(threshold // 10, 10):
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   @parameterized.named_parameters(
       ("1", False, dtypes.bool, False),
@@ -449,13 +422,12 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
 
-    get_next = dataset.make_one_shot_iterator().get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(10):
-        self.assertAllEqual([element for _ in range(10)],
-                            self.evaluate(get_next))
+    get_next = self.getNext(dataset)
+    for _ in range(10):
+      self.assertAllEqual([element for _ in range(10)],
+                          self.evaluate(get_next()))
 
+  # TODO(b/117581999): add eager coverage.
   @parameterized.named_parameters(
       ("Identity", None, lambda x: x, None),
       ("Replicate", None, lambda x: (x, x), None),
@@ -463,10 +435,10 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("Project", (None, None), lambda x, y: x, None),
   )
   @test_util.run_deprecated_v1
-  def testShortCircuit(self, structure, map_fn, num_parallel_calls):
+  def testSkipEagerShortCircuit(self, structure, map_fn, num_parallel_calls):
     dataset = self.structuredDataset(structure).repeat().apply(
         batching.map_and_batch(map_fn, batch_size=10))
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       if isinstance(structure, tuple):
@@ -477,30 +449,25 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
             sess.run(self.structuredElement(structure, shape=[10])))
       self.assertAllEqual(expected, self.evaluate(get_next))
 
-  @test_util.run_deprecated_v1
   def testShortCircuitCapturedInput(self):
-    captured_t = array_ops.placeholder(dtypes.int64, shape=[])
+    captured_t = variables.Variable(42)
     dataset = self.structuredDataset(None).repeat().apply(
         batching.map_and_batch(lambda x: captured_t, batch_size=10))
-    iterator = dataset.make_initializable_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer, feed_dict={captured_t: 42})
-      self.assertAllEqual([42] * 10, self.evaluate(get_next))
+    self.evaluate(variables.global_variables_initializer())
+    get_next = self.getNext(dataset, requires_initialization=True)
+    self.assertAllEqual([42] * 10, self.evaluate(get_next()))
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchControlFlow(self, numa_aware):
 
     def map_fn(x):
-      previous_cond_v2_value = control_flow_ops.ENABLE_COND_V2
-      control_flow_ops.ENABLE_COND_V2 = True
+      previous_control_flow_v2_value = control_flow_util.ENABLE_CONTROL_FLOW_V2
+      control_flow_util.ENABLE_CONTROL_FLOW_V2 = True
       return_value = control_flow_ops.cond(x < 50, lambda: x + 1, lambda: x * x)
-      control_flow_ops.ENABLE_COND_V2 = previous_cond_v2_value
+      control_flow_util.ENABLE_CONTROL_FLOW_V2 = previous_control_flow_v2_value
       return return_value
 
     dataset = dataset_ops.Dataset.range(100).apply(
@@ -509,20 +476,17 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      for i in range(10):
-        print("Case %d" % i)
-        if i < 5:
-          self.assertAllEqual([i * 10 + j + 1 for j in range(10)],
-                              self.evaluate(get_next))
-        else:
-          self.assertAllEqual(
-              [((i * 10) + j) * ((i * 10) + j) for j in range(10)],
-              self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(dataset)
+    for i in range(10):
+      if i < 5:
+        self.assertAllEqual([i * 10 + j + 1 for j in range(10)],
+                            self.evaluate(get_next()))
+      else:
+        self.assertAllEqual(
+            [((i * 10) + j) * ((i * 10) + j) for j in range(10)],
+            self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
index 6042ca1c63f561a20e58e63e7864e13e847d3b35..19830a23bb2ea7ace55a458351d4eda556ba3bf8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
@@ -31,11 +31,11 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+# TODO(b/117581999): add eager coverage.
 class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunSimple(self):
@@ -237,7 +237,7 @@ class MapDefunTest(test_base.DatasetTestBase):
       thread = self.checkedThread(
           self._assert_op_cancelled, args=(sess, map_defun_op))
       thread.start()
-      time.sleep(0.1)
+      time.sleep(0.2)
       sess.close()
       thread.join()
 
@@ -254,46 +254,5 @@ class MapDefunTest(test_base.DatasetTestBase):
     self.assertAllEqual(self.evaluate(expected), self.evaluate(map_defun_op))
 
 
-class MapDefunBenchmark(test.Benchmark):
-
-  def _run(self, op, name=None, num_iters=3000):
-    with session.Session() as sess:
-      # Warm up the session
-      for _ in range(5):
-        self.evaluate(op)
-      start = time.time()
-      for _ in range(num_iters):
-        self.evaluate(op)
-      end = time.time()
-      mean_us = (end - start) * 1e6 / num_iters
-      self.report_benchmark(
-          name=name,
-          iters=num_iters,
-          wall_time=mean_us,
-          extras={"examples_per_sec": num_iters / (end - start)})
-
-  def benchmarkDefunVsMapFn(self):
-    """Benchmarks to compare the performance of MapDefun vs tf.map_fn."""
-
-    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)])
-    def defun(x):
-      return array_ops.identity(x)
-
-    def map_fn(x):
-      return array_ops.identity(x)
-
-    base = math_ops.range(100)
-    for input_size in [10, 100, 1000, 10000]:
-      num_iters = 100000 // input_size
-      map_defun_op = map_defun.map_defun(defun, [base], [dtypes.int32], [()])
-      map_fn_op = functional_ops.map_fn(map_fn, base)
-
-      self._run(
-          map_defun_op,
-          "benchmarkMapDefun_size_%d" % input_size,
-          num_iters=num_iters)
-      self._run(
-          map_fn_op, "benchmarkMapFn_size_%d" % input_size, num_iters=num_iters)
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/matching_files_test.py b/tensorflow/python/data/experimental/kernel_tests/matching_files_test.py
index 787a7a91a4b19a2ec2d6294c27c4bd0005ddefa4..fe83b4c66ec06fe5cd13caceb7c399036c4c4f5e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/matching_files_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/matching_files_test.py
@@ -29,7 +29,8 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-class MatchingFilesTest(test_base.DatasetTestBase):
+@test_util.run_all_in_graph_and_eager_modes
+class MatchingFilesDatasetTest(test_base.DatasetTestBase):
 
   def setUp(self):
     self.tmp_dir = tempfile.mkdtemp()
@@ -41,30 +42,23 @@ class MatchingFilesTest(test_base.DatasetTestBase):
     for filename in filenames:
       open(os.path.join(self.tmp_dir, filename), 'a').close()
 
-  @test_util.run_deprecated_v1
   def testNonExistingDirectory(self):
     """Test the MatchingFiles dataset with a non-existing directory."""
 
     self.tmp_dir = os.path.join(self.tmp_dir, 'nonexistingdir')
     dataset = matching_files.MatchingFilesDataset(
         os.path.join(self.tmp_dir, '*'))
-    with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
-      with self.assertRaises(errors.NotFoundError):
-        sess.run(next_element)
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.NotFoundError, ''))
 
-  @test_util.run_deprecated_v1
   def testEmptyDirectory(self):
     """Test the MatchingFiles dataset with an empty directory."""
 
     dataset = matching_files.MatchingFilesDataset(
         os.path.join(self.tmp_dir, '*'))
-    with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
-      with self.assertRaises(errors.NotFoundError):
-        sess.run(next_element)
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.NotFoundError, ''))
 
-  @test_util.run_deprecated_v1
   def testSimpleDirectory(self):
     """Test the MatchingFiles dataset with a simple directory."""
 
@@ -73,21 +67,14 @@ class MatchingFilesTest(test_base.DatasetTestBase):
 
     dataset = matching_files.MatchingFilesDataset(
         os.path.join(self.tmp_dir, '*'))
-    with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(os.path.join(self.tmp_dir, filename))
+            for filename in filenames
+        ],
+        assert_items_equal=True)
 
-      expected_filenames = []
-      actual_filenames = []
-      for filename in filenames:
-        expected_filenames.append(
-            compat.as_bytes(os.path.join(self.tmp_dir, filename)))
-        actual_filenames.append(compat.as_bytes(sess.run(next_element)))
-
-      self.assertItemsEqual(expected_filenames, actual_filenames)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  @test_util.run_deprecated_v1
   def testFileSuffixes(self):
     """Test the MatchingFiles dataset using the suffixes of filename."""
 
@@ -96,20 +83,14 @@ class MatchingFilesTest(test_base.DatasetTestBase):
 
     dataset = matching_files.MatchingFilesDataset(
         os.path.join(self.tmp_dir, '*.py'))
-    with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
-      expected_filenames = []
-      actual_filenames = []
-      for filename in filenames[1:-1]:
-        expected_filenames.append(
-            compat.as_bytes(os.path.join(self.tmp_dir, filename)))
-        actual_filenames.append(compat.as_bytes(sess.run(next_element)))
-
-      self.assertItemsEqual(expected_filenames, actual_filenames)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  @test_util.run_deprecated_v1
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(os.path.join(self.tmp_dir, filename))
+            for filename in filenames[1:-1]
+        ],
+        assert_items_equal=True)
+
   def testFileMiddles(self):
     """Test the MatchingFiles dataset using the middles of filename."""
 
@@ -118,20 +99,14 @@ class MatchingFilesTest(test_base.DatasetTestBase):
 
     dataset = matching_files.MatchingFilesDataset(
         os.path.join(self.tmp_dir, 'b*.py*'))
-    with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
-      expected_filenames = []
-      actual_filenames = []
-      for filename in filenames[1:3]:
-        expected_filenames.append(
-            compat.as_bytes(os.path.join(self.tmp_dir, filename)))
-        actual_filenames.append(compat.as_bytes(sess.run(next_element)))
-
-      self.assertItemsEqual(expected_filenames, actual_filenames)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  @test_util.run_deprecated_v1
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(os.path.join(self.tmp_dir, filename))
+            for filename in filenames[1:3]
+        ],
+        assert_items_equal=True)
+
   def testNestedDirectories(self):
     """Test the MatchingFiles dataset with nested directories."""
 
@@ -155,21 +130,20 @@ class MatchingFilesTest(test_base.DatasetTestBase):
     ]
 
     dataset = matching_files.MatchingFilesDataset(patterns)
-    with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
-      expected_filenames = [
-          compat.as_bytes(filename)
-          for filename in filenames
-          if filename.endswith('.txt') or filename.endswith('.log')
-      ]
-      actual_filenames = []
-      while True:
-        try:
-          actual_filenames.append(compat.as_bytes(sess.run(next_element)))
-        except errors.OutOfRangeError:
-          break
-
-      self.assertItemsEqual(expected_filenames, actual_filenames)
+    next_element = self.getNext(dataset)
+    expected_filenames = [
+        compat.as_bytes(filename)
+        for filename in filenames
+        if filename.endswith('.txt') or filename.endswith('.log')
+    ]
+    actual_filenames = []
+    while True:
+      try:
+        actual_filenames.append(compat.as_bytes(self.evaluate(next_element())))
+      except errors.OutOfRangeError:
+        break
+
+    self.assertItemsEqual(expected_filenames, actual_filenames)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index e05f382171836bb624145ba6cd4b4c91488a714e..bf868ebe79339e3c36473711ece064210db5f47f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -279,7 +279,6 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/data/experimental/ops:optimization",
-        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
@@ -287,7 +286,7 @@ py_test(
 
 py_test(
     name = "optimize_dataset_test",
-    size = "small",
+    size = "medium",
     srcs = ["optimize_dataset_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -301,10 +300,17 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
+        "//tensorflow/python/data/experimental/ops:scan_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
index 7371cf31dff33a5de18f3268ecdfc91c6a08b29c..3ce921b5efe9e870fe1c5fb6406736f8bbb9c09f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -72,7 +71,6 @@ class FilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     dataset = dataset.cache()
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
     options.experimental_optimization.filter_fusion = True
     dataset = dataset.with_options(options)
     expected_output = []
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
index 0aacf8bb07cb5cb67fff6eaa9c4184678b45aa95..f080891f2e783f0cbe2f6f6f8fb4bfa1ff726745 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
@@ -70,7 +69,7 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
       iterator = dataset.__iter__()
       get_next = iterator._next_internal  # pylint: disable=protected-access
     else:
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       get_next = iterator.get_next
     for _ in range(5):
       result = self.evaluate(get_next())
@@ -92,7 +91,6 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
             ["Zip[0]", "Map"] if will_optimize else ["Map"])).map(function)
 
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
     options.experimental_optimization.hoist_random_uniform = True
     dataset = dataset.with_options(options)
     self._testDataset(dataset)
@@ -109,7 +107,6 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.range(5).apply(
         optimization.assert_next(["Zip[0]", "Map"])).map(random_with_capture)
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
     options.experimental_optimization.hoist_random_uniform = True
     dataset = dataset.with_options(options)
     self._testDataset(dataset)
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
index fc65f52704c3389a24e9f304cfa1cadd5686c7d6..8af86da852169eae992c0bad92ae8acbbdff5bb6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_base
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.experimental.ops import stats_aggregator
-from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
@@ -36,7 +35,6 @@ class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase):
             ["LatencyStats", "Map", "LatencyStats", "Prefetch",
              "LatencyStats"])).map(lambda x: x * x).prefetch(1)
     options = dataset_ops.Options()
-    options.experimental_stats = stats_options.StatsOptions()
     options.experimental_stats.latency_all_edges = True
     options.experimental_stats.aggregator = aggregator
     dataset = dataset.with_options(options)
@@ -53,29 +51,6 @@ class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase):
     self._assertSummaryHasCount(summary_str,
                                 "record_latency_PrefetchDataset/_6", 1)
 
-  def testLatencyStatsOptimizationV2(self):
-    aggregator = stats_aggregator.StatsAggregator()
-    dataset = dataset_ops.Dataset.from_tensors(1).apply(
-        optimization.assert_next(
-            ["LatencyStats", "Map", "LatencyStats", "Prefetch",
-             "LatencyStats"])).map(lambda x: x * x).prefetch(1)
-    options = dataset_ops.Options()
-    options.experimental_stats = stats_options.StatsOptions()
-    options.experimental_stats.aggregator = aggregator
-    dataset = dataset.with_options(options)
-    self.assertDatasetProduces(
-        dataset,
-        expected_output=[1],
-        requires_initialization=True,
-        num_test_iterations=1)
-    summary_t = aggregator.get_summary()
-    summary_str = self.evaluate(summary_t)
-    self._assertSummaryHasCount(summary_str, "record_latency_TensorDataset/_1",
-                                1)
-    self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4", 1)
-    self._assertSummaryHasCount(summary_str,
-                                "record_latency_PrefetchDataset/_6", 1)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
index 801f664f09c7f2f7008090f356a246ca530ddcd5..e2ff3116eccf2ccfb7ed72085f4727a1e0262164 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import test_util
@@ -32,10 +31,6 @@ class MapAndBatchFusionTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(10).apply(
         optimization.assert_next(
             ["MapAndBatch"])).map(lambda x: x * x).batch(10)
-    options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
-    options.experimental_optimization.map_and_batch_fusion = True
-    dataset = dataset.with_options(options)
     self.assertDatasetProduces(
         dataset, expected_output=[[x * x for x in range(10)]])
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
index db8f214fbfca1389af70df55518c885610984031..fa1d673065d6b5e8e473fd72680a92f0f07e7d65 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -84,7 +83,6 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
         optimization.assert_next(
             ["Map", "FilterByLastComponent"])).map(function).filter(predicate)
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
     options.experimental_optimization.map_and_filter_fusion = True
     dataset = dataset.with_options(options)
     self._testMapAndFilter(dataset, function, predicate)
@@ -103,7 +101,6 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
         optimization.assert_next(["Map",
                                   "Filter"])).map(function).filter(predicate)
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
     options.experimental_optimization.map_and_filter_fusion = True
     dataset = dataset.with_options(options)
     self._testMapAndFilter(dataset, function, predicate)
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
index d8d63903749d13b80f662c996ebf5c95f934a0b1..defdaf044001ec4b6129987c82c0c626825fce95 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import test_util
@@ -75,7 +74,6 @@ class MapFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     dataset = dataset.cache()
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
     options.experimental_optimization.map_fusion = True
     dataset = dataset.with_options(options)
     expected_output = []
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
index 0ff3fff4f8550a4221e54ab2b01ddcaf6c340145..d8dd31fee8b0bc66bcaf92dffe6b0a89d29d668f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
@@ -68,7 +67,6 @@ class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.range(5).apply(
         optimization.assert_next(next_nodes)).map(function)
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
     options.experimental_optimization.map_parallelization = True
     dataset = dataset.with_options(options)
     if should_optimize:
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index d979aaa5a068754a1f685e6fadc6b9d5a67fe5f5..65fa2bac171e87eba0f5c61bb1c7d11966572e11 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -344,17 +343,20 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
       Tuple of (unoptimized dataset, optimized dataset).
     """
     map_node_name = "Map" if num_parallel_calls is None else "ParallelMap"
-    batch_size = 100
 
     def _make_dataset(node_names):
-      return base_dataset.apply(optimization.assert_next(node_names)).map(
-          map_fn, num_parallel_calls=num_parallel_calls).batch(batch_size)
+      dataset = base_dataset.apply(optimization.assert_next(node_names))
+      dataset = dataset.map(map_fn, num_parallel_calls)
+      dataset = dataset.batch(100)
+      options = dataset_ops.Options()
+      options.experimental_optimization.map_and_batch_fusion = False
+      dataset = dataset.with_options(options)
+      return dataset
 
     unoptimized = _make_dataset([map_node_name, "Batch"])
     optimized = _make_dataset(["Batch", map_node_name]
                               if expect_optimized else [map_node_name, "Batch"])
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
     options.experimental_optimization.map_vectorization = True
     optimized = optimized.with_options(options)
     return unoptimized, optimized
@@ -376,7 +378,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     base_dataset = dataset_ops.Dataset.range(5).repeat(5).batch(
         5, drop_remainder=True)
     _, optimized = self._get_test_datasets(base_dataset, map_fn)
-    nxt = optimized.make_one_shot_iterator().get_next()
+    nxt = dataset_ops.make_one_shot_iterator(optimized).get_next()
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r"indices = 10 is not in \[0, 5\)"):
       self.evaluate(nxt)
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
index d3c121491aeebfa10920f65d780e5cec0d9bc3ce..0f0274b41f2da1add8b2361b54e5c32a5974da41 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
@@ -37,7 +37,7 @@ class ModelDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     options.experimental_autotune = True
     dataset = dataset.with_options(options)
 
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
index ce86bfa4e0f8f953722cbb772705ae866ef33e0e..8058f53eea240831545444286fb2c6aa404e240a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -42,10 +41,6 @@ class NoopEliminationTest(test_base.DatasetTestBase):
             ["FiniteRepeat", "FiniteSkip", "Prefetch", "MemoryCacheImpl"]))
     dataset = dataset.repeat(some_tensor).skip(5).take(-1).skip(0).repeat(
         1).prefetch(0).prefetch(1).cache()
-    options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
-    options.experimental_optimization.noop_elimination = True
-    dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=range(5))
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
index df26a2c0cdfc8a454b4cba4d94aa3cc60ae16f02..dd432b8c15d3c41d0016e41700dbd44776c81ddd 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
@@ -17,25 +17,95 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
+from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops import scan_ops
 from tensorflow.python.data.experimental.ops import threadpool
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
 
+def _generate_captured_refvar_test_cases():
+  """Generates testcases.
+
+  Returns:
+    A list of tuples of (testcase_name, make_dataset_fn). make_dataset_fn takes
+    a tf.Variable as input and creates a test dataset that uses that variable.
+  """
+
+  def make_map_dataset(var):
+    return dataset_ops.Dataset.from_tensors(0).map(lambda x: x + var)
+
+  def make_flat_map_dataset(var):
+    return dataset_ops.Dataset.from_tensors(
+        0).flat_map(lambda _: dataset_ops.Dataset.from_tensors(var))
+
+  def make_filter_dataset(var):
+    return dataset_ops.Dataset.from_tensors(0).filter(lambda x: x < var)
+
+  def make_map_and_batch_dataset(var):
+
+    def map_fn(x):
+      return x + var
+
+    return dataset_ops.Dataset.from_tensors(0).apply(
+        batching.map_and_batch(map_fn, 1))
+
+  def make_group_by_reducer_dataset(var):
+    reducer = grouping.Reducer(
+        init_func=lambda _: 0,
+        reduce_func=lambda x, y: x,
+        finalize_func=lambda _: var)
+    return dataset_ops.Dataset.range(5).apply(
+        grouping.group_by_reducer(lambda x: x % 2, reducer))
+
+  def make_group_by_window_dataset(var):
+
+    def reduce_fn(key, bucket):
+      del key, bucket
+      return dataset_ops.Dataset.from_tensors(var)
+
+    return dataset_ops.Dataset.from_tensors(0).repeat(10).apply(
+        grouping.group_by_window(lambda _: 0, reduce_fn, 10))
+
+  def make_scan_dataset(var):
+    return dataset_ops.Dataset.from_tensors(0).apply(
+        scan_ops.scan(
+            0, lambda old_state, elem: (old_state + 1, elem + old_state + var)))
+
+  return [
+      # Core datasets
+      ("Map", make_map_dataset),
+      ("FlatMap", make_flat_map_dataset),
+      ("Filter", make_filter_dataset),
+      # Experimental datasets
+      ("MapAndBatch", make_map_and_batch_dataset),
+      ("GroupByReducer", make_group_by_reducer_dataset),
+      ("GroupByWindow", make_group_by_window_dataset),
+      ("Scan", make_scan_dataset)
+  ]
+
+
 @test_util.run_all_in_graph_and_eager_modes
-class OptimizeDatasetTest(test_base.DatasetTestBase):
+class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testOptimizationStatefulFunction(self):
-    dataset = dataset_ops.Dataset.range(10).map(
-        lambda _: random_ops.random_uniform([])).batch(10)
+    dataset = dataset_ops.Dataset.range(
+        10).map(lambda _: random_ops.random_uniform([])).batch(10)
     dataset = dataset_ops._OptimizeDataset(dataset, [])
     get_next = self.getNext(dataset)
     self.evaluate(get_next())
@@ -45,7 +115,7 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None))
     dataset = dataset_ops.Dataset.from_tensors(input_t)
     dataset = dataset_ops._OptimizeDataset(dataset, [])
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -58,7 +128,7 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None, None))
     dataset = dataset_ops.Dataset.from_tensor_slices(input_t)
     dataset = dataset_ops._OptimizeDataset(dataset, [])
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -80,6 +150,27 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
     dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
     self.assertDatasetProduces(dataset, expected_output=[0])
 
+  def testOptimizationNestedDatasetWithModifiedRetval(self):
+
+    def flat_map_fn(_):
+      dataset = dataset_ops.Dataset.from_tensors(0)
+      dataset = dataset.apply(optimization.assert_next(["MapAndBatch"]))
+      # Should be fused by map and batch fusion
+      dataset = dataset.map(lambda x: x)
+      dataset = dataset.batch(1)
+      return dataset
+
+    dataset = dataset_ops.Dataset.range(1)
+    dataset = dataset.flat_map(flat_map_fn)
+
+    # TODO(b/120558523): We use Options instead of _OptimizeDataset directly
+    # here because of a bug with chaining _OptimizeDatasets when there are
+    # nested dataset functions
+    options = dataset_ops.Options()
+    options.experimental_optimization.map_and_batch_fusion = True
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(dataset, expected_output=[[0]])
+
   def testOptimizationThreadPoolDataset(self):
     dataset = dataset_ops.Dataset.range(10).batch(10)
 
@@ -106,13 +197,80 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
     self.assertDatasetProduces(dataset, expected_output=[0])
 
   def testOptimizationNonSerializableAsDirectInput(self):
-    """Tests that non-serializable dataset can be OptimizeDataset's input.
-    """
+    """Tests that non-serializable dataset can be OptimizeDataset's input."""
     dataset = dataset_ops.Dataset.from_tensors(0)
     dataset = dataset.apply(optimization.non_serializable())
     dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
     self.assertDatasetProduces(dataset, expected_output=[0])
 
+  @parameterized.named_parameters(_generate_captured_refvar_test_cases())
+  # Skip eager because RefVariables are not supported in eager mode.
+  def testSkipEagerOptimizationWithCapturedRefVar(self, dataset_fn):
+    """Tests that default optimizations are disabled with ref variables."""
+    variable = variable_scope.get_variable(
+        "v", initializer=0, use_resource=False)
+    assign_op = variable.assign_add(1)
+
+    unoptimized_dataset = dataset_fn(variable)
+
+    options = dataset_ops.Options()
+    options.experimental_optimization.noop_elimination = True
+    options.experimental_optimization.map_and_batch_fusion = True
+    optimized_dataset = unoptimized_dataset.with_options(options)
+
+    # Check that warning is logged.
+    warnings.simplefilter("always")
+    with warnings.catch_warnings(record=True) as w:
+      optimized_it = optimized_dataset.make_initializable_iterator()
+
+    self.assertGreaterEqual(len(w), 1)
+    expected = ("tf.data static optimizations are not compatible with "
+                "tf.Variable. The following optimizations will be disabled: %s."
+                " To enable optimizations, use resource variables instead by "
+                "calling `tf.enable_resource_variables()` at the start of the "
+                "program." % (", ".join(options._static_optimizations())))
+    self.assertTrue(any([expected in str(warning) for warning in w]))
+
+    # Check that outputs are the same in the optimized and unoptimized cases,
+    # when the variable value is changing.
+    unoptimized_it = unoptimized_dataset.make_initializable_iterator()
+    with ops.control_dependencies([assign_op]):
+      unoptimized_output = unoptimized_it.get_next()
+      optimized_output = optimized_it.get_next()
+
+    self.evaluate(variable.initializer)
+    self.evaluate((unoptimized_it.initializer, optimized_it.initializer))
+    while True:
+      try:
+        unoptimized, optimized = self.evaluate((unoptimized_output,
+                                                optimized_output))
+        self.assertEqual(unoptimized, optimized)
+      except errors.OutOfRangeError:
+        break
+
+  def testOptimizationEnabledByDefault(self):
+    """Tests that some optimizations are applied to datasets by default."""
+    options = dataset_ops.Options()
+    expected_optimizations = [
+        "map_and_batch_fusion",
+        "noop_elimination",
+        "shuffle_and_repeat_fusion",
+    ]
+    self.assertEqual(
+        set(options._static_optimizations()), set(expected_optimizations))
+
+  def testOptimizationDisableDefault(self):
+    """Tests that we can disable all static optimizations enabled by default.
+
+    If the `apply_default_optimizations` optimization options flag is False,
+    only explicitly enabled optimizations will be applied.
+    """
+    options = dataset_ops.Options()
+    options.experimental_optimization.hoist_random_uniform = True
+    options.experimental_optimization.apply_default_optimizations = False
+    expected_optimizations = ["hoist_random_uniform"]
+    self.assertEqual(options._static_optimizations(), expected_optimizations)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
index 5f746ec63ac8d68d614044e809e7f31178ea8874..594b59375febbba6c939dc5429ff59fe9c971a5f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
@@ -32,10 +31,6 @@ class ShuffleAndRepeatFusionTest(test_base.DatasetTestBase):
   def testShuffleAndRepeatFusion(self):
     dataset = dataset_ops.Dataset.range(10).apply(
         optimization.assert_next(["ShuffleAndRepeat"])).shuffle(10).repeat(2)
-    options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
-    options.experimental_optimization.shuffle_and_repeat_fusion = True
-    dataset = dataset.with_options(options)
     get_next = self.getNext(dataset)
 
     for _ in range(2):
diff --git a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
index 7116e7549f9998431410e5215dcc16cbe844f3f4..811a58262efe6500784700518ac2bb1a20b03c63 100644
--- a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
@@ -22,6 +22,7 @@ import threading
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.core.framework import graph_pb2
 from tensorflow.python.data.experimental.ops import threading_options
 from tensorflow.python.data.experimental.ops import threadpool
 from tensorflow.python.data.experimental.ops import unique
@@ -34,6 +35,7 @@ from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class OverrideThreadpoolTest(test_base.DatasetTestBase,
                              parameterized.TestCase):
 
@@ -52,24 +54,21 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase,
             lambda x: script_ops.py_func(get_thread_id, [x], dtypes.int64),
             num_parallel_calls=32).apply(unique.unique()))
     dataset = override_threadpool_fn(dataset)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      thread_ids = []
-      try:
-        while True:
-          thread_ids.append(self.evaluate(next_element))
-      except errors.OutOfRangeError:
-        pass
-      self.assertLen(thread_ids, len(set(thread_ids)))
-      self.assertNotEmpty(thread_ids)
-      if num_threads:
-        # NOTE(mrry): We don't control the thread pool scheduling, and
-        # so cannot guarantee that all of the threads in the pool will
-        # perform work.
-        self.assertLessEqual(len(thread_ids), num_threads)
+    next_element = self.getNext(dataset, requires_initialization=True)
+
+    thread_ids = []
+    try:
+      while True:
+        thread_ids.append(self.evaluate(next_element()))
+    except errors.OutOfRangeError:
+      pass
+    self.assertLen(thread_ids, len(set(thread_ids)))
+    self.assertNotEmpty(thread_ids)
+    if num_threads:
+      # NOTE(mrry): We don't control the thread pool scheduling, and
+      # so cannot guarantee that all of the threads in the pool will
+      # perform work.
+      self.assertLessEqual(len(thread_ids), num_threads)
 
   @parameterized.named_parameters(
       ("1", 1, None),
@@ -82,7 +81,6 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase,
       ("8", 4, 1),
       ("9", 4, 4),
   )
-  @test_util.run_deprecated_v1
   def testNumThreadsDeprecated(self, num_threads, max_intra_op_parallelism):
 
     def override_threadpool_fn(dataset):
@@ -109,7 +107,6 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase,
       ("11", 4, 4),
       ("12", None, None),
   )
-  @test_util.run_deprecated_v1
   def testNumThreads(self, num_threads, max_intra_op_parallelism):
 
     def override_threadpool_fn(dataset):
@@ -124,6 +121,14 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase,
 
     self._testNumThreadsHelper(num_threads, override_threadpool_fn)
 
+  def testMaxIntraOpParallelismAsGraphDefInternal(self):
+    dataset = dataset_ops.Dataset.from_tensors(0)
+    dataset = dataset_ops._MaxIntraOpParallelismDataset(dataset, 1)
+    graph = graph_pb2.GraphDef().FromString(
+        self.evaluate(dataset._as_serialized_graph()))
+    self.assertTrue(
+        any([node.op != "MaxIntraOpParallelismDataset" for node in graph.node]))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
index 77f0dc8e81315c1b20b37faf5ca98f69751c59af..9d535316619db395853e83e3c1b2a740965b9f7d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
@@ -22,6 +22,7 @@ import math
 import threading
 import time
 
+import numpy as np
 from six.moves import zip_longest
 
 from tensorflow.python.data.experimental.ops import interleave_ops
@@ -30,24 +31,18 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class ParallelInterleaveTest(test_base.DatasetTestBase):
 
   def setUp(self):
 
-    self.input_values = array_ops.placeholder(dtypes.int64, shape=[None])
-    self.cycle_length = array_ops.placeholder(dtypes.int64, shape=[])
-    self.block_length = array_ops.placeholder(dtypes.int64, shape=[])
-    self.sloppy = array_ops.placeholder(dtypes.bool, shape=[])
-    self.buffer_output_elements = array_ops.placeholder(dtypes.int64, shape=[])
-    self.prefetch_input_elements = array_ops.placeholder(dtypes.int64, shape=[])
-
     self.error = None
     self.repeat_count = 2
 
@@ -61,6 +56,9 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       self.read_coordination_events[i] = threading.Semaphore(0)
       self.write_coordination_events[i] = threading.Event()
 
+  def dataset_fn(self, input_values, cycle_length, block_length, sloppy,
+                 buffer_output_elements, prefetch_input_elements):
+
     def map_py_fn(x):
       self.write_coordination_events[x].wait()
       self.write_coordination_events[x].clear()
@@ -79,16 +77,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       dataset = dataset.repeat(x)
       return dataset.map(map_fn)
 
-    self.dataset = (
-        dataset_ops.Dataset.from_tensor_slices(self.input_values)
-        .repeat(self.repeat_count).apply(
-            interleave_ops.parallel_interleave(interleave_fn, self.cycle_length,
-                                               self.block_length, self.sloppy,
-                                               self.buffer_output_elements,
-                                               self.prefetch_input_elements)))
-    self.iterator = self.dataset.make_initializable_iterator()
-    self.init_op = self.iterator.initializer
-    self.next_element = self.iterator.get_next()
+    return dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
+        self.repeat_count).apply(
+            interleave_ops.parallel_interleave(
+                interleave_fn, cycle_length, block_length, sloppy,
+                buffer_output_elements, prefetch_input_elements))
 
   def _interleave(self, lists, cycle_length, block_length):
     """Python implementation of interleave used for testing."""
@@ -178,26 +171,22 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
   def _testSingleThreaded(self, sloppy=False, prefetch_input_elements=0):
     # cycle_length=1,block_length=1 acts like `Dataset.interleave()` and
     # `Dataset.flat_map()` and is single-threaded. No synchronization required.
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 1,
-              self.block_length: 1,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: prefetch_input_elements,
-          })
-
-      for expected_element in self._interleave(
-          [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 1):
-        self.write_coordination_events[expected_element].set()
-        self.assertEqual(expected_element * expected_element,
-                         self.evaluate(self.next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=1,
+            block_length=1,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=prefetch_input_elements))
+    for expected_element in self._interleave(
+        [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 1):
+      self.write_coordination_events[expected_element].set()
+      self.assertEqual(expected_element * expected_element,
+                       self.evaluate(next_element()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testSingleThreaded(self):
     self._testSingleThreaded()
@@ -213,64 +202,59 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
 
   def testSingleThreadedRagged(self):
     # Tests a sequence with wildly different elements per iterator.
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [3, 7, 4],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: False,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 1,
-          })
-
-      # Add coordination values for 3 and 7
-      self.read_coordination_events[3] = threading.Semaphore(0)
-      self.write_coordination_events[3] = threading.Event()
-      self.read_coordination_events[7] = threading.Semaphore(0)
-      self.write_coordination_events[7] = threading.Event()
-
-      for expected_element in self._interleave(
-          [[3] * 3, [7] * 7, [4] * 4] * self.repeat_count, 2, 1):
-        self.write_coordination_events[expected_element].set()
-        output = self.evaluate(self.next_element)
-        self.assertEqual(expected_element * expected_element, output)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([3, 7, 4]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=False,
+            buffer_output_elements=1,
+            prefetch_input_elements=1))
+
+    # Add coordination values for 3 and 7
+    self.read_coordination_events[3] = threading.Semaphore(0)
+    self.write_coordination_events[3] = threading.Event()
+    self.read_coordination_events[7] = threading.Semaphore(0)
+    self.write_coordination_events[7] = threading.Event()
+
+    for expected_element in self._interleave(
+        [[3] * 3, [7] * 7, [4] * 4] * self.repeat_count, 2, 1):
+      self.write_coordination_events[expected_element].set()
+      output = self.evaluate(next_element())
+      self.assertEqual(expected_element * expected_element, output)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def _testTwoThreadsNoContention(self, sloppy=False):
     # num_threads > 1.
     # Explicit coordination should result in `Dataset.interleave()` behavior
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      done_first_event = False
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 1,
-          })
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
-                           1)):
-        self.write_coordination_events[expected_element].set()
-        if done_first_event:  # First event starts the worker threads.
-          self.read_coordination_events[expected_element].acquire()
-        actual_element = self.evaluate(self.next_element)
-        if not done_first_event:
-          self.read_coordination_events[expected_element].acquire()
-          done_first_event = True
-        self.assertEqual(expected_element * expected_element, actual_element,
-                         "At index %s: %s expected, got: %s" %
-                         (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    done_first_event = False
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=1))
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
+                         1)):
+      self.write_coordination_events[expected_element].set()
+      if done_first_event:  # First event starts the worker threads.
+        self.read_coordination_events[expected_element].acquire()
+      actual_element = self.evaluate(next_element())
+      if not done_first_event:
+        self.read_coordination_events[expected_element].acquire()
+        done_first_event = True
+      self.assertEqual(
+          expected_element * expected_element, actual_element,
+          "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                 actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testTwoThreadsNoContention(self):
     self._testTwoThreadsNoContention()
@@ -287,38 +271,36 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
     Args:
       sloppy: Whether to be sloppy or not.
     """
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      done_first_event = False
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 1,
-          })
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
-                           1)):
-        if done_first_event:  # First event starts the worker threads.
-          self._allow_all_map_threads()
-          self.read_coordination_events[expected_element].acquire()
-        else:
-          self.write_coordination_events[expected_element].set()
-        time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
-        actual_element = self.evaluate(self.next_element)
-        if not done_first_event:
-          done_first_event = True
-          self.assertTrue(
-              self.read_coordination_events[expected_element].acquire(False))
-        self.assertEqual(expected_element * expected_element, actual_element,
-                         "At index %s: %s expected, got: %s" %
-                         (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    done_first_event = False
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=1))
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
+                         1)):
+      if done_first_event:  # First event starts the worker threads.
+        self._allow_all_map_threads()
+        self.read_coordination_events[expected_element].acquire()
+      else:
+        self.write_coordination_events[expected_element].set()
+      time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
+      actual_element = self.evaluate(next_element())
+      if not done_first_event:
+        done_first_event = True
+        self.assertTrue(
+            self.read_coordination_events[expected_element].acquire(False))
+      self.assertEqual(
+          expected_element * expected_element, actual_element,
+          "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                 actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testTwoThreadsNoContentionWithRaces(self):
     self._testTwoThreadsNoContentionWithRaces()
@@ -329,34 +311,32 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
   def _testTwoThreadsNoContentionBlockLength(self, sloppy=False):
     # num_threads > 1.
     # Explicit coordination should result in `Dataset.interleave()` behavior
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      done_first_event = False
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 2,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 1,
-          })
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
-                           2)):
-        self.write_coordination_events[expected_element].set()
-        if done_first_event:  # First event starts the worker threads.
-          self.read_coordination_events[expected_element].acquire()
-        actual_element = self.evaluate(self.next_element)
-        if not done_first_event:
-          done_first_event = True
-          self.read_coordination_events[expected_element].acquire()
-        self.assertEqual(expected_element * expected_element, actual_element,
-                         "At index %s: %s expected, got: %s" %
-                         (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    done_first_event = False
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=2,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=1))
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
+                         2)):
+      self.write_coordination_events[expected_element].set()
+      if done_first_event:  # First event starts the worker threads.
+        self.read_coordination_events[expected_element].acquire()
+      actual_element = self.evaluate(next_element())
+      if not done_first_event:
+        done_first_event = True
+        self.read_coordination_events[expected_element].acquire()
+      self.assertEqual(
+          expected_element * expected_element, actual_element,
+          "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                 actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testTwoThreadsNoContentionBlockLength(self):
     self._testTwoThreadsNoContentionBlockLength()
@@ -374,38 +354,36 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
     Args:
       sloppy: Whether to be sloppy or not.
     """
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      done_first_event = False
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 2,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 1,
-          })
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
-                           2)):
-        if done_first_event:  # First event starts the worker threads.
-          self._allow_all_map_threads()
-          self.read_coordination_events[expected_element].acquire()
-        else:
-          self.write_coordination_events[expected_element].set()
-        time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
-        actual_element = self.evaluate(self.next_element)
-        if not done_first_event:
-          done_first_event = True
-          self.assertTrue(
-              self.read_coordination_events[expected_element].acquire(False))
-        self.assertEqual(expected_element * expected_element, actual_element,
-                         "At index %s: %s expected, got: %s" %
-                         (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    done_first_event = False
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=2,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=1))
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
+                         2)):
+      if done_first_event:  # First event starts the worker threads.
+        self._allow_all_map_threads()
+        self.read_coordination_events[expected_element].acquire()
+      else:
+        self.write_coordination_events[expected_element].set()
+      time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
+      actual_element = self.evaluate(next_element())
+      if not done_first_event:
+        done_first_event = True
+        self.assertTrue(
+            self.read_coordination_events[expected_element].acquire(False))
+      self.assertEqual(
+          expected_element * expected_element, actual_element,
+          "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                 actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testTwoThreadsNoContentionWithRacesAndBlocking(self):
     self._testTwoThreadsNoContentionWithRacesAndBlocking()
@@ -414,21 +392,18 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
     self._testTwoThreadsNoContentionWithRacesAndBlocking(sloppy=True)
 
   def _testEmptyInput(self, sloppy=False):
-    with self.cached_session() as sess:
-      # Empty input.
-      self._clear_coordination_events()
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [],
-              self.cycle_length: 2,
-              self.block_length: 3,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 0,
-          })
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    # Empty input.
+    self._clear_coordination_events()
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([]),
+            cycle_length=2,
+            block_length=3,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=0))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testEmptyInput(self):
     self._testEmptyInput()
@@ -438,20 +413,17 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
 
   def _testNonEmptyInputIntoEmptyOutputs(self, sloppy=False):
     # Non-empty input leading to empty output.
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [0, 0, 0],
-              self.cycle_length: 2,
-              self.block_length: 3,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 0,
-          })
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([0, 0, 0]),
+            cycle_length=2,
+            block_length=3,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=0))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testNonEmptyInputIntoEmptyOutputs(self):
     self._testNonEmptyInputIntoEmptyOutputs()
@@ -462,35 +434,33 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
   def _testPartiallyEmptyOutputs(self, sloppy=False, prefetch_input_elements=1):
     race_indices = {2, 8, 14}  # Sequence points when sloppy mode has race conds
     # Mixture of non-empty and empty interleaved datasets.
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      done_first_event = False
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 0, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: prefetch_input_elements,
-          })
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [], [6] * 6] * self.repeat_count, 2, 1)):
-        self.write_coordination_events[expected_element].set()
-        # First event starts the worker threads. Additionally, when running the
-        # sloppy case with prefetch_input_elements=0, we get stuck if we wait
-        # for the read coordination event for certain event orderings in the
-        # presence of finishing iterators.
-        if done_first_event and not (sloppy and (i in race_indices)):
-          self.read_coordination_events[expected_element].acquire()
-        actual_element = self.evaluate(self.next_element)
-        if not done_first_event or (sloppy and (i in race_indices)):
-          done_first_event = True
-          self.read_coordination_events[expected_element].acquire()
-        self.assertEqual(expected_element * expected_element, actual_element,
-                         "At index %s: %s expected, got: %s" %
-                         (i, expected_element, actual_element))
+    self._clear_coordination_events()
+    done_first_event = False
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 0, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=prefetch_input_elements))
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [], [6] * 6] * self.repeat_count, 2, 1)):
+      self.write_coordination_events[expected_element].set()
+      # First event starts the worker threads. Additionally, when running the
+      # sloppy case with prefetch_input_elements=0, we get stuck if we wait
+      # for the read coordination event for certain event orderings in the
+      # presence of finishing iterators.
+      if done_first_event and not (sloppy and (i in race_indices)):
+        self.read_coordination_events[expected_element].acquire()
+      actual_element = self.evaluate(next_element())
+      if not done_first_event or (sloppy and (i in race_indices)):
+        done_first_event = True
+        self.read_coordination_events[expected_element].acquire()
+      self.assertEqual(
+          expected_element * expected_element, actual_element,
+          "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                 actual_element))
 
   def testPartiallyEmptyOutputs(self):
     self._testPartiallyEmptyOutputs()
@@ -501,89 +471,81 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
   def testDelayedOutputSloppy(self):
     # Explicitly control the sequence of events to ensure we correctly avoid
     # head-of-line blocking.
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: True,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 0,
-          })
-
-      mis_ordering = [
-          4, 4, 5, 4, 5, 5, 4, 5, 6, 6, 6, 5, 4, 4, 6, 6, 4, 4, 6, 5, 6, 6, 6,
-          6, 5, 5, 5, 5, 6, 6
-      ]
-      for element in mis_ordering:
-        self.write_coordination_events[element].set()
-        self.assertEqual(element * element, self.evaluate(self.next_element))
-        self.assertTrue(self.read_coordination_events[element].acquire(False))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=True,
+            buffer_output_elements=1,
+            prefetch_input_elements=0))
+
+    mis_ordering = [
+        4, 4, 5, 4, 5, 5, 4, 5, 6, 6, 6, 5, 4, 4, 6, 6, 4, 4, 6, 5, 6, 6, 6, 6,
+        5, 5, 5, 5, 6, 6
+    ]
+    for element in mis_ordering:
+      self.write_coordination_events[element].set()
+      self.assertEqual(element * element, self.evaluate(next_element()))
+      self.assertTrue(self.read_coordination_events[element].acquire(False))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testBlockLengthWithContentionSloppy(self):
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      done_first_event = False
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: True,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 1,
-          })
-      # Test against a generating sequence that differs from the uncontended
-      # case, in order to prove sloppy correctness.
-      for i, expected_element in enumerate(
-          self._interleave(
-              [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count,
-              cycle_length=2,
-              block_length=3)):
-        self.write_coordination_events[expected_element].set()
-        if done_first_event:  # First event starts the worker threads.
-          self.read_coordination_events[expected_element].acquire()
-        actual_element = self.evaluate(self.next_element)
-        if not done_first_event:
-          self.read_coordination_events[expected_element].acquire()
-          done_first_event = True
-        self.assertEqual(expected_element * expected_element, actual_element,
-                         "At index %s: %s expected, got: %s" %
-                         (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    done_first_event = False
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=True,
+            buffer_output_elements=1,
+            prefetch_input_elements=1))
+    # Test against a generating sequence that differs from the uncontended
+    # case, in order to prove sloppy correctness.
+    for i, expected_element in enumerate(
+        self._interleave(
+            [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count,
+            cycle_length=2,
+            block_length=3)):
+      self.write_coordination_events[expected_element].set()
+      if done_first_event:  # First event starts the worker threads.
+        self.read_coordination_events[expected_element].acquire()
+      actual_element = self.evaluate(next_element())
+      if not done_first_event:
+        self.read_coordination_events[expected_element].acquire()
+        done_first_event = True
+      self.assertEqual(
+          expected_element * expected_element, actual_element,
+          "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                 actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def _testEarlyExit(self, sloppy=False):
     # Exiting without consuming all input should not block
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 3,
-              self.block_length: 2,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 0,
-          })
-      for i in range(4, 7):
-        self.write_coordination_events[i].set()
-      elem = self.evaluate(self.next_element)  # Start all workers
-      # Allow the one successful worker to progress beyond the py_func again.
-      elem = int(math.sqrt(elem))
-      self.write_coordination_events[elem].set()
-      self.read_coordination_events[elem].acquire()
-      # Allow the prefetch to succeed
-      for i in range(4, 7):
-        self.read_coordination_events[i].acquire()
-        self.write_coordination_events[i].set()
+    self._clear_coordination_events()
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=3,
+            block_length=2,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=0))
+    for i in range(4, 7):
+      self.write_coordination_events[i].set()
+    elem = self.evaluate(next_element())  # Start all workers
+    # Allow the one successful worker to progress beyond the py_func again.
+    elem = int(math.sqrt(elem))
+    self.write_coordination_events[elem].set()
+    self.read_coordination_events[elem].acquire()
+    # Allow the prefetch to succeed
+    for i in range(4, 7):
+      self.read_coordination_events[i].acquire()
+      self.write_coordination_events[i].set()
 
   def testEarlyExit(self):
     self._testEarlyExit()
@@ -603,12 +565,10 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
     dataset = dataset.apply(
         interleave_ops.parallel_interleave(
             interleave_fn, cycle_length=16, block_length=2, sloppy=sloppy))
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      output_values = []
-      for _ in range(30):
-        output_values.append(self.evaluate(iterator.get_next()))
+    get_next = self.getNext(dataset)
+    output_values = []
+    for _ in range(30):
+      output_values.append(self.evaluate(get_next()))
 
     expected_values = self._interleave(
         [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 2)
@@ -629,54 +589,47 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       return dataset_ops.Dataset.from_tensor_slices(
           sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
 
-    dataset = dataset_ops.Dataset.range(10).map(_map_fn)
-    iterator = dataset.apply(
-        interleave_ops.parallel_interleave(
-            _interleave_fn, cycle_length=1)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      for i in range(10):
-        for j in range(2):
-          expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn).apply(
+        interleave_ops.parallel_interleave(_interleave_fn, cycle_length=1))
+    get_next = self.getNext(dataset)
+
+    for i in range(10):
+      for j in range(2):
+        expected = [i, 0] if j % 2 == 0 else [0, -i]
+        self.assertAllEqual(expected, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testErrorsInOutputFn(self):
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: False,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 0,
-          })
-
-      except_on_element_indices = set([3])
-
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
-                           1)):
-        if i in except_on_element_indices:
-          self.error = ValueError()
-          self.write_coordination_events[expected_element].set()
-          with self.assertRaises(errors.InvalidArgumentError):
-            self.evaluate(self.next_element)
-        else:
-          self.write_coordination_events[expected_element].set()
-          actual_element = self.evaluate(self.next_element)
-          self.assertEqual(expected_element * expected_element, actual_element,
-                           "At index %s: %s expected, got: %s" %
-                           (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=False,
+            buffer_output_elements=1,
+            prefetch_input_elements=0))
+
+    except_on_element_indices = set([3])
+
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
+                         1)):
+      if i in except_on_element_indices:
+        self.error = ValueError()
+        self.write_coordination_events[expected_element].set()
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(next_element())
+      else:
+        self.write_coordination_events[expected_element].set()
+        actual_element = self.evaluate(next_element())
+        self.assertEqual(
+            expected_element * expected_element, actual_element,
+            "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                   actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testErrorsInInputFn(self):
 
@@ -693,41 +646,35 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       dataset = dataset.repeat(x)
       return dataset
 
-    self.dataset = (
-        dataset_ops.Dataset.from_tensor_slices(self.input_values).map(map_fn)
-        .repeat(self.repeat_count).apply(
-            interleave_ops.parallel_interleave(interleave_fn, self.cycle_length,
-                                               self.block_length, self.sloppy,
-                                               self.buffer_output_elements,
-                                               self.prefetch_input_elements)))
-
-    self.iterator = self.dataset.make_initializable_iterator()
-    self.init_op = self.iterator.initializer
-    self.next_element = self.iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: False,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 0,
-          })
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
-        if expected_element == 5:
-          with self.assertRaises(errors.InvalidArgumentError):
-            self.evaluate(self.next_element)
-        else:
-          actual_element = self.evaluate(self.next_element)
-          self.assertEqual(expected_element, actual_element,
-                           "At index %s: %s expected, got: %s" %
-                           (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    def dataset_fn(input_values, cycle_length, block_length, sloppy,
+                   buffer_output_elements, prefetch_input_elements):
+      return dataset_ops.Dataset.from_tensor_slices(input_values).map(
+          map_fn).repeat(self.repeat_count).apply(
+              interleave_ops.parallel_interleave(
+                  interleave_fn, cycle_length, block_length, sloppy,
+                  buffer_output_elements, prefetch_input_elements))
+
+    next_element = self.getNext(
+        dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=False,
+            buffer_output_elements=1,
+            prefetch_input_elements=0))
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
+      if expected_element == 5:
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(next_element())
+      else:
+        actual_element = self.evaluate(next_element())
+        self.assertEqual(
+            expected_element, actual_element,
+            "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                   actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testErrorsInInterleaveFn(self):
 
@@ -742,41 +689,35 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       dataset = dataset.repeat(y)
       return dataset
 
-    self.dataset = (
-        dataset_ops.Dataset.from_tensor_slices(self.input_values)
-        .repeat(self.repeat_count).apply(
-            interleave_ops.parallel_interleave(interleave_fn, self.cycle_length,
-                                               self.block_length, self.sloppy,
-                                               self.buffer_output_elements,
-                                               self.prefetch_input_elements)))
-
-    self.iterator = self.dataset.make_initializable_iterator()
-    self.init_op = self.iterator.initializer
-    self.next_element = self.iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: False,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 0,
-          })
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
-        if expected_element == 5:
-          with self.assertRaises(errors.InvalidArgumentError):
-            self.evaluate(self.next_element)
-        else:
-          actual_element = self.evaluate(self.next_element)
-          self.assertEqual(expected_element, actual_element,
-                           "At index %s: %s expected, got: %s" %
-                           (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    def dataset_fn(input_values, cycle_length, block_length, sloppy,
+                   buffer_output_elements, prefetch_input_elements):
+      return dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
+          self.repeat_count).apply(
+              interleave_ops.parallel_interleave(
+                  interleave_fn, cycle_length, block_length, sloppy,
+                  buffer_output_elements, prefetch_input_elements))
+
+    next_element = self.getNext(
+        dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=False,
+            buffer_output_elements=1,
+            prefetch_input_elements=0))
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
+      if expected_element == 5:
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(next_element())
+      else:
+        actual_element = self.evaluate(next_element())
+        self.assertEqual(
+            expected_element, actual_element,
+            "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                   actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testShutdownRace(self):
     dataset = dataset_ops.Dataset.range(20)
@@ -789,21 +730,17 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
             buffer_output_elements=1,
             prefetch_input_elements=0))
     dataset = dataset.batch(32)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
 
     results = []
-    with self.cached_session() as sess:
-      for _ in range(2):
-        elements = []
-        self.evaluate(iterator.initializer)
-        try:
-          while True:
-            elements.extend(self.evaluate(next_element))
-        except errors.OutOfRangeError:
-          pass
-        results.append(elements)
-
+    for _ in range(2):
+      elements = []
+      next_element = self.getNext(dataset)
+      try:
+        while True:
+          elements.extend(self.evaluate(next_element()))
+      except errors.OutOfRangeError:
+        pass
+      results.append(elements)
     self.assertAllEqual(results[0], results[1])
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
index c64bb2d299a7bb60d90b0e3b780b1452e1b5c065..238c5cd5060cafe7590fde72e4ac1e7b9b4ea6f4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+# TODO(b/117581999): add eager coverage when supported.
 class PrefetchToDeviceTest(test_base.DatasetTestBase):
 
   @test_util.run_deprecated_v1
@@ -38,7 +39,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -66,7 +67,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
             "/job:localhost/replica:0/task:0/device:CPU:0"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -92,7 +93,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -123,7 +124,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -153,10 +154,12 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/gpu:0"))
 
-    iterator = device_dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_initializable_iterator(device_dataset)
     next_element = iterator.get_next()
 
-    with self.cached_session():
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(10):
         self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
@@ -169,7 +172,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -201,10 +204,11 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/gpu:0"))
 
-    iterator = device_dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(device_dataset)
     next_element = iterator.get_next()
 
-    with self.cached_session():
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
       self.evaluate(iterator.initializer)
       for i in range(5):
         self.assertEqual(i, self.evaluate(next_element))
diff --git a/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
index 77df8310d439b458c691ccbfb1d6015859c7d015..f36f94c02fec98f95d9cb718ae2d1dd19905b454 100644
--- a/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
@@ -26,12 +26,9 @@ from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import readers as core_readers
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.lib.io import python_io
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.util import compat
 
@@ -150,26 +147,25 @@ class MakeBatchedFeaturesDatasetTestBase(test_base.DatasetTestBase):
       writer.close()
     return filenames
 
-  def _run_actual_batch(self, outputs, sess, label_key_provided=False):
+  def _run_actual_batch(self, outputs, label_key_provided=False):
     if label_key_provided:
       # outputs would be a tuple of (feature dict, label)
-      label_op = outputs[1]
-      features_op = outputs[0]
+      features, label = self.evaluate(outputs())
     else:
-      features_op = outputs
-      label_op = features_op["label"]
-    file_op = features_op["file"]
-    keywords_indices_op = features_op["keywords"].indices
-    keywords_values_op = features_op["keywords"].values
-    keywords_dense_shape_op = features_op["keywords"].dense_shape
-    record_op = features_op["record"]
-    return sess.run([
-        file_op, keywords_indices_op, keywords_values_op,
-        keywords_dense_shape_op, record_op, label_op
+      features = self.evaluate(outputs())
+      label = features["label"]
+    file_out = features["file"]
+    keywords_indices = features["keywords"].indices
+    keywords_values = features["keywords"].values
+    keywords_dense_shape = features["keywords"].dense_shape
+    record = features["record"]
+    return ([
+        file_out, keywords_indices, keywords_values, keywords_dense_shape,
+        record, label
     ])
 
-  def _next_actual_batch(self, sess, label_key_provided=False):
-    return self._run_actual_batch(self.outputs, sess, label_key_provided)
+  def _next_actual_batch(self, label_key_provided=False):
+    return self._run_actual_batch(self.outputs, label_key_provided)
 
   def _interleave(self, iterators, cycle_length):
     pending_iterators = iterators
@@ -251,7 +247,6 @@ class MakeBatchedFeaturesDatasetTestBase(test_base.DatasetTestBase):
       ]
 
   def verify_records(self,
-                     sess,
                      batch_size,
                      file_index=None,
                      num_epochs=1,
@@ -268,7 +263,7 @@ class MakeBatchedFeaturesDatasetTestBase(test_base.DatasetTestBase):
         num_epochs,
         cycle_length=interleave_cycle_length):
       actual_batch = self._next_actual_batch(
-          sess, label_key_provided=label_key_provided)
+          label_key_provided=label_key_provided)
       for i in range(len(expected_batch)):
         self.assertAllEqual(expected_batch[i], actual_batch[i])
 
@@ -323,21 +318,6 @@ class TFRecordDatasetTestBase(test_base.DatasetTestBase):
 
     self.test_filenames = self._createFiles()
 
-    self.filenames = array_ops.placeholder(dtypes.string, shape=[None])
-    self.num_epochs = array_ops.placeholder_with_default(
-        constant_op.constant(1, dtypes.int64), shape=[])
-    self.compression_type = array_ops.placeholder_with_default("", shape=[])
-    self.batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = core_readers.TFRecordDataset(
-        self.filenames, self.compression_type).repeat(self.num_epochs)
-    batch_dataset = repeat_dataset.batch(self.batch_size)
-
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
-    self.init_op = iterator.make_initializer(repeat_dataset)
-    self.init_batch_op = iterator.make_initializer(batch_dataset)
-    self.get_next = iterator.get_next()
-
   def _record(self, f, r):
     return compat.as_bytes("Record %d of file %d" % (r, f))
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
index 675209d1899afa9cdaf282da525288020b1ed1d6..4d35b160fdc15e22b9b62718af9407978d20d7e2 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
@@ -17,11 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
 
 from absl.testing import parameterized
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.data.experimental.ops import resampling
 from tensorflow.python.data.kernel_tests import test_base
@@ -36,35 +34,12 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-def _time_resampling(
-    test_obj, data_np, target_dist, init_dist, num_to_sample):
-  dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat()
-
-  # Reshape distribution via rejection sampling.
-  dataset = dataset.apply(
-      resampling.rejection_resample(
-          class_func=lambda x: x,
-          target_dist=target_dist,
-          initial_dist=init_dist,
-          seed=142))
-
-  get_next = dataset.make_one_shot_iterator().get_next()
-
-  with test_obj.test_session() as sess:
-    start_time = time.time()
-    for _ in xrange(num_to_sample):
-      sess.run(get_next)
-    end_time = time.time()
-
-  return end_time - start_time
-
-
+@test_util.run_all_in_graph_and_eager_modes
 class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
       ("InitialDistributionKnown", True),
       ("InitialDistributionUnknown", False))
-  @test_util.run_deprecated_v1
   def testDistribution(self, initial_known):
     classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
     target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
@@ -73,17 +48,17 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
         200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat()
 
-    get_next = dataset.apply(
-        resampling.rejection_resample(
-            target_dist=target_dist,
-            initial_dist=initial_dist,
-            class_func=lambda c, _: c,
-            seed=27)).make_one_shot_iterator().get_next()
+    get_next = self.getNext(
+        dataset.apply(
+            resampling.rejection_resample(
+                target_dist=target_dist,
+                initial_dist=initial_dist,
+                class_func=lambda c, _: c,
+                seed=27)))
 
-    with self.cached_session() as sess:
-      returned = []
-      while len(returned) < 4000:
-        returned.append(sess.run(get_next))
+    returned = []
+    while len(returned) < 4000:
+      returned.append(self.evaluate(get_next()))
 
     returned_classes, returned_classes_and_data = zip(*returned)
     _, returned_data = zip(*returned_classes_and_data)
@@ -99,7 +74,6 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
   @parameterized.named_parameters(
       ("OnlyInitial", True),
       ("NotInitial", False))
-  @test_util.run_deprecated_v1
   def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist):
     init_dist = [0.5, 0.5]
     target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0]
@@ -117,15 +91,13 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
             target_dist=target_dist,
             initial_dist=init_dist))
 
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      returned = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          returned.append(sess.run(get_next))
+    returned = []
+    with self.assertRaises(errors.OutOfRangeError):
+      while True:
+        returned.append(self.evaluate(get_next()))
 
-  @test_util.run_deprecated_v1
   def testRandomClasses(self):
     init_dist = [0.25, 0.25, 0.25, 0.25]
     target_dist = [0.0, 0.0, 0.0, 1.0]
@@ -149,13 +121,12 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
             target_dist=target_dist,
             initial_dist=init_dist))
 
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      returned = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          returned.append(sess.run(get_next))
+    returned = []
+    with self.assertRaises(errors.OutOfRangeError):
+      while True:
+        returned.append(self.evaluate(get_next()))
 
     classes, _ = zip(*returned)
     bincount = np.bincount(
@@ -165,22 +136,5 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertAllClose(target_dist, bincount, atol=1e-2)
 
 
-class ResampleDatasetBenchmark(test.Benchmark):
-
-  def benchmarkResamplePerformance(self):
-    init_dist = [0.25, 0.25, 0.25, 0.25]
-    target_dist = [0.0, 0.0, 0.0, 1.0]
-    num_classes = len(init_dist)
-    # We don't need many samples to test a dirac-delta target distribution
-    num_samples = 1000
-    data_np = np.random.choice(num_classes, num_samples, p=init_dist)
-
-    resample_time = _time_resampling(
-        self, data_np, target_dist, init_dist, num_to_sample=1000)
-
-    self.report_benchmark(
-        iters=1000, wall_time=resample_time, name="benchmark_resample")
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
index 658e6120cf9e30d7f79e542c8df726d997b1abb9..87a91415b08097c40a60937b4d970cc63183c23e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+# TODO(b/117581999): Add eager coverage
 class RestructuredDatasetTest(test_base.DatasetTestBase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/data/experimental/kernel_tests/scan_test.py b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
index 43a765b9e02ca45786725b7bfe8996dfe266a827..f5ac0f500746f69f0d91eda5d93f9a967c429aa1 100644
--- a/tensorflow/python/data/experimental/kernel_tests/scan_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
@@ -24,58 +24,44 @@ import numpy as np
 from tensorflow.python.data.experimental.ops import scan_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class ScanTest(test_base.DatasetTestBase):
 
   def _counting_dataset(self, start, scan_fn):
     return dataset_ops.Dataset.from_tensors(0).repeat().apply(
         scan_ops.scan(start, scan_fn))
 
-  @test_util.run_deprecated_v1
   def testCount(self):
     def make_scan_fn(step):
       return lambda state, _: (state + step, state)
 
-    start = array_ops.placeholder(dtypes.int32, shape=[])
-    step = array_ops.placeholder(dtypes.int32, shape=[])
-    take = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = self._counting_dataset(
-        start, make_scan_fn(step)).take(take).make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-
-      for start_val, step_val, take_val in [(0, 1, 10), (0, 1, 0), (10, 1, 10),
-                                            (10, 2, 10), (10, -1, 10),
-                                            (10, -2, 10)]:
-        sess.run(iterator.initializer,
-                 feed_dict={start: start_val, step: step_val, take: take_val})
-        for expected, _ in zip(
-            itertools.count(start_val, step_val), range(take_val)):
-          self.assertEqual(expected, self.evaluate(next_element))
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(next_element)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testFibonacci(self):
-    iterator = dataset_ops.Dataset.from_tensors(1).repeat(None).apply(
-        scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1]))
-    ).make_one_shot_iterator()
+    def dataset_fn(start, step, take):
+      return self._counting_dataset(start, make_scan_fn(step)).take(take)
 
-    if context.executing_eagerly():
-      next_element = iterator.get_next
-    else:
-      get_next = iterator.get_next()
-      next_element = lambda: get_next
+    for start_val, step_val, take_val in [(0, 1, 10), (0, 1, 0), (10, 1, 10),
+                                          (10, 2, 10), (10, -1, 10), (10, -2,
+                                                                      10)]:
+      next_element = self.getNext(dataset_fn(start_val, step_val, take_val))
+      for expected, _ in zip(
+          itertools.count(start_val, step_val), range(take_val)):
+        self.assertEqual(expected, self.evaluate(next_element()))
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(next_element())
+
+  def testFibonacci(self):
+    data = dataset_ops.Dataset.from_tensors(1).repeat(None).apply(
+        scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1])))
+    next_element = self.getNext(data)
 
     self.assertEqual(1, self.evaluate(next_element()))
     self.assertEqual(1, self.evaluate(next_element()))
@@ -84,8 +70,10 @@ class ScanTest(test_base.DatasetTestBase):
     self.assertEqual(5, self.evaluate(next_element()))
     self.assertEqual(8, self.evaluate(next_element()))
 
+  # TODO(b/117581999): Add coverage for eager.
   @test_util.run_deprecated_v1
-  def testSparseCount(self):
+  def testSkipEagerSparseCount(self):
+
     def _sparse(i):
       return sparse_tensor.SparseTensorValue(
           indices=np.array([[0, 0]]),
@@ -95,28 +83,20 @@ class ScanTest(test_base.DatasetTestBase):
     def make_scan_fn(step):
       return lambda state, _: (_sparse(state.values[0] + step), state)
 
-    start = array_ops.placeholder(dtypes.int32, shape=[])
-    step = array_ops.placeholder(dtypes.int32, shape=[])
-    take = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = self._counting_dataset(
-        _sparse(start),
-        make_scan_fn(step)).take(take).make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-
-      for start_val, step_val, take_val in [(0, 1, 10), (0, 1, 0), (10, 1, 10),
-                                            (10, 2, 10), (10, -1, 10),
-                                            (10, -2, 10)]:
-        sess.run(iterator.initializer,
-                 feed_dict={start: start_val, step: step_val, take: take_val})
-        for expected, _ in zip(
-            itertools.count(start_val, step_val), range(take_val)):
-          self.assertEqual(expected, self.evaluate(next_element).values[0])
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(next_element)
+    def dataset_fn(start, step, take):
+      return self._counting_dataset(_sparse(start),
+                                    make_scan_fn(step)).take(take)
+
+    for start_val, step_val, take_val in [(0, 1, 10), (0, 1, 0), (10, 1, 10),
+                                          (10, 2, 10), (10, -1, 10), (10, -2,
+                                                                      10)]:
+      next_element = self.getNext(dataset_fn(start_val, step_val, take_val))
+      for expected, _ in zip(
+          itertools.count(start_val, step_val), range(take_val)):
+        self.assertEqual(expected, self.evaluate(next_element()).values[0])
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(next_element())
 
-  @test_util.run_deprecated_v1
   def testChangingStateShape(self):
     # Test the fixed-point shape invariant calculations: start with
     # initial values with known shapes, and use a scan function that
@@ -134,16 +114,14 @@ class ScanTest(test_base.DatasetTestBase):
     self.assertIs(None, dataset.output_shapes[0][1].ndims)
     self.assertEqual([], dataset.output_shapes[1].as_list())
 
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
+    next_element = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      for i in range(5):
-        (longer_vector_val, larger_rank_val), _ = self.evaluate(next_element)
-        self.assertAllEqual([0] * (2**i), longer_vector_val)
-        self.assertAllEqual(np.array(1, ndmin=i), larger_rank_val)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    for i in range(5):
+      (longer_vector_val, larger_rank_val), _ = self.evaluate(next_element())
+      self.assertAllEqual([0] * (2**i), longer_vector_val)
+      self.assertAllEqual(np.array(1, ndmin=i), larger_rank_val)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testIncorrectStateType(self):
 
@@ -170,6 +148,21 @@ class ScanTest(test_base.DatasetTestBase):
       dataset.apply(
           scan_ops.scan(constant_op.constant(1, dtype=dtypes.int32), _scan_fn))
 
+  def testPreserveCardinality(self):
+
+    def scan_fn(state, val):
+
+      def py_fn(_):
+        raise StopIteration()
+
+      return state, script_ops.py_func(py_fn, [val], dtypes.int64)
+
+    dataset = dataset_ops.Dataset.from_tensors(0).apply(
+        scan_ops.scan(constant_op.constant(1), scan_fn))
+    get_next = self.getNext(dataset)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index c724987b2435aa8b5352587b06e0e487c6cf5d7c..4a2e28f49649ea698e9d426d86dae4bb42cdebf9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -76,6 +76,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
+        "no_windows",
         "notsan",
     ],
     deps = [
@@ -316,6 +317,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
+        "no_windows",
         "notap",
     ],
     deps = [
@@ -358,6 +360,9 @@ py_test(
     size = "small",
     srcs = ["matching_files_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
index 140ed517ef5eac138bebc9e0aeb024fe0e62cd77..8cc66d0c29392b206015ad886780d854fb2b5d5c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
@@ -21,8 +21,6 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.ops import iterator_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import model_fn
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -33,6 +31,8 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
+from tensorflow_estimator.python.estimator import estimator
+from tensorflow_estimator.python.estimator import model_fn
 
 
 class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
index 7f435b823975ad7a12661d909f37cebae67a0018..bdbd8702b7f8d315a730c5cd2b000218ea5e19be 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
@@ -23,6 +23,8 @@ import os
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -73,23 +75,39 @@ class DatasetSerializationTestBase(test.TestCase):
     Raises:
       AssertionError if any test fails.
     """
+    # NOTE: We disable all default optimizations in serialization tests in order
+    # to test the actual dataset in question.
+    options = dataset_ops.Options()
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.apply_default_optimizations = False
+
+    def ds_fn1_no_opt():
+      return ds_fn1().with_options(options)
+
     self.verify_unused_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_fully_used_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_exhausted_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_init_before_restore(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_multiple_breaks(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_reset_restored_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_restore_in_empty_graph(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     if ds_fn2:
+
+      def ds_fn2_no_opt():
+        return ds_fn2().with_options(options)
+
       self.verify_restore_in_modified_graph(
-          ds_fn1, ds_fn2, num_outputs, sparse_tensors=sparse_tensors)
+          ds_fn1_no_opt,
+          ds_fn2_no_opt,
+          num_outputs,
+          sparse_tensors=sparse_tensors)
 
   def verify_unused_iterator(self,
                              ds_fn,
@@ -578,7 +596,7 @@ class DatasetSerializationTestBase(test.TestCase):
     return np.linspace(0, num_outputs, num_samples, dtype=int)
 
   def _build_graph(self, ds_fn, sparse_tensors=False):
-    iterator = ds_fn().make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(ds_fn())
 
     saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
index 166ffa99ca02eabe8b8b30ba6f1fa8ed99d8b45c..8bfe6ce2f30e02c78f4a5b760849b92dd0a8fc65 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
@@ -22,6 +22,7 @@ import math
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -83,6 +84,19 @@ class MapAndBatchDatasetSerializationTest(
     self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
                         num_outputs_drop_remainder)
 
+  def testSparse(self):
+
+    def build_dataset():
+
+      def map_fn(i):
+        return sparse_tensor.SparseTensorValue(
+            indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+      return dataset_ops.Dataset.range(10).apply(
+          batching.map_and_batch(map_fn, 5))
+
+    self.run_core_tests(build_dataset, None, 2)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
index aeb338dfd5e0ca777a7422a113868be338c9dbd6..34419a314938560818f3a9f4cdd1979a8dbb44d4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
@@ -56,8 +56,8 @@ class RangeDatasetSerializationTest(
   def testSaveRestore(self):
 
     def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
diff --git a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
index ce63da6bf988352b2d248cdb8dc9dc3cd8616923..110966a5a08bcc2081abca71f56db736200283b1 100644
--- a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
@@ -23,11 +23,11 @@ from tensorflow.python.data.experimental.ops import shuffle_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class ShuffleAndRepeatTest(test_base.DatasetTestBase):
 
   def _build_ds(self, seed, count=5, num_elements=20):
@@ -35,17 +35,15 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
         shuffle_ops.shuffle_and_repeat(buffer_size=5, count=count, seed=seed))
 
   def _gen_outputs(self, ds_fn, num_outputs, verify_exhausted=True):
-    get_next = ds_fn().make_one_shot_iterator().get_next()
+    get_next = self.getNext(ds_fn())
     outputs = []
-    with self.cached_session() as sess:
-      for _ in range(num_outputs):
-        outputs.append(self.evaluate(get_next))
-      if verify_exhausted:
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(get_next)
+    for _ in range(num_outputs):
+      outputs.append(self.evaluate(get_next()))
+    if verify_exhausted:
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
     return outputs
 
-  @test_util.run_deprecated_v1
   def testCorrectOutput(self):
     output = self._gen_outputs(lambda: self._build_ds(10), 100)
     self.assertSequenceEqual(
@@ -54,7 +52,6 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
     for i in range(5):
       self.assertSequenceEqual(sorted(output[i * 20:(i + 1) * 20]), range(20))
 
-  @test_util.run_deprecated_v1
   def testReshuffling(self):
     # Check that the output orders of different epochs are indeed different.
     output = self._gen_outputs(lambda: self._build_ds(10), 100)
@@ -63,20 +60,17 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
       epoch2 = output[(i + 1) * 20:(i + 2) * 20]
       self.assertNotEqual(epoch1, epoch2)
 
-  @test_util.run_deprecated_v1
   def testSameOrderForSameSeeds(self):
     output1 = self._gen_outputs(lambda: self._build_ds(10), 100)
     output2 = self._gen_outputs(lambda: self._build_ds(10), 100)
     self.assertEqual(output1, output2)
 
-  @test_util.run_deprecated_v1
   def testDifferentOrderForDifferentSeeds(self):
     output1 = self._gen_outputs(lambda: self._build_ds(10), 100)
     output2 = self._gen_outputs(lambda: self._build_ds(20), 100)
     self.assertNotEqual(output1, output2)
     self.assertEqual(sorted(output1), sorted(output2))
 
-  @test_util.run_deprecated_v1
   def testCountNone(self):
     output1 = self._gen_outputs(
         lambda: self._build_ds(10, count=None), 100, verify_exhausted=False)
@@ -85,7 +79,6 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
     self.assertNotEqual(output1, output2)
     self.assertEqual(sorted(output1), sorted(output2))
 
-  @test_util.run_deprecated_v1
   def testCountMinusOne(self):
     output1 = self._gen_outputs(
         lambda: self._build_ds(10, count=-1), 100, verify_exhausted=False)
@@ -110,12 +103,10 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
                         100)
 
   def testLargeBufferSize(self):
-    with ops.Graph().as_default() as g:
-      ds = dataset_ops.Dataset.range(20).apply(
-          shuffle_ops.shuffle_and_repeat(buffer_size=21))
-      get_next_op = ds.make_one_shot_iterator().get_next()
-      with self.session(graph=g) as sess:
-        self.evaluate(get_next_op)
+    ds = dataset_ops.Dataset.range(20).apply(
+        shuffle_ops.shuffle_and_repeat(buffer_size=21))
+    get_next = self.getNext(ds)
+    self.evaluate(get_next())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/sleep_test.py b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
index 8c9e8225d6800862737925a6fc59bfa40004b782..a4fe847f04baa0f8dd7c45bae4e02617e33053ca 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
@@ -29,25 +29,21 @@ from tensorflow.python.platform import test
 _NUMPY_RANDOM_SEED = 42
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SleepTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testSleep(self):
     sleep_microseconds = 100
     dataset = dataset_ops.Dataset.range(10).apply(
         sleep.sleep(sleep_microseconds))
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      start_time = time.time()
-      for i in range(10):
-        self.assertEqual(i, self.evaluate(next_element))
-      end_time = time.time()
-      self.assertGreater(end_time - start_time, (10 * sleep_microseconds) / 1e6)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    next_element = self.getNext(dataset)
+    start_time = time.time()
+    for i in range(10):
+      self.assertEqual(i, self.evaluate(next_element()))
+    end_time = time.time()
+    self.assertGreater(end_time - start_time, (10 * sleep_microseconds) / 1e6)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
index eb66927ee5c73c67325f3764d29d5c8461c05cbb..fd96c0b52135bb784e3f2bfca3b9b4f697ba78a3 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
@@ -21,574 +21,454 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.kernel_tests import sql_dataset_test_base
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
 
   # Test that SqlDataset can read from a database table.
   def testReadResultSet(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string), 2)
-    with self.cached_session() as sess:
-      for _ in range(2):  # Run twice to verify statelessness of db operations.
-        sess.run(
-            init_op,
-            feed_dict={
-                self.query: "SELECT first_name, last_name, motto FROM students "
-                            "ORDER BY first_name DESC"
-            })
-        for _ in range(2):  # Dataset is repeated. See setUp.
-          self.assertEqual((b"John", b"Doe", b"Hi!"), self.evaluate(get_next))
-          self.assertEqual((b"Jane", b"Moe", b"Hi again!"),
-                           self.evaluate(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(get_next)
+    for _ in range(2):  # Run twice to verify statelessness of db operations.
+      dataset = self._createSqlDataset(
+          query="SELECT first_name, last_name, motto FROM students "
+          "ORDER BY first_name DESC",
+          output_types=(dtypes.string, dtypes.string, dtypes.string),
+          num_repeats=2)
+      self.assertDatasetProduces(
+          dataset,
+          expected_output=[(b"John", b"Doe", b"Hi!"),
+                           (b"Jane", b"Moe", b"Hi again!")] * 2,
+          num_test_iterations=2)
 
   # Test that SqlDataset works on a join query.
   def testReadResultSetJoinQuery(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT students.first_name, state, motto FROM students "
-                  "INNER JOIN people "
-                  "ON students.first_name = people.first_name "
-                  "AND students.last_name = people.last_name"
-          })
-      self.assertEqual((b"John", b"California", b"Hi!"),
-                       self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT students.first_name, state, motto FROM students "
+            "INNER JOIN people "
+            "ON students.first_name = people.first_name "
+            "AND students.last_name = people.last_name",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+
+    self.assertEqual((b"John", b"California", b"Hi!"),
+                     self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that SqlDataset can read a database entry with a null-terminator
   # in the middle of the text and place the entry in a `string` tensor.
   def testReadResultSetNullTerminator(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT first_name, last_name, favorite_nonsense_word "
-                  "FROM students ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", b"Doe", b"n\0nsense"), self.evaluate(get_next))
-      self.assertEqual((b"Jane", b"Moe", b"nonsense\0"),
-                       self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, favorite_nonsense_word "
+            "FROM students ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+
+    self.assertEqual((b"John", b"Doe", b"n\0nsense"), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", b"Moe", b"nonsense\0"),
+                     self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that SqlDataset works when used on two different queries.
   # Because the output types of the dataset must be determined at graph-creation
   # time, the two queries must have the same number and types of columns.
   def testReadResultSetReuseSqlDataset(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, last_name, motto FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", b"Doe", b"Hi!"), self.evaluate(get_next))
-      self.assertEqual((b"Jane", b"Moe", b"Hi again!"), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, last_name, state FROM people "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", b"Doe", b"California"),
-                       self.evaluate(get_next))
-      self.assertEqual((b"Benjamin", b"Franklin", b"Pennsylvania"),
-                       self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, motto FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+    self.assertEqual((b"John", b"Doe", b"Hi!"), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", b"Moe", b"Hi again!"), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, state FROM people "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+    self.assertEqual((b"John", b"Doe", b"California"),
+                     self.evaluate(get_next()))
+    self.assertEqual((b"Benjamin", b"Franklin", b"Pennsylvania"),
+                     self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that an `OutOfRangeError` is raised on the first call to
   # `get_next_str_only` if result set is empty.
   def testReadEmptyResultSet(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, last_name, motto FROM students "
-                          "WHERE first_name = 'Nonexistent'"
-          })
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, motto FROM students "
+            "WHERE first_name = 'Nonexistent'",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that an error is raised when `driver_name` is invalid.
   def testReadResultSetWithInvalidDriverName(self):
-    init_op = self._createSqlDataset((dtypes.string, dtypes.string,
-                                      dtypes.string))[0]
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(
-            init_op,
-            feed_dict={
-                self.driver_name: "sqlfake",
-                self.query: "SELECT first_name, last_name, motto FROM students "
-                            "ORDER BY first_name DESC"
-            })
+    dataset = self._createSqlDataset(
+        driver_name="sqlfake",
+        query="SELECT first_name, last_name, motto FROM students "
+        "ORDER BY first_name DESC",
+        output_types=(dtypes.string, dtypes.string, dtypes.string))
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, ""))
 
   # Test that an error is raised when a column name in `query` is nonexistent
   def testReadResultSetWithInvalidColumnName(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT first_name, last_name, fake_column FROM students "
-                  "ORDER BY first_name DESC"
-          })
-      with self.assertRaises(errors.UnknownError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, fake_column FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+    with self.assertRaises(errors.UnknownError):
+      self.evaluate(get_next())
 
   # Test that an error is raised when there is a syntax error in `query`.
   def testReadResultSetOfQueryWithSyntaxError(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELEmispellECT first_name, last_name, motto FROM students "
-                  "ORDER BY first_name DESC"
-          })
-      with self.assertRaises(errors.UnknownError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELEmispellECT first_name, last_name, motto FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+    with self.assertRaises(errors.UnknownError):
+      self.evaluate(get_next())
 
   # Test that an error is raised when the number of columns in `query`
-  # does not match the length of `output_types`.
+  # does not match the length of `, output_types`.
   def testReadResultSetWithMismatchBetweenColumnsAndOutputTypes(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, last_name FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
 
   # Test that no results are returned when `query` is an insert query rather
   # than a select query. In particular, the error refers to the number of
   # output types passed to the op not matching the number of columns in the
   # result set of the query (namely, 0 for an insert statement.)
   def testReadResultSetOfInsertQuery(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "INSERT INTO students (first_name, last_name, motto) "
-                  "VALUES ('Foo', 'Bar', 'Baz'), ('Fizz', 'Buzz', 'Fizzbuzz')"
-          })
-      with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="INSERT INTO students (first_name, last_name, motto) "
+            "VALUES ('Foo', 'Bar', 'Baz'), ('Fizz', 'Buzz', 'Fizzbuzz')",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in an `int8` tensor.
   def testReadResultSetInt8(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int8))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, desk_number FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 9), self.evaluate(get_next))
-      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, desk_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int8)))
+    self.assertEqual((b"John", 9), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", 127), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int8` tensor.
   def testReadResultSetInt8NegativeAndZero(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int8,
-                                                dtypes.int8))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, income, favorite_negative_number "
-                          "FROM students "
-                          "WHERE first_name = 'John' ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 0, -2), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, income, favorite_negative_number "
+            "FROM students "
+            "WHERE first_name = 'John' ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int8, dtypes.int8)))
+    self.assertEqual((b"John", 0, -2), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int8` tensor.
   def testReadResultSetInt8MaxValues(self):
-    init_op, get_next = self._createSqlDataset((dtypes.int8, dtypes.int8))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT desk_number, favorite_negative_number FROM students "
-                  "ORDER BY first_name DESC"
-          })
-      self.assertEqual((9, -2), self.evaluate(get_next))
-      # Max and min values of int8
-      self.assertEqual((127, -128), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT desk_number, favorite_negative_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.int8, dtypes.int8)))
+    self.assertEqual((9, -2), self.evaluate(get_next()))
+    # Max and min values of int8
+    self.assertEqual((127, -128), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in an `int16` tensor.
   def testReadResultSetInt16(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int16))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, desk_number FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 9), self.evaluate(get_next))
-      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, desk_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int16)))
+    self.assertEqual((b"John", 9), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", 127), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int16` tensor.
   def testReadResultSetInt16NegativeAndZero(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int16,
-                                                dtypes.int16))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, income, favorite_negative_number "
-                          "FROM students "
-                          "WHERE first_name = 'John' ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 0, -2), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, income, favorite_negative_number "
+            "FROM students "
+            "WHERE first_name = 'John' ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int16, dtypes.int16)))
+    self.assertEqual((b"John", 0, -2), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int16` tensor.
   def testReadResultSetInt16MaxValues(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int16))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, favorite_medium_sized_number "
-                          "FROM students ORDER BY first_name DESC"
-          })
-      # Max value of int16
-      self.assertEqual((b"John", 32767), self.evaluate(get_next))
-      # Min value of int16
-      self.assertEqual((b"Jane", -32768), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, favorite_medium_sized_number "
+            "FROM students ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int16)))
+    # Max value of int16
+    self.assertEqual((b"John", 32767), self.evaluate(get_next()))
+    # Min value of int16
+    self.assertEqual((b"Jane", -32768), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in an `int32` tensor.
   def testReadResultSetInt32(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int32))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, desk_number FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 9), self.evaluate(get_next))
-      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, desk_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int32)))
+    self.assertEqual((b"John", 9), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", 127), self.evaluate(get_next()))
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int32` tensor.
   def testReadResultSetInt32NegativeAndZero(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int32))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, income FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 0), self.evaluate(get_next))
-      self.assertEqual((b"Jane", -20000), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, income FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int32)))
+    self.assertEqual((b"John", 0), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", -20000), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int32` tensor.
   def testReadResultSetInt32MaxValues(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int32))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, favorite_number FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      # Max value of int32
-      self.assertEqual((b"John", 2147483647), self.evaluate(get_next))
-      # Min value of int32
-      self.assertEqual((b"Jane", -2147483648), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, favorite_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int32)))
+    # Max value of int32
+    self.assertEqual((b"John", 2147483647), self.evaluate(get_next()))
+    # Min value of int32
+    self.assertEqual((b"Jane", -2147483648), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a numeric `varchar` from a SQLite database
   # table and place it in an `int32` tensor.
   def testReadResultSetInt32VarCharColumnAsInt(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int32))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, school_id FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 123), self.evaluate(get_next))
-      self.assertEqual((b"Jane", 1000), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, school_id FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int32)))
+    self.assertEqual((b"John", 123), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", 1000), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read an integer from a SQLite database table
   # and place it in an `int64` tensor.
   def testReadResultSetInt64(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int64))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, desk_number FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 9), self.evaluate(get_next))
-      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, desk_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int64)))
+    self.assertEqual((b"John", 9), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", 127), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int64` tensor.
   def testReadResultSetInt64NegativeAndZero(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int64))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, income FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 0), self.evaluate(get_next))
-      self.assertEqual((b"Jane", -20000), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, income FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int64)))
+    self.assertEqual((b"John", 0), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", -20000), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int64` tensor.
   def testReadResultSetInt64MaxValues(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int64))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT first_name, favorite_big_number FROM students "
-                  "ORDER BY first_name DESC"
-          })
-      # Max value of int64
-      self.assertEqual((b"John", 9223372036854775807), self.evaluate(get_next))
-      # Min value of int64
-      self.assertEqual((b"Jane", -9223372036854775808), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, favorite_big_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int64)))
+    # Max value of int64
+    self.assertEqual((b"John", 9223372036854775807), self.evaluate(get_next()))
+    # Min value of int64
+    self.assertEqual((b"Jane", -9223372036854775808), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in a `uint8` tensor.
   def testReadResultSetUInt8(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.uint8))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, desk_number FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 9), self.evaluate(get_next))
-      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, desk_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.uint8)))
+    self.assertEqual((b"John", 9), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", 127), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read the minimum and maximum uint8 values from a
   # SQLite database table and place them in `uint8` tensors.
   def testReadResultSetUInt8MinAndMaxValues(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.uint8))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, brownie_points FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      # Min value of uint8
-      self.assertEqual((b"John", 0), self.evaluate(get_next))
-      # Max value of uint8
-      self.assertEqual((b"Jane", 255), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, brownie_points FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.uint8)))
+    # Min value of uint8
+    self.assertEqual((b"John", 0), self.evaluate(get_next()))
+    # Max value of uint8
+    self.assertEqual((b"Jane", 255), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read an integer from a SQLite database table
   # and place it in a `uint16` tensor.
   def testReadResultSetUInt16(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.uint16))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, desk_number FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 9), self.evaluate(get_next))
-      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, desk_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.uint16)))
+    self.assertEqual((b"John", 9), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", 127), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read the minimum and maximum uint16 values from a
   # SQLite database table and place them in `uint16` tensors.
   def testReadResultSetUInt16MinAndMaxValues(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.uint16))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, account_balance FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      # Min value of uint16
-      self.assertEqual((b"John", 0), self.evaluate(get_next))
-      # Max value of uint16
-      self.assertEqual((b"Jane", 65535), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, account_balance FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.uint16)))
+    # Min value of uint16
+    self.assertEqual((b"John", 0), self.evaluate(get_next()))
+    # Max value of uint16
+    self.assertEqual((b"Jane", 65535), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a 0-valued and 1-valued integer from a
   # SQLite database table and place them as `True` and `False` respectively
   # in `bool` tensors.
   def testReadResultSetBool(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.bool))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT first_name, registration_complete FROM students "
-                  "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", True), self.evaluate(get_next))
-      self.assertEqual((b"Jane", False), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, registration_complete FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.bool)))
+    self.assertEqual((b"John", True), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", False), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read an integer that is not 0-valued or 1-valued
   # from a SQLite database table and place it as `True` in a `bool` tensor.
   def testReadResultSetBoolNotZeroOrOne(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.bool))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, favorite_medium_sized_number "
-                          "FROM students ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", True), self.evaluate(get_next))
-      self.assertEqual((b"Jane", True), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, favorite_medium_sized_number "
+            "FROM students ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.bool)))
+    self.assertEqual((b"John", True), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", True), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a float from a SQLite database table
   # and place it in a `float64` tensor.
   def testReadResultSetFloat64(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.float64))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT first_name, last_name, victories FROM townspeople "
-                  "ORDER BY first_name"
-          })
-      self.assertEqual((b"George", b"Washington", 20.0),
-                       self.evaluate(get_next))
-      self.assertEqual((b"John", b"Adams", -19.95), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, victories FROM townspeople "
+            "ORDER BY first_name",
+            output_types=(dtypes.string, dtypes.string, dtypes.float64)))
+    self.assertEqual((b"George", b"Washington", 20.0),
+                     self.evaluate(get_next()))
+    self.assertEqual((b"John", b"Adams", -19.95), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a float from a SQLite database table beyond
   # the precision of 64-bit IEEE, without throwing an error. Test that
   # `SqlDataset` identifies such a value as equal to itself.
   def testReadResultSetFloat64OverlyPrecise(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.float64))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT first_name, last_name, accolades FROM townspeople "
-                  "ORDER BY first_name"
-          })
-      self.assertEqual(
-          (b"George", b"Washington",
-           1331241.321342132321324589798264627463827647382647382643874),
-          self.evaluate(get_next))
-      self.assertEqual(
-          (b"John", b"Adams",
-           1331241321342132321324589798264627463827647382647382643874.0),
-          self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, accolades FROM townspeople "
+            "ORDER BY first_name",
+            output_types=(dtypes.string, dtypes.string, dtypes.float64)))
+    self.assertEqual(
+        (b"George", b"Washington",
+         1331241.321342132321324589798264627463827647382647382643874),
+        self.evaluate(get_next()))
+    self.assertEqual(
+        (b"John", b"Adams",
+         1331241321342132321324589798264627463827647382647382643874.0),
+        self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a float from a SQLite database table,
   # representing the largest integer representable as a 64-bit IEEE float
   # such that the previous integer is also representable as a 64-bit IEEE float.
   # Test that `SqlDataset` can distinguish these two numbers.
   def testReadResultSetFloat64LargestConsecutiveWholeNumbersNotEqual(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.float64))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT first_name, last_name, triumphs FROM townspeople "
-                  "ORDER BY first_name"
-          })
-      self.assertNotEqual((b"George", b"Washington", 9007199254740992.0),
-                          self.evaluate(get_next))
-      self.assertNotEqual((b"John", b"Adams", 9007199254740991.0),
-                          self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, triumphs FROM townspeople "
+            "ORDER BY first_name",
+            output_types=(dtypes.string, dtypes.string, dtypes.float64)))
+    self.assertNotEqual((b"George", b"Washington", 9007199254740992.0),
+                        self.evaluate(get_next()))
+    self.assertNotEqual((b"John", b"Adams", 9007199254740991.0),
+                        self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
index 6aaaa90c651ebab7ce5d98371d45a7f64831e883..90451b865f842e9f34b332ed6df45f1e4e85b9ff 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
@@ -24,27 +24,23 @@ import sqlite3
 
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class SqlDatasetTestBase(test_base.DatasetTestBase):
   """Base class for setting up and testing SqlDataset."""
 
-  def _createSqlDataset(self, output_types, num_repeats=1):
-    dataset = readers.SqlDataset(self.driver_name, self.data_source_name,
-                                 self.query, output_types).repeat(num_repeats)
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    return init_op, get_next
+  def _createSqlDataset(self,
+                        query,
+                        output_types,
+                        driver_name="sqlite",
+                        num_repeats=1):
+    dataset = readers.SqlDataset(driver_name, self.data_source_name, query,
+                                 output_types).repeat(num_repeats)
+    return dataset
 
   def setUp(self):
     self.data_source_name = os.path.join(test.get_temp_dir(), "tftest.sqlite")
-    self.driver_name = array_ops.placeholder_with_default(
-        array_ops.constant("sqlite", dtypes.string), shape=[])
-    self.query = array_ops.placeholder(dtypes.string, shape=[])
 
     conn = sqlite3.connect(self.data_source_name)
     c = conn.cursor()
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
index b89aa20432a43ccf4cd3515433d3bc0cfc281629..59d0ebdb37e08ee15cc16196508813fa18c7287c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
@@ -26,7 +26,6 @@ from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.experimental.ops import stats_ops
-from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -36,6 +35,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 def function_set_stats_aggregator(dataset,
                                   aggregator,
                                   prefix="",
@@ -46,7 +46,6 @@ def function_set_stats_aggregator(dataset,
 
 def function_apply_options(dataset, aggregator, prefix="", counter_prefix=""):
   options = dataset_ops.Options()
-  options.experimental_stats = stats_options.StatsOptions()
   options.experimental_stats.aggregator = aggregator
   options.experimental_stats.prefix = prefix
   options.experimental_stats.counter_prefix = counter_prefix
@@ -60,133 +59,110 @@ def function_apply_options(dataset, aggregator, prefix="", counter_prefix=""):
 )
 class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testBytesProduced(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
             stats_ops.bytes_produced_stats("bytes_produced"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
+    next_element = self.getNext(dataset, requires_initialization=True)
     summary_t = aggregator.get_summary()
 
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      expected_sum = 0.0
-      for i in range(100):
-        self.assertAllEqual(
-            np.array([i] * i, dtype=np.int64), self.evaluate(next_element))
-        summary_str = self.evaluate(summary_t)
-        self._assertSummaryHasCount(summary_str, "bytes_produced", float(i + 1))
-        expected_sum += i * 8.0
-        self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
-      summary_str = self.evaluate(summary_t)
-      self._assertSummaryHasCount(summary_str, "bytes_produced", 100.0)
+    expected_sum = 0.0
+    for i in range(100):
+      self.assertAllEqual(
+          np.array([i] * i, dtype=np.int64), self.evaluate(next_element()))
+      summary_str = self.evaluate(aggregator.get_summary())
+      self._assertSummaryHasCount(summary_str, "bytes_produced", float(i + 1))
+      expected_sum += i * 8.0
       self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    # TODO(shivaniagrawal): ntentional breaking case
+    summary_str = self.evaluate(summary_t)
+    self._assertSummaryHasCount(summary_str, "bytes_produced", 100.0)
+    self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
 
-  @test_util.run_deprecated_v1
   def testLatencyStats(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
+    next_element = self.getNext(dataset, requires_initialization=True)
 
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      for i in range(100):
-        self.assertEqual(i, self.evaluate(next_element))
-        self._assertSummaryHasCount(
-            self.evaluate(summary_t), "record_latency", float(i + 1))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    for i in range(100):
+      self.assertEqual(i, self.evaluate(next_element()))
       self._assertSummaryHasCount(
-          self.evaluate(summary_t), "record_latency", 100.0)
+          self.evaluate(aggregator.get_summary()), "record_latency",
+          float(i + 1))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "record_latency", 100.0)
 
-  @test_util.run_deprecated_v1
   def testPrefetchBufferUtilization(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(-1)
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
-
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      for i in range(100):
-        self.assertAllEqual(
-            np.array([i] * i, dtype=np.int64), self.evaluate(next_element))
-        summary_str = self.evaluate(summary_t)
-        self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
-                                    float(i + 1))
-        self._assertSummaryContains(summary_str, "Prefetch::buffer_capacity")
-        self._assertSummaryContains(summary_str, "Prefetch::buffer_size")
-        self._assertSummaryHasRange(summary_str, "Prefetch::buffer_utilization",
-                                    0, 1)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
-      summary_str = self.evaluate(summary_t)
+    next_element = self.getNext(dataset, requires_initialization=True)
+    for i in range(100):
+      self.assertAllEqual(
+          np.array([i] * i, dtype=np.int64), self.evaluate(next_element()))
+      summary_str = self.evaluate(aggregator.get_summary())
       self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
-                                  100)
+                                  float(i + 1))
+      self._assertSummaryContains(summary_str, "Prefetch::buffer_capacity")
+      self._assertSummaryContains(summary_str, "Prefetch::buffer_size")
+      self._assertSummaryHasRange(summary_str, "Prefetch::buffer_utilization",
+                                  0, 1)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    summary_str = self.evaluate(aggregator.get_summary())
+    self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
+                                100)
 
-  @test_util.run_deprecated_v1
   def testPrefetchBufferScalars(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(10).map(
-        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(0)
+        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(1)
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
+    next_element = self.getNext(dataset, requires_initialization=True)
+
+    for i in range(10):
+      self.assertAllEqual(
+          np.array([i] * i, dtype=np.int64), self.evaluate(next_element()))
+      summary_str = self.evaluate(aggregator.get_summary())
+      self._assertSummaryHasScalarValue(summary_str,
+                                        "Prefetch::buffer_capacity", 1)
+      self._assertSummaryHasScalarValue(summary_str, "Prefetch::buffer_size", 1)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      for i in range(10):
-        self.assertAllEqual(
-            np.array([i] * i, dtype=np.int64), self.evaluate(next_element))
-        summary_str = self.evaluate(summary_t)
-        self._assertSummaryHasScalarValue(summary_str,
-                                          "Prefetch::buffer_capacity", 0)
-        self._assertSummaryHasScalarValue(summary_str, "Prefetch::buffer_size",
-                                          0)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
-
-  @test_util.run_deprecated_v1
   def testFilteredElementsStats(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(101).filter(
         lambda x: math_ops.equal(math_ops.mod(x, 3), 0))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
+    next_element = self.getNext(dataset, requires_initialization=True)
 
-    with self.test_session() as sess:
-      self.evaluate(iterator.initializer)
-      for i in range(34):
-        self.assertEqual(i * 3, self.evaluate(next_element))
-        if i is not 0:
-          self._assertSummaryHasScalarValue(
-              self.evaluate(summary_t), "Filter::dropped_elements",
-              float(i * 2))
-        self._assertSummaryHasScalarValue(
-            self.evaluate(summary_t), "Filter::filtered_elements", float(i + 1))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
-      self._assertSummaryHasScalarValue(
-          self.evaluate(summary_t), "Filter::dropped_elements", 67.0)
+    for i in range(34):
+      self.assertEqual(i * 3, self.evaluate(next_element()))
+      summary_str = self.evaluate(aggregator.get_summary())
+      if i is not 0:
+        self._assertSummaryHasScalarValue(summary_str,
+                                          "Filter::dropped_elements",
+                                          float(i * 2))
       self._assertSummaryHasScalarValue(
-          self.evaluate(summary_t), "Filter::filtered_elements", 34.0)
+          summary_str, "Filter::filtered_elements", float(i + 1))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    summary_str = self.evaluate(aggregator.get_summary())
+    self._assertSummaryHasScalarValue(summary_str, "Filter::dropped_elements",
+                                      67.0)
+    self._assertSummaryHasScalarValue(summary_str, "Filter::filtered_elements",
+                                      34.0)
 
-  @test_util.run_deprecated_v1
   def testMapBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
@@ -201,7 +177,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         dataset_transformation,
         function_processing_time=True)
 
-  @test_util.run_deprecated_v1
   def testMapAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
@@ -219,7 +194,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         dataset_transformation,
         function_processing_time=True)
 
-  @test_util.run_deprecated_v1
   def testInterleaveAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
@@ -236,7 +210,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
     self._testParallelCallsStats(dataset_fn, "ParallelInterleaveV2", 10,
                                  dataset_transformation)
 
-  @test_util.run_deprecated_v1
   def testMapAndBatchAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
@@ -258,114 +231,98 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         check_elements=False,
         function_processing_time=True)
 
-  @test_util.run_deprecated_v1
   def testReinitialize(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
 
-    with self.cached_session() as sess:
-      for j in range(5):
-        self.evaluate(iterator.initializer)
-        for i in range(100):
-          self.assertEqual(i, self.evaluate(next_element))
-          self._assertSummaryHasCount(
-              self.evaluate(summary_t), "record_latency",
-              float((j * 100) + i + 1))
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(next_element)
+    for j in range(5):
+      next_element = self.getNext(dataset, requires_initialization=True)
+      for i in range(100):
+        self.assertEqual(i, self.evaluate(next_element()))
         self._assertSummaryHasCount(
-            self.evaluate(summary_t), "record_latency", (j + 1) * 100.0)
+            self.evaluate(aggregator.get_summary()), "record_latency",
+            float((j * 100) + i + 1))
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(next_element())
+      self._assertSummaryHasCount(
+          self.evaluate(aggregator.get_summary()), "record_latency",
+          (j + 1) * 100.0)
 
-  @test_util.run_deprecated_v1
   def testNoAggregatorRegistered(self, dataset_transformation):
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      for i in range(100):
-        self.assertEqual(i, self.evaluate(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    next_element = self.getNext(dataset, requires_initialization=True)
+
+    for i in range(100):
+      self.assertEqual(i, self.evaluate(next_element()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
-  @test_util.run_deprecated_v1
   def testMultipleTags(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
             stats_ops.latency_stats("record_latency_2"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
 
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      for i in range(100):
-        self.assertEqual(i, self.evaluate(next_element))
-        self._assertSummaryHasCount(
-            self.evaluate(summary_t), "record_latency", float(i + 1))
-        self._assertSummaryHasCount(
-            self.evaluate(summary_t), "record_latency_2", float(i + 1))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    next_element = self.getNext(dataset, requires_initialization=True)
+
+    for i in range(100):
+      self.assertEqual(i, self.evaluate(next_element()))
       self._assertSummaryHasCount(
-          self.evaluate(summary_t), "record_latency", 100.0)
+          self.evaluate(aggregator.get_summary()), "record_latency",
+          float(i + 1))
       self._assertSummaryHasCount(
-          self.evaluate(summary_t), "record_latency_2", 100.0)
+          self.evaluate(aggregator.get_summary()), "record_latency_2",
+          float(i + 1))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "record_latency", 100.0)
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "record_latency_2", 100.0)
 
-  @test_util.run_deprecated_v1
   def testRepeatedTags(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
             stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
+    next_element = self.getNext(dataset, requires_initialization=True)
 
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      for i in range(100):
-        self.assertEqual(i, self.evaluate(next_element))
-        self._assertSummaryHasCount(
-            self.evaluate(summary_t), "record_latency", float(2 * (i + 1)))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    for i in range(100):
+      self.assertEqual(i, self.evaluate(next_element()))
       self._assertSummaryHasCount(
-          self.evaluate(summary_t), "record_latency", 200.0)
+          self.evaluate(aggregator.get_summary()), "record_latency",
+          float(2 * (i + 1)))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "record_latency", 200.0)
 
-  @test_util.run_deprecated_v1
   def testMultipleIteratorsSameAggregator(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator_0 = dataset.make_initializable_iterator()
-    iterator_1 = dataset.make_initializable_iterator()
-    next_element = iterator_0.get_next() + iterator_1.get_next()
-    summary_t = aggregator.get_summary()
+    next_element1 = self.getNext(dataset, requires_initialization=True)
+    next_element2 = self.getNext(dataset, requires_initialization=True)
 
-    with self.cached_session() as sess:
-      self.evaluate([iterator_0.initializer, iterator_1.initializer])
-      for i in range(100):
-        self.assertEqual(i * 2, self.evaluate(next_element))
-        self._assertSummaryHasCount(
-            self.evaluate(summary_t), "record_latency", float(2 * (i + 1)))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    for i in range(100):
+      self.assertEqual(i * 2, self.evaluate(next_element1() + next_element2()))
       self._assertSummaryHasCount(
-          self.evaluate(summary_t), "record_latency", 200.0)
+          self.evaluate(aggregator.get_summary()), "record_latency",
+          float(2 * (i + 1)))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element1())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element2())
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "record_latency", 200.0)
 
-  @test_util.run_deprecated_v1
   def testMultipleDatasetWithPrefixes(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
@@ -374,25 +331,27 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
     dataset2 = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset2 = dataset_transformation(dataset2, aggregator, prefix="dataset2")
-    iterator_0 = dataset.make_initializable_iterator()
-    iterator_1 = dataset2.make_initializable_iterator()
-    next_element = iterator_0.get_next() + iterator_1.get_next()
-    summary_t = aggregator.get_summary()
+    next_element1 = self.getNext(dataset, requires_initialization=True)
+    next_element2 = self.getNext(dataset2, requires_initialization=True)
 
-    with self.test_session() as sess:
-      self.evaluate([iterator_0.initializer, iterator_1.initializer])
-      for i in range(100):
-        self.assertEqual(i * 2, self.evaluate(next_element))
-        self._assertSummaryHasCount(
-            self.evaluate(summary_t), "dataset1_record_latency", float(i + 1))
-        self._assertSummaryHasCount(
-            self.evaluate(summary_t), "dataset2_record_latency", float(i + 1))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    for i in range(100):
+      self.assertEqual(i * 2, self.evaluate(next_element1() + next_element2()))
       self._assertSummaryHasCount(
-          self.evaluate(summary_t), "dataset1_record_latency", 100.0)
+          self.evaluate(aggregator.get_summary()), "dataset1_record_latency",
+          float(i + 1))
       self._assertSummaryHasCount(
-          self.evaluate(summary_t), "dataset2_record_latency", 100.0)
+          self.evaluate(aggregator.get_summary()), "dataset2_record_latency",
+          float(i + 1))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element1())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element2())
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "dataset1_record_latency",
+        100.0)
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "dataset2_record_latency",
+        100.0)
 
 
 @parameterized.named_parameters(
@@ -406,7 +365,6 @@ class FeatureStatsDatasetTest(
     stats_dataset_test_base.StatsDatasetTestBase,
     reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testFeaturesStats(self, dataset_transformation):
     num_epochs = 5
     total_records = num_epochs * self._num_records
@@ -435,27 +393,26 @@ class FeatureStatsDatasetTest(
 
     dataset = dataset_transformation(
         dataset_fn(), aggregator, prefix="record_stats")
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
-
-    with self.test_session() as sess:
-      self.evaluate(iterator.initializer)
-      for _ in range(num_output):
-        self.evaluate(next_element)
 
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
-      self._assertSummaryHasCount(
-          self.evaluate(summary_t), "record_stats_features", total_records)
-      self._assertSummaryHasCount(
-          self.evaluate(summary_t), "record_stats_feature-values",
-          total_records)
-      self._assertSummaryHasSum(
-          self.evaluate(summary_t), "record_stats_features", total_records * 4)
-      self._assertSummaryHasSum(
-          self.evaluate(summary_t), "record_stats_feature-values",
-          self._sum_keywords(1) * num_epochs + 3 * total_records)
+    next_element = self.getNext(dataset, requires_initialization=True)
+
+    for _ in range(num_output):
+      self.evaluate(next_element())
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "record_stats_features",
+        total_records)
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "record_stats_feature-values",
+        total_records)
+    self._assertSummaryHasSum(
+        self.evaluate(aggregator.get_summary()), "record_stats_features",
+        total_records * 4)
+    self._assertSummaryHasSum(
+        self.evaluate(aggregator.get_summary()), "record_stats_feature-values",
+        self._sum_keywords(1) * num_epochs + 3 * total_records)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
index c5bf9267590b105bcb681455d9488d09451345b9..b80aab994e1754faccde5653de9149f32a5f862c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
@@ -93,27 +93,23 @@ class StatsDatasetTestBase(test_base.DatasetTestBase):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_fn()
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
+    next_element = self.getNext(dataset, requires_initialization=True)
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      for i in range(num_output):
-        next_ = sess.run(next_element)
-        if check_elements:
-          self.assertAllEqual(np.array([i] * i, dtype=np.int64), next_)
-        summary_str = sess.run(summary_t)
-        if function_processing_time:
-          self._assertSummaryHasCountMoreOrEqualGeneralisedTag(
-              summary_str, "::execution_time", float(i + 1))
-        self._assertSummaryContains(summary_str,
-                                    dataset_name + "::num_parallel_calls")
-        self._assertSummaryContains(summary_str,
-                                    dataset_name + "::active_parallel_calls")
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+    for i in range(num_output):
+      next_ = self.evaluate(next_element())
+      if check_elements:
+        self.assertAllEqual(np.array([i] * i, dtype=np.int64), next_)
+      summary_str = self.evaluate(aggregator.get_summary())
       if function_processing_time:
-        summary_str = sess.run(summary_t)
         self._assertSummaryHasCountMoreOrEqualGeneralisedTag(
-            summary_str, "::execution_time", float(num_output))
+            summary_str, "::execution_time", float(i + 1))
+      self._assertSummaryContains(summary_str,
+                                  dataset_name + "::num_parallel_calls")
+      self._assertSummaryContains(summary_str,
+                                  dataset_name + "::active_parallel_calls")
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    if function_processing_time:
+      summary_str = self.evaluate(aggregator.get_summary())
+      self._assertSummaryHasCountMoreOrEqualGeneralisedTag(
+          summary_str, "::execution_time", float(num_output))
diff --git a/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py
index 8fd0ad50c4483ab321f391d403a2c8bf6ab48b7d..14a4241ec2e6930622aaf9e35ae70e18eaaa004f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py
@@ -23,26 +23,24 @@ from tensorflow.python.data.experimental.ops import writers
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
-from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.lib.io import tf_record
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class TFRecordWriterTest(test_base.DatasetTestBase):
 
   def setUp(self):
     super(TFRecordWriterTest, self).setUp()
     self._num_records = 7
-    self.filename = array_ops.placeholder(dtypes.string, shape=[])
-    self.compression_type = array_ops.placeholder_with_default("", shape=[])
 
-    input_dataset = readers.TFRecordDataset([self.filename],
-                                            self.compression_type)
-    self.writer = writers.TFRecordWriter(
-        self._outputFilename(), self.compression_type).write(input_dataset)
+  def writer_fn(self, filename, compression_type=""):
+    input_dataset = readers.TFRecordDataset([filename], compression_type)
+    return writers.TFRecordWriter(self._outputFilename(),
+                                  compression_type).write(input_dataset)
 
   def _record(self, i):
     return compat.as_bytes("Record %d" % (i))
@@ -62,56 +60,39 @@ class TFRecordWriterTest(test_base.DatasetTestBase):
     return os.path.join(self.get_temp_dir(), "tf_record.out.txt")
 
   def testWrite(self):
-    with self.cached_session() as sess:
-      sess.run(
-          self.writer, feed_dict={
-              self.filename: self._createFile(),
-          })
+    self.evaluate(self.writer_fn(self._createFile()))
     for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())):
       self.assertAllEqual(self._record(i), r)
 
   def testWriteZLIB(self):
     options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.ZLIB)
-    with self.cached_session() as sess:
-      sess.run(
-          self.writer,
-          feed_dict={
-              self.filename: self._createFile(options),
-              self.compression_type: "ZLIB",
-          })
+    self.evaluate(
+        self.writer_fn(self._createFile(options), compression_type="ZLIB"))
     for i, r in enumerate(
         tf_record.tf_record_iterator(self._outputFilename(), options=options)):
       self.assertAllEqual(self._record(i), r)
 
   def testWriteGZIP(self):
     options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.GZIP)
-    with self.cached_session() as sess:
-      sess.run(
-          self.writer,
-          feed_dict={
-              self.filename: self._createFile(options),
-              self.compression_type: "GZIP",
-          })
+    self.evaluate(
+        self.writer_fn(self._createFile(options), compression_type="GZIP"))
     for i, r in enumerate(
         tf_record.tf_record_iterator(self._outputFilename(), options=options)):
       self.assertAllEqual(self._record(i), r)
 
   def testFailDataset(self):
     with self.assertRaises(TypeError):
-      writers.TFRecordWriter(self._outputFilename(),
-                             self.compression_type).write("whoops")
+      writers.TFRecordWriter(self._outputFilename(), "").write("whoops")
 
   def testFailDType(self):
     input_dataset = dataset_ops.Dataset.from_tensors(10)
     with self.assertRaises(TypeError):
-      writers.TFRecordWriter(self._outputFilename(),
-                             self.compression_type).write(input_dataset)
+      writers.TFRecordWriter(self._outputFilename(), "").write(input_dataset)
 
   def testFailShape(self):
     input_dataset = dataset_ops.Dataset.from_tensors([["hello"], ["world"]])
     with self.assertRaises(TypeError):
-      writers.TFRecordWriter(self._outputFilename(),
-                             self.compression_type).write(input_dataset)
+      writers.TFRecordWriter(self._outputFilename(), "").write(input_dataset)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
index 9c6830d9934350603a39c8a64ad860b726b62aba..e4034cc43a0cbc6cd0c35595a8a4ca944ca4d07e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
@@ -36,24 +36,14 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
-  @test_util.run_deprecated_v1
   def testUnbatchWithUnknownRankInput(self):
-    placeholder = array_ops.placeholder(dtypes.int32)
-    dataset = dataset_ops.Dataset.from_tensors(placeholder).apply(
-        batching.unbatch())
-    iterator = dataset.make_initializable_iterator()
-    next_elem = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer, feed_dict={placeholder: [0, 1, 2, 3]})
-      for i in range(4):
-        self.assertEqual(i, self.evaluate(next_elem))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_elem)
+    dataset = dataset_ops.Dataset.from_tensors([0, 1, 2,
+                                                3]).apply(batching.unbatch())
+    self.assertDatasetProduces(dataset, range(4))
 
-  @test_util.run_deprecated_v1
   def testUnbatchScalarDataset(self):
     data = tuple([math_ops.range(10) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -63,17 +53,8 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
+    self.assertDatasetProduces(data, [(i,) * 3 for i in range(10)])
 
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual((i,) * 3, self.evaluate(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(op)
-
-  @test_util.run_deprecated_v1
   def testUnbatchDatasetWithStrings(self):
     data = tuple([math_ops.range(10) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -84,18 +65,12 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual((i, compat.as_bytes(str(i)), i), self.evaluate(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(op)
+    self.assertDatasetProduces(
+        data, [(i, compat.as_bytes(str(i)), i) for i in range(10)])
 
+  # TODO(b/117581999): Add eager coverage.
   @test_util.run_deprecated_v1
-  def testUnbatchDatasetWithSparseTensor(self):
+  def testSkipEagerUnbatchDatasetWithSparseTensor(self):
     st = sparse_tensor.SparseTensorValue(
         indices=[[i, i] for i in range(10)],
         values=list(range(10)),
@@ -104,20 +79,20 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     data = data.batch(5)
     data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      for i in range(10):
-        st_row = self.evaluate(next_element)
-        self.assertEqual([i], st_row.indices)
-        self.assertEqual([i], st_row.values)
-        self.assertEqual([10], st_row.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    for i in range(10):
+      st_row = self.evaluate(next_element)
+      self.assertEqual([i], st_row.indices)
+      self.assertEqual([i], st_row.values)
+      self.assertEqual([10], st_row.dense_shape)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element)
 
+  # TODO(b/117581999): Add eager coverage.
   @test_util.run_deprecated_v1
-  def testUnbatchDatasetWithDenseAndSparseTensor(self):
+  def testSkipEagerUnbatchDatasetWithDenseAndSparseTensor(self):
     st = sparse_tensor.SparseTensorValue(
         indices=[[i, i] for i in range(10)],
         values=list(range(10)),
@@ -126,20 +101,17 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     data = data.batch(5)
     data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
-    next_element = iterator.get_next()
+    next_element = self.getNext(data)
 
-    with self.cached_session() as sess:
-      for i in range(10):
-        dense_elem, st_row = self.evaluate(next_element)
-        self.assertEqual(i, dense_elem)
-        self.assertEqual([i], st_row.indices)
-        self.assertEqual([i], st_row.values)
-        self.assertEqual([10], st_row.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    for i in range(10):
+      dense_elem, st_row = self.evaluate(next_element())
+      self.assertEqual(i, dense_elem)
+      self.assertEqual([i], st_row.indices)
+      self.assertEqual([i], st_row.values)
+      self.assertEqual([10], st_row.dense_shape)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
-  @test_util.run_deprecated_v1
   def testUnbatchSingleElementTupleDataset(self):
     data = tuple([(math_ops.range(10),) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -149,17 +121,8 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual(((i,),) * 3, self.evaluate(op))
+    self.assertDatasetProduces(data, [((i,),) * 3 for i in range(10)])
 
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(op)
-
-  @test_util.run_deprecated_v1
   def testUnbatchMultiElementTupleDataset(self):
     data = tuple([(math_ops.range(10 * i, 10 * i + 10),
                    array_ops.fill([10], "hi")) for i in range(3)])
@@ -170,29 +133,16 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertAllEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual(((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")),
-                         self.evaluate(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(op)
+    self.assertDatasetProduces(
+        data,
+        [((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")) for i in range(10)])
 
-  @test_util.run_deprecated_v1
   def testUnbatchEmpty(self):
     data = dataset_ops.Dataset.from_tensors(
         (constant_op.constant([]), constant_op.constant([], shape=[0, 4]),
          constant_op.constant([], shape=[0, 4, 0])))
     data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    self.assertDatasetProduces(data, [])
 
   def testUnbatchStaticShapeMismatch(self):
     data = dataset_ops.Dataset.from_tensors((np.arange(7), np.arange(8),
@@ -200,13 +150,14 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(ValueError):
       data.apply(batching.unbatch())
 
+  # TODO(b/117581999): eager mode doesnt capture raised error, debug.
   @test_util.run_deprecated_v1
-  def testUnbatchDynamicShapeMismatch(self):
+  def testSkipEagerUnbatchDynamicShapeMismatch(self):
     ph1 = array_ops.placeholder(dtypes.int32, shape=[None])
     ph2 = array_ops.placeholder(dtypes.int32, shape=None)
     data = dataset_ops.Dataset.from_tensors((ph1, ph2))
     data = data.apply(batching.unbatch())
-    iterator = data.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(data)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
diff --git a/tensorflow/python/data/experimental/kernel_tests/unique_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
index ddec968858aa9234d019b28d906b24f95c7f7646..42d76a2eb3013625e7807d1f50dd19809a7cd3e4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unique_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
@@ -21,12 +21,12 @@ from tensorflow.python.data.experimental.ops import unique
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class UniqueTest(test_base.DatasetTestBase):
 
   def _testSimpleHelper(self, dtype, test_cases):
@@ -44,19 +44,13 @@ class UniqueTest(test_base.DatasetTestBase):
     current_test_case = []
     dataset = dataset_ops.Dataset.from_generator(lambda: current_test_case,
                                                  dtype).apply(unique.unique())
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      for test_case, expected in test_cases:
-        current_test_case = test_case
-        self.evaluate(iterator.initializer)
-        for element in expected:
-          if dtype == dtypes.string:
-            element = compat.as_bytes(element)
-          self.assertAllEqual(element, self.evaluate(next_element))
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(next_element)
+    for test_case, expected in test_cases:
+      current_test_case = test_case
+      self.assertDatasetProduces(dataset, [
+          compat.as_bytes(element) if dtype == dtypes.string else element
+          for element in expected
+      ])
 
   @test_util.run_deprecated_v1
   def testSimpleInt(self):
diff --git a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8f50501517c24d5aea78d7dda18240f54921197
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py
@@ -0,0 +1,68 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Wrapping / Unwrapping dataset variants."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class WrapDatasetVariantTest(test_base.DatasetTestBase):
+
+  def testBasic(self):
+    ds = dataset_ops.Dataset.range(100)
+    ds_variant = ds._as_variant_tensor()  # pylint: disable=protected-access
+
+    wrapped_variant = gen_dataset_ops.wrap_dataset_variant(ds_variant)
+    unwrapped_variant = gen_dataset_ops.unwrap_dataset_variant(wrapped_variant)
+
+    variant_ds = dataset_ops._VariantDataset(unwrapped_variant,
+                                             ds._element_structure)
+    get_next = self.getNext(variant_ds, requires_initialization=True)
+    for i in range(100):
+      self.assertEqual(i, self.evaluate(get_next()))
+
+  # TODO(b/117581999): add eager coverage when supported.
+  def testSkipEagerGPU(self):
+    ds = dataset_ops.Dataset.range(100)
+    ds_variant = ds._as_variant_tensor()  # pylint: disable=protected-access
+    wrapped_variant = gen_dataset_ops.wrap_dataset_variant(ds_variant)
+
+    with ops.device("/gpu:0"):
+      gpu_wrapped_variant = array_ops.identity(wrapped_variant)
+
+    unwrapped_variant = gen_dataset_ops.unwrap_dataset_variant(
+        gpu_wrapped_variant)
+    variant_ds = dataset_ops._VariantDataset(unwrapped_variant,
+                                             ds._element_structure)
+    iterator = dataset_ops.make_initializable_iterator(variant_ds)
+    get_next = iterator.get_next()
+
+    with self.cached_session():
+      self.evaluate(iterator.initializer)
+      for i in range(100):
+        self.assertEqual(i, self.evaluate(get_next))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 50f5127833e607c9fde87f1812019f6f6869e7b3..60c20e0bcf2d875a15ffcc4c42d10cb6e0cc25ea 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -4,6 +4,16 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+py_library(
+    name = "cardinality",
+    srcs = ["cardinality.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
 py_library(
     name = "counter",
     srcs = ["counter.py"],
@@ -54,14 +64,13 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:random_seed",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
@@ -125,6 +134,7 @@ py_library(
         "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
         "//third_party/py/numpy",
     ],
 )
@@ -219,6 +229,8 @@ py_library(
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
@@ -257,7 +269,7 @@ py_library(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
@@ -286,12 +298,13 @@ py_library(
     srcs = ["scan_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
@@ -422,6 +435,7 @@ py_library(
     name = "dataset_ops",
     deps = [
         ":batching",
+        ":cardinality",
         ":counter",
         ":enumerate_ops",
         ":error_ops",
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index d8985fd13bf3e976764654f83cf02eb464254d18..29df98f4ea4c90d80f3518684febacc101ec2ba5 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -24,17 +24,18 @@ from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -364,23 +365,19 @@ class _UnbatchDataset(dataset_ops.UnaryDataset):
                          "different batch sizes.")
     self._input_dataset = input_dataset
 
+    self._structure = structure.convert_legacy_structure(
+        input_dataset.output_types,
+        nest.map_structure(lambda s: s[1:], input_dataset.output_shapes),
+        input_dataset.output_classes)
+
   def _as_variant_tensor(self):
-    return gen_dataset_ops.unbatch_dataset(
+    return ged_ops.experimental_unbatch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return nest.map_structure(lambda s: s[1:],
-                              self._input_dataset.output_shapes)
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
 @tf_export("data.experimental.unbatch")
@@ -408,21 +405,19 @@ def unbatch():
 
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
-    if not sparse.any_sparse(dataset.output_classes):
-      return _UnbatchDataset(dataset)
-
     # NOTE(mrry): We must ensure that any SparseTensors in `dataset`
     # are normalized to the rank-1 dense representation, so that the
     # sparse-oblivious unbatching logic will slice them
     # appropriately. This leads to a somewhat inefficient re-encoding step
     # for all SparseTensor components.
-    # TODO(mrry): Consider optimizing this in future
-    # if it turns out to be a bottleneck.
+    # TODO(mrry): Consider optimizing this in future if it turns out to be
+    # a bottleneck.
     def normalize(arg, *rest):
+      # pylint: disable=protected-access
       if rest:
-        return sparse.serialize_many_sparse_tensors((arg,) + rest)
+        return dataset._element_structure._to_batched_tensor_list((arg,) + rest)
       else:
-        return sparse.serialize_many_sparse_tensors(arg)
+        return dataset._element_structure._to_batched_tensor_list(arg)
 
     normalized_dataset = dataset.map(normalize)
 
@@ -453,25 +448,20 @@ class _DenseToSparseBatchDataset(dataset_ops.UnaryDataset):
     self._input_dataset = input_dataset
     self._batch_size = batch_size
     self._row_shape = row_shape
+    self._structure = structure.SparseTensorStructure(
+        input_dataset.output_types,
+        tensor_shape.vector(None).concatenate(self._row_shape))
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.dense_to_sparse_batch_dataset(
+    return ged_ops.experimental_dense_to_sparse_batch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._batch_size,
         row_shape=convert.partial_shape_to_tensor(self._row_shape),
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return sparse_tensor.SparseTensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.vector(None).concatenate(self._row_shape)
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
 class _RestructuredDataset(dataset_ops.UnaryDataset):
@@ -522,13 +512,10 @@ class _RestructuredDataset(dataset_ops.UnaryDataset):
             "Dataset with output types %r cannot be restructured to have "
             "output types %r" % (dataset.output_types, output_types))
 
-    self._output_types = output_types
-
     if output_shapes is None:
       # Inherit shapes from the original `dataset`.
-      self._output_shapes = nest.pack_sequence_as(output_types,
-                                                  nest.flatten(
-                                                      dataset.output_shapes))
+      output_shapes = nest.pack_sequence_as(
+          output_types, nest.flatten(dataset.output_shapes))
     else:
       if not allow_unsafe_cast:
         # Validate that the shapes are compatible.
@@ -543,39 +530,34 @@ class _RestructuredDataset(dataset_ops.UnaryDataset):
                 "Dataset with output shapes %r cannot be restructured to have "
                 "incompatible output shapes %r" % (dataset.output_shapes,
                                                    output_shapes))
-      self._output_shapes = nest.map_structure_up_to(
+      output_shapes = nest.map_structure_up_to(
           output_types, tensor_shape.as_shape, output_shapes)
     if output_classes is None:
       # Inherit class types from the original `dataset`.
-      self._output_classes = nest.pack_sequence_as(output_types,
-                                                   nest.flatten(
-                                                       dataset.output_classes))
-    else:
-      self._output_classes = output_classes
+      output_classes = nest.pack_sequence_as(
+          output_types, nest.flatten(dataset.output_classes))
+
+    self._structure = structure.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
 
   def _as_variant_tensor(self):
     return self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
 
   @property
-  def output_classes(self):
-    return self._output_classes
+  def _element_structure(self):
+    return self._structure
 
-  @property
-  def output_types(self):
-    return self._output_types
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
 
-
-class _MapAndBatchDataset(dataset_ops.MapDataset):
+class _MapAndBatchDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that maps a function over a batch of elements."""
 
   def __init__(self, input_dataset, map_func, batch_size, num_parallel_calls,
                drop_remainder):
     """See `Dataset.map()` for details."""
-    super(_MapAndBatchDataset, self).__init__(input_dataset, map_func)
+    super(_MapAndBatchDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._map_func = dataset_ops.StructuredFunctionWrapper(
+        map_func, "tf.data.experimental.map_and_batch()", dataset=input_dataset)
     self._batch_size_t = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
     self._num_parallel_calls_t = ops.convert_to_tensor(
@@ -583,36 +565,33 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
     self._drop_remainder_t = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
-    self._batch_size = batch_size
-    self._drop_remainder = drop_remainder
+    constant_drop_remainder = tensor_util.constant_value(self._drop_remainder_t)
+    if constant_drop_remainder:
+      # NOTE(mrry): `constant_drop_remainder` may be `None` (unknown statically)
+      # or `False` (explicitly retaining the remainder).
+      self._structure = self._map_func.output_structure._batch(  # pylint: disable=protected-access
+          tensor_util.constant_value(self._batch_size_t))
+    else:
+      self._structure = self._map_func.output_structure._batch(None)  # pylint: disable=protected-access
+
+  def _functions(self):
+    return [self._map_func]
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
-    input_resource = self._input_dataset._as_variant_tensor()
-    return gen_dataset_ops.map_and_batch_dataset_v2(
-        input_resource,
-        self._map_func.captured_inputs,
-        f=self._map_func,
+    return ged_ops.experimental_map_and_batch_dataset(
+        self._input_dataset._as_variant_tensor(),
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         batch_size=self._batch_size_t,
         num_parallel_calls=self._num_parallel_calls_t,
         drop_remainder=self._drop_remainder_t,
+        preserve_cardinality=True,
         **dataset_ops.flat_structure(self))
-    # pylint: enable=protected-access
-
-  @property
-  def output_shapes(self):
-    dim = self._batch_size if self._drop_remainder else None
-    return nest.pack_sequence_as(self._output_shapes, [
-        tensor_shape.vector(dim).concatenate(s)
-        for s in nest.flatten(self._output_shapes)
-    ])
 
   @property
-  def output_types(self):
-    return self._output_types
-
-  def _transformation_name(self):
-    return "tf.data.experimental.map_and_batch()"
+  def _element_structure(self):
+    return self._structure
 
 
 @tf_export("data.experimental.map_and_batch")
@@ -644,9 +623,10 @@ def map_and_batch(map_func,
       whether the last batch should be dropped in case its size is smaller than
       desired; the default behavior is not to drop the smaller batch.
     num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
-        representing the number of elements to process in parallel. If not
-        specified, `batch_size * num_parallel_batches` elements will be
-        processed in parallel.
+      representing the number of elements to process in parallel. If not
+      specified, `batch_size * num_parallel_batches` elements will be processed
+      in parallel. If the value `tf.data.experimental.AUTOTUNE` is used, then
+      the number of parallel calls is set dynamically based on available CPU.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
diff --git a/tensorflow/python/data/experimental/ops/cardinality.py b/tensorflow/python/data/experimental/ops/cardinality.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cf0a8801e8339f233eb61c8e0b1223b8b94358b
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/cardinality.py
@@ -0,0 +1,50 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Cardinality analysis of `Dataset` objects."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+INFINITE = -1
+UNKNOWN = -2
+tf_export("data.experimental.INFINITE_CARDINALITY").export_constant(
+    __name__, "INFINITE")
+tf_export("data.experimental.UNKNOWN_CARDINALITY").export_constant(
+    __name__, "UNKNOWN")
+
+
+@tf_export("data.experimental.cardinality")
+def cardinality(dataset):
+  """Returns the cardinality of `dataset`, if known.
+
+  The operation returns the cardinality of `dataset`. The operation may return
+  `tf.data.experimental.INFINITE_CARDINALITY` if `dataset` contains an infinite
+  number of elements or `tf.data.experimental.UNKNOWN_CARDINALITY` if the
+  analysis fails to determine the number of elements in `dataset` (e.g. when the
+  dataset source is a file).
+
+  Args:
+    dataset: A `tf.data.Dataset` for which to determine cardinality.
+
+  Returns:
+    A scalar `tf.int64` `Tensor` representing the cardinality of `dataset`. If
+    the cardinality is infinite or unknown, the operation returns the named
+    constant `INFINITE_CARDINALITY` and `UNKNOWN_CARDINALITY` respectively.
+  """
+  return ged_ops.experimental_dataset_cardinality(dataset._as_variant_tensor())  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/experimental/ops/get_single_element.py b/tensorflow/python/data/experimental/ops/get_single_element.py
index 73116edf1288bf252721a5f96cf69b8d590dff14..d649a0701270c55d399af140f5e2bae79484fec2 100644
--- a/tensorflow/python/data/experimental/ops/get_single_element.py
+++ b/tensorflow/python/data/experimental/ops/get_single_element.py
@@ -18,8 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -63,10 +61,8 @@ def get_single_element(dataset):
   if not isinstance(dataset, dataset_ops.DatasetV2):
     raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
 
-  nested_ret = nest.pack_sequence_as(
-      dataset.output_types, gen_dataset_ops.dataset_to_single_element(
-          dataset._as_variant_tensor(),  # pylint: disable=protected-access
+  # pylint: disable=protected-access
+  return dataset._element_structure._from_compatible_tensor_list(
+      gen_dataset_ops.dataset_to_single_element(
+          dataset._as_variant_tensor(),
           **dataset_ops.flat_structure(dataset)))
-  return sparse.deserialize_sparse_tensors(
-      nested_ret, dataset.output_types, dataset.output_shapes,
-      dataset.output_classes)
diff --git a/tensorflow/python/data/experimental/ops/grouping.py b/tensorflow/python/data/experimental/ops/grouping.py
index 2eef36d0c89a2901d0165ff5b49d90cceae53a46..7a144aa99cb114403b60dca12f0a7173d26b7161 100644
--- a/tensorflow/python/data/experimental/ops/grouping.py
+++ b/tensorflow/python/data/experimental/ops/grouping.py
@@ -28,7 +28,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -243,29 +243,6 @@ def bucket_by_sequence_length(element_length_func,
     return _apply_fn
 
 
-def _map_x_dataset(map_func):
-  """A transformation that maps `map_func` across its input.
-
-  This transformation is similar to `tf.data.Dataset.map`, but in addition to
-  supporting dense and sparse tensor inputs, it also supports dataset inputs.
-
-  Args:
-    map_func: A function mapping a nested structure of tensors and/or datasets
-      (having shapes and types defined by `self.output_shapes` and
-     `self.output_types`) to another nested structure of tensors and/or
-     datasets.
-
-  Returns:
-    Dataset: A `Dataset`.
-  """
-
-  def _apply_fn(dataset):
-    """Function from `Dataset` to `Dataset` that applies the transformation."""
-    return _MapXDataset(dataset, map_func)
-
-  return _apply_fn
-
-
 class _GroupByReducerDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that groups its input and performs a reduction."""
 
@@ -282,50 +259,44 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
 
   def _make_key_func(self, key_func, input_dataset):
     """Make wrapping defun for key_func."""
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+    self._key_func = dataset_ops.StructuredFunctionWrapper(
         key_func, self._transformation_name(), dataset=input_dataset)
-    if not (
-        wrapped_func.output_types == dtypes.int64 and
-        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
+    if not self._key_func.output_structure.is_compatible_with(
+        structure.TensorStructure(dtypes.int64, [])):
       raise ValueError(
           "`key_func` must return a single tf.int64 tensor. "
           "Got type=%s and shape=%s"
-          % (wrapped_func.output_types, wrapped_func.output_shapes))
-    self._key_func = wrapped_func.function
-
+          % (self._key_func.output_types, self._key_func.output_shapes))
   def _make_init_func(self, init_func):
     """Make wrapping defun for init_func."""
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+    self._init_func = dataset_ops.StructuredFunctionWrapper(
         init_func,
         self._transformation_name(),
-        input_classes=ops.Tensor,
-        input_shapes=tensor_shape.scalar(),
-        input_types=dtypes.int64)
-    self._init_func = wrapped_func.function
-    self._state_classes = wrapped_func.output_classes
-    self._state_shapes = wrapped_func.output_shapes
-    self._state_types = wrapped_func.output_types
+        input_structure=structure.TensorStructure(dtypes.int64, []))
 
   def _make_reduce_func(self, reduce_func, input_dataset):
     """Make wrapping defun for reduce_func."""
 
     # Iteratively rerun the reduce function until reaching a fixed point on
-    # `self._state_shapes`.
+    # `self._state_structure`.
+    self._state_structure = self._init_func.output_structure
+    state_types = self._init_func.output_types
+    state_shapes = self._init_func.output_shapes
+    state_classes = self._init_func.output_classes
     need_to_rerun = True
     while need_to_rerun:
 
       wrapped_func = dataset_ops.StructuredFunctionWrapper(
           reduce_func,
           self._transformation_name(),
-          input_classes=(self._state_classes, input_dataset.output_classes),
-          input_shapes=(self._state_shapes, input_dataset.output_shapes),
-          input_types=(self._state_types, input_dataset.output_types),
+          input_structure=structure.NestedStructure(
+              (self._state_structure, input_dataset._element_structure)),  # pylint: disable=protected-access
           add_to_graph=False)
 
       # Extract and validate class information from the returned values.
       for new_state_class, state_class in zip(
           nest.flatten(wrapped_func.output_classes),
-          nest.flatten(self._state_classes)):
+          nest.flatten(state_classes)):
         if not issubclass(new_state_class, state_class):
           raise TypeError(
               "The element classes for the new state must match the initial "
@@ -334,16 +305,15 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
 
       # Extract and validate type information from the returned values.
       for new_state_type, state_type in zip(
-          nest.flatten(wrapped_func.output_types),
-          nest.flatten(self._state_types)):
+          nest.flatten(wrapped_func.output_types), nest.flatten(state_types)):
         if new_state_type != state_type:
           raise TypeError(
               "The element types for the new state must match the initial "
               "state. Expected %s; got %s." %
-              (self._state_types, wrapped_func.output_types))
+              (self._init_func.output_types, wrapped_func.output_types))
 
       # Extract shape information from the returned values.
-      flat_state_shapes = nest.flatten(self._state_shapes)
+      flat_state_shapes = nest.flatten(state_shapes)
       flat_new_state_shapes = nest.flatten(wrapped_func.output_shapes)
       weakened_state_shapes = [
           original.most_specific_compatible_shape(new)
@@ -360,48 +330,40 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
           break
 
       if need_to_rerun:
-        self._state_shapes = nest.pack_sequence_as(self._state_shapes,
-                                                   weakened_state_shapes)
+        state_shapes = nest.pack_sequence_as(
+            self._init_func.output_shapes, weakened_state_shapes)
+        self._state_structure = structure.convert_legacy_structure(
+            state_types, state_shapes, state_classes)
 
-    self._reduce_func = wrapped_func.function
-    self._reduce_func.add_to_graph(ops.get_default_graph())
+    self._reduce_func = wrapped_func
+    self._reduce_func.function.add_to_graph(ops.get_default_graph())
 
   def _make_finalize_func(self, finalize_func):
     """Make wrapping defun for finalize_func."""
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        finalize_func,
-        self._transformation_name(),
-        input_classes=self._state_classes,
-        input_shapes=self._state_shapes,
-        input_types=self._state_types)
-    self._finalize_func = wrapped_func.function
-    self._output_classes = wrapped_func.output_classes
-    self._output_shapes = wrapped_func.output_shapes
-    self._output_types = wrapped_func.output_types
+    self._finalize_func = dataset_ops.StructuredFunctionWrapper(
+        finalize_func, self._transformation_name(),
+        input_structure=self._state_structure)
 
   @property
-  def output_classes(self):
-    return self._output_classes
+  def _element_structure(self):
+    return self._finalize_func.output_structure
 
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _functions(self):
+    return [
+        self._key_func, self._init_func, self._reduce_func, self._finalize_func
+    ]
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.group_by_reducer_dataset(
+    return ged_ops.experimental_group_by_reducer_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._key_func.captured_inputs,
-        self._init_func.captured_inputs,
-        self._reduce_func.captured_inputs,
-        self._finalize_func.captured_inputs,
-        key_func=self._key_func,
-        init_func=self._init_func,
-        reduce_func=self._reduce_func,
-        finalize_func=self._finalize_func,
+        self._key_func.function.captured_inputs,
+        self._init_func.function.captured_inputs,
+        self._reduce_func.function.captured_inputs,
+        self._finalize_func.function.captured_inputs,
+        key_func=self._key_func.function,
+        init_func=self._init_func.function,
+        reduce_func=self._reduce_func.function,
+        finalize_func=self._finalize_func.function,
         **dataset_ops.flat_structure(self))
 
   def _transformation_name(self):
@@ -426,76 +388,59 @@ class _GroupByWindowDataset(dataset_ops.UnaryDataset):
 
     def window_size_func_wrapper(key):
       return ops.convert_to_tensor(window_size_func(key), dtype=dtypes.int64)
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+    self._window_size_func = dataset_ops.StructuredFunctionWrapper(
         window_size_func_wrapper,
         self._transformation_name(),
-        input_classes=ops.Tensor,
-        input_shapes=tensor_shape.scalar(),
-        input_types=dtypes.int64)
-    if not (
-        wrapped_func.output_types == dtypes.int64 and
-        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
+        input_structure=structure.TensorStructure(dtypes.int64, []))
+    if not self._window_size_func.output_structure.is_compatible_with(
+        structure.TensorStructure(dtypes.int64, [])):
       raise ValueError(
           "`window_size_func` must return a single tf.int64 scalar tensor.")
-    self._window_size_func = wrapped_func.function
 
   def _make_key_func(self, key_func, input_dataset):
     """Make wrapping defun for key_func."""
 
     def key_func_wrapper(*args):
       return ops.convert_to_tensor(key_func(*args), dtype=dtypes.int64)
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+    self._key_func = dataset_ops.StructuredFunctionWrapper(
         key_func_wrapper, self._transformation_name(), dataset=input_dataset)
-    if not (
-        wrapped_func.output_types == dtypes.int64 and
-        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
+    if not self._key_func.output_structure.is_compatible_with(
+        structure.TensorStructure(dtypes.int64, [])):
       raise ValueError(
           "`key_func` must return a single tf.int64 scalar tensor.")
-    self._key_func = wrapped_func.function
 
   def _make_reduce_func(self, reduce_func, input_dataset):
     """Make wrapping defun for reduce_func."""
     nested_dataset = dataset_ops.DatasetStructure(
-        structure.Structure._from_legacy_structure(  # pylint: disable=protected-access
-            input_dataset.output_types, input_dataset.output_shapes,
-            input_dataset.output_classes))
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        reduce_func,
-        self._transformation_name(),
-        input_classes=(ops.Tensor, nested_dataset),
-        input_shapes=(tensor_shape.scalar(), nested_dataset),
-        input_types=(dtypes.int64, nested_dataset))
+        input_dataset._element_structure)  # pylint: disable=protected-access
+    input_structure = structure.NestedStructure(
+        (structure.TensorStructure(dtypes.int64, []), nested_dataset))
+    self._reduce_func = dataset_ops.StructuredFunctionWrapper(
+        reduce_func, self._transformation_name(),
+        input_structure=input_structure)
     if not isinstance(
-        wrapped_func.output_structure, dataset_ops.DatasetStructure):
+        self._reduce_func.output_structure, dataset_ops.DatasetStructure):
       raise TypeError("`reduce_func` must return a `Dataset` object.")
     # pylint: disable=protected-access
-    element_structure = wrapped_func.output_structure._element_structure
-    self._output_classes = element_structure._to_legacy_output_classes()
-    self._output_types = element_structure._to_legacy_output_types()
-    self._output_shapes = element_structure._to_legacy_output_shapes()
-    self._reduce_func = wrapped_func.function
+    self._structure = (
+        self._reduce_func.output_structure._element_structure)
 
   @property
-  def output_classes(self):
-    return self._output_classes
+  def _element_structure(self):
+    return self._structure
 
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _functions(self):
+    return [self._key_func, self._reduce_func, self._window_size_func]
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.group_by_window_dataset(
+    return ged_ops.experimental_group_by_window_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._key_func.captured_inputs,
-        self._reduce_func.captured_inputs,
-        self._window_size_func.captured_inputs,
-        key_func=self._key_func,
-        reduce_func=self._reduce_func,
-        window_size_func=self._window_size_func,
+        self._key_func.function.captured_inputs,
+        self._reduce_func.function.captured_inputs,
+        self._window_size_func.function.captured_inputs,
+        key_func=self._key_func.function,
+        reduce_func=self._reduce_func.function,
+        window_size_func=self._window_size_func.function,
         **dataset_ops.flat_structure(self))
 
   def _transformation_name(self):
@@ -528,42 +473,3 @@ class Reducer(object):
   @property
   def finalize_func(self):
     return self._finalize_func
-
-
-class _MapXDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that maps a function over elements in its input."""
-
-  def __init__(self, input_dataset, map_func):
-    """See `map_x_dataset()` for details."""
-    super(_MapXDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        map_func, self._transformation_name(), dataset=input_dataset)
-    self._output_classes = wrapped_func.output_classes
-    self._output_shapes = wrapped_func.output_shapes
-    self._output_types = wrapped_func.output_types
-    self._map_func = wrapped_func.function
-
-  def _as_variant_tensor(self):
-    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
-    return gen_dataset_ops.map_dataset(
-        input_t,
-        self._map_func.captured_inputs,
-        f=self._map_func,
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-  def _transformation_name(self):
-    return "tf.data.experimental.map_x_dataset()"
diff --git a/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py b/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
index 570f0116f7686327f147f96447e87e5ddf8a927c..fdf3692420b1943db0b4ff0de826e6203593e2c7 100644
--- a/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
+++ b/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
@@ -22,9 +22,9 @@ import abc
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 
 
@@ -94,11 +94,7 @@ class IndexedDataset(dataset_ops.Dataset):
         ged_ops.experimental_materialized_index_dataset_handle(
             container=container,
             shared_name=shared_name,
-            output_types=nest.flatten(
-                sparse.as_dense_types(self.output_types, self.output_classes)),
-            output_shapes=nest.flatten(
-                sparse.as_dense_types(self.output_shapes,
-                                      self.output_classes))))
+            **dataset_ops.flat_structure(self)))
 
     with ops.colocate_with(materialized_resource):
       materializer = ged_ops.experimental_indexed_dataset_materialize(
@@ -107,38 +103,6 @@ class IndexedDataset(dataset_ops.Dataset):
                                       self.output_classes, self.output_types,
                                       self.output_shapes)
 
-  @abc.abstractproperty
-  def output_types(self):
-    """Returns the type of each component of an element of this IndexedDataset.
-
-    Returns:
-      A nested structure of `tf.DType` objects corresponding to each component
-      of an element of this IndexedDataset.
-    """
-    raise NotImplementedError("IndexedDataset.output_types")
-
-  @abc.abstractproperty
-  def output_classes(self):
-    """Returns the class of each component of an element of this IndexedDataset.
-
-    The expected values are `tf.Tensor` and `tf.SparseTensor`.
-
-    Returns:
-      A nested structure of Python `type` objects corresponding to each
-      component of an element of this IndexedDataset.
-    """
-    raise NotImplementedError("IndexedDataset.output_classes")
-
-  @abc.abstractproperty
-  def output_shapes(self):
-    """Returns the shape of each component of an element of this IndexedDataset.
-
-    Returns:
-      A nested structure of `tf.TensorShape` objects corresponding to each
-      component of an element of this IndexedDataset.
-    """
-    raise NotImplementedError("IndexedDataset.output_shapes")
-
   @abc.abstractmethod
   def _as_variant_tensor(self):
     """Creates a `tf.variant` `tf.Tensor` representing this IndexedDataset.
@@ -161,16 +125,8 @@ class IdentityIndexedDataset(IndexedDataset):
     self._size = ops.convert_to_tensor(size, dtype=dtypes.uint64, name="size")
 
   @property
-  def output_types(self):
-    return dtypes.uint64
-
-  @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.uint64, [])
 
   def _as_variant_tensor(self):
     return ged_ops.experimental_identity_indexed_dataset(self._size)
diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py
index 8b0fdfce11d26889770aa84403829b87c6528191..5a719f8ed8f0176f628a89eb1b3e535064d9a72e 100644
--- a/tensorflow/python/data/experimental/ops/interleave_ops.py
+++ b/tensorflow/python/data/experimental/ops/interleave_ops.py
@@ -21,6 +21,7 @@ from tensorflow.python.data.experimental.ops import random_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -101,6 +102,18 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset):
           data_input.output_classes != data_inputs[0].output_classes):
         raise TypeError("All datasets must have the same type and class.")
 
+    output_shapes = self._data_inputs[0].output_shapes
+    for data_input in self._data_inputs[1:]:
+      output_shapes = nest.pack_sequence_as(output_shapes, [
+          ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip(
+              nest.flatten(output_shapes),
+              nest.flatten(data_input.output_shapes))
+      ])
+
+    self._structure = structure.convert_legacy_structure(
+        data_inputs[0].output_types, output_shapes,
+        data_inputs[0].output_classes)
+
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
     return (
@@ -115,22 +128,8 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset):
     return [self._selector_input] + self._data_inputs
 
   @property
-  def output_classes(self):
-    return self._data_inputs[0].output_classes
-
-  @property
-  def output_shapes(self):
-    ret = self._data_inputs[0].output_shapes
-    for data_input in self._data_inputs[1:]:
-      ret = nest.pack_sequence_as(ret, [
-          ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip(
-              nest.flatten(ret), nest.flatten(data_input.output_shapes))
-      ])
-    return ret
-
-  @property
-  def output_types(self):
-    return self._data_inputs[0].output_types
+  def _element_structure(self):
+    return self._structure
 
 
 @tf_export("data.experimental.sample_from_datasets", v1=[])
diff --git a/tensorflow/python/data/experimental/ops/matching_files.py b/tensorflow/python/data/experimental/ops/matching_files.py
index 8398f86e31c92635366c599010c862b620e462b7..63b99cb1e4533d165902893918d5aea2c6f02613 100644
--- a/tensorflow/python/data/experimental/ops/matching_files.py
+++ b/tensorflow/python/data/experimental/ops/matching_files.py
@@ -19,9 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 
 
@@ -37,15 +37,5 @@ class MatchingFilesDataset(dataset_ops.DatasetSource):
     return ged_ops.experimental_matching_files_dataset(self._patterns)
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.string
-
-
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
diff --git a/tensorflow/python/data/experimental/ops/optimization_options.py b/tensorflow/python/data/experimental/ops/optimization_options.py
index dc9d3193748deb8957bdd9a5d0b25f226e6f1955..41a819d94bb88384c89cbc9b3eb0d4dc59575e0e 100644
--- a/tensorflow/python/data/experimental/ops/optimization_options.py
+++ b/tensorflow/python/data/experimental/ops/optimization_options.py
@@ -26,58 +26,106 @@ from tensorflow.python.util.tf_export import tf_export
 class OptimizationOptions(options.OptionsBase):
   """Represents options for dataset optimizations.
 
-  You can apply `OptimizationOptions` to a `dataset` object, as follows:
+  You can set the optimization options of a dataset through the
+  `experimental_optimization` property of `tf.data.Options`; the property is
+  an instance of `tf.data.experimental.OptimizationOptions`.
 
   ```python
   options = tf.data.Options()
-  options.optimization = tf.data.experimental.OptimizationOptions()
-  options.optimization.map_and_batch_fusion = True
+  options.experimental_optimization.map_vectorization = True
+  options.apply_default_optimizations = False
   dataset = dataset.with_options(options)
   ```
   """
+  apply_default_optimizations = options.create_option(
+      name="apply_default_optimizations",
+      ty=bool,
+      docstring=
+      "Whether to apply default static optimizations. If False, only static "
+      "optimizations that have been explicitly enabled will be applied.")
 
   filter_fusion = options.create_option(
       name="filter_fusion",
       ty=bool,
-      docstring="Whether to fuse filter transformations.")
+      docstring=
+      "Whether to fuse filter transformations. If None, defaults to False.")
 
   hoist_random_uniform = options.create_option(
       name="hoist_random_uniform",
       ty=bool,
       docstring=
-      "Whether to hoist `tf.random_uniform()` ops out of map transformations.")
+      "Whether to hoist `tf.random_uniform()` ops out of map transformations. "
+      "If None, defaults to False.")
 
   map_and_batch_fusion = options.create_option(
       name="map_and_batch_fusion",
       ty=bool,
-      docstring="Whether to fuse map and batch transformations.")
+      docstring=
+      "Whether to fuse map and batch transformations. If None, defaults to "
+      "True.")
 
   map_and_filter_fusion = options.create_option(
       name="map_and_filter_fusion",
       ty=bool,
-      docstring="Whether to fuse map and filter transformations.")
+      docstring=
+      "Whether to fuse map and filter transformations. If None, defaults to "
+      "False.")
 
   map_fusion = options.create_option(
-      name="map_and_filter_fusion",
+      name="map_fusion",
       ty=bool,
-      docstring="Whether to fuse map transformations.")
+      docstring="Whether to fuse map transformations. If None, defaults to "
+      "False.")
 
   map_parallelization = options.create_option(
       name="map_parallelization",
       ty=bool,
-      docstring="Whether to parallelize stateless map transformations.")
+      docstring=
+      "Whether to parallelize stateless map transformations. If None, defaults "
+      "to False.")
 
   map_vectorization = options.create_option(
       name="map_vectorization",
       ty=bool,
-      docstring="Whether to vectorize map transformations.")
+      docstring=
+      "Whether to vectorize map transformations. If None, defaults to False.")
 
   noop_elimination = options.create_option(
       name="noop_elimination",
       ty=bool,
-      docstring="Whether to eliminate no-op transformations.")
+      docstring=
+      "Whether to eliminate no-op transformations. If None, defaults to True.")
 
   shuffle_and_repeat_fusion = options.create_option(
       name="shuffle_and_repeat_fusion",
       ty=bool,
-      docstring="Whether to fuse shuffle and repeat transformations.")
+      docstring="Whether to fuse shuffle and repeat transformations. If None, "
+      "defaults to True.")
+
+  def _static_optimizations(self):
+    """Produces the list of enabled static optimizations."""
+    result = []
+    optimizations_to_enable = [
+        "filter_fusion",
+        "hoist_random_uniform",
+        "map_and_filter_fusion",
+        "map_fusion",
+        "map_parallelization",
+        "map_vectorization",
+    ]
+    for optimization in optimizations_to_enable:
+      if getattr(self, optimization):
+        result.append(optimization)
+
+    if self.apply_default_optimizations is not False:
+      # The following optimizations are turned on by default, unless the
+      # user explicitly disables them.
+      optimizations_to_disable = [
+          "map_and_batch_fusion",
+          "noop_elimination",
+          "shuffle_and_repeat_fusion",
+      ]
+      for optimization in optimizations_to_disable:
+        if getattr(self, optimization) is not False:
+          result.append(optimization)
+    return result
diff --git a/tensorflow/python/data/experimental/ops/parsing_ops.py b/tensorflow/python/data/experimental/ops/parsing_ops.py
index a63eb8c516e7172d573f9abd6b94dd4a2edd2753..deb20d61888adeeff078997fc8adfede604de8eb 100644
--- a/tensorflow/python/data/experimental/ops/parsing_ops.py
+++ b/tensorflow/python/data/experimental/ops/parsing_ops.py
@@ -18,11 +18,11 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -33,8 +33,8 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
   def __init__(self, input_dataset, features, num_parallel_calls):
     super(_ParseExampleDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
-    if not all(types == dtypes.string
-               for types in nest.flatten(input_dataset.output_types)):
+    if not input_dataset._element_structure.is_compatible_with(  # pylint: disable=protected-access
+        structure.TensorStructure(dtypes.string, [None])):
       raise TypeError("Input dataset should be a dataset of vectors of strings")
     self._num_parallel_calls = num_parallel_calls
     # pylint: disable=protected-access
@@ -67,20 +67,22 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
         for _ in range(len(sparse_keys))
     ]
 
-    self._output_shapes = dict(
+    output_shapes = dict(
         zip(self._dense_keys + self._sparse_keys,
             dense_output_shapes + sparse_output_shapes))
-    self._output_types = dict(
+    output_types = dict(
         zip(self._dense_keys + self._sparse_keys,
             self._dense_types + self._sparse_types))
-    self._output_classes = dict(
+    output_classes = dict(
         zip(self._dense_keys + self._sparse_keys,
             [ops.Tensor for _ in range(len(self._dense_defaults))] +
             [sparse_tensor.SparseTensor for _ in range(len(self._sparse_keys))
             ]))
+    self._structure = structure.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.parse_example_dataset(
+    return gen_experimental_dataset_ops.experimental_parse_example_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._num_parallel_calls,
         self._dense_defaults,
@@ -91,16 +93,8 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-  @property
-  def output_classes(self):
-    return self._output_classes
+  def _element_structure(self):
+    return self._structure
 
 
 # TODO(b/111553342): add arguments names and example names as well.
diff --git a/tensorflow/python/data/experimental/ops/prefetching_ops.py b/tensorflow/python/data/experimental/ops/prefetching_ops.py
index 9e3ca569292947d3e0f40186af708a664694ca97..e3a8622393309e796cdfc3da3f238c4430cc9237 100644
--- a/tensorflow/python/data/experimental/ops/prefetching_ops.py
+++ b/tensorflow/python/data/experimental/ops/prefetching_ops.py
@@ -19,8 +19,6 @@ from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
 from tensorflow.python.eager import function
 from tensorflow.python.framework import device as framework_device
 from tensorflow.python.framework import dtypes
@@ -51,8 +49,8 @@ def prefetch_to_device(device, buffer_size=None):
     `tf.data.Dataset.apply`.
   """
   def _apply_fn(dataset):
-    return _CopyToDeviceDataset(
-        dataset, target_device=device).prefetch(buffer_size)
+    return dataset.apply(
+        copy_to_device(target_device=device)).prefetch(buffer_size)
 
   return _apply_fn
 
@@ -71,8 +69,12 @@ def copy_to_device(target_device, source_device="/cpu:0"):
   """
 
   def _apply_fn(dataset):
+    options = dataset_ops.Options()
+    options.experimental_autotune = False
+    options.experimental_optimization.apply_default_optimizations = False
     return _CopyToDeviceDataset(
-        dataset, target_device=target_device, source_device=source_device)
+        dataset, target_device=target_device,
+        source_device=source_device).with_options(options)
 
   return _apply_fn
 
@@ -99,13 +101,6 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
     self._source_device_string = source_device
     self._source_device = ops.convert_to_tensor(source_device)
 
-    self._flat_output_shapes = nest.flatten(
-        sparse.as_dense_shapes(self._input_dataset.output_shapes,
-                               self._input_dataset.output_classes))
-    self._flat_output_types = nest.flatten(
-        sparse.as_dense_types(self._input_dataset.output_types,
-                              self._input_dataset.output_classes))
-
     @function.defun()
     def _init_func():
       """Creates an iterator for the input dataset.
@@ -116,8 +111,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
       # pylint: disable=protected-access
       ds_variant = self._input_dataset._as_variant_tensor()
       resource = gen_dataset_ops.anonymous_iterator(
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
+          **dataset_ops.flat_structure(self._input_dataset))
       with ops.control_dependencies(
           [gen_dataset_ops.make_iterator(ds_variant, resource)]):
         return gen_dataset_ops.iterator_to_string_handle(resource)
@@ -148,8 +142,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
         iterator = iterator_ops.Iterator.from_string_handle(
             string_handle, self.output_types, self.output_shapes,
             self.output_classes)
-      ret = iterator.get_next()
-      return nest.flatten(sparse.serialize_sparse_tensors(ret))
+      return self._element_structure._to_tensor_list(iterator.get_next())  # pylint: disable=protected-access
 
     next_func_concrete = _next_func._get_concrete_function_internal()  # pylint: disable=protected-access
 
@@ -159,7 +152,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
           target=self._source_device,
           args=[string_handle] +
           next_func_concrete.captured_inputs,
-          Tout=self._flat_output_types,
+          Tout=self._input_dataset._element_structure._flat_types,  # pylint: disable=protected-access
           f=next_func_concrete)
 
     self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
@@ -176,8 +169,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
       """
       iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
           string_handle,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
+          **dataset_ops.flat_structure(self._input_dataset))
       with ops.control_dependencies([
           resource_variable_ops.destroy_resource_op(
               iterator_resource, ignore_lookup_error=True)]):
@@ -189,8 +181,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
     def _remote_finalize_func(string_handle):
       return functional_ops.remote_call(
           target=self._source_device,
-          args=[string_handle] +
-          finalize_func_concrete.captured_inputs,
+          args=[string_handle] + finalize_func_concrete.captured_inputs,
           Tout=[dtypes.int64],
           f=finalize_func_concrete)
 
@@ -226,8 +217,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
           init_func=self._init_func,
           next_func=self._next_func,
           finalize_func=self._finalize_func,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
+          **dataset_ops.flat_structure(self._input_dataset))
 
 
 class _MapOnGpuDataset(dataset_ops.UnaryDataset):
@@ -239,36 +229,27 @@ class _MapOnGpuDataset(dataset_ops.UnaryDataset):
     self._input_dataset = input_dataset
     self._use_inter_op_parallelism = use_inter_op_parallelism
 
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+    self._map_func = dataset_ops.StructuredFunctionWrapper(
         map_func,
         self._transformation_name(),
         dataset=input_dataset,
         defun_kwargs={"experimental_ints_on_device": True})
-    self._output_classes = wrapped_func.output_classes
-    self._output_shapes = wrapped_func.output_shapes
-    self._output_types = wrapped_func.output_types
-    self._map_func = wrapped_func.function
+
+  def _functions(self):
+    return [self._map_func]
 
   def _as_variant_tensor(self):
     input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
     return ged_ops.experimental_map_dataset(
         input_t,
-        self._map_func.captured_inputs,
-        f=self._map_func,
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         use_inter_op_parallelism=self._use_inter_op_parallelism,
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._map_func.output_structure
 
   def _transformation_name(self):
     return "map_on_gpu()"
diff --git a/tensorflow/python/data/experimental/ops/random_ops.py b/tensorflow/python/data/experimental/ops/random_ops.py
index 7bf703502be2a9ca7853873b909b4692f89f3476..cbdf367db6bd5b4ce27e636c08a19cd4fedda041 100644
--- a/tensorflow/python/data/experimental/ops/random_ops.py
+++ b/tensorflow/python/data/experimental/ops/random_ops.py
@@ -21,10 +21,9 @@ import functools
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import random_seed
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -38,22 +37,14 @@ class RandomDatasetV2(dataset_ops.DatasetSource):
     self._seed, self._seed2 = random_seed.get_seed(seed)
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.random_dataset(
+    return gen_experimental_dataset_ops.experimental_random_dataset(
         seed=self._seed,
         seed2=self._seed2,
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.int64
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.int64, [])
 
 
 @tf_export(v1=["data.experimental.RandomDataset"])
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index 1ba26ed5b9f491d51fde753fd2bb64fb0e992b48..c2d82aeb59174fb9d35c4cc2c3d850fb351d8a90 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -32,12 +32,11 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import file_io
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import gfile
@@ -573,7 +572,9 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
 
     We can construct a CsvDataset from it as follows:
     ```python
-    dataset = tf.data.experimental.CsvDataset(
+    tf.enable_eager_execution()
+
+     dataset = tf.data.experimental.CsvDataset(
         "my_file*.csv",
         [tf.float32,  # Required field, use dtype or empty tensor
          tf.constant([0.0], dtype=tf.float32),  # Optional field, default to 0.0
@@ -585,13 +586,8 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
 
     The expected output of its iterations is:
     ```python
-    next_element = dataset.make_one_shot_iterator().get_next()
-    with tf.Session() as sess:
-      while True:
-        try:
-          print(sess.run(next_element))
-        except tf.errors.OutOfRangeError:
-          break
+    for element in dataset:
+      print(element)
 
     >> (4.28e10, 5.55e6, 12)
     >> (-5.3e14, 0.0, 2)
@@ -656,11 +652,9 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
         argument_default=[],
         argument_dtype=dtypes.int64,
     )
-    self._output_shapes = tuple(
-        tensor_shape.scalar() for _ in range(len(record_defaults)))
-    self._output_types = tuple(d.dtype for d in self._record_defaults)
-    self._output_classes = tuple(
-        ops.Tensor for _ in range(len(record_defaults)))
+    self._structure = structure.NestedStructure(
+        tuple(structure.TensorStructure(d.dtype, [])
+              for d in self._record_defaults))
 
   def _as_variant_tensor(self):
     # Constructs graph node for the dataset op.
@@ -669,7 +663,7 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
         record_defaults=self._record_defaults,
         buffer_size=self._buffer_size,
         header=self._header,
-        output_shapes=self._output_shapes,
+        output_shapes=self._structure._flat_shapes,  # pylint: disable=protected-access
         field_delim=self._field_delim,
         use_quote_delim=self._use_quote_delim,
         na_value=self._na_value,
@@ -678,16 +672,8 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
     )
 
   @property
-  def output_types(self):
-    return self._output_types
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_classes(self):
-    return self._output_classes
+  def _element_structure(self):
+    return self._structure
 
 
 @tf_export(v1=["data.experimental.CsvDataset"])
@@ -939,17 +925,14 @@ class SqlDatasetV2(dataset_ops.DatasetSource):
     For example:
 
     ```python
+    tf.enable_eager_execution()
+
     dataset = tf.data.experimental.SqlDataset("sqlite", "/foo/bar.sqlite3",
                                               "SELECT name, age FROM people",
                                               (tf.string, tf.int32))
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
     # Prints the rows of the result set of the above query.
-    while True:
-      try:
-        print(sess.run(next_element))
-      except tf.errors.OutOfRangeError:
-        break
+    for element in dataset:
+      print(element)
     ```
 
     Args:
@@ -968,26 +951,18 @@ class SqlDatasetV2(dataset_ops.DatasetSource):
         data_source_name, dtype=dtypes.string, name="data_source_name")
     self._query = ops.convert_to_tensor(
         query, dtype=dtypes.string, name="query")
-    self._output_types = output_types
+    self._structure = structure.NestedStructure(
+        nest.map_structure(
+            lambda dtype: structure.TensorStructure(dtype, []), output_types))
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.sql_dataset(self._driver_name,
-                                       self._data_source_name, self._query,
-                                       nest.flatten(self.output_types),
-                                       nest.flatten(self.output_shapes))
-
-  @property
-  def output_classes(self):
-    return nest.map_structure(lambda _: ops.Tensor, self._output_types)
-
-  @property
-  def output_shapes(self):
-    return nest.map_structure(lambda _: tensor_shape.TensorShape([]),
-                              self._output_types)
+    return gen_experimental_dataset_ops.experimental_sql_dataset(
+        self._driver_name, self._data_source_name, self._query,
+        nest.flatten(self.output_types), nest.flatten(self.output_shapes))
 
   @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
 
 @tf_export(v1=["data.experimental.SqlDataset"])
diff --git a/tensorflow/python/data/experimental/ops/scan_ops.py b/tensorflow/python/data/experimental/ops/scan_ops.py
index 1194238e2f987f9acc9028955f670df9e0efb4ad..5c77ad734348401ed666c562b36ef52ec8c5525b 100644
--- a/tensorflow/python/data/experimental/ops/scan_ops.py
+++ b/tensorflow/python/data/experimental/ops/scan_ops.py
@@ -21,10 +21,10 @@ import collections
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -49,18 +49,7 @@ class _ScanDataset(dataset_ops.UnaryDataset):
     # Compute initial values for the state classes, shapes and types based on
     # the initial state. The shapes may be refined by running `tf_scan_func` one
     # or more times below.
-    self._state_classes = sparse.get_classes(self._initial_state)
-    self._state_shapes = nest.pack_sequence_as(
-        self._initial_state,
-        [t.get_shape() for t in nest.flatten(self._initial_state)])
-    self._state_types = nest.pack_sequence_as(
-        self._initial_state,
-        [t.dtype for t in nest.flatten(self._initial_state)])
-
-    # Will be populated by calling `tf_scan_func`.
-    self._output_classes = None
-    self._output_shapes = None
-    self._output_types = None
+    self._state_structure = structure.Structure.from_value(self._initial_state)
 
     # Iteratively rerun the scan function until reaching a fixed point on
     # `self._state_shapes`.
@@ -70,9 +59,8 @@ class _ScanDataset(dataset_ops.UnaryDataset):
       wrapped_func = dataset_ops.StructuredFunctionWrapper(
           scan_func,
           self._transformation_name(),
-          input_classes=(self._state_classes, input_dataset.output_classes),
-          input_shapes=(self._state_shapes, input_dataset.output_shapes),
-          input_types=(self._state_types, input_dataset.output_types),
+          input_structure=structure.NestedStructure(
+              (self._state_structure, input_dataset._element_structure)),  # pylint: disable=protected-access
           add_to_graph=False)
       if not (
           isinstance(wrapped_func.output_types, collections.Sequence) and
@@ -83,29 +71,35 @@ class _ScanDataset(dataset_ops.UnaryDataset):
       new_state_classes, self._output_classes = wrapped_func.output_classes
 
       # Extract and validate class information from the returned values.
-      for new_state_class, state_class in zip(
+      new_state_classes, output_classes = wrapped_func.output_classes
+      old_state_classes = self._state_structure._to_legacy_output_classes()  # pylint: disable=protected-access
+      for new_state_class, old_state_class in zip(
           nest.flatten(new_state_classes),
-          nest.flatten(self._state_classes)):
-        if not issubclass(new_state_class, state_class):
+          nest.flatten(old_state_classes)):
+        if not issubclass(new_state_class, old_state_class):
           raise TypeError(
               "The element classes for the new state must match the initial "
               "state. Expected %s; got %s." %
-              (self._state_classes, new_state_classes))
+              (old_state_classes, new_state_classes))
 
       # Extract and validate type information from the returned values.
-      new_state_types, self._output_types = wrapped_func.output_types
-      for new_state_type, state_type in zip(
-          nest.flatten(new_state_types), nest.flatten(self._state_types)):
-        if new_state_type != state_type:
+      new_state_types, output_types = wrapped_func.output_types
+      old_state_types = self._state_structure._to_legacy_output_types()  # pylint: disable=protected-access
+      for new_state_type, old_state_type in zip(
+          nest.flatten(new_state_types), nest.flatten(old_state_types)):
+        if new_state_type != old_state_type:
           raise TypeError(
               "The element types for the new state must match the initial "
               "state. Expected %s; got %s." %
-              (self._state_types, new_state_types))
+              (old_state_types, new_state_types))
 
       # Extract shape information from the returned values.
-      new_state_shapes, self._output_shapes = wrapped_func.output_shapes
+      new_state_shapes, output_shapes = wrapped_func.output_shapes
+      old_state_shapes = self._state_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
+      self._structure = structure.convert_legacy_structure(
+          output_types, output_shapes, output_classes)
 
-      flat_state_shapes = nest.flatten(self._state_shapes)
+      flat_state_shapes = nest.flatten(old_state_shapes)
       flat_new_state_shapes = nest.flatten(new_state_shapes)
       weakened_state_shapes = [
           original.most_specific_compatible_shape(new)
@@ -122,32 +116,34 @@ class _ScanDataset(dataset_ops.UnaryDataset):
           break
 
       if need_to_rerun:
-        self._state_shapes = nest.pack_sequence_as(self._state_shapes,
-                                                   weakened_state_shapes)
+        # TODO(b/110122868): Support a "most specific compatible structure"
+        # method for combining structures, to avoid using legacy structures
+        # in this method.
+        self._state_structure = structure.convert_legacy_structure(
+            old_state_types,
+            nest.pack_sequence_as(old_state_shapes, weakened_state_shapes),
+            old_state_classes)
 
-    self._scan_func = wrapped_func.function
-    self._scan_func.add_to_graph(ops.get_default_graph())
+    self._scan_func = wrapped_func
+    self._scan_func.function.add_to_graph(ops.get_default_graph())
+
+  def _functions(self):
+    return [self._scan_func]
 
   def _as_variant_tensor(self):
-    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
-    return gen_dataset_ops.scan_dataset(
+    # pylint: disable=protected-access
+    input_t = self._input_dataset._as_variant_tensor()
+    return gen_experimental_dataset_ops.experimental_scan_dataset(
         input_t,
-        nest.flatten(sparse.serialize_sparse_tensors(self._initial_state)),
-        self._scan_func.captured_inputs,
-        f=self._scan_func,
+        self._state_structure._to_tensor_list(self._initial_state),
+        self._scan_func.function.captured_inputs,
+        f=self._scan_func.function,
+        preserve_cardinality=True,
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
   def _transformation_name(self):
     return "tf.data.experimental.scan()"
diff --git a/tensorflow/python/data/experimental/ops/sleep.py b/tensorflow/python/data/experimental/ops/sleep.py
index 5e9d021ada9ce4bd068f8d899d570683e7e5d80b..2da832395b2e665168c1cd9cd7f52fb13e50c830 100644
--- a/tensorflow/python/data/experimental/ops/sleep.py
+++ b/tensorflow/python/data/experimental/ops/sleep.py
@@ -35,18 +35,6 @@ class _SleepDataset(dataset_ops.UnaryUnchangedStructureDataset):
         self._sleep_microseconds,
         **dataset_ops.flat_structure(self))
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
 
 def sleep(sleep_microseconds):
   """Sleeps for `sleep_microseconds` before producing each input element.
diff --git a/tensorflow/python/data/experimental/ops/stats_aggregator.py b/tensorflow/python/data/experimental/ops/stats_aggregator.py
index 5274c816a49bf70bf25b18cf7d981b90e100ba10..d5fcc033ab7df34369e0680275df744c431ed069 100644
--- a/tensorflow/python/data/experimental/ops/stats_aggregator.py
+++ b/tensorflow/python/data/experimental/ops/stats_aggregator.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -47,7 +47,6 @@ class StatsAggregator(object):
   options = dataset_ops.Options()
   options.experimental_stats = tf.data.experimental.StatsOptions(aggregator)
   dataset = dataset.with_options(options)
-  iterator = dataset.make_one_shot_iterator()
   ```
 
   To get a protocol buffer summary of the currently aggregated statistics,
@@ -69,7 +68,7 @@ class StatsAggregator(object):
 
   def __init__(self):
     """Creates a `StatsAggregator`."""
-    self._resource = gen_dataset_ops.stats_aggregator_handle()
+    self._resource = ged_ops.experimental_stats_aggregator_handle()
 
   # TODO(b/116314787): Update this/add support for V2 summary API.
   def get_summary(self):
@@ -81,4 +80,4 @@ class StatsAggregator(object):
     Returns:
       A scalar string `tf.Tensor` that summarizes the aggregated statistics.
     """
-    return gen_dataset_ops.stats_aggregator_summary(self._resource)
+    return ged_ops.experimental_stats_aggregator_summary(self._resource)
diff --git a/tensorflow/python/data/experimental/ops/stats_ops.py b/tensorflow/python/data/experimental/ops/stats_ops.py
index 95689433bd076c8afd8f027a5b867575dcb68daa..15a9d24546e950543cc3274dbead26178620b5ed 100644
--- a/tensorflow/python/data/experimental/ops/stats_ops.py
+++ b/tensorflow/python/data/experimental/ops/stats_ops.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -66,8 +66,10 @@ def bytes_produced_stats(tag):
   """
 
   def _apply_fn(dataset):
-    return _StatsDataset(dataset, gen_dataset_ops.bytes_produced_stats_dataset,
-                         tag)
+    return _StatsDataset(
+        dataset,
+        gen_experimental_dataset_ops.experimental_bytes_produced_stats_dataset,
+        tag)
 
   return _apply_fn
 
@@ -89,7 +91,9 @@ def latency_stats(tag):
   """
 
   def _apply_fn(dataset):
-    return _StatsDataset(dataset, gen_dataset_ops.latency_stats_dataset, tag)
+    return _StatsDataset(
+        dataset,
+        gen_experimental_dataset_ops.experimental_latency_stats_dataset, tag)
 
   return _apply_fn
 
diff --git a/tensorflow/python/data/experimental/ops/stats_options.py b/tensorflow/python/data/experimental/ops/stats_options.py
index 6df608c6080d51da7b9a11d0eae7ae0b1d222565..c4c4b1cea0354ed35f60f56f3fdf73f9664d88b2 100644
--- a/tensorflow/python/data/experimental/ops/stats_options.py
+++ b/tensorflow/python/data/experimental/ops/stats_options.py
@@ -28,28 +28,18 @@ from tensorflow.python.util.tf_export import tf_export
 class StatsOptions(options.OptionsBase):
   """Represents options for collecting dataset stats using `StatsAggregator`.
 
-  To apply `StatsOptions` with a `tf.data.Dataset` object, use the following
-  pattern:
+  You can set the stats options of a dataset through the `experimental_stats`
+  property of `tf.data.Options`; the property is an instance of
+  `tf.data.experimental.StatsOptions`. For example, to collect latency stats
+  on all dataset edges, use the following pattern:
 
   ```python
-  aggretator = tf.data.experimental.StatsAggregator()
+  aggregator = tf.data.experimental.StatsAggregator()
 
   options = tf.data.Options()
-  options.experimental_stats = tf.data.experimental.StatsOptions()
   options.experimental_stats.aggregator = aggregator
+  options.experimental_stats.latency_all_edges = True
   dataset = dataset.with_options(options)
-
-  iterator = dataset.make_one_shot_iterator()
-  ```
-
-  Note: a `StatsAggregator` object can be attached either duing construction or
-  can be provided later like in above example.
-
-  ```python
-  aggretator = tf.data.experimental.StatsAggregator()
-  # attach aggregator during construction
-  options.experimental_stats = tf.data.experimental.StatsOptions(aggregator)
-  .....
   ```
   """
 
@@ -64,18 +54,16 @@ class StatsOptions(options.OptionsBase):
       ty=str,
       docstring=
       "Prefix to prepend all statistics recorded for the input `dataset` with.",
-      default="")
+      default_factory=lambda: "")
 
   counter_prefix = options.create_option(
       name="counter_prefix",
       ty=str,
-      docstring=
-      "Prefix for the statistics recorded as counter.",
-      default="")
+      docstring="Prefix for the statistics recorded as counter.",
+      default_factory=lambda: "")
 
   latency_all_edges = options.create_option(
       name="latency_all_edges",
       ty=bool,
       docstring=
-      "Whether to add latency measurements on all edges.",
-      default=True)
+      "Whether to add latency measurements on all edges. Defaults to False.")
diff --git a/tensorflow/python/data/experimental/ops/threading_options.py b/tensorflow/python/data/experimental/ops/threading_options.py
index dbf662186f818a24a3b19ea678f87351ab45ed6e..d713b9ae0753d0c800a7212eccf99684218c193d 100644
--- a/tensorflow/python/data/experimental/ops/threading_options.py
+++ b/tensorflow/python/data/experimental/ops/threading_options.py
@@ -26,11 +26,12 @@ from tensorflow.python.util.tf_export import tf_export
 class ThreadingOptions(options.OptionsBase):
   """Represents options for dataset threading.
 
-  To apply `ThreadingOptions` to a `dataset` object, use the following pattern:
+  You can set the threading options of a dataset through the
+  `experimental_threading` property of `tf.data.Options`; the property is
+  an instance of `tf.data.experimental.ThreadingOptions`.
 
   ```python
   options = tf.data.Options()
-  options.experimental_threading = tf.data.experimental.ThreadingOptions()
   options.experimental_threading.private_threadpool_size = 10
   dataset = dataset.with_options(options)
   ```
@@ -46,5 +47,4 @@ class ThreadingOptions(options.OptionsBase):
       name="private_threadpool_size",
       ty=int,
       docstring=
-      "If set, the dataset will use a private threadpool of the given size.",
-      default=None)
+      "If set, the dataset will use a private threadpool of the given size.")
diff --git a/tensorflow/python/data/experimental/ops/writers.py b/tensorflow/python/data/experimental/ops/writers.py
index cc0a80336c515e629874a5987219e05f0a8918b0..aef6da51409dbe13f59408b650fc5947f088d89d 100644
--- a/tensorflow/python/data/experimental/ops/writers.py
+++ b/tensorflow/python/data/experimental/ops/writers.py
@@ -22,7 +22,7 @@ from tensorflow.python.data.util import convert
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -56,5 +56,5 @@ class TFRecordWriter(object):
           "`dataset` must produce scalar `DT_STRING` tensors whereas it "
           "produces shape {0} and types {1}".format(dataset.output_shapes,
                                                     dataset.output_types))
-    return gen_dataset_ops.dataset_to_tf_record(
+    return gen_experimental_dataset_ops.experimental_dataset_to_tf_record(
         dataset._as_variant_tensor(), self._filename, self._compression_type)  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 9f7ce99cbc6414480931c92d6bd1f7ead3ec3fe4..737ba28cebd6b885c71ba53da3130b2d6abf6ee1 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -97,6 +97,8 @@ tf_py_test(
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_tensor",
     ],
 )
@@ -442,6 +444,19 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "options_test",
+    size = "small",
+    srcs = ["options_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
+        "//tensorflow/python/data/experimental/ops:threading_options",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 tf_py_test(
     name = "padded_batch_test",
     size = "small",
diff --git a/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py b/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py
index cdaa4fd4d5f07ff09bf910fe7ca9a8e42544e5b7..6dcd94ea0207a53be1e3444db2a3e6643b8841ed 100644
--- a/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py
@@ -64,8 +64,8 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
   def testSaveRestore(self):
 
     def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -114,7 +114,7 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
 
     def _build_graph(start, stop, num_epochs):
       dataset = dataset_ops.Dataset.range(start, stop).repeat(num_epochs)
-      iterator = dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(dataset)
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -161,7 +161,7 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
 
     def _build_graph(start, stop):
       dataset = dataset_ops.Dataset.range(start, stop)
-      iterator = dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(dataset)
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -200,7 +200,7 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
 
     def _build_graph(start, stop):
       dataset = dataset_ops.Dataset.range(start, stop)
-      iterator = dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(dataset)
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -233,8 +233,8 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
   def testMultipleSaves(self):
 
     def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -276,8 +276,8 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
   def testSaveRestoreWithRepeat(self):
 
     def _build_graph(start, stop, num_epochs):
-      iterator = dataset_ops.Dataset.range(
-          start, stop).repeat(num_epochs).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop).repeat(num_epochs))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -321,8 +321,8 @@ class DatasetCheckpointTest(test_base.DatasetTestBase):
   def testSaveRestoreExhaustedIterator(self):
 
     def _build_graph(start, stop, num_epochs):
-      iterator = dataset_ops.Dataset.range(
-          start, stop).repeat(num_epochs).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop).repeat(num_epochs))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
diff --git a/tensorflow/python/data/kernel_tests/dataset_test.py b/tensorflow/python/data/kernel_tests/dataset_test.py
index 2952c08be02b76fb221ee0f31f4b9fc34a14d659..820bc8e4e2e46202093475cb344a620dc71abe72 100644
--- a/tensorflow/python/data/kernel_tests/dataset_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_test.py
@@ -207,53 +207,6 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertEqual(2, inputs.count(ds2))
     self.assertEqual(1, inputs.count(ds3))
 
-  def testOptionsDefault(self):
-    ds = dataset_ops.Dataset.range(0)
-    self.assertEqual(dataset_ops.Options(), ds.options())
-
-  def testOptionsOnce(self):
-    options = dataset_ops.Options()
-    ds = dataset_ops.Dataset.range(0).with_options(options).cache()
-    self.assertEqual(options, ds.options())
-
-  def testOptionsTwiceSame(self):
-    options = dataset_ops.Options()
-    options.experimental_autotune = True
-    ds = dataset_ops.Dataset.range(0).with_options(options).with_options(
-        options)
-    self.assertEqual(options, ds.options())
-
-  def testOptionsTwiceDifferent(self):
-    options1 = dataset_ops.Options()
-    options1.experimental_autotune = True
-    options2 = dataset_ops.Options()
-    options2.experimental_deterministic = False
-    ds = dataset_ops.Dataset.range(0).with_options(options1).with_options(
-        options2)
-    self.assertTrue(ds.options().experimental_autotune)
-    # Explicitly check that flag is False since assertFalse allows None
-    self.assertIs(ds.options().experimental_deterministic, False)
-
-  def testOptionsTwiceDifferentError(self):
-    options1 = dataset_ops.Options()
-    options1.experimental_autotune = True
-    options2 = dataset_ops.Options()
-    options2.experimental_autotune = False
-    with self.assertRaisesRegexp(ValueError,
-                                 "Cannot merge incompatible values"):
-      dataset_ops.Dataset.range(0).with_options(options1).with_options(options2)
-
-  def testOptionsMergeOptionsFromMultipleInputs(self):
-    options1 = dataset_ops.Options()
-    options1.experimental_autotune = True
-    options2 = dataset_ops.Options()
-    options2.experimental_deterministic = True
-    ds = dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.range(0).with_options(options1),
-         dataset_ops.Dataset.range(0).with_options(options2)))
-    self.assertTrue(ds.options().experimental_autotune)
-    self.assertTrue(ds.options().experimental_deterministic)
-
   # TODO(b/119882922): use-after-free bug in eager mode.
   # pylint: disable=g-long-lambda
   @parameterized.named_parameters(
@@ -313,5 +266,6 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           round_trip_dataset, [self.evaluate(tf_value_fn())],
           requires_initialization=True)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/from_generator_test.py b/tensorflow/python/data/kernel_tests/from_generator_test.py
index d58e3c2364d976cf56d0492d3ee7abbef9654343..11919bdaeee3d8b27e0c7644c485be4809213934 100644
--- a/tensorflow/python/data/kernel_tests/from_generator_test.py
+++ b/tensorflow/python/data/kernel_tests/from_generator_test.py
@@ -21,7 +21,6 @@ import threading
 
 import numpy as np
 
-from tensorflow.python.client import session
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -32,45 +31,27 @@ from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
-class FromGeneratorTest(test_base.DatasetTestBase):
+@test_util.run_all_in_graph_and_eager_modes
+class DatasetConstructorTest(test_base.DatasetTestBase):
 
   def _testFromGenerator(self, generator, elem_sequence, num_repeats,
                          output_types=None):
     if output_types is None:
       output_types = dtypes.int64
-    iterator = (
-        dataset_ops.Dataset.from_generator(generator, output_types=output_types)
-        .repeat(num_repeats)
-        .prefetch(5)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(2):  # Run twice to test reinitialization.
-        sess.run(init_op)
-        for _ in range(num_repeats):
-          for elem in elem_sequence:
-            self.assertAllEqual(elem, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=output_types).repeat(num_repeats).prefetch(5)
+    self.assertDatasetProduces(
+        dataset,
+        elem_sequence * num_repeats,
+        requires_initialization=True,
+        num_test_iterations=2)
 
   def _testFromGeneratorOneShot(self, generator, elem_sequence, num_repeats):
-    iterator = (
-        dataset_ops.Dataset.from_generator(generator, output_types=dtypes.int64)
-        .repeat(num_repeats)
-        .prefetch(5)
-        .make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(num_repeats):
-        for elem in elem_sequence:
-          self.assertAllEqual(elem, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=dtypes.int64).repeat(num_repeats).prefetch(5)
+    self.assertDatasetProduces(
+        dataset, elem_sequence * num_repeats, num_test_iterations=2)
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorUsingFunction(self):
     def generator():
       for i in range(1, 100):
@@ -81,21 +62,18 @@ class FromGeneratorTest(test_base.DatasetTestBase):
     self._testFromGeneratorOneShot(generator, elem_sequence, 1)
     self._testFromGeneratorOneShot(generator, elem_sequence, 5)
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorUsingList(self):
     generator = lambda: [[i] * i for i in range(1, 100)]
     elem_sequence = list(generator())
     self._testFromGenerator(generator, elem_sequence, 1)
     self._testFromGenerator(generator, elem_sequence, 5)
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorUsingNdarray(self):
     generator = lambda: np.arange(100, dtype=np.int64)
     elem_sequence = list(generator())
     self._testFromGenerator(generator, elem_sequence, 1, output_types=np.int64)
     self._testFromGenerator(generator, elem_sequence, 5, output_types=np.int64)
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorUsingGeneratorExpression(self):
     # NOTE(mrry): Generator *expressions* are not repeatable (or in
     # general reusable), because they eagerly evaluate the `for`
@@ -107,7 +85,6 @@ class FromGeneratorTest(test_base.DatasetTestBase):
     self._testFromGenerator(generator, elem_sequence, 1)
     self._testFromGenerator(generator, elem_sequence, 5)
 
-  @test_util.run_deprecated_v1
   def testFromMultipleConcurrentGenerators(self):
     num_inner_repeats = 5
     num_outer_repeats = 100
@@ -130,23 +107,16 @@ class FromGeneratorTest(test_base.DatasetTestBase):
           output_shapes=([None], [3]))
               .repeat(num_inner_repeats).prefetch(5))
 
-    iterator = (
-        dataset_ops.Dataset.range(num_outer_repeats)
-        .interleave(interleave_fn, cycle_length=10,
-                    block_length=len(input_list))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(num_inner_repeats * num_outer_repeats):
-        for elem in input_list:
-          val0, val1 = sess.run(get_next)
-          self.assertAllEqual(elem[0], val0)
-          self.assertAllEqual(elem[1], val1)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(num_outer_repeats).interleave(
+        interleave_fn, cycle_length=10, block_length=len(input_list))
+    get_next = self.getNext(dataset)
+    for _ in range(num_inner_repeats * num_outer_repeats):
+      for elem in input_list:
+        val0, val1 = self.evaluate(get_next())
+        self.assertAllEqual(elem[0], val0)
+        self.assertAllEqual(elem[1], val1)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # TODO(b/67868766): Reenable this when the source of flakiness is discovered.
   def _testFromGeneratorsRunningInParallel(self):
@@ -189,23 +159,16 @@ class FromGeneratorTest(test_base.DatasetTestBase):
       return dataset_ops.Dataset.from_generator(
           generator, output_types=dtypes.int64, output_shapes=[]).prefetch(2)
 
-    iterator = (
-        dataset_ops.Dataset.range(num_parallel_iterators)
-        .interleave(
-            interleave_fn, cycle_length=num_parallel_iterators, block_length=1)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for elem in [0, 1]:
-        for _ in range(num_parallel_iterators):
-          self.assertAllEqual(elem, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(num_parallel_iterators).interleave(
+        interleave_fn, cycle_length=num_parallel_iterators, block_length=1)
+    get_next = self.getNext(dataset)
+
+    for elem in [0, 1]:
+      for _ in range(num_parallel_iterators):
+        self.assertAllEqual(elem, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorImplicitConversion(self):
     def generator():
       yield [1]
@@ -213,45 +176,28 @@ class FromGeneratorTest(test_base.DatasetTestBase):
       yield [3]
 
     for dtype in [dtypes.int8, dtypes.int32, dtypes.int64]:
-      iterator = (dataset_ops.Dataset.from_generator(
+      dataset = dataset_ops.Dataset.from_generator(
           generator, output_types=dtype, output_shapes=[1])
-                  .make_initializable_iterator())
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-
-      self.assertEqual(dtype, get_next.dtype)
-
-      with self.cached_session() as sess:
-        sess.run(init_op)
-        for expected in [[1], [2], [3]]:
-          next_val = sess.run(get_next)
-          self.assertEqual(dtype.as_numpy_dtype, next_val.dtype)
-          self.assertAllEqual(expected, next_val)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  @test_util.run_deprecated_v1
+      get_next = self.getNext(dataset)
+
+      for expected in [[1], [2], [3]]:
+        next_val = self.evaluate(get_next())
+        self.assertEqual(dtype.as_numpy_dtype, next_val.dtype)
+        self.assertAllEqual(expected, next_val)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
   def testFromGeneratorString(self):
     def generator():
       yield "foo"
       yield b"bar"
       yield u"baz"
 
-    iterator = (dataset_ops.Dataset.from_generator(
+    dataset = dataset_ops.Dataset.from_generator(
         generator, output_types=dtypes.string, output_shapes=[])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for expected in [b"foo", b"bar", b"baz"]:
-        next_val = sess.run(get_next)
-        self.assertAllEqual(expected, next_val)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset, expected_output=[b"foo", b"bar", b"baz"])
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorTypeError(self):
     def generator():
       yield np.array([1, 2, 3], dtype=np.int64)
@@ -259,23 +205,19 @@ class FromGeneratorTest(test_base.DatasetTestBase):
       yield "ERROR"
       yield np.array([7, 8, 9], dtype=np.int64)
 
-    iterator = (dataset_ops.Dataset.from_generator(
+    dataset = dataset_ops.Dataset.from_generator(
         generator, output_types=dtypes.int64, output_shapes=[3])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      self.assertAllEqual([4, 5, 6], sess.run(get_next))
-      with self.assertRaisesOpError("The expected type was int64"):
-        sess.run(get_next)
-      self.assertAllEqual([7, 8, 9], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
 
-  @test_util.run_deprecated_v1
+    get_next = self.getNext(dataset)
+
+    self.assertAllEqual([1, 2, 3], self.evaluate(get_next()))
+    self.assertAllEqual([4, 5, 6], self.evaluate(get_next()))
+    with self.assertRaisesOpError("The expected type was int64"):
+      self.evaluate(get_next())
+    self.assertAllEqual([7, 8, 9], self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
   def testFromGeneratorShapeError(self):
     def generator():
       yield np.array([1, 2, 3], dtype=np.int64)
@@ -283,23 +225,18 @@ class FromGeneratorTest(test_base.DatasetTestBase):
       yield np.array([7, 8, 9, 10], dtype=np.int64)
       yield np.array([11, 12, 13], dtype=np.int64)
 
-    iterator = (dataset_ops.Dataset.from_generator(
+    dataset = dataset_ops.Dataset.from_generator(
         generator, output_types=dtypes.int64, output_shapes=[3])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      self.assertAllEqual([4, 5, 6], sess.run(get_next))
-      with self.assertRaisesOpError(r"element of shape \(3,\) was expected"):
-        sess.run(get_next)
-      self.assertAllEqual([11, 12, 13], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    get_next = self.getNext(dataset)
+
+    self.assertAllEqual([1, 2, 3], self.evaluate(get_next()))
+    self.assertAllEqual([4, 5, 6], self.evaluate(get_next()))
+    with self.assertRaisesOpError(r"element of shape \(3,\) was expected"):
+      self.evaluate(get_next())
+    self.assertAllEqual([11, 12, 13], self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorStructureError(self):
     def generator():
       yield 1, 2
@@ -308,46 +245,31 @@ class FromGeneratorTest(test_base.DatasetTestBase):
       yield 6, 7, 8
       yield 9, 10
 
-    iterator = (dataset_ops.Dataset.from_generator(
+    dataset = dataset_ops.Dataset.from_generator(
         generator, output_types=(dtypes.int64, dtypes.int64))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertEqual((1, 2), sess.run(get_next))
-      self.assertEqual((3, 4), sess.run(get_next))
-      with self.assertRaisesOpError(
-          r"The expected structure was \(tf\.int64, tf\.int64\)"):
-        sess.run(get_next)
-      with self.assertRaisesOpError(
-          r"The expected structure was \(tf\.int64, tf\.int64\)"):
-        sess.run(get_next)
-      self.assertEqual((9, 10), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    get_next = self.getNext(dataset)
+
+    self.assertEqual((1, 2), self.evaluate(get_next()))
+    self.assertEqual((3, 4), self.evaluate(get_next()))
+    with self.assertRaisesOpError(
+        r"The expected structure was \(tf\.int64, tf\.int64\)"):
+      self.evaluate(get_next())
+    with self.assertRaisesOpError(
+        r"The expected structure was \(tf\.int64, tf\.int64\)"):
+      self.evaluate(get_next())
+    self.assertEqual((9, 10), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorHeterogeneous(self):
     def generator():
       yield 1
       yield [2, 3]
 
-    iterator = (
-        dataset_ops.Dataset.from_generator(
-            generator, output_types=dtypes.int64).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=dtypes.int64)
+    self.assertDatasetProduces(dataset, expected_output=[1, [2, 3]])
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(1, sess.run(get_next))
-      self.assertAllEqual([2, 3], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  @test_util.run_deprecated_v1
   def testFromGeneratorStopShort(self):
 
     def generator():
@@ -355,18 +277,12 @@ class FromGeneratorTest(test_base.DatasetTestBase):
       yield 1
       yield 2
 
-    iterator = (
-        dataset_ops.Dataset.from_generator(
-            generator, output_types=dtypes.int64).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(0, sess.run(get_next))
-      self.assertAllEqual(1, sess.run(get_next))
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=dtypes.int64)
+    get_next = self.getNext(dataset)
+    self.assertAllEqual(0, self.evaluate(get_next()))
+    self.assertAllEqual(1, self.evaluate(get_next()))
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorDestructorCalled(self):
     # Use an `Event` to signal that the generator has been deleted.
     event = threading.Event()
@@ -385,23 +301,18 @@ class FromGeneratorTest(test_base.DatasetTestBase):
       def __del__(self):
         event.set()
 
-    iterator = dataset_ops.Dataset.from_generator(
-        GeneratorWrapper,
-        output_types=dtypes.int64).take(2).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_generator(
+        GeneratorWrapper, output_types=dtypes.int64).take(2)
+    get_next = self.getNext(dataset)
 
-    with session.Session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(42, sess.run(get_next))
-      self.assertAllEqual(42, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      # Test that `GeneratorWrapper` object is destroyed when the
-      # iterator terminates (and the generator iterator is deleted).
-      self.assertTrue(event.is_set())
+    self.assertAllEqual(42, self.evaluate(get_next()))
+    self.assertAllEqual(42, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    # Test that `GeneratorWrapper` object is destroyed when the
+    # iterator terminates (and the generator iterator is deleted).
+    self.assertTrue(event.is_set())
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorWithArgs(self):
 
     def flat_map_fn(elem):
@@ -414,22 +325,10 @@ class FromGeneratorTest(test_base.DatasetTestBase):
           generator_with_arg, output_types=dtypes.int64, output_shapes=(),
           args=(elem,))
 
-    iterator = (dataset_ops.Dataset
-                .range(5)
-                .flat_map(flat_map_fn)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      expected = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
-      for x in expected:
-        self.assertEqual(x, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(5).flat_map(flat_map_fn)
+    self.assertDatasetProduces(
+        dataset, expected_output=[1, 2, 2, 3, 3, 3, 4, 4, 4, 4])
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorWithTwoArgs(self):
 
     def flat_map_fn(elem, message):
@@ -442,27 +341,17 @@ class FromGeneratorTest(test_base.DatasetTestBase):
           generator_with_arg, output_types=(dtypes.int64, dtypes.string),
           output_shapes=((), ()), args=(elem, message))
 
-    iterator = (
-        dataset_ops.Dataset.zip(
-            (dataset_ops.Dataset.range(5),
-             dataset_ops.Dataset.from_tensors("Hi!").repeat(None)))
-        .flat_map(flat_map_fn)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      expected = [(0, b"Hi!"),
-                  (0, b"Hi!"), (1, b"Hi!"),
-                  (0, b"Hi!"), (1, b"Hi!"), (2, b"Hi!"),
-                  (0, b"Hi!"), (1, b"Hi!"), (2, b"Hi!"), (3, b"Hi!")]
-      for x in expected:
-        self.assertEqual(x, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(5),
+         dataset_ops.Dataset.from_tensors("Hi!").repeat(None)
+        )).flat_map(flat_map_fn)
+
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[(0, b"Hi!"), (0, b"Hi!"), (1, b"Hi!"), (0, b"Hi!"),
+                         (1, b"Hi!"), (2, b"Hi!"), (0, b"Hi!"), (1, b"Hi!"),
+                         (2, b"Hi!"), (3, b"Hi!")])
 
-  @test_util.run_deprecated_v1
   def testGeneratorDatasetFinalizeFunctionCalled(self):
     # NOTE(mrry): This test tests the internal `_GeneratorDataset`,
     # which affords more control over what the finalize function can do than
@@ -479,20 +368,15 @@ class FromGeneratorTest(test_base.DatasetTestBase):
                                 stateful=True)
 
     dummy = constant_op.constant(37)
-    iterator = (dataset_ops._GeneratorDataset(dummy, lambda x: x,
-                                              lambda x: x, finalize_fn)
-                .take(2)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(37, sess.run(get_next))
-      self.assertAllEqual(37, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-        self.assertTrue(event.is_set())
+    dataset = dataset_ops._GeneratorDataset(dummy, lambda x: x, lambda x: x,
+                                            finalize_fn).take(2)
+    get_next = self.getNext(dataset)
+
+    self.assertAllEqual(37, self.evaluate(get_next()))
+    self.assertAllEqual(37, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+      self.assertTrue(event.is_set())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
index 80ed26e7fbc38dd8f453ca3c69022350374ed511..ef608ebb67007c7605e7bea36058d0cd5c5d146f 100644
--- a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
+++ b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
@@ -36,8 +36,8 @@ class FromSparseTensorSlicesTest(test_base.DatasetTestBase):
   def testSkipEagerFromSparseTensorSlices(self):
     """Test a dataset based on slices of a `tf.SparseTensor`."""
     st = array_ops.sparse_placeholder(dtypes.float64)
-    iterator = (dataset_ops.Dataset.from_sparse_tensor_slices(st)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_sparse_tensor_slices(st))
     init_op = iterator.initializer
     get_next = sparse_tensor.SparseTensor(*iterator.get_next())
 
diff --git a/tensorflow/python/data/kernel_tests/from_tensors_test.py b/tensorflow/python/data/kernel_tests/from_tensors_test.py
index ce7063757243538342ee54ec1e11f68cf382e77f..ab3c15263fdaa0829686f90450e0e79081299a2e 100644
--- a/tensorflow/python/data/kernel_tests/from_tensors_test.py
+++ b/tensorflow/python/data/kernel_tests/from_tensors_test.py
@@ -246,7 +246,7 @@ class FromTensorsTest(test_base.DatasetTestBase):
         dataset = dataset.map(lambda x: x + var_1.read_value())
       sess.run(var_1.initializer)
 
-      iterator = dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(dataset)
       sess.run(iterator.initializer)
 
       with self.assertRaisesRegexp(
diff --git a/tensorflow/python/data/kernel_tests/interleave_test.py b/tensorflow/python/data/kernel_tests/interleave_test.py
index cd1d850304e060c33411fd3c0543ae5b5f087898..05a211afcc177faaeb1a00ad03d8f117448f8315 100644
--- a/tensorflow/python/data/kernel_tests/interleave_test.py
+++ b/tensorflow/python/data/kernel_tests/interleave_test.py
@@ -116,7 +116,7 @@ def _make_coordinated_sloppy_dataset(input_values, cycle_length, block_length,
   dataset = dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
       2).interleave(interleave_fn, cycle_length, block_length,
                     num_parallel_calls).with_options(options)
-  iterator = dataset.make_one_shot_iterator()
+  iterator = dataset_ops.make_one_shot_iterator(dataset)
   get_next = iterator.get_next()
   return get_next, coordination_events
 
@@ -264,6 +264,7 @@ class InterleaveTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("8", np.int64([4, 0, 6]), 2, 3, 1),
       ("9", np.int64([4, 0, 6]), 2, 3, 2),
   )
+  @test_util.run_v1_only("b/120545219")
   def testSkipEagerSloppyInterleaveInOrder(self, input_values, cycle_length,
                                            block_length, num_parallel_calls):
     get_next, coordination_events = _make_coordinated_sloppy_dataset(
@@ -286,6 +287,7 @@ class InterleaveTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("3", np.int64([4, 5, 6]), 3, 2, 3),
       ("4", np.int64([4, 0, 6]), 2, 3, 2),
   )
+  @test_util.run_v1_only("b/120545219")
   def testSkipEagerSloppyInterleaveOutOfOrder(self, input_values, cycle_length,
                                               block_length, num_parallel_calls):
     get_next, coordination_events = _make_coordinated_sloppy_dataset(
diff --git a/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
index fc4164c81a226d57084bd36e6caffbe8c3018ccb..91b356691b75eb337ad61643646ba717e4929ab9 100644
--- a/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
@@ -40,7 +40,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6]).map(
         math_ops.square).batch(2)
     iterator = iter(dataset) if context.executing_eagerly(
-    ) else dataset.make_one_shot_iterator()
+    ) else dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next if context.executing_eagerly(
     ) else functools.partial(self.evaluate, iterator.get_next())
     checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
@@ -61,16 +61,16 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
         [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
     dataset = dataset.map(math_ops.square).batch(2)
     iterator_1 = iter(dataset) if context.executing_eagerly(
-    ) else dataset.make_one_shot_iterator()
+    ) else dataset_ops.make_one_shot_iterator(dataset)
     get_next_1 = iterator_1.get_next if context.executing_eagerly(
     ) else functools.partial(self.evaluate, iterator_1.get_next())
     iterator_2 = iter(dataset) if context.executing_eagerly(
-    ) else dataset.make_one_shot_iterator()
+    ) else dataset_ops.make_one_shot_iterator(dataset)
     get_next_2 = iterator_2.get_next if context.executing_eagerly(
     ) else functools.partial(self.evaluate, iterator_2.get_next())
     dataset_2 = dataset_ops.Dataset.range(10)
     iterator_3 = iter(dataset_2) if context.executing_eagerly(
-    ) else dataset_2.make_one_shot_iterator()
+    ) else dataset_ops.make_one_shot_iterator(dataset_2)
     get_next_3 = iterator_3.get_next if context.executing_eagerly(
     ) else functools.partial(self.evaluate, iterator_3.get_next())
     checkpoint = checkpointable_utils.Checkpoint(
@@ -93,7 +93,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     dataset = dataset_ops.Dataset.range(3)
     iterator = iter(dataset) if context.executing_eagerly(
-    ) else dataset.make_one_shot_iterator()
+    ) else dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next if context.executing_eagerly(
     ) else functools.partial(self.evaluate, iterator.get_next())
     checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
@@ -113,7 +113,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     dataset = dataset_ops.Dataset.range(10)
     iterator = iter(dataset) if context.executing_eagerly(
-    ) else dataset.make_initializable_iterator()
+    ) else dataset_ops.make_initializable_iterator(dataset)
     get_next = iterator.get_next
     checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
     for i in range(5):
diff --git a/tensorflow/python/data/kernel_tests/iterator_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
index c1f856ec62032d6433625e5b1b14cab668cb8897..20088234953b1cdc8f85381ded45cf22aa93c75a 100644
--- a/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.platform import test
 
 class IteratorClusterTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testRemoteIteratorWithoutRemoteCallFail(self):
     worker_config = config_pb2.ConfigProto()
     worker_config.device_count["CPU"] = 2
@@ -47,7 +48,7 @@ class IteratorClusterTest(test.TestCase):
 
     with ops.device("/job:worker/replica:0/task:0/cpu:1"):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
       iterator_3_handle = iterator_3.string_handle()
 
     with ops.device("/job:worker/replica:0/task:0/cpu:0"):
@@ -62,7 +63,7 @@ class IteratorClusterTest(test.TestCase):
   def _testRemoteIteratorHelper(self, device0, device1, target):
     with ops.device(device1):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
       iterator_3_handle = iterator_3.string_handle()
 
     @function.Defun(dtypes.string)
@@ -92,6 +93,7 @@ class IteratorClusterTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(remote_op, feed_dict={target_placeholder: device1})
 
+  @test_util.run_v1_only("b/120545219")
   def testRemoteIteratorUsingRemoteCallOp(self):
     worker_config = config_pb2.ConfigProto()
     worker_config.device_count["CPU"] = 2
@@ -102,6 +104,7 @@ class IteratorClusterTest(test.TestCase):
                                    "/job:worker/replica:0/task:0/cpu:1",
                                    worker[0].target)
 
+  @test_util.run_v1_only("b/120545219")
   def testRemoteIteratorUsingRemoteCallOpCrossProcess(self):
     workers, _ = test_util.create_local_cluster(2, 1)
 
@@ -109,6 +112,7 @@ class IteratorClusterTest(test.TestCase):
                                    "/job:worker/replica:0/task:1/cpu:0",
                                    workers[0].target)
 
+  @test_util.run_v1_only("b/120545219")
   def testCaptureHashTableInSharedIterator(self):
     worker, _ = test_util.create_local_cluster(1, 1)
 
@@ -143,6 +147,7 @@ class IteratorClusterTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_v1_only("b/120545219")
   def testImplicitDisposeParallelMapDataset(self):
     # Tests whether a parallel map dataset will be cleaned up correctly when
     # the pipeline does not run it until exhaustion.
@@ -161,7 +166,7 @@ class IteratorClusterTest(test.TestCase):
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
         .repeat(None).prefetch(10000))
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/python/data/kernel_tests/iterator_test.py b/tensorflow/python/data/kernel_tests/iterator_test.py
index b836a6aecf1aa026a311a32af74406430a8c1642..916cf8bb45ce7dbf55261d3f67ca17c0cdbb10fd 100644
--- a/tensorflow/python/data/kernel_tests/iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_test.py
@@ -61,7 +61,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     side = constant_op.constant(0.)
     add = lambda x: x + side
     dataset = dataset_ops.Dataset.from_tensor_slices(component).map(add)
-    value = dataset.make_one_shot_iterator().get_next()
+    value = dataset_ops.make_one_shot_iterator(dataset).get_next()
     self.assertIsNone(gradients_impl.gradients(value, component)[0])
     self.assertIsNone(gradients_impl.gradients(value, side)[0])
     self.assertIsNone(gradients_impl.gradients(value, [component, side])[0])
@@ -75,7 +75,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(
         ValueError, r"`Dataset.make_one_shot_iterator\(\)` does not support "
         "datasets that capture stateful objects.+myvar"):
-      dataset.make_one_shot_iterator()
+      dataset_ops.make_one_shot_iterator(dataset)
 
   @test_util.run_deprecated_v1
   def testOneShotIterator(self):
@@ -86,9 +86,9 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    iterator = (
+    iterator = dataset_ops.make_one_shot_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(14).make_one_shot_iterator())
+        .repeat(14))
     get_next = iterator.get_next()
 
     self.assertEqual([c.shape[1:] for c in components],
@@ -113,9 +113,9 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    iterator = (
+    iterator = dataset_ops.make_one_shot_iterator(
         dataset_ops.Dataset.from_tensor_slices(tensor_components)
-        .map(_map_fn).repeat(14).make_one_shot_iterator())
+        .map(_map_fn).repeat(14))
     get_next = iterator.get_next()
 
     self.assertEqual([c.shape[1:] for c in components],
@@ -140,9 +140,9 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
       def _map_fn(x, y, z):
         return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-      iterator = (
+      iterator = dataset_ops.make_one_shot_iterator(
           dataset_ops.Dataset.from_tensor_slices(components)
-          .map(_map_fn).repeat(14).make_one_shot_iterator())
+          .map(_map_fn).repeat(14))
       return iterator.get_next()
 
     server = server_lib.Server.create_local_server()
@@ -169,7 +169,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
   @test_util.run_deprecated_v1
   def testOneShotIteratorNonBlocking(self):
     dataset = dataset_ops.Dataset.from_tensors([1, 2, 3]).map(lambda x: x * x)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     # Create a session with a single thread to ensure that the
@@ -211,7 +211,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensors(
         array_ops.check_numerics(
             constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -289,9 +289,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
   @test_util.run_deprecated_v1
   def testNotInitializedError(self):
     components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    iterator = (
-        dataset_ops.Dataset.from_tensors(components)
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(components))
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -404,8 +403,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
     dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])
 
-    iterator_3 = dataset_3.make_one_shot_iterator()
-    iterator_4 = dataset_4.make_one_shot_iterator()
+    iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
+    iterator_4 = dataset_ops.make_one_shot_iterator(dataset_4)
 
     handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
     feedable_iterator = iterator_ops.Iterator.from_string_handle(
@@ -461,8 +460,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
       dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])
 
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_4 = dataset_4.make_one_shot_iterator()
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
+      iterator_4 = dataset_ops.make_one_shot_iterator(dataset_4)
 
       handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
       feedable_iterator = iterator_ops.Iterator.from_string_handle(
@@ -522,8 +521,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
   @test_util.run_deprecated_v1
   def testIteratorStringHandleReuseTensorObject(self):
     dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-    one_shot_iterator = dataset.make_one_shot_iterator()
-    initializable_iterator = dataset.make_initializable_iterator()
+    one_shot_iterator = dataset_ops.make_one_shot_iterator(dataset)
+    initializable_iterator = dataset_ops.make_initializable_iterator(dataset)
     structure_iterator = iterator_ops.Iterator.from_structure(
         dataset.output_types)
 
@@ -564,10 +563,10 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         handle_placeholder, dtypes.int32)
 
     with self.cached_session() as sess:
-      handle_int_scalar = sess.run(
-          dataset_int_scalar.make_one_shot_iterator().string_handle())
-      handle_float_vector = sess.run(
-          dataset_float_vector.make_one_shot_iterator().string_handle())
+      handle_int_scalar = sess.run(dataset_ops.make_one_shot_iterator(
+          dataset_int_scalar).string_handle())
+      handle_float_vector = sess.run(dataset_ops.make_one_shot_iterator(
+          dataset_float_vector).string_handle())
 
       self.assertEqual(1,
                        sess.run(
@@ -596,7 +595,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
     with ops.device("/job:localhost/replica:0/task:0/cpu:1"):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
       iterator_3_handle = iterator_3.string_handle()
 
     @function.Defun(dtypes.string)
@@ -669,7 +668,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     for device in worker_devices:
       with ops.device(device):
         src = dataset_ops.Dataset.from_tensor_slices([device])
-        itr = src.make_one_shot_iterator()
+        itr = dataset_ops.make_one_shot_iterator(src)
         itr_handles.append(itr.string_handle())
 
     targets = dataset_ops.Dataset.from_tensor_slices(worker_devices)
@@ -687,7 +686,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
     with ops.device("/job:client"):
       client_dataset = dataset_ops.Dataset.zip((targets, handles)).map(map_fn)
-      itr = client_dataset.make_initializable_iterator()
+      itr = dataset_ops.make_initializable_iterator(client_dataset)
       n = itr.get_next()
 
     with session.Session(s3.target, config=config) as sess:
@@ -705,7 +704,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
     with ops.device("/job:localhost/replica:0/task:0/cpu:0"):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
       iterator_3_handle = iterator_3.string_handle()
 
     def _encode_raw(byte_array):
@@ -777,8 +776,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     def _build_range_dataset_graph():
       start = 1
       stop = 10
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = _save_op(iterator._iterator_resource)
@@ -787,8 +786,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
     def _build_reader_dataset_graph():
       filenames = ["test"]  # Does not exist but we don't care in this test.
-      iterator = readers.FixedLengthRecordDataset(
-          filenames, 1, 0, 0).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          readers.FixedLengthRecordDataset(filenames, 1, 0, 0))
       init_op = iterator.initializer
       get_next_op = iterator.get_next()
       save_op = _save_op(iterator._iterator_resource)
@@ -815,7 +814,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def testRepeatedGetNextWarning(self):
-    iterator = dataset_ops.Dataset.range(10).make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset_ops.Dataset.range(10))
     warnings.simplefilter("always")
     with warnings.catch_warnings(record=True) as w:
       for _ in range(100):
@@ -858,8 +857,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
                             expected_output_classes, expected_output_types,
                             expected_output_shapes):
     tf_value = tf_value_fn()
-    iterator = dataset_ops.Dataset.from_tensors(
-        tf_value).make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset.from_tensors(tf_value))
 
     self.assertTrue(expected_element_structure.is_compatible_with(
         iterator._element_structure))
@@ -872,7 +871,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
   def testIteratorGetNextName(self):
     with ops.Graph().as_default():
-      iterator = dataset_ops.Dataset.from_tensors(37.0).make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(
+          dataset_ops.Dataset.from_tensors(37.0))
       next_element = iterator.get_next(name="overridden_name")
       self.assertEqual("overridden_name", next_element.op.name)
 
diff --git a/tensorflow/python/data/kernel_tests/list_files_test.py b/tensorflow/python/data/kernel_tests/list_files_test.py
index 789f1ab6de7b45cae3b1541d2ecf8ce381e009b3..a70c4b081d5c710082eb485a1dbb6179a90da2ce 100644
--- a/tensorflow/python/data/kernel_tests/list_files_test.py
+++ b/tensorflow/python/data/kernel_tests/list_files_test.py
@@ -82,27 +82,29 @@ class ListFilesTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.list_files(
         path.join(self.tmp_dir, '*'), shuffle=True, seed=37)
 
-    full_filenames = [compat.as_bytes(path.join(self.tmp_dir, filename))
-                      for filename in filenames]
+    expected_filenames = [
+        compat.as_bytes(path.join(self.tmp_dir, filename))
+        for filename in filenames
+    ]
 
-    all_produced_filenames = []
+    all_actual_filenames = []
     for _ in range(3):
-      produced_filenames = []
+      actual_filenames = []
       next_element = self.getNext(dataset, requires_initialization=True)
       try:
         while True:
-          produced_filenames.append(self.evaluate(next_element()))
+          actual_filenames.append(self.evaluate(next_element()))
       except errors.OutOfRangeError:
         pass
-      all_produced_filenames.append(produced_filenames)
+      all_actual_filenames.append(actual_filenames)
 
     # Each run should produce the same set of filenames, which may be
-    # different from the order of `full_filenames`.
-    self.assertItemsEqual(full_filenames, all_produced_filenames[0])
+    # different from the order of `expected_filenames`.
+    self.assertItemsEqual(expected_filenames, all_actual_filenames[0])
     # However, the different runs should produce filenames in the same order
     # as each other.
-    self.assertEqual(all_produced_filenames[0], all_produced_filenames[1])
-    self.assertEqual(all_produced_filenames[0], all_produced_filenames[2])
+    self.assertEqual(all_actual_filenames[0], all_actual_filenames[1])
+    self.assertEqual(all_actual_filenames[0], all_actual_filenames[2])
 
   # TODO(b/117581999): eager mode assertion fail wrapped, debug.
   def tesSkipEagerEmptyDirectoryInitializer(self):
@@ -169,16 +171,17 @@ class ListFilesTest(test_base.DatasetTestBase):
         path.join(self.tmp_dir, '*'), shuffle=False).repeat(2)
     next_element = self.getNext(dataset)
 
-    full_filenames = []
-    produced_filenames = []
+    expected_filenames = []
+    actual_filenames = []
     for filename in filenames * 2:
-      full_filenames.append(compat.as_bytes(path.join(self.tmp_dir, filename)))
-      produced_filenames.append(compat.as_bytes(self.evaluate(next_element())))
+      expected_filenames.append(
+          compat.as_bytes(path.join(self.tmp_dir, filename)))
+      actual_filenames.append(compat.as_bytes(self.evaluate(next_element())))
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(next_element())
-    self.assertItemsEqual(full_filenames, produced_filenames)
-    self.assertEqual(produced_filenames[:len(filenames)],
-                     produced_filenames[len(filenames):])
+    self.assertItemsEqual(expected_filenames, actual_filenames)
+    self.assertEqual(actual_filenames[:len(filenames)],
+                     actual_filenames[len(filenames):])
 
   def testMultiplePatternsAsList(self):
     filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py
index a9c4d79429a2ecf15c1edc5b29db42c9ad0cb9c7..67ef98f9fe9eee52d64c680ea0bd87d63cbf3973 100644
--- a/tensorflow/python/data/kernel_tests/map_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -28,12 +28,14 @@ from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
@@ -45,6 +47,7 @@ from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -77,18 +80,24 @@ def _make_coordinated_sloppy_dataset(num_elements, num_parallel_calls):
   options.experimental_deterministic = False
   dataset = dataset_ops.Dataset.range(num_elements).map(
       map_fn, num_parallel_calls).with_options(options)
-  iterator = dataset.make_one_shot_iterator()
+  iterator = dataset_ops.make_one_shot_iterator(dataset)
   next_element = iterator.get_next()
   return next_element, coordination_events
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _buildMapDataset(self, components, count):
+
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-    return (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-            .repeat(count))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+        _map_fn).repeat(count)
+    self.assertEqual([c.shape[1:] for c in components],
+                     [shape for shape in dataset.output_shapes])
+    return dataset
 
   def testMapDataset(self):
     """Test an dataset that maps a TF function across its input elements."""
@@ -97,34 +106,32 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     components = (np.arange(7),
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
                   np.array(37.0) * np.arange(7))
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-
-    dataset = self._buildMapDataset(components, count)
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
 
+    # Test single-threaded access to the iterator.
+    get_next = self.getNext(self._buildMapDataset(components, 14))
+    for _ in range(14):
+      for i in range(7):
+        result = self.evaluate(get_next())
+        for component, result_component in zip(components, result):
+          self.assertAllEqual(component[i]**2, result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  # TODO(b/117581999): add eager coverage, different threads run in graph
+  # context.
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerMapDatasetMultithreaded(self):
+    # Test multi-threaded access to the same iterator.
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+    get_next = self.getNext(self._buildMapDataset(components, 18))
+    results = []
     with self.cached_session() as sess:
-      # Test single-threaded access to the iterator.
-      sess.run(init_op, feed_dict={count: 14})
-      for _ in range(14):
-        for i in range(7):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i]**2, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test multi-threaded access to the same iterator.
-      sess.run(init_op, feed_dict={count: 18})
-      results = []
       def iterator_thread():
         while True:
           try:
-            results.append(sess.run(get_next))
+            results.append(sess.run(get_next()))
           except errors.OutOfRangeError:
             return
       threads = [self.checkedThread(target=iterator_thread) for _ in range(8)]
@@ -146,59 +153,66 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _buildParallelMapDataset(self, components, count, num_parallel_calls,
                                output_buffer_size):
+
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-    return (dataset_ops.Dataset.from_tensor_slices(components)
-            .map(_map_fn, num_parallel_calls=num_parallel_calls)
-            .prefetch(output_buffer_size)
-            .repeat(count))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+        _map_fn, num_parallel_calls=num_parallel_calls).prefetch(
+            output_buffer_size).repeat(count)
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [shape for shape in dataset.output_shapes])
+    return dataset
 
   def testParallelMapDataset(self):
     """Test an dataset that maps a TF function across its input elements."""
+
     # The pipeline is TensorSliceDataset -> ParallelMapDataset(square_3) ->
     # RepeatDataset(count).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    num_parallel_calls = array_ops.placeholder(dtypes.int32, shape=[])
-    output_buffer_size = array_ops.placeholder(dtypes.int64, shape=[])
+    def do_test(num_parallel_calls, output_buffer_size):
 
-    dataset = self._buildParallelMapDataset(
-        components, count, num_parallel_calls, output_buffer_size)
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+      components = (np.arange(7),
+                    np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                    np.array(37.0) * np.arange(7))
+      # Test single-threaded access to the iterator.
+      get_next = self.getNext(
+          self._buildParallelMapDataset(components, 14, num_parallel_calls,
+                                        output_buffer_size))
+      for _ in range(14):
+        for i in range(7):
+          result = self.evaluate(get_next())
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
 
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
+    for num_parallel_calls_val, output_buffer_size_val in [(1, 1), (1, 2), (2,
+                                                                            2),
+                                                           (2, 4), (8, 8),
+                                                           (8, 16)]:
+      do_test(num_parallel_calls_val, output_buffer_size_val)
 
-    with self.cached_session() as sess:
+  # TODO(b/117581999): add eager coverage, different threads run in graph
+  # context.
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerParallelMapDatasetMultithreaded(self):
+
+    def do_test(num_parallel_calls, output_buffer_size):
+      # Test multi-threaded access to the same iterator.
+      components = (np.arange(7),
+                    np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                    np.array(37.0) * np.arange(7))
+      get_next = self.getNext(
+          self._buildParallelMapDataset(components, 18, num_parallel_calls,
+                                        output_buffer_size))
+      results = []
+      with self.cached_session() as sess:
 
-      def do_test(num_parallel_calls_val, output_buffer_size_val):
-        # Test single-threaded access to the iterator.
-        sess.run(init_op, feed_dict={
-            count: 14,
-            num_parallel_calls: num_parallel_calls_val,
-            output_buffer_size: output_buffer_size_val})
-        for _ in range(14):
-          for i in range(7):
-            result = sess.run(get_next)
-            for component, result_component in zip(components, result):
-              self.assertAllEqual(component[i]**2, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-        # Test multi-threaded access to the same iterator.
-        sess.run(init_op, feed_dict={
-            count: 18,
-            num_parallel_calls: num_parallel_calls_val,
-            output_buffer_size: output_buffer_size_val})
-        results = []
         def iterator_thread():
           while True:
             try:
-              results.append(sess.run(get_next))
+              results.append(sess.run(get_next()))
             except errors.OutOfRangeError:
               return
         threads = [self.checkedThread(target=iterator_thread)
@@ -235,14 +249,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = self._buildParallelMapDataset(components, 1000, 100, 100)
     # NOTE(mrry): Also test that the prefetching thread is cancelled correctly.
     dataset = dataset.prefetch(100)
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
+    for _ in range(3):
+      self.evaluate(get_next())
 
   def testParallelMapUnspecifiedOutputSize(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
@@ -250,14 +260,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = (dataset_ops.Dataset.from_tensor_slices(components)
                .map(lambda x: array_ops.check_numerics(x, "message"),
                     num_parallel_calls=2))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
+    for _ in range(3):
+      self.evaluate(get_next())
 
   def testParallelMapError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
@@ -265,20 +271,16 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = (dataset_ops.Dataset.from_tensor_slices(components)
                .map(lambda x: array_ops.check_numerics(x, "message"),
                     num_parallel_calls=2))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-      # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-      sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    for _ in range(3):
+      self.evaluate(get_next())
+    # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
+    self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testPrefetchError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
@@ -286,20 +288,17 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = (dataset_ops.Dataset.from_tensor_slices(components)
                .map(lambda x: array_ops.check_numerics(x, "message"))
                .prefetch(2))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-      # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-      sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    get_next = self.getNext(dataset)
+
+    for _ in range(3):
+      self.evaluate(get_next())
+    # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
+    self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testCaptureIterator(self):
 
@@ -312,23 +311,22 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return dataset_ops.Dataset.range(10).map(_map_fn)
 
     def _build_graph():
-      captured_iterator = dataset_ops.Dataset.range(
-          10).make_initializable_iterator()
+      if context.executing_eagerly():
+        captured_iterator = iter(dataset_ops.Dataset.range(10))
+      else:
+        captured_iterator = dataset_ops.Dataset.range(
+            10).make_initializable_iterator()
       ds = _build_ds(captured_iterator)
-      iterator = ds.make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      return captured_iterator.initializer, init_op, get_next
-
-    with ops.Graph().as_default() as g:
-      captured_init_op, init_op, get_next = _build_graph()
-      with self.session(graph=g) as sess:
-        sess.run(captured_init_op)
-        sess.run(init_op)
-        for i in range(10):
-          self.assertEqual(i * i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+      return captured_iterator, ds
+
+    captured_iter, ds = _build_graph()
+    if not context.executing_eagerly():
+      self.evaluate(captured_iter.initializer)
+    get_next = self.getNext(ds, requires_initialization=True)
+    for i in range(10):
+      self.assertEqual(i * i, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testCaptureHashTable(self):
     # NOTE(mrry): We must use the V2 variants of `HashTable`
@@ -343,41 +341,37 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     input_sentences = dataset_ops.Dataset.from_tensor_slices(
         ["brain brain tank salad surgery", "surgery brain"])
 
-    iterator = (input_sentences
-                .map(lambda x: string_ops.string_split([x]).values)
-                .map(table.lookup)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = input_sentences.map(lambda x: string_ops.string_split([x]).values
+                                 ).map(table.lookup)
 
-    with self.cached_session() as sess:
-      sess.run(table.initializer)
-      sess.run(init_op)
-      sess.run(get_next)
-      sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    get_next = self.getNext(dataset, requires_initialization=True)
+
+    self.evaluate(table.initializer)
+    self.evaluate(get_next())
+    self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testCaptureQueue(self):
     elements = np.random.randint(100, size=[200])
     queue = data_flow_ops.FIFOQueue(200, dtypes.int64, shapes=[])
     enqueue_op = queue.enqueue_many(elements)
     close_op = queue.close()
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(-1)
-                .map(lambda _: queue.dequeue()).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(
+        -1).map(lambda _: queue.dequeue())
 
-    with self.cached_session() as sess:
-      sess.run(enqueue_op)
-      sess.run(close_op)
-      sess.run(init_op)
-      for element in elements:
-        self.assertEqual(element, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    get_next = self.getNext(dataset, requires_initialization=True)
+    self.evaluate(enqueue_op)
+    self.evaluate(close_op)
 
-  def testCaptureSameResourceMultipleTimes(self):
+    for element in elements:
+      self.assertEqual(element, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  # TODO(b/117581999): Possible deadlock in eager mode, debug.
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerCaptureSameResourceMultipleTimes(self):
     elements = np.random.randint(100, size=[200])
     queue = data_flow_ops.FIFOQueue(
         200, dtypes.int64, shapes=[], shared_name="shared_queue")
@@ -387,101 +381,84 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     enqueue_op = queue.enqueue_many(elements)
     close_op = queue.close()
 
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(-1)
-                .map(lambda _: (queue.dequeue(), queue_2.dequeue()))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(
+        -1).map(lambda _: (queue.dequeue(), queue_2.dequeue()))
 
-    with self.cached_session() as sess:
-      sess.run(enqueue_op)
-      sess.run(close_op)
-      sess.run(init_op)
-      for i in range(100):
-        self.assertEqual(sorted([elements[i * 2], elements[i * 2 + 1]]),
-                         sorted(sess.run(get_next)))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.evaluate(enqueue_op)
+    self.evaluate(close_op)
+    get_next = self.getNext(dataset, requires_initialization=True)
+    for i in range(100):
+      self.assertCountEqual([elements[i * 2], elements[i * 2 + 1]],
+                            self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testCaptureVariable(self):
     counter_var = variable_scope.get_variable(
         "counter", (), dtypes.int32, use_resource=True)
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: counter_var.assign_add(1))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(counter_var.initializer)
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual(i, sess.run(counter_var))
-        self.assertEqual(i + 1, sess.run(get_next))
-      self.assertEqual(10, sess.run(counter_var))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(10, sess.run(counter_var))
-
-  def testCaptureUninitializedVariableError(self):
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(
+        10).map(lambda _: counter_var.assign_add(1))
+    get_next = self.getNext(dataset, requires_initialization=True)
+
+    self.evaluate(counter_var.initializer)
+
+    for i in range(10):
+      self.assertEqual(i, self.evaluate(counter_var))
+      self.assertEqual(i + 1, self.evaluate(get_next()))
+    self.assertEqual(10, self.evaluate(counter_var))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertEqual(10, self.evaluate(counter_var))
+
+  # TODO(b/117581999): error not captured for eager mode, debug.
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerCaptureUninitializedVariableError(self):
     counter_var = variable_scope.get_variable(
         "counter", (), dtypes.int32, use_resource=True)
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: counter_var.assign_add(1))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(
+        10).map(lambda _: counter_var.assign_add(1))
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      with self.assertRaises(errors.NotFoundError):
-        sess.run(get_next)
-
-  def testSeededStatefulOperatorIsProperlyStateful(self):
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: random_ops.random_uniform((), seed=11)).batch(2)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    get_next = self.getNext(dataset, requires_initialization=True)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      random_values = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          random_values.extend(sess.run(get_next))
-      self.assertEqual(10, len(random_values))
-      self.assertGreater(np.abs(np.diff(random_values)).max(), 1e-6)
-      sess.run(init_op)
-      random_values_2 = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          random_values_2.extend(sess.run(get_next))
+    with self.assertRaises(errors.NotFoundError):
+      self.evaluate(get_next())
 
-      # Randomness is repeatable given same seed
-      self.assertAllClose(random_values, random_values_2)
+  def testSeededStatefulOperatorIsProperlyStateful(self):
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(
+        10).map(lambda _: random_ops.random_uniform((), seed=11)).batch(2)
+
+    get_next = self.getNext(dataset, requires_initialization=True)
+    random_values = []
+    with self.assertRaises(errors.OutOfRangeError):
+      while True:
+        random_values.extend(self.evaluate(get_next()))
+    self.assertLen(random_values, 10)
+    self.assertGreater(np.abs(np.diff(random_values)).max(), 1e-6)
+
+    get_next = self.getNext(dataset, requires_initialization=True)
+    random_values_2 = []
+    with self.assertRaises(errors.OutOfRangeError):
+      while True:
+        random_values_2.extend(self.evaluate(get_next()))
+
+    # Randomness is repeatable given same seed
+    self.assertAllClose(random_values, random_values_2)
 
   def testStatefulMapKeepsStateAcrossIterators(self):
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: random_ops.random_uniform((), seed=11))
-                .repeat(1000)
-                .batch(10)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(10).map(
+        lambda _: random_ops.random_uniform((), seed=11)).repeat(1000).batch(10)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      random_values = sess.run(get_next)
-
-      # Assert that one of the next 99 batches yielded by the iterator is
-      # different from the first.
-      i = 0
-      while i < 99:
-        if np.any(random_values != sess.run(get_next)):
-          break
-        i += 1
-      self.assertLess(i, 99)
+    get_next = self.getNext(dataset)
+    random_values = self.evaluate(get_next())
+
+    # Assert that one of the next 99 batches yielded by the iterator is
+    # different from the first.
+    i = 0
+    while i < 99:
+      if np.any(random_values != self.evaluate(get_next())):
+        break
+      i += 1
+    self.assertLess(i, 99)
 
   def testStatefulOperationInShortCircuit(self):
     counter_var = variable_scope.get_variable(
@@ -491,37 +468,25 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       counter_var.assign_add(1)
       return x
 
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(increment_fn)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.range(10).map(increment_fn)
 
-    with self.cached_session() as sess:
-      sess.run(counter_var.initializer)
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual(i, sess.run(counter_var))
-        self.assertEqual(i, sess.run(get_next))
-      self.assertEqual(10, sess.run(counter_var))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(10, sess.run(counter_var))
+    get_next = self.getNext(dataset, requires_initialization=True)
 
-  def testMapDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .map(lambda d: d["foo"] + d["bar"])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    self.evaluate(counter_var.initializer)
+    for i in range(10):
+      self.assertEqual(i, self.evaluate(counter_var))
+      self.assertEqual(i, self.evaluate(get_next()))
+    self.assertEqual(10, self.evaluate(counter_var))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertEqual(10, self.evaluate(counter_var))
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual(i * 2 + i**2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+  def testMapDict(self):
+    dataset = dataset_ops.Dataset.range(10).map(
+        lambda x: {"foo": x * 2, "bar": x**2}).map(
+            lambda d: d["foo"] + d["bar"])
+    self.assertDatasetProduces(
+        dataset, expected_output=[i * 2 + i**2 for i in range(10)])
 
   def testMapNamedtuple(self, count=10):
     # construct dataset of tuples
@@ -544,33 +509,23 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset_tuple = dataset_tuple.map(preprocess_tuple)
     dataset_namedtuple = dataset_namedtuple.map(preprocess_namedtuple)
 
-    next_tuple = dataset_tuple.make_one_shot_iterator().get_next()
-    next_namedtuple = dataset_namedtuple.make_one_shot_iterator().get_next()
+    next_tuple = self.getNext(dataset_tuple)
+    next_namedtuple = self.getNext(dataset_namedtuple)
 
     # make sure both datasets contain the same data
-    with self.cached_session() as sess:
-      for i in range(count):
-        tuple_, namedtuple_ = sess.run([next_tuple, next_namedtuple])
-        self.assertEqual(tuple_, namedtuple_)
-        self.assertEqual(tuple_, (i, -2 * i))
+    for i in range(count):
+      tuple_, namedtuple_ = self.evaluate([next_tuple(), next_namedtuple()])
+      self.assertEqual(tuple_, namedtuple_)
+      self.assertEqual(tuple_, (i, -2 * i))
 
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_namedtuple)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_namedtuple())
 
   def testUseStepContainerInMap(self):
     row = np.arange(6)
-    iterator = (
-        dataset_ops.Dataset.from_tensors(row)
-        .map(lambda elems: functional_ops.map_fn(lambda x: x * x, elems))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(row**2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.from_tensors(
+        row).map(lambda elems: functional_ops.map_fn(lambda x: x * x, elems))
+    self.assertDatasetProduces(dataset, expected_output=[row**2])
 
   def testCaseAndCondInMap(self):
 
@@ -598,24 +553,19 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           pred_fn_pairs, default=multiply, exclusive=True)
 
     def build_dataset(row, num):
-      iterator = (
-          dataset_ops.Dataset.from_tensor_slices(row).map(
-              lambda x: control_map_fn(x, num)).make_initializable_iterator())
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      return init_op, get_next
+      dataset = dataset_ops.Dataset.from_tensor_slices(
+          row).map(lambda x: control_map_fn(x, num))
+      return self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      row = np.arange(6)
-      for num in [2, 3, 4]:
-        init_op, get_next = build_dataset(row, num)
-        sess.run(init_op)
-        for i in range(6):
-          self.assertEqual(
-              (i // 2 if i % 2 else i * 2) if (num == 2 or num == 3) else i * 2,
-              sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+    row = np.arange(6)
+    for num in [2, 3, 4]:
+      get_next = build_dataset(row, num)
+      for i in range(6):
+        self.assertEqual(
+            (i // 2 if i % 2 else i * 2) if (num == 2 or num == 3) else i * 2,
+            self.evaluate(get_next()))
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
 
   def testCaseInWhileInMap(self):
 
@@ -637,25 +587,19 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     def build_dataset(row, num):
       # pylint: disable=g-long-lambda
-      iterator = (
-          dataset_ops.Dataset.from_tensors(row).map(
-              lambda elems: functional_ops.map_fn(lambda x:
-                                                  control_map_fn(x, num), elems)
-              ).make_initializable_iterator())
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      return init_op, get_next
+      dataset = dataset_ops.Dataset.from_tensors(
+          row).map(lambda elems: functional_ops.map_fn(
+              lambda x: control_map_fn(x, num), elems))
+      return self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      row = np.arange(6)
-      for num in [2, 3, 4]:
-        init_op, get_next = build_dataset(row, num)
-        sess.run(init_op)
-        self.assertAllEqual(
-            [x // 2 if (num == 2 or num == 3) else x * 2 for x in row],
-            sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+    row = np.arange(6)
+    for num in [2, 3, 4]:
+      get_next = build_dataset(row, num)
+      self.assertAllEqual(
+          [x // 2 if (num == 2 or num == 3) else x * 2 for x in row],
+          self.evaluate(get_next()))
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
 
   def testCaseAndCondInWhileInMap(self):
 
@@ -685,22 +629,17 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     row = np.arange(6)
     num = 2
     # pylint: disable=g-long-lambda
-    iterator = (
-        dataset_ops.Dataset.from_tensors(row).map(
-            lambda elems: functional_ops.map_fn(lambda x:
-                                                control_map_fn(x, num), elems)
-            ).make_initializable_iterator())
+    dataset = dataset_ops.Dataset.from_tensors(
+        row).map(lambda elems: functional_ops.map_fn(
+            lambda x: control_map_fn(x, num), elems))
     # pylint: enable=g-long-lambda
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([(x // 2 if x % 2 else x * 2) if
-                           (num == 2 or num == 3) else x * 2 for x in row],
-                          sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertAllEqual([(x // 2 if x % 2 else x * 2) if
+                         (num == 2 or num == 3) else x * 2 for x in row],
+                        self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testPrefetch(self):
     # We will use this event to test that `_map_py_func()` has been
@@ -718,59 +657,54 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     def _map_fn(x):
       return script_ops.py_func(_map_py_func, [x], x.dtype)
 
-    buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
-        dataset_ops.Dataset.range(100)
-        .map(_map_fn)
-        .prefetch(buffer_size_placeholder)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    def do_test(buffer_size):
+      dataset = dataset_ops.Dataset.range(100).map(_map_fn).prefetch(
+          buffer_size)
 
-    with self.cached_session() as sess:
+      get_next = self.getNext(dataset)
       # Simple test that prefetch yields the expected values in the
       # expected order.
-      for buffer_size in [1, 10, 100, 1000]:
-        sess.run(init_op, feed_dict={buffer_size_placeholder: buffer_size})
-        for i in range(100):
-          self.assertEqual(i * i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-      # We can indirectly observe that varying the buffer size has the
-      # intended effect by observing when `ev` is set (on the 6th
-      # invocation of `_map_py_func()`).
-      # NOTE(mrry): We do not test with `buffer_size ==
-      # set_event_during_invocation`, because we must consume at least
-      # one element to start the prefetching.
-      for buffer_size in range(1, set_event_during_invocation):
-        event_will_be_set_after_consuming = (
-            set_event_during_invocation - buffer_size + 1)
-
-        ev.clear()
-        sess.run(init_op, feed_dict={buffer_size_placeholder: buffer_size})
-        for i in range(event_will_be_set_after_consuming):
-          self.assertFalse(ev.is_set())
-          self.assertEqual(i * i, sess.run(get_next))
-        ev.wait()
-        for i in range(event_will_be_set_after_consuming, 100):
-          self.assertEqual(i * i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+      for i in range(100):
+        self.assertEqual(i * i, self.evaluate(get_next()))
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
+    for buffer_size in [1, 10, 100, 1000]:
+      do_test(buffer_size)
+
+    # We can indirectly observe that varying the buffer size has the
+    # intended effect by observing when `ev` is set (on the 6th
+    # invocation of `_map_py_func()`).
+    # NOTE(mrry): We do not test with `buffer_size ==
+    # set_event_during_invocation`, because we must consume at least
+    # one element to start the prefetching.
+    def do_test_ev(buffer_size):
+      dataset = dataset_ops.Dataset.range(100).map(_map_fn).prefetch(
+          buffer_size)
+
+      get_next = self.getNext(dataset)
+
+      event_will_be_set_after_consuming = (
+          set_event_during_invocation - buffer_size + 1)
+
+      ev.clear()
+      for i in range(event_will_be_set_after_consuming):
+        self.assertFalse(ev.is_set())
+        self.assertEqual(i * i, self.evaluate(get_next()))
+      ev.wait()
+      for i in range(event_will_be_set_after_consuming, 100):
+        self.assertEqual(i * i, self.evaluate(get_next()))
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
 
-  def testReturnList(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: [x, constant_op.constant(37.0)])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    for buffer_size in range(1, set_event_during_invocation):
+      do_test_ev(buffer_size)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, 37.0), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+  def testReturnList(self):
+    dataset = dataset_ops.Dataset.range(
+        10).map(lambda x: [x, constant_op.constant(37.0)])
+    self.assertDatasetProduces(
+        dataset, expected_output=[(i, 37.0) for i in range(10)])
 
   def testMultiOutputPyFunc(self):
     # The `tf.py_func()` op returns a list of tensors for its outputs.
@@ -780,18 +714,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return script_ops.py_func(
           _map_py_func, [x_tensor], [dtypes.int64, dtypes.float64])
 
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(_map_fn)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, 37.0), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn)
+    self.assertDatasetProduces(
+        dataset, expected_output=[(i, 37.0) for i in range(10)])
 
   def testSparse(self):
 
@@ -801,20 +726,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           values=(i * np.array([1])),
           dense_shape=np.array([1, 1]))
 
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(_sparse)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        actual = sess.run(get_next)
-        self.assertIsInstance(actual, sparse_tensor.SparseTensorValue)
-        self.assertSparseValuesEqual(actual, _sparse(i))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(10).map(_sparse)
+    self.assertDatasetProduces(
+        dataset, expected_output=[_sparse(i) for i in range(10)])
 
   def testSparseChain(self):
 
@@ -828,20 +742,11 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertTrue(sparse_tensor.is_sparse(i))
       return sparse_ops.sparse_concat(0, [i, i])
 
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_sparse).map(_check)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).map(_check)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        actual = sess.run(get_next)
-        self.assertIsInstance(actual, sparse_tensor.SparseTensorValue)
-        self.assertSparseValuesEqual(actual, _check(_sparse(i)).eval())
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[self.evaluate(_check(_sparse(i))) for i in range(10)])
 
   def testParallelMapOutOfRangeError(self):
     def raising_py_func(i):
@@ -850,34 +755,18 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       else:
         return i
 
-    iterator = (
-        dataset_ops.Dataset.range(105)
-        .map(lambda x: script_ops.py_func(raising_py_func, [x], dtypes.int64),
-             num_parallel_calls=2)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(100):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(105).map(
+        lambda x: script_ops.py_func(raising_py_func, [x], dtypes.int64),
+        num_parallel_calls=2)
+    get_next = self.getNext(dataset)
+    for i in range(100):
+      self.assertEqual(i, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testConstantOutput(self):
-    iterator = (
-        dataset_ops.Dataset.range(10).map(lambda x: [x, "hello", 10])
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, b"hello", 10), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(10).map(lambda x: [x, "hello", 10])
+    self.assertDatasetProduces(dataset, [(i, b"hello", 10) for i in range(10)])
 
   def testWarnOnLookupTable(self):
     def collecting_function(x):
@@ -906,7 +795,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         dataset_ops.Dataset.from_tensor_slices).map(
             lambda ds: ds.batch(3)).flat_map(lambda x: x)
 
-    self.assertDatasetProduces(dataset, [[1.0, 2.0, 3.0]])
+    self.assertDatasetProduces(dataset, expected_output=[[1.0, 2.0, 3.0]])
 
   def testReturnValueError(self):
     dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0])
@@ -939,11 +828,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return const_tensor
 
     dataset = dataset.map(broken_function)
-    iterator = dataset.make_initializable_iterator()
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "BrokenConst"):
-        sess.run(iterator.initializer)
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, "BrokenConst"))
 
 # pylint: disable=g-long-lambda
   @parameterized.named_parameters(
@@ -966,12 +852,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return tids
 
     dataset = make_dataset_fn(dataset, _map_fn)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      tids = sess.run(get_next)
-      self.assertTrue(all(tids[0] == tid for tid in tids))
+    tids = self.evaluate(get_next())
+    self.assertTrue(all(tids[0] == tid for tid in tids))
 # pylint: enable=g-long-lambda
 
   @parameterized.named_parameters(
@@ -987,30 +871,28 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testShortCircuit(self, structure, map_fn, num_parallel_calls):
     dataset = self.structuredDataset(structure).repeat().map(
         map_fn, num_parallel_calls=num_parallel_calls)
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      if isinstance(structure, tuple):
-        expected = map_fn(*sess.run(self.structuredElement(structure)))
-      else:
-        expected = map_fn(sess.run(self.structuredElement(structure)))
-      self.assertEqual(expected, sess.run(get_next))
+    if isinstance(structure, tuple):
+      expected = map_fn(*self.evaluate(self.structuredElement(structure)))
+    else:
+      expected = map_fn(self.evaluate(self.structuredElement(structure)))
+    self.assertEqual(expected, self.evaluate(get_next()))
 
   @parameterized.named_parameters(
       ("Sequential", None),
       ("Parallel", 10),
   )
   def testShortCircuitCapturedInput(self, num_parallel_calls):
-    captured_t = array_ops.placeholder(dtypes.int64, shape=[])
+    captured_t = variables.Variable(42)
     dataset = self.structuredDataset(None).repeat().map(
         lambda x: captured_t, num_parallel_calls=num_parallel_calls)
-    iterator = dataset.make_initializable_iterator()
-    get_next = iterator.get_next()
+    self.evaluate(variables.global_variables_initializer())
+    get_next = self.getNext(dataset, requires_initialization=True)
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer, feed_dict={captured_t: 42})
-      self.assertEqual(42, sess.run(get_next))
+    self.assertEqual(42, self.evaluate(get_next()))
 
+  # TODO(b/117581999): Add eager coverage.
   @parameterized.named_parameters(
       ("1", 1, 1),
       ("2", 10, 1),
@@ -1019,7 +901,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("5", 100, 10),
       ("6", 100, 100),
   )
-  def testSloppyInterleaveInOrder(self, num_elements, num_parallel_calls):
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerSloppyInterleaveInOrder(self, num_elements,
+                                           num_parallel_calls):
     get_next, coordination_events = _make_coordinated_sloppy_dataset(
         num_elements, num_parallel_calls)
     config = config_pb2.ConfigProto(
@@ -1032,12 +916,15 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  # TODO(b/117581999): Add eager coverage.
   @parameterized.named_parameters(
       ("1", 10, 10),
       ("2", 100, 10),
       ("3", 100, 100),
   )
-  def testSloppyInterleaveOutOfOrder(self, num_elements, num_parallel_calls):
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerSloppyInterleaveOutOfOrder(self, num_elements,
+                                              num_parallel_calls):
     get_next, coordination_events = _make_coordinated_sloppy_dataset(
         num_elements, num_parallel_calls)
     config = config_pb2.ConfigProto(
@@ -1054,6 +941,22 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @parameterized.named_parameters(
+      ("Map", None),
+      ("ParallelMap", 12),
+  )
+  def testPreserveCardinality(self, num_parallel_calls):
+
+    def py_fn(_):
+      raise StopIteration()
+
+    dataset = dataset_ops.DatasetV2.from_tensors(0).map(
+        lambda x: script_ops.py_func(py_fn, [x], dtypes.int64),
+        num_parallel_calls=num_parallel_calls)
+    get_next = self.getNext(dataset)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
index 622ebb55dec635c9b28787820ad789abb51a6fdf..433ea620e1698d22a63716b18a6d5dadf1f06dff 100644
--- a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
@@ -35,6 +34,7 @@ from tensorflow.python.platform import test
 # TODO(b/117581999): Add eager coverage.
 class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
+  @test_util.run_v1_only("b/120545219")
   def testNoGetNext(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -44,6 +44,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
     with self.test_session(config=config) as sess:
       self.evaluate(multi_device_iterator.initializer)
 
+  @test_util.run_v1_only("b/120545219")
   def testBasic(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -60,6 +61,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/120545219")
   def testOneOnSameDevice(self):
     with ops.device("/cpu:0"):
       dataset = dataset_ops.Dataset.range(10)
@@ -77,6 +79,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/120545219")
   def testRepeatDevices(self):
     with ops.device("/cpu:0"):
       dataset = dataset_ops.Dataset.range(20)
@@ -99,6 +102,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
         self.evaluate(elem_on_3)
         self.evaluate(elem_on_4)
 
+  @test_util.run_v1_only("b/120545219")
   def testNotFullyDivisible(self):
     dataset = dataset_ops.Dataset.range(9)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -116,6 +120,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/120545219")
   def testGetNextAsOptional(self):
     dataset = dataset_ops.Dataset.range(9)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -149,6 +154,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.InvalidArgumentError):
         self.evaluate(elem_on_2_t)
 
+  @test_util.run_v1_only("b/120545219")
   def testUneven(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -166,6 +172,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleInitializations(self):
     with ops.device("/cpu:0"):
       epoch = array_ops.placeholder(dtypes.int64, shape=[])
@@ -259,6 +266,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.InvalidArgumentError):
         self.evaluate(elem_on_2_t)
 
+  @test_util.run_v1_only("b/120545219")
   def testOptimization(self):
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(optimization.assert_next(["MemoryCacheImpl"]))
@@ -266,7 +274,6 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
     dataset = dataset.cache()
 
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
     options.experimental_optimization.noop_elimination = True
     dataset = dataset.with_options(options)
 
diff --git a/tensorflow/python/data/kernel_tests/optional_test.py b/tensorflow/python/data/kernel_tests/optional_test.py
index c2c62e9423e6e082fd6fc42668e2827cc06246e1..ba5ee9b6613a1a82000ed41f90f595b3975bedb8 100644
--- a/tensorflow/python/data/kernel_tests/optional_test.py
+++ b/tensorflow/python/data/kernel_tests/optional_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import optional_ops
 from tensorflow.python.data.util import structure
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -361,6 +362,25 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
         with self.assertRaises(errors.InvalidArgumentError):
           sess.run(elem_value_t)
 
+  def testFunctionBoundaries(self):
+    @def_function.function
+    def get_optional():
+      x = constant_op.constant(1.0)
+      opt = optional_ops.Optional.from_value(x)
+      # TODO(skyewm): support returning Optionals from functions?
+      return opt._variant_tensor
+
+    # TODO(skyewm): support Optional arguments?
+    @def_function.function
+    def consume_optional(opt_tensor):
+      value_structure = structure.TensorStructure(dtypes.float32, [])
+      opt = optional_ops._OptionalImpl(opt_tensor, value_structure)
+      return opt.get_value()
+
+    opt_tensor = get_optional()
+    val = consume_optional(opt_tensor)
+    self.assertEqual(self.evaluate(val), 1.0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/options_test.py b/tensorflow/python/data/kernel_tests/options_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5bad3e7ae58885a5d013b0dc0f9dec41e0204c8
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/options_test.py
@@ -0,0 +1,96 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Options`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import optimization_options
+from tensorflow.python.data.experimental.ops import stats_options
+from tensorflow.python.data.experimental.ops import threading_options
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class OptionsTest(test_base.DatasetTestBase):
+
+  def testOptionsDefault(self):
+    ds = dataset_ops.Dataset.range(0)
+    self.assertEqual(dataset_ops.Options(), ds.options())
+
+  def testOptionsOnce(self):
+    options = dataset_ops.Options()
+    ds = dataset_ops.Dataset.range(0).with_options(options).cache()
+    self.assertEqual(options, ds.options())
+
+  def testOptionsTwiceSame(self):
+    options = dataset_ops.Options()
+    options.experimental_autotune = True
+    ds = dataset_ops.Dataset.range(0).with_options(options).with_options(
+        options)
+    self.assertEqual(options, ds.options())
+
+  def testOptionsTwiceDifferent(self):
+    options1 = dataset_ops.Options()
+    options1.experimental_autotune = True
+    options2 = dataset_ops.Options()
+    options2.experimental_deterministic = False
+    ds = dataset_ops.Dataset.range(0).with_options(options1).with_options(
+        options2)
+    self.assertTrue(ds.options().experimental_autotune)
+    # Explicitly check that flag is False since assertFalse allows None
+    self.assertIs(ds.options().experimental_deterministic, False)
+
+  def testOptionsTwiceDifferentError(self):
+    options1 = dataset_ops.Options()
+    options1.experimental_autotune = True
+    options2 = dataset_ops.Options()
+    options2.experimental_autotune = False
+    with self.assertRaisesRegexp(ValueError,
+                                 "Cannot merge incompatible values"):
+      dataset_ops.Dataset.range(0).with_options(options1).with_options(options2)
+
+  def testOptionsMergeOptionsFromMultipleInputs(self):
+    options1 = dataset_ops.Options()
+    options1.experimental_autotune = True
+    options2 = dataset_ops.Options()
+    options2.experimental_deterministic = True
+    ds = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(0).with_options(options1),
+         dataset_ops.Dataset.range(0).with_options(options2)))
+    self.assertTrue(ds.options().experimental_autotune)
+    self.assertTrue(ds.options().experimental_deterministic)
+
+  def testOptionsHaveDefaults(self):
+    options1 = dataset_ops.Options()
+    options2 = dataset_ops.Options()
+    self.assertIsNot(options1.experimental_optimization,
+                     options2.experimental_optimization)
+    self.assertIsNot(options1.experimental_stats,
+                     options2.experimental_stats)
+    self.assertIsNot(options1.experimental_threading,
+                     options2.experimental_threading)
+    self.assertEquals(options1.experimental_optimization,
+                      optimization_options.OptimizationOptions())
+    self.assertEquals(options1.experimental_stats,
+                      stats_options.StatsOptions())
+    self.assertEquals(options1.experimental_threading,
+                      threading_options.ThreadingOptions())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/shuffle_test.py b/tensorflow/python/data/kernel_tests/shuffle_test.py
index 05d5d814c01d2640b8d34f937393df6256cde665..13df870938d1cee7b29e0189b9b1db1731bb4114 100644
--- a/tensorflow/python/data/kernel_tests/shuffle_test.py
+++ b/tensorflow/python/data/kernel_tests/shuffle_test.py
@@ -118,9 +118,8 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
   @test_util.run_deprecated_v1
   def testSkipEagerSeedZero(self):
     """Test for same behavior when the seed is a Python or Tensor zero."""
-    iterator = (
-        dataset_ops.Dataset.range(10).shuffle(10, seed=0)
-        .make_one_shot_iterator())
+    iterator = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset.range(10).shuffle(10, seed=0))
     get_next = iterator.get_next()
 
     elems = []
@@ -131,9 +130,8 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
         sess.run(get_next)
 
     seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
-        dataset_ops.Dataset.range(10).shuffle(10, seed=seed_placeholder)
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).shuffle(10, seed=seed_placeholder))
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -197,7 +195,7 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
         dataset = dataset_ops.Dataset.range(10).shuffle(
             10, seed=op_level_seed, reshuffle_each_iteration=reshuffle).repeat(
                 3)
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         next_element = iterator.get_next()
 
         run_results = []
@@ -224,9 +222,11 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
           10, reshuffle_each_iteration=reshuffle).repeat(3)
 
       if initializable:
-        iterators = [dataset.make_initializable_iterator() for _ in range(2)]
+        iterators = [dataset_ops.make_initializable_iterator(dataset)
+                     for _ in range(2)]
       else:
-        iterators = [dataset.make_one_shot_iterator() for _ in range(2)]
+        iterators = [dataset_ops.make_one_shot_iterator(dataset)
+                     for _ in range(2)]
 
       results = []
       with self.session(graph=g) as sess:
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index 03fc0da1495d962915d830a568cda76d4138fe0c..7aa7f33003cf7195f5ecde406e181b26644c8038 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -58,14 +58,14 @@ class DatasetTestBase(test.TestCase):
       A callable that returns the next element of `dataset`.
     """
     if context.executing_eagerly():
-      iterator = dataset.__iter__()
+      iterator = iter(dataset)
       return iterator._next_internal  # pylint: disable=protected-access
     else:
       if requires_initialization:
-        iterator = dataset.make_initializable_iterator()
+        iterator = dataset_ops.make_initializable_iterator(dataset)
         self.evaluate(iterator.initializer)
       else:
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
       get_next = iterator.get_next()
       return lambda: get_next
 
@@ -88,6 +88,7 @@ class DatasetTestBase(test.TestCase):
   def assertDatasetProduces(self,
                             dataset,
                             expected_output=None,
+                            expected_shapes=None,
                             expected_error=None,
                             requires_initialization=False,
                             num_test_iterations=1,
@@ -98,6 +99,8 @@ class DatasetTestBase(test.TestCase):
       dataset: A dataset to check for the expected output / error.
       expected_output: A list of elements that the dataset is expected to
         produce.
+      expected_shapes: A list of TensorShapes which is expected to match
+        output_shapes of dataset.
       expected_error: A tuple `(type, predicate)` identifying the expected error
         `dataset` should raise. The `type` should match the expected exception
         type, while `predicate` should either be 1) a unary function that inputs
@@ -126,6 +129,8 @@ class DatasetTestBase(test.TestCase):
             dataset, requires_initialization=requires_initialization)
         self.evaluate(get_next())
       return
+    if expected_shapes:
+      self.assertEqual(expected_shapes, dataset.output_shapes)
     self.assertGreater(num_test_iterations, 0)
     for _ in range(num_test_iterations):
       get_next = self.getNext(
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 0c5acda180f960d428266fc63df4cbafbdff6d7b..fbff7df9c379e04a2b12a14ed5f5534339cde543 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -52,6 +52,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/compat",
         "//tensorflow/python/data/util:convert",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
@@ -86,6 +87,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/util:structure",
     ],
 )
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index c57ddab15f2785ac03db5ae97ebdb7bfef2cbf73..904c5b4b64d0288ef42f86d6ea9f6fbdaecbc210 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -69,7 +69,6 @@ class DatasetV2(object):
 
   A `Dataset` can be used to represent an input pipeline as a
   collection of elements (nested structures of tensors) and a "logical
-
   plan" of transformations that act on those elements.
   """
 
@@ -97,6 +96,37 @@ class DatasetV2(object):
 
     raise NotImplementedError("Dataset._inputs")
 
+  def _has_captured_ref(self):
+    """Whether this dataset uses a function that captures ref variables.
+
+    Returns:
+      A boolean, which if true indicates that the dataset or one of its inputs
+      uses a function that captures ref variables.
+    """
+    if context.executing_eagerly():
+      # RefVariables are not supported in eager mode
+      return False
+
+    def is_tensor_or_parent_ref(tensor):
+      if tensor.dtype._is_ref_dtype:  # pylint: disable=protected-access
+        return True
+      return any([is_tensor_or_parent_ref(x) for x in tensor.op.inputs])
+
+    for fn in self._functions():
+      if any([is_tensor_or_parent_ref(t) for t in fn.function.captured_inputs]):
+        return True
+
+    return any(
+        [input_dataset._has_captured_ref() for input_dataset in self._inputs()])  # pylint: disable=protected-access
+
+  def _functions(self):
+    """Returns a list of functions associated with this dataset.
+
+    Returns:
+      A list of `StructuredFunctionWrapper` objects.
+    """
+    return []
+
   def options(self):
     """Returns the options for this dataset and its inputs.
 
@@ -125,7 +155,16 @@ class DatasetV2(object):
             dataset, t_options.max_intra_op_parallelism)
     static_optimizations = options._static_optimizations()  # pylint: disable=protected-access
     if static_optimizations:
-      dataset = _OptimizeDataset(dataset, static_optimizations)
+      if self._has_captured_ref():
+        warnings.warn(
+            "tf.data static optimizations are not compatible with tf.Variable. "
+            "The following optimizations will be disabled: %s. To enable "
+            "optimizations, use resource variables instead by calling "
+            "`tf.enable_resource_variables()` at the start of the program." %
+            ", ".join(static_optimizations))
+      else:
+        dataset = _OptimizeDataset(dataset, static_optimizations)
+
     if options.experimental_autotune is not False:
       dataset = _ModelDataset(dataset)
     if options.experimental_stats and options.experimental_stats.aggregator:  # pylint: disable=line-too-long
@@ -135,51 +174,6 @@ class DatasetV2(object):
           options.experimental_stats.counter_prefix)
     return dataset
 
-  def make_initializable_iterator(self, shared_name=None):
-    """Creates an `Iterator` for enumerating the elements of this dataset.
-
-    Note: The returned iterator will be in an uninitialized state,
-    and you must run the `iterator.initializer` operation before using it:
-
-    ```python
-    dataset = ...
-    iterator = dataset.make_initializable_iterator()
-    # ...
-    sess.run(iterator.initializer)
-    ```
-
-    Args:
-      shared_name: (Optional.) If non-empty, the returned iterator will be
-        shared under the given name across multiple sessions that share the
-        same devices (e.g. when using a remote server).
-
-    Returns:
-      An `Iterator` over the elements of this dataset.
-
-    Raises:
-      RuntimeError: If eager execution is enabled.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          "dataset.make_initializable_iterator is not supported when eager "
-          "execution is enabled.")
-    dataset = self._apply_options()
-    if shared_name is None:
-      shared_name = ""
-    if compat.forward_compatible(2018, 8, 3):
-      iterator_resource = gen_dataset_ops.iterator_v2(
-          container="", shared_name=shared_name, **flat_structure(self))
-    else:
-      iterator_resource = gen_dataset_ops.iterator(
-          container="", shared_name=shared_name, **flat_structure(self))
-    with ops.colocate_with(iterator_resource):
-      initializer = gen_dataset_ops.make_iterator(
-          dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          iterator_resource)
-    return iterator_ops.Iterator(iterator_resource, initializer,
-                                 dataset.output_types, dataset.output_shapes,
-                                 dataset.output_classes)
-
   def __iter__(self):
     """Creates an `Iterator` for enumerating the elements of this dataset.
 
@@ -193,62 +187,22 @@ class DatasetV2(object):
       RuntimeError: If eager execution is not enabled.
     """
     if context.executing_eagerly():
-      dataset = self._apply_options()
-      return iterator_ops.EagerIterator(dataset)
+      return iterator_ops.EagerIterator(self)
     else:
       raise RuntimeError("dataset.__iter__() is only supported when eager "
                          "execution is enabled.")
 
-  def make_one_shot_iterator(self):
-    """Creates an `Iterator` for enumerating the elements of this dataset.
-
-    Note: The returned iterator will be initialized automatically.
-    A "one-shot" iterator does not currently support re-initialization.
+  @abc.abstractproperty
+  def _element_structure(self):
+    """The structure of an element of this dataset.
 
     Returns:
-      An `Iterator` over the elements of this dataset.
+      A `Structure` object representing the structure of an element of this
+      dataset.
     """
-    if context.executing_eagerly():
-      dataset = self._apply_options()
-      return iterator_ops.EagerIterator(dataset)
-
-    graph_level_seed, op_level_seed = core_random_seed.get_seed(None)
-
-    # NOTE(mrry): We capture by value here to ensure that `_make_dataset()` is
-    # a 0-argument function.
-    @function.Defun(capture_by_value=True)
-    def _make_dataset():
-      """Factory function for a dataset."""
-      # NOTE(mrry): `Defun` does not capture the graph-level seed from the
-      # enclosing graph, so if a graph-level seed is present we set the local
-      # graph seed based on a combination of the graph- and op-level seeds.
-      if graph_level_seed is not None:
-        assert op_level_seed is not None
-        core_random_seed.set_random_seed(
-            (graph_level_seed + 87654321 * op_level_seed) % (2 ** 63 - 1))
-
-      dataset = self._apply_options()
-      return dataset._as_variant_tensor()  # pylint: disable=protected-access
-
-    try:
-      _make_dataset.add_to_graph(ops.get_default_graph())
-    except ValueError as err:
-      if "Cannot capture a stateful node" in str(err):
-        raise ValueError(
-            "Failed to create a one-shot iterator for a dataset. "
-            "`Dataset.make_one_shot_iterator()` does not support datasets that "
-            "capture stateful objects, such as a `Variable` or `LookupTable`. "
-            "In these cases, use `Dataset.make_initializable_iterator()`. "
-            "(Original error: %s)" % err)
-      else:
-        six.reraise(ValueError, err)
+    raise NotImplementedError("Dataset._element_structure")
 
-    return iterator_ops.Iterator(
-        gen_dataset_ops.one_shot_iterator(
-            dataset_factory=_make_dataset, **flat_structure(self)),
-        None, self.output_types, self.output_shapes, self.output_classes)
-
-  @abc.abstractproperty
+  @property
   def output_classes(self):
     """Returns the class of each component of an element of this dataset.
 
@@ -258,9 +212,9 @@ class DatasetV2(object):
       A nested structure of Python `type` objects corresponding to each
       component of an element of this dataset.
     """
-    raise NotImplementedError("Dataset.output_classes")
+    return self._element_structure._to_legacy_output_classes()  # pylint: disable=protected-access
 
-  @abc.abstractproperty
+  @property
   def output_shapes(self):
     """Returns the shape of each component of an element of this dataset.
 
@@ -268,9 +222,9 @@ class DatasetV2(object):
       A nested structure of `tf.TensorShape` objects corresponding to each
       component of an element of this dataset.
     """
-    raise NotImplementedError("Dataset.output_shapes")
+    return self._element_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
 
-  @abc.abstractproperty
+  @property
   def output_types(self):
     """Returns the type of each component of an element of this dataset.
 
@@ -278,7 +232,7 @@ class DatasetV2(object):
       A nested structure of `tf.DType` objects corresponding to each component
       of an element of this dataset.
     """
-    raise NotImplementedError("Dataset.output_types")
+    return self._element_structure._to_legacy_output_types()  # pylint: disable=protected-access
 
   def __repr__(self):
     output_shapes = nest.map_structure(str, self.output_shapes)
@@ -297,7 +251,8 @@ class DatasetV2(object):
     `tf.constant` operations. For large datasets (> 1 GB), this can waste
     memory and run into byte limits of graph serialization. If `tensors`
     contains one or more large NumPy arrays, consider the alternative described
-    in [this guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
+    in [this
+    guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
 
     Args:
       tensors: A nested structure of tensors.
@@ -316,7 +271,8 @@ class DatasetV2(object):
     `tf.constant` operations. For large datasets (> 1 GB), this can waste
     memory and run into byte limits of graph serialization. If `tensors`
     contains one or more large NumPy arrays, consider the alternative described
-    in [this guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
+    in [this guide](
+    https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
 
     Args:
       tensors: A nested structure of tensors, each having the same size in the
@@ -376,17 +332,19 @@ class DatasetV2(object):
 
     ```python
     import itertools
+    tf.enable_eager_execution()
 
     def gen():
       for i in itertools.count(1):
         yield (i, [1] * i)
 
-    ds = Dataset.from_generator(
+    ds = tf.data.Dataset.from_generator(
         gen, (tf.int64, tf.int64), (tf.TensorShape([]), tf.TensorShape([None])))
-    value = ds.make_one_shot_iterator().get_next()
 
-    sess.run(value)  # (1, array([1]))
-    sess.run(value)  # (2, array([1, 1]))
+    for value in ds.take(2):
+      print value
+    # (1, array([1]))
+    # (2, array([1, 1]))
     ```
 
     NOTE: The current implementation of `Dataset.from_generator()` uses
@@ -989,15 +947,18 @@ class DatasetV2(object):
        `self.output_types`) to another nested structure of tensors.
       num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
         representing the number elements to process in parallel. If not
-        specified, elements will be processed sequentially.
+        specified, elements will be processed sequentially. If the value
+        `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
+        calls is set dynamically based on available CPU.
 
     Returns:
       Dataset: A `Dataset`.
     """
     if num_parallel_calls is None:
-      return MapDataset(self, map_func)
+      return MapDataset(self, map_func, preserve_cardinality=True)
     else:
-      return ParallelMapDataset(self, map_func, num_parallel_calls)
+      return ParallelMapDataset(
+          self, map_func, num_parallel_calls, preserve_cardinality=True)
 
   def flat_map(self, map_func):
     """Maps `map_func` across this dataset and flattens the result.
@@ -1099,7 +1060,9 @@ class DatasetV2(object):
       num_parallel_calls: (Optional.) If specified, the implementation creates
         a threadpool, which is used to fetch inputs from cycle elements
         asynchronously and in parallel. The default behavior is to fetch inputs
-        from cycle elements synchronously with no parallelism.
+        from cycle elements synchronously with no parallelism. If the value
+        `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
+        calls is set dynamically based on available CPU.
 
     Returns:
       Dataset: A `Dataset`.
@@ -1232,27 +1195,23 @@ class DatasetV2(object):
 
     # Compute initial values for the state classes, shapes and types based on
     # the initial state.
-    state_classes = sparse.get_classes(initial_state)
-    state_shapes = nest.pack_sequence_as(
-        initial_state, [t.get_shape() for t in nest.flatten(initial_state)])
-    state_types = nest.pack_sequence_as(
-        initial_state, [t.dtype for t in nest.flatten(initial_state)])
+    state_structure = structure_lib.Structure.from_value(initial_state)
 
     # Iteratively rerun the reduce function until reaching a fixed point on
-    # `self._state_shapes`.
+    # `state_structure`.
     need_to_rerun = True
     while need_to_rerun:
 
       wrapped_func = StructuredFunctionWrapper(
           reduce_func,
           "reduce()",
-          input_classes=(state_classes, self.output_classes),
-          input_shapes=(state_shapes, self.output_shapes),
-          input_types=(state_types, self.output_types),
+          input_structure=structure_lib.NestedStructure(
+              (state_structure, self._element_structure)),
           add_to_graph=False)
 
       # Extract and validate class information from the returned values.
       output_classes = wrapped_func.output_classes
+      state_classes = state_structure._to_legacy_output_classes()  # pylint: disable=protected-access
       for new_state_class, state_class in zip(
           nest.flatten(output_classes), nest.flatten(state_classes)):
         if not issubclass(new_state_class, state_class):
@@ -1263,6 +1222,7 @@ class DatasetV2(object):
 
       # Extract and validate type information from the returned values.
       output_types = wrapped_func.output_types
+      state_types = state_structure._to_legacy_output_types()  # pylint: disable=protected-access
       for new_state_type, state_type in zip(
           nest.flatten(output_types), nest.flatten(state_types)):
         if new_state_type != state_type:
@@ -1273,6 +1233,7 @@ class DatasetV2(object):
 
       # Extract shape information from the returned values.
       output_shapes = wrapped_func.output_shapes
+      state_shapes = state_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
       flat_state_shapes = nest.flatten(state_shapes)
       flat_new_state_shapes = nest.flatten(output_shapes)
       weakened_state_shapes = [
@@ -1290,27 +1251,26 @@ class DatasetV2(object):
           break
 
       if need_to_rerun:
-        state_shapes = nest.pack_sequence_as(state_shapes,
-                                             weakened_state_shapes)
+        # TODO(b/110122868): Support a "most specific compatible structure"
+        # method for combining structures, to avoid using legacy structures
+        # here.
+        state_structure = structure_lib.convert_legacy_structure(
+            state_types,
+            nest.pack_sequence_as(state_shapes, weakened_state_shapes),
+            state_classes)
 
     reduce_func = wrapped_func.function
     reduce_func.add_to_graph(ops.get_default_graph())
 
-    return sparse.deserialize_sparse_tensors(
-        nest.pack_sequence_as(
-            output_types,
-            gen_dataset_ops.reduce_dataset(
-                self._as_variant_tensor(),  # pylint: disable=protected-access
-                nest.flatten(sparse.serialize_sparse_tensors(initial_state)),
-                reduce_func.captured_inputs,
-                f=reduce_func,
-                output_shapes=nest.flatten(
-                    sparse.as_dense_shapes(output_shapes, output_classes)),
-                output_types=nest.flatten(
-                    sparse.as_dense_types(output_types, output_classes)))),
-        output_types,
-        output_shapes,
-        output_classes)
+    # pylint: disable=protected-access
+    return state_structure._from_compatible_tensor_list(
+        gen_dataset_ops.reduce_dataset(
+            self._as_variant_tensor(),
+            state_structure._to_tensor_list(initial_state),
+            reduce_func.captured_inputs,
+            f=reduce_func,
+            output_shapes=state_structure._flat_shapes,
+            output_types=state_structure._flat_types))
 
   def with_options(self, options):
     """Returns a new `tf.data.Dataset` with the given options set.
@@ -1343,6 +1303,116 @@ class DatasetV1(DatasetV2):
   def __init__(self):
     pass
 
+  @deprecation.deprecated(
+      None, "Use `for ... in dataset:` to iterate over a dataset. If using "
+      "`tf.estimator`, return the `Dataset` object directly from your input "
+      "function. As a last resort, you can use "
+      "`tf.compat.v1.data.make_one_shot_iterator(dataset)`.")
+  def make_one_shot_iterator(self):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
+
+    Note: The returned iterator will be initialized automatically.
+    A "one-shot" iterator does not currently support re-initialization.
+
+    Returns:
+      An `Iterator` over the elements of this dataset.
+    """
+    if context.executing_eagerly():
+      return iterator_ops.EagerIterator(self)
+
+    graph_level_seed, op_level_seed = core_random_seed.get_seed(None)
+
+    # NOTE(mrry): We capture by value here to ensure that `_make_dataset()` is
+    # a 0-argument function.
+    @function.Defun(capture_by_value=True)
+    def _make_dataset():
+      """Factory function for a dataset."""
+      # NOTE(mrry): `Defun` does not capture the graph-level seed from the
+      # enclosing graph, so if a graph-level seed is present we set the local
+      # graph seed based on a combination of the graph- and op-level seeds.
+      if graph_level_seed is not None:
+        assert op_level_seed is not None
+        core_random_seed.set_random_seed(
+            (graph_level_seed + 87654321 * op_level_seed) % (2 ** 63 - 1))
+
+      dataset = self._apply_options()
+      return dataset._as_variant_tensor()  # pylint: disable=protected-access
+
+    try:
+      _make_dataset.add_to_graph(ops.get_default_graph())
+    except ValueError as err:
+      if "Cannot capture a stateful node" in str(err):
+        raise ValueError(
+            "Failed to create a one-shot iterator for a dataset. "
+            "`Dataset.make_one_shot_iterator()` does not support datasets that "
+            "capture stateful objects, such as a `Variable` or `LookupTable`. "
+            "In these cases, use `Dataset.make_initializable_iterator()`. "
+            "(Original error: %s)" % err)
+      else:
+        six.reraise(ValueError, err)
+
+    return iterator_ops.Iterator(
+        gen_dataset_ops.one_shot_iterator(
+            dataset_factory=_make_dataset, **flat_structure(self)),
+        None, self.output_types, self.output_shapes, self.output_classes)
+
+  @deprecation.deprecated(
+      None, "Use `for ... in dataset:` to iterate over a dataset. If using "
+      "`tf.estimator`, return the `Dataset` object directly from your input "
+      "function. As a last resort, you can use "
+      "`tf.compat.v1.data.make_initializable_iterator(dataset)`.")
+  def make_initializable_iterator(self, shared_name=None):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
+
+    Note: The returned iterator will be in an uninitialized state,
+    and you must run the `iterator.initializer` operation before using it:
+
+    ```python
+    dataset = ...
+    iterator = dataset.make_initializable_iterator()
+    # ...
+    sess.run(iterator.initializer)
+    ```
+
+    Args:
+      shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server).
+
+    Returns:
+      An `Iterator` over the elements of this dataset.
+
+    Raises:
+      RuntimeError: If eager execution is enabled.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError(
+          "dataset.make_initializable_iterator is not supported when eager "
+          "execution is enabled.")
+    dataset = self._apply_options()
+    if shared_name is None:
+      shared_name = ""
+    if compat.forward_compatible(2018, 8, 3):
+      iterator_resource = gen_dataset_ops.iterator_v2(
+          container="", shared_name=shared_name, **flat_structure(self))
+    else:
+      iterator_resource = gen_dataset_ops.iterator(
+          container="", shared_name=shared_name, **flat_structure(self))
+    with ops.colocate_with(iterator_resource):
+      initializer = gen_dataset_ops.make_iterator(
+          dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          iterator_resource)
+    return iterator_ops.Iterator(iterator_resource, initializer,
+                                 dataset.output_types, dataset.output_shapes,
+                                 dataset.output_classes)
+
+  @property
+  def _element_structure(self):
+    # TODO(b/110122868): Remove this override once all `Dataset` instances
+    # implement `element_structure`.
+    return structure_lib.convert_legacy_structure(
+        self.output_types, self.output_shapes, self.output_classes)
+
   @staticmethod
   @functools.wraps(DatasetV2.from_tensors)
   def from_tensors(tensors):
@@ -1487,8 +1557,13 @@ class DatasetV1(DatasetV2):
 
   @functools.wraps(DatasetV2.map)
   def map(self, map_func, num_parallel_calls=None):
-    return DatasetV1Adapter(super(DatasetV1, self).map(
-        map_func, num_parallel_calls))
+    if num_parallel_calls is None:
+      return DatasetV1Adapter(
+          MapDataset(self, map_func, preserve_cardinality=False))
+    else:
+      return DatasetV1Adapter(
+          ParallelMapDataset(
+              self, map_func, num_parallel_calls, preserve_cardinality=False))
 
   @functools.wraps(DatasetV2.flat_map)
   def flat_map(self, map_func):
@@ -1536,6 +1611,9 @@ class DatasetV1Adapter(DatasetV1):
   def _as_variant_tensor(self):
     return self._dataset._as_variant_tensor()  # pylint: disable=protected-access
 
+  def _has_captured_ref(self):
+    return self._dataset._has_captured_ref()  # pylint: disable=protected-access
+
   def _inputs(self):
     return self._dataset._inputs()  # pylint: disable=protected-access
 
@@ -1543,25 +1621,63 @@ class DatasetV1Adapter(DatasetV1):
     return self._dataset.options()
 
   @property
-  def output_classes(self):
-    return self._dataset.output_classes
+  def _element_structure(self):
+    return self._dataset._element_structure  # pylint: disable=protected-access
 
-  @property
-  def output_shapes(self):
-    return self._dataset.output_shapes
+  def __iter__(self):
+    return iter(self._dataset)
 
-  @property
-  def output_types(self):
-    return self._dataset.output_types
 
-  def make_initializable_iterator(self, shared_name=None):
-    return self._dataset.make_initializable_iterator(shared_name)
+@tf_export(v1=["data.make_one_shot_iterator"])
+def make_one_shot_iterator(dataset):
+  """Creates a `tf.data.Iterator` for enumerating the elements of a dataset.
 
-  def __iter__(self):
-    return iter(self._dataset)
+  Note: The returned iterator will be initialized automatically.
+  A "one-shot" iterator does not support re-initialization.
+
+  Args:
+    dataset: A `tf.data.Dataset`.
+
+  Returns:
+    A `tf.data.Iterator` over the elements of this dataset.
+  """
+  try:
+    # Call the defined `make_one_shot_iterator()` if there is one, because some
+    # datasets (e.g. for prefetching) override its behavior.
+    return dataset.make_one_shot_iterator()
+  except AttributeError:
+    return DatasetV1Adapter(dataset).make_one_shot_iterator()
 
-  def make_one_shot_iterator(self):
-    return self._dataset.make_one_shot_iterator()
+
+@tf_export(v1=["data.make_initializable_iterator"])
+def make_initializable_iterator(dataset):
+  """Creates a `tf.data.Iterator` for enumerating the elements of a dataset.
+
+  Note: The returned iterator will be in an uninitialized state,
+  and you must run the `iterator.initializer` operation before using it:
+
+  ```python
+  dataset = ...
+  iterator = dataset.make_initializable_iterator()
+  # ...
+  sess.run(iterator.initializer)
+  ```
+
+  Args:
+    dataset: A `tf.data.Dataset`.
+
+  Returns:
+    A `tf.data.Iterator` over the elements of `dataset`.
+
+  Raises:
+    RuntimeError: If eager execution is enabled.
+  """
+  try:
+    # Call the defined `make_one_shot_iterator()` if there is one, because some
+    # datasets (e.g. for prefetching) override its behavior.
+    return dataset.make_initializable_iterator()
+  except AttributeError:
+    return DatasetV1Adapter(dataset).make_initializable_iterator()
 
 
 @tf_export("data.Options")
@@ -1579,55 +1695,51 @@ class Options(options_lib.OptionsBase):
       ty=bool,
       docstring=
       "Whether to dynamically adjust the values of tunable parameters (e.g. "
-      "degrees of parallelism).")
+      "degrees of parallelism). If None, defaults to True.")
 
   experimental_deterministic = options_lib.create_option(
       name="experimental_deterministic",
       ty=bool,
       docstring=
-      "Whether to dynamically adjust the values of tunable parameters (e.g. "
-      "degrees of parallelism).")
+      "Whether the outputs need to be produced in deterministic order. If None,"
+      " defaults to True.")
 
   experimental_numa_aware = options_lib.create_option(
       name="experimental_numa_aware",
       ty=bool,
-      docstring="Whether to use NUMA-aware operations.")
+      docstring=
+      "Whether to use NUMA-aware operations. If None, defaults to False.")
 
   experimental_optimization = options_lib.create_option(
       name="experimental_optimization",
       ty=optimization_options.OptimizationOptions,
-      docstring="Associates the given optimization options with the dataset.")
+      docstring=
+      "The optimization options associated with the dataset. See "
+      "`tf.data.experimental.OptimizationOptions` for more details.",
+      default_factory=optimization_options.OptimizationOptions)
 
   experimental_stats = options_lib.create_option(
       name="experimental_stats",
       ty=stats_options.StatsOptions,
-      docstring="Associates the given statistics options with the dataset.")
+      docstring=
+      "The statistics options associated with the dataset. See "
+      "`tf.data.experimental.StatsOptions` for more details.",
+      default_factory=stats_options.StatsOptions)
 
   experimental_threading = options_lib.create_option(
       name="experimental_threading",
       ty=threading_options.ThreadingOptions,
-      docstring="Associates the given threading options with the dataset.")
+      docstring=
+      "The threading options associated with the dataset. See "
+      "`tf.data.experimental.ThreadingOptions` for more details.",
+      default_factory=threading_options.ThreadingOptions)
 
   def _static_optimizations(self):
     """Produces the list of enabled static optimizations."""
 
     result = []
-    exp_optimization_options = self.experimental_optimization
-    if exp_optimization_options:
-      optimizations = [
-          "filter_fusion",
-          "hoist_random_uniform",
-          "map_and_batch_fusion",
-          "map_and_filter_fusion",
-          "map_fusion",
-          "map_parallelization",
-          "map_vectorization",
-          "noop_elimination",
-          "shuffle_and_repeat_fusion",
-      ]
-      for optimization in optimizations:
-        if getattr(exp_optimization_options, optimization):
-          result.append(optimization)
+    result.extend(self.experimental_optimization._static_optimizations())  # pylint: disable=protected-access
+
     if self.experimental_numa_aware:
       result.append("make_numa_aware")
     if self.experimental_deterministic is False:
@@ -1678,16 +1790,8 @@ class UnaryUnchangedStructureDataset(UnaryDataset):
   """Represents a unary dataset with the same input and output structure."""
 
   @property
-  def output_classes(self):
-    return self._input_dataset.output_classes  # pylint: disable=protected-access
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes  # pylint: disable=protected-access
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types  # pylint: disable=protected-access
+  def _element_structure(self):
+    return self._input_dataset._element_structure  # pylint: disable=protected-access
 
 
 class TensorDataset(DatasetSource):
@@ -1703,31 +1807,16 @@ class TensorDataset(DatasetSource):
               t, name="component_%d" % i)
           for i, t in enumerate(nest.flatten(tensors))
       ])
-
-    self._tensors = sparse.serialize_sparse_tensors(tensors)
-    self._output_classes = sparse.get_classes(tensors)
-    self._output_shapes = nest.pack_sequence_as(
-        tensors, [t.get_shape() for t in nest.flatten(tensors)])
-    self._output_types = nest.pack_sequence_as(
-        tensors, [t.dtype for t in nest.flatten(tensors)])
+    self._structure = structure_lib.Structure.from_value(tensors)
+    self._tensors = self._structure._to_tensor_list(tensors)  # pylint: disable=protected-access
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.tensor_dataset(
-        nest.flatten(self._tensors),
-        output_shapes=nest.flatten(
-            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+        self._tensors, output_shapes=self._structure._flat_shapes)  # pylint: disable=protected-access
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
 
 class TensorSliceDataset(DatasetSource):
@@ -1743,37 +1832,26 @@ class TensorSliceDataset(DatasetSource):
               t, name="component_%d" % i)
           for i, t in enumerate(nest.flatten(tensors))
       ])
-      flat_tensors = nest.flatten(tensors)
+
+    batched_structure = structure_lib.Structure.from_value(tensors)
+    # pylint: disable=protected-access
+    self._tensors = batched_structure._to_batched_tensor_list(tensors)
+    self._structure = batched_structure._unbatch()
+    # pylint: enable=protected-access
 
     batch_dim = tensor_shape.Dimension(tensor_shape.dimension_value(
-        flat_tensors[0].get_shape()[0]))
-    for t in flat_tensors[1:]:
+        self._tensors[0].get_shape()[0]))
+    for t in self._tensors[1:]:
       batch_dim.assert_is_compatible_with(tensor_shape.Dimension(
           tensor_shape.dimension_value(t.get_shape()[0])))
-    self._tensors = sparse.serialize_many_sparse_tensors(tensors)
-    self._output_classes = sparse.get_classes(tensors)
-    self._output_shapes = nest.pack_sequence_as(
-        tensors, [t.get_shape()[1:] for t in nest.flatten(tensors)])
-    self._output_types = nest.pack_sequence_as(
-        tensors, [t.dtype for t in nest.flatten(tensors)])
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.tensor_slice_dataset(
-        nest.flatten(self._tensors),
-        output_shapes=nest.flatten(
-            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
-
-  @property
-  def output_classes(self):
-    return self._output_classes
+        self._tensors, output_shapes=self._structure._flat_shapes)  # pylint: disable=protected-access
 
   @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
 
 class SparseTensorSliceDataset(DatasetSource):
@@ -1786,28 +1864,22 @@ class SparseTensorSliceDataset(DatasetSource):
       raise TypeError("`sparse_tensor` must be a `tf.SparseTensor` object.")
     self._sparse_tensor = sparse_tensor
 
+    indices_shape = self._sparse_tensor.indices.get_shape()
+    shape_shape = self._sparse_tensor.dense_shape.get_shape()
+    rank = (indices_shape.dims[1] - 1).merge_with(shape_shape.dims[0] - 1)
+    self._structure = structure_lib.NestedStructure(
+        (structure_lib.TensorStructure(dtypes.int64, [None, rank]),
+         structure_lib.TensorStructure(self._sparse_tensor.dtype, [None]),
+         structure_lib.TensorStructure(dtypes.int64, [rank])))
+
   def _as_variant_tensor(self):
     return gen_dataset_ops.sparse_tensor_slice_dataset(
         self._sparse_tensor.indices, self._sparse_tensor.values,
         self._sparse_tensor.dense_shape)
 
   @property
-  def output_classes(self):
-    return (ops.Tensor, ops.Tensor, ops.Tensor)
-
-  @property
-  def output_shapes(self):
-    indices_shape = self._sparse_tensor.indices.get_shape()
-    shape_shape = self._sparse_tensor.dense_shape.get_shape()
-    rank = (indices_shape.dims[1] - 1).merge_with(shape_shape.dims[0] - 1)
-    num_values = tensor_shape.Dimension(None)
-    return (tensor_shape.TensorShape([num_values, rank]),
-            tensor_shape.TensorShape([num_values]),
-            tensor_shape.TensorShape([rank]))
-
-  @property
-  def output_types(self):
-    return (dtypes.int64, self._sparse_tensor.dtype, dtypes.int64)
+  def _element_structure(self):
+    return self._structure
 
 
 class _VariantDataset(DatasetV2):
@@ -1825,18 +1897,11 @@ class _VariantDataset(DatasetV2):
     return []
 
   @property
-  def output_classes(self):
-    return self._structure._to_legacy_output_classes()  # pylint: disable=protected-access
-
-  @property
-  def output_shapes(self):
-    return self._structure._to_legacy_output_shapes()  # pylint: disable=protected-access
-
-  @property
-  def output_types(self):
-    return self._structure._to_legacy_output_types()  # pylint: disable=protected-access
+  def _element_structure(self):
+    return self._structure
 
 
+@tf_export("data.experimental.DatasetStructure")
 class DatasetStructure(structure_lib.Structure):
   """Represents a `Dataset` of structured values."""
 
@@ -1860,6 +1925,9 @@ class DatasetStructure(structure_lib.Structure):
   def _to_tensor_list(self, value):
     return [value._as_variant_tensor()]  # pylint: disable=protected-access
 
+  def _to_batched_tensor_list(self, value):
+    raise NotImplementedError("Unbatching for `tf.data.Dataset` objects.")
+
   def _from_tensor_list(self, flat_value):
     if (len(flat_value) != 1 or flat_value[0].dtype != dtypes.variant or
         not flat_value[0].shape.is_compatible_with(tensor_shape.scalar())):
@@ -1873,11 +1941,7 @@ class DatasetStructure(structure_lib.Structure):
 
   @staticmethod
   def from_value(value):
-    # TODO(b/110122868): We can simplify this when a `Dataset` object has a
-    # `Structure`-valued property.
-    element_structure = structure_lib.Structure._from_legacy_structure(
-        value.output_types, value.output_shapes, value.output_classes)
-    return DatasetStructure(element_structure)
+    return DatasetStructure(value._element_structure)  # pylint: disable=protected-access
 
   def _to_legacy_output_types(self):
     return self
@@ -1888,6 +1952,12 @@ class DatasetStructure(structure_lib.Structure):
   def _to_legacy_output_classes(self):
     return self
 
+  def _batch(self, batch_size):
+    raise NotImplementedError("Batching for `tf.data.Dataset` objects.")
+
+  def _unbatch(self):
+    raise NotImplementedError("Unbatching for `tf.data.Dataset` objects.")
+
 
 # pylint: disable=protected-access
 structure_lib.Structure._register_custom_converter(DatasetV2,
@@ -1906,6 +1976,7 @@ class StructuredFunctionWrapper(object):
                input_classes=None,
                input_shapes=None,
                input_types=None,
+               input_structure=None,
                add_to_graph=True,
                defun_kwargs=None):
     """Creates a new `StructuredFunctionWrapper` for the given function.
@@ -1924,6 +1995,8 @@ class StructuredFunctionWrapper(object):
         arguments.
       input_types: (Optional.) A nested structure of `tf.DType`. If given, this
         argument defines the element types and structure for `func` arguments.
+      input_structure: (Optional.) A `Structure` object. If given, this argument
+        defines the element types and structure for `func` arguments.
       add_to_graph: (Optional.) If `True`, the function will be added to the
         default graph.
       defun_kwargs: (Optional.) A dictionary mapping string argument names to
@@ -1934,24 +2007,28 @@ class StructuredFunctionWrapper(object):
       ValueError: If an invalid combination of `dataset`, `input_classes`,
         `input_shapes`, and `input_types` is passed.
     """
-    if dataset is None:
-      if input_classes is None or input_shapes is None or input_types is None:
-        raise ValueError("Either `dataset`, or all of `input_classes`, "
-                         "`input_shapes`, and `input_types` must be specified.")
-      self._input_shapes = input_shapes
-      self._input_types = input_types
-      self._input_classes = input_classes
+    if input_structure is None:
+      if dataset is None:
+        if input_classes is None or input_shapes is None or input_types is None:
+          raise ValueError("Either `dataset`, `input_structure` or all of "
+                           "`input_classes`, `input_shapes`, and `input_types` "
+                           "must be specified.")
+        self._input_structure = structure_lib.convert_legacy_structure(
+            input_types, input_shapes, input_classes)
+      else:
+        if not (input_classes is None and input_shapes is None and
+                input_types is None):
+          raise ValueError("Either `dataset`, `input_structure` or all of "
+                           "`input_classes`, `input_shapes`, and `input_types` "
+                           "must be specified.")
+        self._input_structure = dataset._element_structure  # pylint: disable=protected-access
     else:
-      if not (input_classes is None and input_shapes is None and
-              input_types is None):
-        raise ValueError("Either `dataset`, or all of `input_classes`, "
-                         "`input_shapes`, and `input_types` must be specified.")
-      self._input_shapes = dataset.output_shapes
-      self._input_types = dataset.output_types
-      self._input_classes = dataset.output_classes
-
-    self._input_structure = structure_lib.Structure._from_legacy_structure(  # pylint: disable=protected-access
-        self._input_types, self._input_shapes, self._input_classes)
+      if not (dataset is None and input_classes is None and input_shapes is None
+              and input_types is None):
+        raise ValueError("Either `dataset`, `input_structure`, or all of "
+                         "`input_classes`, `input_shapes`, and `input_types` "
+                         "must be specified.")
+      self._input_structure = input_structure
 
     self._transformation_name = transformation_name
     readable_transformation_name = transformation_name.replace(
@@ -2045,8 +2122,7 @@ def flat_structure(dataset):
     constructors.
   """
   # pylint: disable=protected-access
-  structure = structure_lib.Structure._from_legacy_structure(
-      dataset.output_types, dataset.output_shapes, dataset.output_classes)
+  structure = dataset._element_structure
   return {
       "output_shapes": structure._flat_shapes,
       "output_types": structure._flat_types,
@@ -2072,70 +2148,39 @@ class _GeneratorDataset(DatasetSource):
         destroyed. The return value is ignored.
     """
     super(_GeneratorDataset, self).__init__()
-    # These members will be initialized by `tf_init_func`.
-    self._state_classes = None
-    self._state_shapes = None
-    self._state_types = None
-
     self._init_args = init_args
 
-    init_args_classes = sparse.get_classes(init_args)
-    init_args_shapes = nest.pack_sequence_as(
-        init_args, [t.get_shape() for t in nest.flatten(init_args)])
-    init_args_types = nest.pack_sequence_as(
-        init_args, [t.dtype for t in nest.flatten(init_args)])
+    self._init_structure = structure_lib.Structure.from_value(init_args)
 
-    wrapped_init_func = StructuredFunctionWrapper(
+    self._init_func = StructuredFunctionWrapper(
         init_func,
         self._transformation_name(),
-        input_classes=init_args_classes,
-        input_shapes=init_args_shapes,
-        input_types=init_args_types)
-    self._state_classes = wrapped_init_func.output_classes
-    self._state_shapes = wrapped_init_func.output_shapes
-    self._state_types = wrapped_init_func.output_types
-    self._init_func = wrapped_init_func.function
-
-    wrapped_next_func = StructuredFunctionWrapper(
+        input_structure=self._init_structure)
+
+    self._next_func = StructuredFunctionWrapper(
         next_func,
         self._transformation_name(),
-        input_classes=self._state_classes,
-        input_shapes=self._state_shapes,
-        input_types=self._state_types)
-    self._output_classes = wrapped_next_func.output_classes
-    self._output_shapes = wrapped_next_func.output_shapes
-    self._output_types = wrapped_next_func.output_types
-    self._next_func = wrapped_next_func.function
-
-    wrapped_finalize_func = StructuredFunctionWrapper(
+        input_structure=self._init_func.output_structure)
+
+    self._finalize_func = StructuredFunctionWrapper(
         finalize_func,
         self._transformation_name(),
-        input_classes=self._state_classes,
-        input_shapes=self._state_shapes,
-        input_types=self._state_types)
-    self._finalize_func = wrapped_finalize_func.function
+        input_structure=self._init_func.output_structure)
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.generator_dataset(
-        nest.flatten(self._init_args) + self._init_func.captured_inputs,
-        self._next_func.captured_inputs,
-        self._finalize_func.captured_inputs,
-        init_func=self._init_func,
-        next_func=self._next_func,
-        finalize_func=self._finalize_func,
+        self._init_structure._to_tensor_list(self._init_args)  # pylint: disable=protected-access
+        + self._init_func.function.captured_inputs,
+        self._next_func.function.captured_inputs,
+        self._finalize_func.function.captured_inputs,
+        init_func=self._init_func.function,
+        next_func=self._next_func.function,
+        finalize_func=self._finalize_func.function,
         **flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._next_func.output_structure
 
   def _transformation_name(self):
     return "Dataset.from_generator()"
@@ -2158,6 +2203,10 @@ class ZipDataset(DatasetV2):
                      "structure of `Dataset` objects.")
         raise TypeError(message)
     self._datasets = datasets
+    self._structure = structure_lib.NestedStructure(
+        nest.pack_sequence_as(
+            self._datasets,
+            [ds._element_structure for ds in nest.flatten(self._datasets)]))  # pylint: disable=protected-access
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
@@ -2170,22 +2219,8 @@ class ZipDataset(DatasetV2):
     return nest.flatten(self._datasets)
 
   @property
-  def output_classes(self):
-    return nest.pack_sequence_as(
-        self._datasets,
-        [ds.output_classes for ds in nest.flatten(self._datasets)])
-
-  @property
-  def output_shapes(self):
-    return nest.pack_sequence_as(
-        self._datasets,
-        [ds.output_shapes for ds in nest.flatten(self._datasets)])
-
-  @property
-  def output_types(self):
-    return nest.pack_sequence_as(
-        self._datasets,
-        [ds.output_types for ds in nest.flatten(self._datasets)])
+  def _element_structure(self):
+    return self._structure
 
 
 class ConcatenateDataset(DatasetV2):
@@ -2197,26 +2232,29 @@ class ConcatenateDataset(DatasetV2):
     self._input_dataset = input_dataset
     self._dataset_to_concatenate = dataset_to_concatenate
 
-    self._output_types = input_dataset.output_types
-    if self._output_types != dataset_to_concatenate.output_types:
+    output_types = input_dataset.output_types
+    if output_types != dataset_to_concatenate.output_types:
       raise TypeError(
           "Two datasets to concatenate have different types %s and %s" %
-          (self._output_types, dataset_to_concatenate.output_types))
+          (output_types, dataset_to_concatenate.output_types))
 
-    self._output_classes = input_dataset.output_classes
-    if self._output_classes != dataset_to_concatenate.output_classes:
+    output_classes = input_dataset.output_classes
+    if output_classes != dataset_to_concatenate.output_classes:
       raise TypeError(
           "Two datasets to concatenate have different classes %s and %s" %
-          (self._output_classes, dataset_to_concatenate.output_classes))
+          (output_classes, dataset_to_concatenate.output_classes))
 
     input_shapes = self._input_dataset.output_shapes
-    self._output_shapes = nest.pack_sequence_as(input_shapes, [
+    output_shapes = nest.pack_sequence_as(input_shapes, [
         ts1.most_specific_compatible_shape(ts2)
         for (ts1, ts2) in zip(
             nest.flatten(input_shapes),
             nest.flatten(self._dataset_to_concatenate.output_shapes))
     ])
 
+    self._structure = structure_lib.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
+
     self._input_datasets = [input_dataset, dataset_to_concatenate]
 
   def _as_variant_tensor(self):
@@ -2231,16 +2269,8 @@ class ConcatenateDataset(DatasetV2):
     return [self._input_dataset, self._dataset_to_concatenate]
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
 
 class RepeatDataset(UnaryUnchangedStructureDataset):
@@ -2299,16 +2329,8 @@ class RangeDataset(DatasetSource):
         **flat_structure(self))
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.int64
+  def _element_structure(self):
+    return structure_lib.TensorStructure(dtypes.int64, [])
 
 
 class CacheDataset(UnaryUnchangedStructureDataset):
@@ -2421,37 +2443,26 @@ class BatchDataset(UnaryDataset):
     self._drop_remainder = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
-  def _as_variant_tensor(self):
-    # TODO(jsimsa): Switch to using v2 only any time after 6/30/2018.
-    if smart_cond.smart_constant_value(self._drop_remainder) is False:
-      return gen_dataset_ops.batch_dataset(
-          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          batch_size=self._batch_size,
-          **flat_structure(self))
+    constant_drop_remainder = tensor_util.constant_value(self._drop_remainder)
+    # pylint: disable=protected-access
+    if constant_drop_remainder:
+      # NOTE(mrry): `constant_drop_remainder` may be `None` (unknown statically)
+      # or `False` (explicitly retaining the remainder).
+      self._structure = input_dataset._element_structure._batch(
+          tensor_util.constant_value(self._batch_size))
     else:
-      return gen_dataset_ops.batch_dataset_v2(
-          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          batch_size=self._batch_size,
-          drop_remainder=self._drop_remainder,
-          **flat_structure(self))
+      self._structure = input_dataset._element_structure._batch(None)
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    input_shapes = self._input_dataset.output_shapes
-    return nest.pack_sequence_as(input_shapes, [
-        tensor_shape.vector(
-            tensor_util.constant_value(self._batch_size) if smart_cond.
-            smart_constant_value(self._drop_remainder) else None).concatenate(s)
-        for s in nest.flatten(self._input_dataset.output_shapes)
-    ])
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.batch_dataset_v2(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        batch_size=self._batch_size,
+        drop_remainder=self._drop_remainder,
+        **flat_structure(self))
 
   @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
 def _is_padded_shape_compatible_with(padded_shape, input_component_shape):
@@ -2600,22 +2611,34 @@ class PaddedBatchDataset(UnaryDataset):
     self._drop_remainder = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
+    def _padded_shape_to_batch_shape(s):
+      return tensor_shape.vector(
+          tensor_util.constant_value(self._batch_size) if smart_cond.
+          smart_constant_value(self._drop_remainder) else None).concatenate(
+              tensor_util.constant_value_as_shape(s))
+
+    output_shapes = nest.map_structure(
+        _padded_shape_to_batch_shape, self._padded_shapes)
+    self._structure = structure_lib.convert_legacy_structure(
+        self._input_dataset.output_types, output_shapes,
+        self._input_dataset.output_classes)
+
   def _as_variant_tensor(self):
+    # pylint: disable=protected-access
     # TODO(jsimsa): Switch to using v2 only any time after 6/30/2018.
     if smart_cond.smart_constant_value(self._drop_remainder) is False:
       return gen_dataset_ops.padded_batch_dataset(
-          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          self._input_dataset._as_variant_tensor(),
           batch_size=self._batch_size,
           padded_shapes=[
               ops.convert_to_tensor(s, dtype=dtypes.int64)
               for s in nest.flatten(self._padded_shapes)
           ],
           padding_values=nest.flatten(self._padding_values),
-          output_shapes=nest.flatten(
-              sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+          output_shapes=self._structure._flat_shapes)
     else:
       return gen_dataset_ops.padded_batch_dataset_v2(
-          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          self._input_dataset._as_variant_tensor(),
           batch_size=self._batch_size,
           padded_shapes=[
               ops.convert_to_tensor(s, dtype=dtypes.int64)
@@ -2623,27 +2646,11 @@ class PaddedBatchDataset(UnaryDataset):
           ],
           padding_values=nest.flatten(self._padding_values),
           drop_remainder=self._drop_remainder,
-          output_shapes=nest.flatten(
-              sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+          output_shapes=self._structure._flat_shapes)
 
   @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-
-    def _padded_shape_to_batch_shape(s):
-      return tensor_shape.vector(
-          tensor_util.constant_value(self._batch_size) if smart_cond.
-          smart_constant_value(self._drop_remainder) else None).concatenate(
-              tensor_util.constant_value_as_shape(s))
-
-    return nest.map_structure(_padded_shape_to_batch_shape, self._padded_shapes)
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
 def _should_unpack_args(args):
@@ -2672,39 +2679,35 @@ def _warn_if_collections(transformation_name):
 class MapDataset(UnaryDataset):
   """A `Dataset` that maps a function over elements in its input."""
 
-  def __init__(self, input_dataset, map_func, use_inter_op_parallelism=True):
+  def __init__(self,
+               input_dataset,
+               map_func,
+               use_inter_op_parallelism=True,
+               preserve_cardinality=False):
     """See `Dataset.map()` for details."""
     super(MapDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._use_inter_op_parallelism = use_inter_op_parallelism
-
-    wrapped_func = StructuredFunctionWrapper(
+    self._preserve_cardinality = preserve_cardinality
+    self._map_func = StructuredFunctionWrapper(
         map_func, self._transformation_name(), dataset=input_dataset)
-    self._output_classes = wrapped_func.output_classes
-    self._output_shapes = wrapped_func.output_shapes
-    self._output_types = wrapped_func.output_types
-    self._map_func = wrapped_func.function
 
   def _as_variant_tensor(self):
     input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
     return gen_dataset_ops.map_dataset(
         input_t,
-        self._map_func.captured_inputs,
-        f=self._map_func,
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         use_inter_op_parallelism=self._use_inter_op_parallelism,
+        preserve_cardinality=self._preserve_cardinality,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
+  def _functions(self):
+    return [self._map_func]
 
   @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._map_func.output_structure
 
   def _transformation_name(self):
     return "Dataset.map()"
@@ -2717,25 +2720,26 @@ class ParallelMapDataset(MapDataset):
                input_dataset,
                map_func,
                num_parallel_calls,
-               use_inter_op_parallelism=True):
+               use_inter_op_parallelism=True,
+               preserve_cardinality=False):
     """See `Dataset.map()` for details."""
-    super(ParallelMapDataset, self).__init__(input_dataset, map_func,
-                                             use_inter_op_parallelism)
+    super(ParallelMapDataset, self).__init__(
+        input_dataset, map_func, use_inter_op_parallelism, preserve_cardinality)
 
     self._num_parallel_calls = ops.convert_to_tensor(
         num_parallel_calls, dtype=dtypes.int32, name="num_parallel_calls")
 
   def _as_variant_tensor(self):
-    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
     # pylint: disable=protected-access
+    input_t = self._input_dataset._as_variant_tensor()
     return gen_dataset_ops.parallel_map_dataset(
         input_t,
-        self._map_func.captured_inputs,
-        f=self._map_func,
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         num_parallel_calls=self._num_parallel_calls,
         use_inter_op_parallelism=self._use_inter_op_parallelism,
+        preserve_cardinality=self._preserve_cardinality,
         **flat_structure(self))
-    # pylint: enable=protected-access
 
 
 class FlatMapDataset(UnaryDataset):
@@ -2746,35 +2750,25 @@ class FlatMapDataset(UnaryDataset):
     super(FlatMapDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
 
-    wrapped_func = StructuredFunctionWrapper(
+    self._map_func = StructuredFunctionWrapper(
         map_func, self._transformation_name(), dataset=input_dataset)
-    if not isinstance(wrapped_func.output_structure, DatasetStructure):
+    if not isinstance(self._map_func.output_structure, DatasetStructure):
       raise TypeError("`map_func` must return a `Dataset` object.")
-    # pylint: disable=protected-access
-    element_structure = wrapped_func.output_structure._element_structure
-    self._output_classes = element_structure._to_legacy_output_classes()
-    self._output_types = element_structure._to_legacy_output_types()
-    self._output_shapes = element_structure._to_legacy_output_shapes()
-    self._map_func = wrapped_func.function
+    self._structure = self._map_func.output_structure._element_structure  # pylint: disable=protected-access
+
+  def _functions(self):
+    return [self._map_func]
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.flat_map_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._map_func.captured_inputs,
-        f=self._map_func,
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         **flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
   def _transformation_name(self):
     return "Dataset.flat_map()"
@@ -2793,12 +2787,13 @@ class InterleaveDataset(FlatMapDataset):
         block_length, dtype=dtypes.int64, name="block_length")
 
   def _as_variant_tensor(self):
+    # pylint: disable=protected-access
     return gen_dataset_ops.interleave_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._map_func.captured_inputs,  # pylint: disable=protected-access
+        self._input_dataset._as_variant_tensor(),
+        self._map_func.function.captured_inputs,
         self._cycle_length,
         self._block_length,
-        f=self._map_func,  # pylint: disable=protected-access
+        f=self._map_func.function,
         **flat_structure(self))
 
   def _transformation_name(self):
@@ -2822,13 +2817,14 @@ class ParallelInterleaveDataset(FlatMapDataset):
         num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
 
   def _as_variant_tensor(self):
+    # pylint: disable=protected-access
     return gen_dataset_ops.parallel_interleave_dataset_v2(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._map_func.captured_inputs,  # pylint: disable=protected-access
+        self._input_dataset._as_variant_tensor(),
+        self._map_func.function.captured_inputs,
         self._cycle_length,
         self._block_length,
         self._num_parallel_calls,
-        f=self._map_func,  # pylint: disable=protected-access
+        f=self._map_func.function,
         **flat_structure(self))
 
   def _transformation_name(self):
@@ -2844,17 +2840,19 @@ class FilterDataset(UnaryUnchangedStructureDataset):
     self._input_dataset = input_dataset
     wrapped_func = StructuredFunctionWrapper(
         predicate, self._transformation_name(), dataset=input_dataset)
-    if not (
-        wrapped_func.output_types == dtypes.bool and
-        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
+    if not wrapped_func.output_structure.is_compatible_with(
+        structure_lib.TensorStructure(dtypes.bool, [])):
       raise ValueError("`predicate` must return a scalar boolean tensor.")
-    self._predicate = wrapped_func.function
+    self._predicate = wrapped_func
+
+  def _functions(self):
+    return [self._predicate]
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.filter_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        other_arguments=self._predicate.captured_inputs,
-        predicate=self._predicate,
+        other_arguments=self._predicate.function.captured_inputs,
+        predicate=self._predicate.function,
         **flat_structure(self))
 
   def _transformation_name(self):
@@ -2893,19 +2891,17 @@ class WindowDataset(UnaryDataset):
         stride, dtype=dtypes.int64, name="stride")
     self._drop_remainder = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
-    self._output_classes = nest.pack_sequence_as(
+    nest_of_structures = nest.pack_sequence_as(
         input_dataset.output_classes,
         [
-            DatasetStructure(
-                structure_lib.Structure._from_legacy_structure(  # pylint: disable=protected-access
-                    output_type, output_shape, output_class))
+            DatasetStructure(structure_lib.convert_legacy_structure(
+                output_type, output_shape, output_class))
             for output_class, output_shape, output_type in zip(
                 nest.flatten(input_dataset.output_classes),
                 nest.flatten(input_dataset.output_shapes),
                 nest.flatten(input_dataset.output_types))
         ])
-    self._output_shapes = self._output_classes
-    self._output_types = self._output_classes
+    self._structure = structure_lib.NestedStructure(nest_of_structures)
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.window_dataset(
@@ -2917,16 +2913,8 @@ class WindowDataset(UnaryDataset):
         **flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
 
 class _OptionsDataset(UnaryUnchangedStructureDataset):
@@ -2992,7 +2980,7 @@ class _SetStatsAggregatorDataset(UnaryUnchangedStructureDataset):
     self._counter_prefix = counter_prefix
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.set_stats_aggregator_dataset(
+    return ged_ops.experimental_set_stats_aggregator_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._stats_aggregator._resource,  # pylint: disable=protected-access
         self._prefix,
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index e2ca64c8025b93589522ade4bec63fd9a995a486..d0e91b01f9138470cd2a06a8b353149b74af2497 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -23,7 +23,6 @@ import warnings
 from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import optional_ops
 from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
 from tensorflow.python.data.util import structure as structure_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -100,10 +99,8 @@ class Iterator(checkpointable.CheckpointableBase):
       raise ValueError("If `structure` is not specified, all of "
                        "`output_types`, `output_shapes`, and `output_classes`"
                        " must be specified.")
-    # pylint: disable=protected-access
-    self._structure = structure_lib.Structure._from_legacy_structure(
+    self._structure = structure_lib.convert_legacy_structure(
         output_types, output_shapes, output_classes)
-    # pylint: enable=protected-access
 
     self._string_handle = gen_dataset_ops.iterator_to_string_handle(
         self._iterator_resource)
@@ -190,34 +187,32 @@ class Iterator(checkpointable.CheckpointableBase):
     if output_classes is None:
       output_classes = nest.map_structure(lambda _: ops.Tensor, output_types)
     nest.assert_same_structure(output_types, output_shapes)
+    output_structure = structure_lib.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
     if shared_name is None:
       shared_name = ""
+    # pylint: disable=protected-access
     if compat.forward_compatible(2018, 8, 3):
       if _device_stack_is_empty():
         with ops.device("/cpu:0"):
           iterator_resource = gen_dataset_ops.iterator_v2(
               container="",
               shared_name=shared_name,
-              output_types=nest.flatten(
-                  sparse.as_dense_types(output_types, output_classes)),
-              output_shapes=nest.flatten(
-                  sparse.as_dense_shapes(output_shapes, output_classes)))
+              output_types=output_structure._flat_types,
+              output_shapes=output_structure._flat_shapes)
       else:
         iterator_resource = gen_dataset_ops.iterator_v2(
             container="",
             shared_name=shared_name,
-            output_types=nest.flatten(
-                sparse.as_dense_types(output_types, output_classes)),
-            output_shapes=nest.flatten(
-                sparse.as_dense_shapes(output_shapes, output_classes)))
+            output_types=output_structure._flat_types,
+            output_shapes=output_structure._flat_shapes)
     else:
       iterator_resource = gen_dataset_ops.iterator(
           container="",
           shared_name=shared_name,
-          output_types=nest.flatten(
-              sparse.as_dense_types(output_types, output_classes)),
-          output_shapes=nest.flatten(
-              sparse.as_dense_shapes(output_shapes, output_classes)))
+          output_types=output_structure._flat_types,
+          output_shapes=output_structure._flat_shapes)
+    # pylint: enable=protected-access
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
@@ -280,30 +275,28 @@ class Iterator(checkpointable.CheckpointableBase):
     if output_classes is None:
       output_classes = nest.map_structure(lambda _: ops.Tensor, output_types)
     nest.assert_same_structure(output_types, output_shapes)
+    output_structure = structure_lib.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
     string_handle = ops.convert_to_tensor(string_handle, dtype=dtypes.string)
+    # pylint: disable=protected-access
     if compat.forward_compatible(2018, 8, 3):
       if _device_stack_is_empty():
         with ops.device("/cpu:0"):
           iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
               string_handle,
-              output_types=nest.flatten(
-                  sparse.as_dense_types(output_types, output_classes)),
-              output_shapes=nest.flatten(
-                  sparse.as_dense_shapes(output_shapes, output_classes)))
+              output_types=output_structure._flat_types,
+              output_shapes=output_structure._flat_shapes)
       else:
         iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
             string_handle,
-            output_types=nest.flatten(
-                sparse.as_dense_types(output_types, output_classes)),
-            output_shapes=nest.flatten(
-                sparse.as_dense_shapes(output_shapes, output_classes)))
+            output_types=output_structure._flat_types,
+            output_shapes=output_structure._flat_shapes)
     else:
       iterator_resource = gen_dataset_ops.iterator_from_string_handle(
           string_handle,
-          output_types=nest.flatten(
-              sparse.as_dense_types(output_types, output_classes)),
-          output_shapes=nest.flatten(
-              sparse.as_dense_shapes(output_shapes, output_classes)))
+          output_types=output_structure._flat_types,
+          output_shapes=output_structure._flat_shapes)
+    # pylint: enable=protected-access
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
@@ -530,8 +523,9 @@ class EagerIterator(checkpointable.CheckpointableBase):
     self._device = context.context().device_name
     with ops.device("/cpu:0"):
       # pylint: disable=protected-access
+      dataset = dataset._apply_options()
       ds_variant = dataset._as_variant_tensor()
-      self._structure = structure_lib.Structure._from_legacy_structure(
+      self._structure = structure_lib.convert_legacy_structure(
           dataset.output_types, dataset.output_shapes, dataset.output_classes)
       self._flat_output_types = self._structure._flat_types
       self._flat_output_shapes = self._structure._flat_shapes
@@ -543,6 +537,7 @@ class EagerIterator(checkpointable.CheckpointableBase):
         # Delete the resource when this object is deleted
         self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
             handle=self._resource, handle_device=self._device)
+      # pylint: enable=protected-access
 
   def __iter__(self):
     return self
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index 0f9add6461aeeb1e1d81dfb75fefb345b659c349..45d01564794fc181f27fbf449738e8e55aae40d4 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -19,8 +19,6 @@ from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
@@ -36,16 +34,9 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
   """A `dummy` generator dataset."""
 
   def __init__(self, shard_num, multi_device_iterator_resource, incarnation_id,
-               source_device, target_device, output_shapes, output_types,
-               output_classes):
+               source_device, target_device, element_structure):
     self._target_device = target_device
-    self._output_types = output_types
-    self._output_shapes = output_shapes
-    self._output_classes = output_classes
-    self._flat_output_shapes = nest.flatten(
-        sparse.as_dense_shapes(self._output_shapes, self._output_classes))
-    self._flat_output_types = nest.flatten(
-        sparse.as_dense_types(self._output_types, self._output_classes))
+    self._structure = element_structure
 
     multi_device_iterator_string_handle = (
         gen_dataset_ops.multi_device_iterator_to_string_handle(
@@ -70,17 +61,18 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
 
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _next_func(string_handle):
+      # pylint: disable=protected-access
       multi_device_iterator = (
           gen_dataset_ops.multi_device_iterator_from_string_handle(
               string_handle=string_handle,
-              output_types=self._flat_output_types,
-              output_shapes=self._flat_output_shapes))
+              output_types=self._structure._flat_types,
+              output_shapes=self._structure._flat_shapes))
       return gen_dataset_ops.multi_device_iterator_get_next_from_shard(
           multi_device_iterator=multi_device_iterator,
           shard_num=shard_num,
           incarnation_id=incarnation_id,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
+          output_types=self._structure._flat_types,
+          output_shapes=self._structure._flat_shapes)
 
     next_func_concrete = _next_func._get_concrete_function_internal()  # pylint: disable=protected-access
 
@@ -90,9 +82,8 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
     def _remote_next_func(string_handle):
       return functional_ops.remote_call(
           target=source_device,
-          args=[string_handle] +
-          next_func_concrete.captured_inputs,
-          Tout=self._flat_output_types,
+          args=[string_handle] + next_func_concrete.captured_inputs,
+          Tout=self._structure._flat_types,  # pylint: disable=protected-access
           f=next_func_concrete)
 
     self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
@@ -108,8 +99,7 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
     def _remote_finalize_func(string_handle):
       return functional_ops.remote_call(
           target=source_device,
-          args=[string_handle] +
-          finalize_func_concrete.captured_inputs,
+          args=[string_handle] + finalize_func_concrete.captured_inputs,
           Tout=[dtypes.int64],
           f=finalize_func_concrete)
 
@@ -126,24 +116,15 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
           init_func=self._init_func,
           next_func=self._next_func,
           finalize_func=self._finalize_func,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
+          **dataset_ops.flat_structure(self))
 
   def _inputs(self):
     # TODO(b/116506223): Determine which datasets should be used as inputs here.
     return []
 
   @property
-  def output_types(self):
-    return self._output_types
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_classes(self):
-    return self._output_classes
+  def _element_structure(self):
+    return self._structure
 
 
 class MultiDeviceIterator(object):
@@ -183,13 +164,6 @@ class MultiDeviceIterator(object):
     self._source_device = source_device
     self._source_device_tensor = ops.convert_to_tensor(source_device)
 
-    self._flat_output_shapes = nest.flatten(
-        sparse.as_dense_shapes(self._dataset.output_shapes,
-                               self._dataset.output_classes))
-    self._flat_output_types = nest.flatten(
-        sparse.as_dense_types(self._dataset.output_types,
-                              self._dataset.output_classes))
-
     # Create the MultiDeviceIterator.
     with ops.device(self._source_device):
       self._multi_device_iterator_resource = (
@@ -197,8 +171,7 @@ class MultiDeviceIterator(object):
               devices=self._devices,
               shared_name="",
               container="",
-              output_types=self._flat_output_types,
-              output_shapes=self._flat_output_shapes))
+              **dataset_ops.flat_structure(dataset)))
 
       # The incarnation ID is used to ensure consistency between the per-device
       # iterators and the multi-device iterator.
@@ -216,13 +189,14 @@ class MultiDeviceIterator(object):
     for i, device in enumerate(self._devices):
       ds = _PerDeviceGenerator(
           i, self._multi_device_iterator_resource, self._incarnation_id,
-          self._source_device_tensor, device, self._dataset.output_shapes,
-          self._dataset.output_types, self._dataset.output_classes)
+          self._source_device_tensor, device, dataset._element_structure)  # pylint: disable=protected-access
       if prefetch_buffer_size > 0:
         ds = ds.prefetch(prefetch_buffer_size)
-      # TODO(jsimsa): Enable auto-tuning when supported for non-CPU devices.
+      # TODO(jsimsa): Enable auto-tuning and optimizations when supported for
+      # non-CPU devices.
       options = dataset_ops.Options()
       options.experimental_autotune = False
+      options.experimental_optimization.apply_default_optimizations = False
       ds = ds.with_options(options)
       with ops.device(device):
         self._device_iterators.append(ds.make_initializable_iterator())
diff --git a/tensorflow/python/data/ops/optional_ops.py b/tensorflow/python/data/ops/optional_ops.py
index 4113b7ed315ede5789d21cfe9c59ab91d2d6e4ec..dcb743bee01964baf06543587661bb73b2225abb 100644
--- a/tensorflow/python/data/ops/optional_ops.py
+++ b/tensorflow/python/data/ops/optional_ops.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 @six.add_metaclass(abc.ABCMeta)
@@ -145,6 +146,7 @@ class _OptionalImpl(Optional):
     return self._value_structure
 
 
+@tf_export("data.experimental.OptionalStructure")
 class OptionalStructure(structure.Structure):
   """Represents an optional potentially containing a structured value."""
 
@@ -167,6 +169,10 @@ class OptionalStructure(structure.Structure):
   def _to_tensor_list(self, value):
     return [value._variant_tensor]  # pylint: disable=protected-access
 
+  def _to_batched_tensor_list(self, value):
+    raise NotImplementedError(
+        "Unbatching for `tf.data.experimental.Optional` objects.")
+
   def _from_tensor_list(self, flat_value):
     if (len(flat_value) != 1 or flat_value[0].dtype != dtypes.variant or
         not flat_value[0].shape.is_compatible_with(tensor_shape.scalar())):
@@ -191,6 +197,14 @@ class OptionalStructure(structure.Structure):
   def _to_legacy_output_classes(self):
     return self
 
+  def _batch(self, batch_size):
+    raise NotImplementedError(
+        "Batching for `tf.data.experimental.Optional` objects.")
+
+  def _unbatch(self):
+    raise NotImplementedError(
+        "Unbatching for `tf.data.experimental.Optional` objects.")
+
 
 # pylint: disable=protected-access
 structure.Structure._register_custom_converter(Optional,
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index a93f3e44d1081e9adeafc4c88d2794f1433443e7..0d6023dea28e3cefa13b32717e2aee87ac2c2bbf 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -63,16 +65,8 @@ class TextLineDatasetV2(dataset_ops.DatasetSource):
         self._filenames, self._compression_type, self._buffer_size)
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.string
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
 
 
 @tf_export(v1=["data.TextLineDataset"])
@@ -125,16 +119,8 @@ class _TFRecordDataset(dataset_ops.DatasetSource):
         self._filenames, self._compression_type, self._buffer_size)
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.TensorShape([])
-
-  @property
-  def output_types(self):
-    return dtypes.string
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
 
 
 class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
@@ -158,15 +144,15 @@ class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
-    return gen_dataset_ops.parallel_interleave_dataset(
+    return ged_ops.experimental_parallel_interleave_dataset(
         self._input_dataset._as_variant_tensor(),
-        self._map_func.captured_inputs,
+        self._map_func.function.captured_inputs,
         self._cycle_length,
         self._block_length,
         self._sloppy,
         self._buffer_output_elements,
         self._prefetch_input_elements,
-        f=self._map_func,
+        f=self._map_func.function,
         **dataset_ops.flat_structure(self))
     # pylint: enable=protected-access
 
@@ -247,16 +233,8 @@ class TFRecordDatasetV2(dataset_ops.DatasetV2):
     return self._impl._inputs()  # pylint: disable=protected-access
 
   @property
-  def output_classes(self):
-    return self._impl.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._impl.output_shapes
-
-  @property
-  def output_types(self):
-    return self._impl.output_types
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
 
 
 @tf_export(v1=["data.TFRecordDataset"])
@@ -347,16 +325,8 @@ class FixedLengthRecordDatasetV2(dataset_ops.DatasetSource):
           self._footer_bytes, self._buffer_size)
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.string
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
 
 
 @tf_export(v1=["data.FixedLengthRecordDataset"])
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index f15ebc32a833369a8862a884929eca9e09ed1229..04e80299e0d57965c21b88bd94250cb62e76d452 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -93,6 +93,7 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/data/util/options.py b/tensorflow/python/data/util/options.py
index 9badba8e5670c749b833da7f1e2094f4f3548098..3c79197fae8d6df91ba477db8f9475dfd3fb61c9 100644
--- a/tensorflow/python/data/util/options.py
+++ b/tensorflow/python/data/util/options.py
@@ -31,7 +31,8 @@ class OptionsBase(object):
   """
 
   def __init__(self):
-    self._options = {}
+    # NOTE: Cannot use `self._options` here as we override `__setattr__`
+    object.__setattr__(self, "_options", {})
 
   def __eq__(self, other):
     if not isinstance(other, self.__class__):
@@ -47,28 +48,40 @@ class OptionsBase(object):
     else:
       return NotImplemented
 
+  def __setattr__(self, name, value):
+    if hasattr(self, name):
+      object.__setattr__(self, name, value)
+    else:
+      raise AttributeError(
+          "Cannot set the property %s on %s." % (name, type(self).__name__))
+
 
-def create_option(name, ty, docstring, default=None):
+def create_option(name, ty, docstring, default_factory=lambda: None):
   """Creates a type-checked property.
 
   Args:
-    name: the name to use
-    ty: the type to use
-    docstring: the docstring to use
-    default: the default value to use
+    name: The name to use.
+    ty: The type to use. The type of the property will be validated when it
+      is set.
+    docstring: The docstring to use.
+    default_factory: A callable that takes no arguments and returns a default
+      value to use if not set.
 
   Returns:
     A type-checked property.
   """
 
-  def get_fn(self):
-    return self._options.get(name, default)  # pylint: disable=protected-access
+  def get_fn(option):
+    # pylint: disable=protected-access
+    if name not in option._options:
+      option._options[name] = default_factory()
+    return option._options.get(name)
 
-  def set_fn(self, value):
+  def set_fn(option, value):
     if not isinstance(value, ty):
       raise TypeError("Property \"%s\" must be of type %s, got: %r (type: %r)" %
                       (name, ty, value, type(value)))
-    self._options[name] = value  # pylint: disable=protected-access
+    option._options[name] = value  # pylint: disable=protected-access
 
   return property(get_fn, set_fn, None, docstring)
 
diff --git a/tensorflow/python/data/util/options_test.py b/tensorflow/python/data/util/options_test.py
index c5169835a322923d7bf2d644717870d87bfab13f..b21afbd455db6c7f3da61df3e1dd8a4897603b85 100644
--- a/tensorflow/python/data/util/options_test.py
+++ b/tensorflow/python/data/util/options_test.py
@@ -24,9 +24,12 @@ from tensorflow.python.platform import test
 
 class _TestOptions(options.OptionsBase):
   x = options.create_option(
-      name="x", ty=int, docstring="the answer to everything", default=42)
+      name="x",
+      ty=int,
+      docstring="the answer to everything",
+      default_factory=lambda: 42)
   y = options.create_option(
-      name="y", ty=float, docstring="a tasty pie", default=3.14)
+      name="y", ty=float, docstring="a tasty pie", default_factory=lambda: 3.14)
 
 
 class _NestedTestOptions(options.OptionsBase):
@@ -91,6 +94,13 @@ class OptionsTest(test.TestCase):
     with self.assertRaises(TypeError):
       options.merge_options(options1, options2)
 
+  def testNoSpuriousAttrs(self):
+    test_options = _TestOptions()
+    with self.assertRaises(AttributeError):
+      test_options.wrong_attr = True
+    with self.assertRaises(AttributeError):
+      _ = test_options.wrong_attr
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/util/structure.py b/tensorflow/python/data/util/structure.py
index 3cf67b07453cdb58abaec0e854eec9847a7a833f..9de0c4da0ebe0beec31aa652397f06d6dc665e63 100644
--- a/tensorflow/python/data/util/structure.py
+++ b/tensorflow/python/data/util/structure.py
@@ -28,11 +28,13 @@ from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 _STRUCTURE_CONVERSION_FUNCTION_REGISTRY = {}
 
 
+@tf_export("data.experimental.Structure")
 @six.add_metaclass(abc.ABCMeta)
 class Structure(object):
   """Represents structural information, such as type and shape, about a value.
@@ -111,6 +113,26 @@ class Structure(object):
     """
     raise NotImplementedError("Structure._to_tensor_list()")
 
+  @abc.abstractmethod
+  def _to_batched_tensor_list(self, value):
+    """Returns a flat list of rank >= 1 `tf.Tensor` representing `value`.
+
+    This method can be used, along with `self._flat_shapes` and
+    `self._flat_types` to represent structured values in lower level APIs
+    (such as plain TensorFlow operations) that do not understand structure,
+    *and* that require that the plain tensors have a rank of at least one
+    (e.g. for the purpose of slicing the tensors).
+
+    Requires: `self.is_compatible_with(Structure.from_value(value))`.
+
+    Args:
+      value: A value with compatible structure.
+
+    Returns:
+      A flat list of `tf.Tensor` representing `value`.
+    """
+    raise NotImplementedError("Structure._to_batched_tensor_list()")
+
   @abc.abstractmethod
   def _from_tensor_list(self, flat_value):
     """Builds a flat list of `tf.Tensor` into a value matching this structure.
@@ -144,6 +166,23 @@ class Structure(object):
     """
     return self._from_tensor_list(flat_value)
 
+  @abc.abstractmethod
+  def _batch(self, batch_size):
+    """Returns a structure representing a batch of objects with this structure.
+
+    Args:
+      batch_size: An `int` representing the number of elements in a batch,
+        or `None` if the batch size may vary.
+
+    Returns:
+      A `Structure` representing a batch of objects with this structure.
+    """
+    raise NotImplementedError("Structure._batch()")
+
+  @abc.abstractmethod
+  def _unbatch(self):
+    raise NotImplementedError("Structure._unbatch()")
+
   @staticmethod
   def from_value(value):
     """Returns a `Structure` that represents the given `value`.
@@ -177,56 +216,6 @@ class Structure(object):
         raise TypeError("Could not build a structure for %r" % value)
       return TensorStructure.from_value(tensor)
 
-  @staticmethod
-  def _from_legacy_structure(output_types, output_shapes, output_classes):
-    """Returns a `Structure` that represents the given legacy structure.
-
-    This method provides a way to convert from the existing `Dataset` and
-    `Iterator` structure-related properties to a `Structure` object.
-
-    TODO(b/110122868): Remove this method once `Structure` is used throughout
-    `tf.data`.
-
-    Args:
-      output_types: A nested structure of `tf.DType` objects corresponding to
-        each component of a structured value.
-      output_shapes: A nested structure of `tf.TensorShape` objects
-        corresponding to each component a structured value.
-      output_classes: A nested structure of Python `type` objects corresponding
-        to each component of a structured value.
-
-    Returns:
-      A `Structure`.
-
-    Raises:
-      TypeError: If a structure cannot be built the arguments, because one of
-        the component classes in `output_classes` is not supported.
-    """
-    flat_types = nest.flatten(output_types)
-    flat_shapes = nest.flatten(output_shapes)
-    flat_classes = nest.flatten(output_classes)
-    flat_ret = []
-    for flat_type, flat_shape, flat_class in zip(flat_types, flat_shapes,
-                                                 flat_classes):
-      if isinstance(flat_class, Structure):
-        flat_ret.append(flat_class)
-      elif issubclass(flat_class, sparse_tensor_lib.SparseTensor):
-        flat_ret.append(SparseTensorStructure(flat_type, flat_shape))
-      elif issubclass(flat_class, ops.Tensor):
-        flat_ret.append(TensorStructure(flat_type, flat_shape))
-      else:
-        # NOTE(mrry): Since legacy structures produced by iterators only
-        # comprise Tensors, SparseTensors, and nests, we do not need to
-        # support all structure types here.
-        raise TypeError(
-            "Could not build a structure for output class %r" % flat_type)
-
-    ret = nest.pack_sequence_as(output_classes, flat_ret)
-    if isinstance(ret, Structure):
-      return ret
-    else:
-      return NestedStructure(ret)
-
   @staticmethod
   def _register_custom_converter(type_object, converter_fn):
     """Registers `converter_fn` for converting values of the given type.
@@ -252,9 +241,63 @@ class Structure(object):
     raise NotImplementedError("Structure._to_legacy_output_classes()")
 
 
+def convert_legacy_structure(output_types, output_shapes, output_classes):
+  """Returns a `Structure` that represents the given legacy structure.
+
+  This method provides a way to convert from the existing `Dataset` and
+  `Iterator` structure-related properties to a `Structure` object. A "legacy"
+  structure is represented by the `tf.data.Dataset.output_types`,
+  `tf.data.Dataset.output_shapes`, and `tf.data.Dataset.output_classes`
+  properties.
+
+  TODO(b/110122868): Remove this function once `Structure` is used throughout
+  `tf.data`.
+
+  Args:
+    output_types: A nested structure of `tf.DType` objects corresponding to
+      each component of a structured value.
+    output_shapes: A nested structure of `tf.TensorShape` objects
+      corresponding to each component a structured value.
+    output_classes: A nested structure of Python `type` objects corresponding
+      to each component of a structured value.
+
+  Returns:
+    A `Structure`.
+
+  Raises:
+    TypeError: If a structure cannot be built from the arguments, because one of
+      the component classes in `output_classes` is not supported.
+  """
+  flat_types = nest.flatten(output_types)
+  flat_shapes = nest.flatten(output_shapes)
+  flat_classes = nest.flatten(output_classes)
+  flat_ret = []
+  for flat_type, flat_shape, flat_class in zip(flat_types, flat_shapes,
+                                               flat_classes):
+    if isinstance(flat_class, Structure):
+      flat_ret.append(flat_class)
+    elif issubclass(flat_class, sparse_tensor_lib.SparseTensor):
+      flat_ret.append(SparseTensorStructure(flat_type, flat_shape))
+    elif issubclass(flat_class, ops.Tensor):
+      flat_ret.append(TensorStructure(flat_type, flat_shape))
+    else:
+      # NOTE(mrry): Since legacy structures produced by iterators only
+      # comprise Tensors, SparseTensors, and nests, we do not need to
+      # support all structure types here.
+      raise TypeError(
+          "Could not build a structure for output class %r" % flat_type)
+
+  ret = nest.pack_sequence_as(output_classes, flat_ret)
+  if isinstance(ret, Structure):
+    return ret
+  else:
+    return NestedStructure(ret)
+
+
 # NOTE(mrry): The following classes make extensive use of non-public methods of
 # their base class, so we disable the protected-access lint warning once here.
 # pylint: disable=protected-access
+@tf_export("data.experimental.NestedStructure")
 class NestedStructure(Structure):
   """Represents a nested structure in which each leaf is a `Structure`."""
 
@@ -310,21 +353,45 @@ class NestedStructure(Structure):
       ret.extend(structure._to_tensor_list(sub_value))
     return ret
 
+  def _to_batched_tensor_list(self, value):
+    ret = []
+
+    try:
+      flat_value = nest.flatten_up_to(self._nested_structure, value)
+    except (ValueError, TypeError):
+      raise ValueError("The value %r is not compatible with the nested "
+                       "structure %r." % (value, self._nested_structure))
+
+    for sub_value, structure in zip(flat_value, self._flat_nested_structure):
+      if not structure.is_compatible_with(Structure.from_value(sub_value)):
+        raise ValueError("Component value %r is not compatible with the nested "
+                         "structure %r." % (sub_value, structure))
+      ret.extend(structure._to_batched_tensor_list(sub_value))
+    return ret
+
   def _from_tensor_list(self, flat_value):
     if len(flat_value) != len(self._flat_types):
       raise ValueError("Expected %d flat values in NestedStructure but got %d."
                        % (len(self._flat_types), len(flat_value)))
 
     flat_ret = []
-    for sub_value, structure in zip(flat_value, self._flat_nested_structure):
-      flat_ret.append(structure._from_tensor_list([sub_value]))
+    i = 0
+    for structure in self._flat_nested_structure:
+      num_flat_values = len(structure._flat_types)
+      sub_value = flat_value[i:i + num_flat_values]
+      flat_ret.append(structure._from_tensor_list(sub_value))
+      i += num_flat_values
 
     return nest.pack_sequence_as(self._nested_structure, flat_ret)
 
   def _from_compatible_tensor_list(self, flat_value):
     flat_ret = []
-    for sub_value, structure in zip(flat_value, self._flat_nested_structure):
-      flat_ret.append(structure._from_compatible_tensor_list([sub_value]))
+    i = 0
+    for structure in self._flat_nested_structure:
+      num_flat_values = len(structure._flat_types)
+      sub_value = flat_value[i:i + num_flat_values]
+      flat_ret.append(structure._from_compatible_tensor_list(sub_value))
+      i += num_flat_values
 
     return nest.pack_sequence_as(self._nested_structure, flat_ret)
 
@@ -347,7 +414,16 @@ class NestedStructure(Structure):
     return nest.map_structure(
         lambda s: s._to_legacy_output_classes(), self._nested_structure)
 
+  def _batch(self, batch_size):
+    return NestedStructure(nest.map_structure(
+        lambda s: s._batch(batch_size), self._nested_structure))
+
+  def _unbatch(self):
+    return NestedStructure(nest.map_structure(
+        lambda s: s._unbatch(), self._nested_structure))
 
+
+@tf_export("data.experimental.TensorStructure")
 class TensorStructure(Structure):
   """Represents structural information about a `tf.Tensor`."""
 
@@ -374,6 +450,11 @@ class TensorStructure(Structure):
                        "and shape %s." % (value, self._dtype, self._shape))
     return [value]
 
+  def _to_batched_tensor_list(self, value):
+    if self._shape.merge_with(value.shape).ndims == 0:
+      raise ValueError("Unbatching a tensor is only supported for rank >= 1")
+    return [value]
+
   def _from_tensor_list(self, flat_value):
     if len(flat_value) != 1:
       raise ValueError("TensorStructure corresponds to a single tf.Tensor.")
@@ -405,7 +486,18 @@ class TensorStructure(Structure):
   def _to_legacy_output_classes(self):
     return ops.Tensor
 
+  def _batch(self, batch_size):
+    return TensorStructure(
+        self._dtype,
+        tensor_shape.TensorShape([batch_size]).concatenate(self._shape))
+
+  def _unbatch(self):
+    if self._shape.ndims == 0:
+      raise ValueError("Unbatching a tensor is only supported for rank >= 1")
+    return TensorStructure(self._dtype, self._shape[1:])
+
 
+@tf_export("data.experimental.SparseTensorStructure")
 class SparseTensorStructure(Structure):
   """Represents structural information about a `tf.SparseTensor`."""
 
@@ -433,6 +525,13 @@ class SparseTensorStructure(Structure):
   def _to_tensor_list(self, value):
     return [sparse_ops.serialize_sparse(value, out_type=dtypes.variant)]
 
+  def _to_batched_tensor_list(self, value):
+    if self._dense_shape.merge_with(
+        tensor_util.constant_value_as_shape(value.dense_shape)).ndims == 0:
+      raise ValueError(
+          "Unbatching a sparse tensor is only supported for rank >= 1")
+    return [sparse_ops.serialize_many_sparse(value, out_type=dtypes.variant)]
+
   def _from_tensor_list(self, flat_value):
     if (len(flat_value) != 1 or flat_value[0].dtype != dtypes.variant or
         not flat_value[0].shape.is_compatible_with(tensor_shape.vector(3))):
@@ -462,3 +561,13 @@ class SparseTensorStructure(Structure):
 
   def _to_legacy_output_classes(self):
     return sparse_tensor_lib.SparseTensor
+
+  def _batch(self, batch_size):
+    return SparseTensorStructure(
+        self._dtype,
+        tensor_shape.TensorShape([batch_size]).concatenate(self._dense_shape))
+
+  def _unbatch(self):
+    if self._dense_shape.ndims == 0:
+      raise ValueError("Unbatching a tensor is only supported for rank >= 1")
+    return SparseTensorStructure(self._dtype, self._dense_shape[1:])
diff --git a/tensorflow/python/data/util/structure_test.py b/tensorflow/python/data/util/structure_test.py
index e9e2f5be0ae60e71909d49f87ebb5a1deaef4809..91dcfa6f6089bf052526e17ca8f0e646f7e86d71 100644
--- a/tensorflow/python/data/util/structure_test.py
+++ b/tensorflow/python/data/util/structure_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
@@ -34,7 +35,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-class StructureTest(test.TestCase, parameterized.TestCase):
+class StructureTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   # NOTE(mrry): The arguments must be lifted into lambdas because otherwise they
   # will be executed before the (eager- or graph-mode) test environment has been
@@ -352,12 +353,141 @@ class StructureTest(test.TestCase, parameterized.TestCase):
            "b": (structure.SparseTensorStructure(dtypes.int32, [2, 2]),
                  structure.TensorStructure(dtypes.string, []))})),
   )
-  def testFromLegacyStructure(self, output_types, output_shapes, output_classes,
-                              expected_structure):
-    actual_structure = structure.Structure._from_legacy_structure(
+  def testConvertLegacyStructure(self, output_types, output_shapes,
+                                 output_classes, expected_structure):
+    actual_structure = structure.convert_legacy_structure(
         output_types, output_shapes, output_classes)
     self.assertTrue(expected_structure.is_compatible_with(actual_structure))
     self.assertTrue(actual_structure.is_compatible_with(expected_structure))
 
+  def testNestedNestedStructure(self):
+    # Although `Structure.from_value()` will not construct one, a nested
+    # structure containing nested `NestedStructure` objects can occur if a
+    # structure is constructed manually.
+    s = structure.NestedStructure(
+        (structure.TensorStructure(dtypes.int64, []),
+         structure.NestedStructure(
+             (structure.TensorStructure(dtypes.float32, []),
+              structure.TensorStructure(dtypes.string, [])))))
+
+    int64_t = constant_op.constant(37, dtype=dtypes.int64)
+    float32_t = constant_op.constant(42.0)
+    string_t = constant_op.constant("Foo")
+
+    nested_tensors = (int64_t, (float32_t, string_t))
+
+    tensor_list = s._to_tensor_list(nested_tensors)
+    for expected, actual in zip([int64_t, float32_t, string_t], tensor_list):
+      self.assertIs(expected, actual)
+
+    (actual_int64_t, (actual_float32_t, actual_string_t)) = s._from_tensor_list(
+        tensor_list)
+    self.assertIs(int64_t, actual_int64_t)
+    self.assertIs(float32_t, actual_float32_t)
+    self.assertIs(string_t, actual_string_t)
+
+    (actual_int64_t, (actual_float32_t, actual_string_t)) = (
+        s._from_compatible_tensor_list(tensor_list))
+    self.assertIs(int64_t, actual_int64_t)
+    self.assertIs(float32_t, actual_float32_t)
+    self.assertIs(string_t, actual_string_t)
+
+  @parameterized.named_parameters(
+      ("Tensor", structure.TensorStructure(dtypes.float32, []), 32,
+       structure.TensorStructure(dtypes.float32, [32])),
+      ("TensorUnknown", structure.TensorStructure(dtypes.float32, []), None,
+       structure.TensorStructure(dtypes.float32, [None])),
+      ("SparseTensor", structure.SparseTensorStructure(dtypes.float32, [None]),
+       32, structure.SparseTensorStructure(dtypes.float32, [32, None])),
+      ("SparseTensorUnknown",
+       structure.SparseTensorStructure(dtypes.float32, [4]), None,
+       structure.SparseTensorStructure(dtypes.float32, [None, 4])),
+      ("Nest", structure.NestedStructure({
+          "a": structure.TensorStructure(dtypes.float32, []),
+          "b": (structure.SparseTensorStructure(dtypes.int32, [2, 2]),
+                structure.TensorStructure(dtypes.string, []))}), 128,
+       structure.NestedStructure({
+           "a": structure.TensorStructure(dtypes.float32, [128]),
+           "b": (structure.SparseTensorStructure(dtypes.int32, [128, 2, 2]),
+                 structure.TensorStructure(dtypes.string, [128]))})),
+  )
+  def testBatch(self, element_structure, batch_size,
+                expected_batched_structure):
+    batched_structure = element_structure._batch(batch_size)
+    self.assertTrue(
+        batched_structure.is_compatible_with(expected_batched_structure))
+    self.assertTrue(
+        expected_batched_structure.is_compatible_with(batched_structure))
+
+  @parameterized.named_parameters(
+      ("Tensor", structure.TensorStructure(dtypes.float32, [32]),
+       structure.TensorStructure(dtypes.float32, [])),
+      ("TensorUnknown", structure.TensorStructure(dtypes.float32, [None]),
+       structure.TensorStructure(dtypes.float32, [])),
+      ("SparseTensor",
+       structure.SparseTensorStructure(dtypes.float32, [32, None]),
+       structure.SparseTensorStructure(dtypes.float32, [None])),
+      ("SparseTensorUnknown",
+       structure.SparseTensorStructure(dtypes.float32, [None, 4]),
+       structure.SparseTensorStructure(dtypes.float32, [4])),
+      ("Nest", structure.NestedStructure({
+          "a": structure.TensorStructure(dtypes.float32, [128]),
+          "b": (structure.SparseTensorStructure(dtypes.int32, [128, 2, 2]),
+                structure.TensorStructure(dtypes.string, [None]))}),
+       structure.NestedStructure({
+           "a": structure.TensorStructure(dtypes.float32, []),
+           "b": (structure.SparseTensorStructure(dtypes.int32, [2, 2]),
+                 structure.TensorStructure(dtypes.string, []))})),
+  )
+  def testUnbatch(self, element_structure, expected_unbatched_structure):
+    unbatched_structure = element_structure._unbatch()
+    self.assertTrue(
+        unbatched_structure.is_compatible_with(expected_unbatched_structure))
+    self.assertTrue(
+        expected_unbatched_structure.is_compatible_with(unbatched_structure))
+
+  # pylint: disable=g-long-lambda
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant([[1.0, 2.0], [3.0, 4.0]]),
+       lambda: constant_op.constant([1.0, 2.0])),
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
+          indices=[[0, 0], [1, 1]], values=[13, 27], dense_shape=[2, 2]),
+       lambda: sparse_tensor.SparseTensor(
+           indices=[[0]], values=[13], dense_shape=[2])),
+      ("Nest", lambda: (
+          constant_op.constant([[1.0, 2.0], [3.0, 4.0]]),
+          sparse_tensor.SparseTensor(
+              indices=[[0, 0], [1, 1]], values=[13, 27], dense_shape=[2, 2])),
+       lambda: (constant_op.constant([1.0, 2.0]), sparse_tensor.SparseTensor(
+           indices=[[0]], values=[13], dense_shape=[2]))),
+  )
+  def testToBatchedTensorList(self, value_fn, element_0_fn):
+    batched_value = value_fn()
+    s = structure.Structure.from_value(batched_value)
+    batched_tensor_list = s._to_batched_tensor_list(batched_value)
+
+    # The batch dimension is 2 for all of the test cases.
+    # NOTE(mrry): `tf.shape()` does not currently work for the DT_VARIANT
+    # tensors in which we store sparse tensors.
+    for t in batched_tensor_list:
+      if t.dtype != dtypes.variant:
+        self.assertEqual(2, self.evaluate(array_ops.shape(t)[0]))
+
+    # Test that the 0th element from the unbatched tensor is equal to the
+    # expected value.
+    expected_element_0 = self.evaluate(element_0_fn())
+    unbatched_s = s._unbatch()
+    actual_element_0 = unbatched_s._from_tensor_list(
+        [t[0] for t in batched_tensor_list])
+
+    for expected, actual in zip(
+        nest.flatten(expected_element_0), nest.flatten(actual_element_0)):
+      if sparse_tensor.is_sparse(expected):
+        self.assertSparseValuesEqual(expected, actual)
+      else:
+        self.assertAllEqual(expected, actual)
+
+  # pylint: enable=g-long-lambda
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index c6abd476d9d274a3aab270a548f5b0ebd3b6d257..1dcdb880f553422c53cd8323ff888dc2e1c60719 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -1132,4 +1132,7 @@ sh_test(
         ":debug_tflearn_iris",
         ":offline_analyzer",
     ],
+    tags = [
+        "no_windows",
+    ],
 )
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index 322ecf94667f29eba4ecbfbd42368e9890e8f36a..586982dc4bf3511925f46268c537ed53d54ed700 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -573,6 +573,7 @@ def create_analyzer_cli(dump):
   return analyzer, registry
 
 
+@test_util.run_v1_only("b/120545219")
 class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
 
   @classmethod
@@ -645,7 +646,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     self.assertEqual(len("Size (B)") + 1, dump_size_col_width)
     self.assertEqual(len("Op type") + 1, op_type_col_width)
 
-  @test_util.run_deprecated_v1
   def testMeasureTensorListColumnWidthsGivesRightAnswerForData(self):
     dump = self._debug_dump.dumped_tensor_data[0]
     self.assertLess(dump.dump_size_bytes, 1000)
@@ -661,7 +661,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     # column should be determined by the length of "VariableV2".
     self.assertEqual(len("VariableV2") + 1, op_type_col_width)
 
-  @test_util.run_deprecated_v1
   def testListTensors(self):
     # Use shorthand alias for the command prefix.
     out = self._registry.dispatch_command("lt", [])
@@ -675,7 +674,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     # Check the main menu.
     check_main_menu(self, out, list_tensors_enabled=False)
 
-  @test_util.run_deprecated_v1
   def testListTensorsInReverseTimeOrderWorks(self):
     # Use shorthand alias for the command prefix.
     out = self._registry.dispatch_command("lt", ["-s", "timestamp", "-r"])
@@ -691,7 +689,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         reverse=True)
     check_main_menu(self, out, list_tensors_enabled=False)
 
-  @test_util.run_deprecated_v1
   def testListTensorsInDumpSizeOrderWorks(self):
     out = self._registry.dispatch_command("lt", ["-s", "dump_size"])
     assert_listed_tensors(
@@ -705,7 +702,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         sort_by="dump_size")
     check_main_menu(self, out, list_tensors_enabled=False)
 
-  @test_util.run_deprecated_v1
   def testListTensorsInReverseDumpSizeOrderWorks(self):
     out = self._registry.dispatch_command("lt", ["-s", "dump_size", "-r"])
     assert_listed_tensors(
@@ -725,7 +721,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     self.assertIn("ValueError: Unsupported key to sort tensors by: foobar",
                   out.lines)
 
-  @test_util.run_deprecated_v1
   def testListTensorsInOpTypeOrderWorks(self):
     # Use shorthand alias for the command prefix.
     out = self._registry.dispatch_command("lt", ["-s", "op_type"])
@@ -741,7 +736,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         reverse=False)
     check_main_menu(self, out, list_tensors_enabled=False)
 
-  @test_util.run_deprecated_v1
   def testListTensorsInReverseOpTypeOrderWorks(self):
     # Use shorthand alias for the command prefix.
     out = self._registry.dispatch_command("lt", ["-s", "op_type", "-r"])
@@ -757,7 +751,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         reverse=True)
     check_main_menu(self, out, list_tensors_enabled=False)
 
-  @test_util.run_deprecated_v1
   def testListTensorsInTensorNameOrderWorks(self):
     # Use shorthand alias for the command prefix.
     out = self._registry.dispatch_command("lt", ["-s", "tensor_name"])
@@ -773,7 +766,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         reverse=False)
     check_main_menu(self, out, list_tensors_enabled=False)
 
-  @test_util.run_deprecated_v1
   def testListTensorsInReverseTensorNameOrderWorks(self):
     # Use shorthand alias for the command prefix.
     out = self._registry.dispatch_command("lt", ["-s", "tensor_name", "-r"])
@@ -789,7 +781,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         reverse=True)
     check_main_menu(self, out, list_tensors_enabled=False)
 
-  @test_util.run_deprecated_v1
   def testListTensorsFilterByNodeNameRegex(self):
     out = self._registry.dispatch_command("list_tensors",
                                           ["--node_name_filter", ".*read.*"])
@@ -803,7 +794,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     assert_listed_tensors(self, out, [], [], node_name_regex="^read")
     check_main_menu(self, out, list_tensors_enabled=False)
 
-  @test_util.run_deprecated_v1
   def testListTensorFilterByOpTypeRegex(self):
     out = self._registry.dispatch_command("list_tensors",
                                           ["--op_type_filter", "Identity"])
@@ -832,7 +822,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         op_type_regex="(Add|MatMul)")
     check_main_menu(self, out, list_tensors_enabled=False)
 
-  @test_util.run_deprecated_v1
   def testListTensorWithFilterAndNodeNameExclusionWorks(self):
     # First, create and register the filter.
     def is_2x1_vector(datum, tensor):
@@ -889,7 +878,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     out = self._registry.dispatch_command("list_tensors", ["--bar"])
     check_syntax_error_output(self, out, "list_tensors")
 
-  @test_util.run_deprecated_v1
   def testNodeInfoByNodeName(self):
     node_name = "simple_mul_add/matmul"
     out = self._registry.dispatch_command("node_info", [node_name])
@@ -914,7 +902,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         [(len(out.lines[0]) - len(node_name), len(out.lines[0]), "bold")],
         out.font_attr_segs[0])
 
-  @test_util.run_deprecated_v1
   def testNodeInfoShowAttributes(self):
     node_name = "simple_mul_add/matmul"
     out = self._registry.dispatch_command("node_info", ["-a", node_name])
@@ -938,7 +925,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         print_tensor_node_name=node_name,
         list_outputs_node_name=node_name)
 
-  @test_util.run_deprecated_v1
   def testNodeInfoShowDumps(self):
     node_name = "simple_mul_add/matmul"
     out = self._registry.dispatch_command("node_info", ["-d", node_name])
@@ -963,7 +949,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
                     len(out.lines[16]) - len(out.lines[16].strip()),
                     len(out.lines[16]), "pt %s:0 -n 0" % node_name)
 
-  @test_util.run_deprecated_v1
   def testNodeInfoShowStackTraceUnavailableIsIndicated(self):
     self._debug_dump.set_python_graph(None)
 
@@ -987,7 +972,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         print_tensor_node_name=node_name,
         list_outputs_node_name=node_name)
 
-  @test_util.run_deprecated_v1
   def testNodeInfoShowStackTraceAvailableWorks(self):
     self._debug_dump.set_python_graph(self._sess.graph)
 
@@ -1011,7 +995,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         print_tensor_node_name=node_name,
         list_outputs_node_name=node_name)
 
-  @test_util.run_deprecated_v1
   def testNodeInfoByTensorName(self):
     node_name = "simple_mul_add/u/read"
     tensor_name = node_name + ":0"
@@ -1381,7 +1364,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         break
     return index
 
-  @test_util.run_deprecated_v1
   def testPrintSourceForOpNamesWholeFileWorks(self):
     self._debug_dump.set_python_graph(self._sess.graph)
     out = self._registry.dispatch_command(
@@ -1434,7 +1416,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     self.assertEqual("pt simple_mul_add/add",
                      out.font_attr_segs[index + 1][0][2].content)
 
-  @test_util.run_deprecated_v1
   def testPrintSourceForTensorNamesWholeFileWorks(self):
     self._debug_dump.set_python_graph(self._sess.graph)
     out = self._registry.dispatch_command(
@@ -1455,7 +1436,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     self.assertEqual("pt simple_mul_add/u:0",
                      out.font_attr_segs[index + 2][0][2].content)
 
-  @test_util.run_deprecated_v1
   def testPrintSourceForOpNamesStartingAtSpecifiedLineWorks(self):
     self._debug_dump.set_python_graph(self._sess.graph)
     out = self._registry.dispatch_command(
@@ -1482,7 +1462,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     self.assertEqual("pt simple_mul_add/u/read",
                      out.font_attr_segs[index + 3][0][2].content)
 
-  @test_util.run_deprecated_v1
   def testPrintSourceForOpNameSettingMaximumElementCountWorks(self):
     self._debug_dump.set_python_graph(self._sess.graph)
     out = self._registry.dispatch_command(
@@ -1527,7 +1506,6 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         self.assertTrue(cli_shared.COLOR_GRAY in attr_seg[2] or
                         attr_seg[2] == cli_shared.COLOR_GRAY)
 
-  @test_util.run_deprecated_v1
   def testListSourceWithNodeNameFilterWithMatchesWorks(self):
     self._debug_dump.set_python_graph(self._sess.graph)
     out = self._registry.dispatch_command("list_source", ["-n", ".*/read"])
@@ -1691,6 +1669,7 @@ class AnalyzerCLIPrintLargeTensorTest(test_util.TensorFlowTestCase):
     self.assertNotIn("...,", out.lines[4])
 
 
+@test_util.run_v1_only("b/120545219")
 class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
 
   @classmethod
@@ -1742,7 +1721,6 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
     # Tear down temporary dump directory.
     shutil.rmtree(cls._dump_root)
 
-  @test_util.run_deprecated_v1
   def testNodeInfoWithControlDependencies(self):
     # Call node_info on a node with control inputs.
     out = self._registry.dispatch_command("node_info",
@@ -1783,7 +1761,6 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
                     len(out.lines[z_line]),
                     "ni -a -d -t control_deps/ctrl_dep_z")
 
-  @test_util.run_deprecated_v1
   def testListInputsNonRecursiveNoControl(self):
     """List inputs non-recursively, without any control inputs."""
 
@@ -1826,7 +1803,6 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
                     len(out.lines[3]) - len("control_deps/ctrl_dep_y"),
                     len(out.lines[3]), "li -c -r control_deps/ctrl_dep_y")
 
-  @test_util.run_deprecated_v1
   def testListInputsNonRecursiveNoControlUsingTensorName(self):
     """List inputs using the name of an output tensor of the node."""
 
@@ -1855,7 +1831,6 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
                     len(out.lines[3]) - len("control_deps/ctrl_dep_y"),
                     len(out.lines[3]), "li -c -r control_deps/ctrl_dep_y")
 
-  @test_util.run_deprecated_v1
   def testListInputsNonRecursiveWithControls(self):
     """List inputs non-recursively, with control inputs."""
     node_name = "control_deps/ctrl_dep_z"
@@ -1886,7 +1861,6 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
                     len(out.lines[5]) - len("control_deps/x"),
                     len(out.lines[5]), "li -c -r control_deps/x")
 
-  @test_util.run_deprecated_v1
   def testListInputsRecursiveWithControls(self):
     """List inputs recursively, with control inputs."""
     node_name = "control_deps/ctrl_dep_z"
@@ -1932,7 +1906,6 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
                     len(out.lines[18]) - len("control_deps/x"),
                     len(out.lines[18]), "li -c -r control_deps/x")
 
-  @test_util.run_deprecated_v1
   def testListInputsRecursiveWithControlsWithDepthLimit(self):
     """List inputs recursively, with control inputs and a depth limit."""
     node_name = "control_deps/ctrl_dep_z"
@@ -1992,7 +1965,6 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
         "ERROR: There is no node named \"control_deps/z/foo\" in the "
         "partition graphs"], out.lines)
 
-  @test_util.run_deprecated_v1
   def testListRecipientsRecursiveWithControlsWithDepthLimit(self):
     """List recipients recursively, with control inputs and a depth limit."""
 
@@ -2025,6 +1997,7 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
                      out.font_attr_segs[0])
 
 
+@test_util.run_v1_only("b/120545219")
 class AnalyzerCLIWhileLoopTest(test_util.TensorFlowTestCase):
 
   @classmethod
@@ -2064,7 +2037,6 @@ class AnalyzerCLIWhileLoopTest(test_util.TensorFlowTestCase):
     # Tear down temporary dump directory.
     shutil.rmtree(cls._dump_root)
 
-  @test_util.run_deprecated_v1
   def testMultipleDumpsPrintTensorNoNumber(self):
     output = self._registry.dispatch_command("pt", ["while/Identity:0"])
 
@@ -2082,7 +2054,6 @@ class AnalyzerCLIWhileLoopTest(test_util.TensorFlowTestCase):
     self.assertEqual("For example:", output.lines[-2])
     self.assertEqual("  print_tensor while/Identity:0 -n 0", output.lines[-1])
 
-  @test_util.run_deprecated_v1
   def testMultipleDumpsPrintTensorWithNumber(self):
     for i in xrange(5):
       output = self._registry.dispatch_command(
@@ -2096,7 +2067,6 @@ class AnalyzerCLIWhileLoopTest(test_util.TensorFlowTestCase):
       self.assertTrue(output.lines[4].startswith("array(%d" % i))
       self.assertTrue(output.lines[4].endswith(")"))
 
-  @test_util.run_deprecated_v1
   def testMultipleDumpsPrintTensorInvalidNumber(self):
     output = self._registry.dispatch_command("pt",
                                              ["while/Identity:0", "-n", "10"])
diff --git a/tensorflow/python/debug/cli/cli_shared_test.py b/tensorflow/python/debug/cli/cli_shared_test.py
index d191a234fde730bfd03f80e008d210f8588889ef..66a12efda53470b33edf4788984e632bfe55f2b9 100644
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@@ -105,6 +105,7 @@ class TimeToReadableStrTest(test_util.TensorFlowTestCase):
       cli_shared.time_to_readable_str(100, force_time_unit="ks")
 
 
+@test_util.run_v1_only("b/120545219")
 class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -334,6 +335,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     self.assertEqual("run #1: 1 fetch (a:0); 1 feed (foo)", short_description)
 
 
+@test_util.run_v1_only("b/120545219")
 class GetErrorIntroTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -342,7 +344,6 @@ class GetErrorIntroTest(test_util.TensorFlowTestCase):
   def tearDown(self):
     ops.reset_default_graph()
 
-  @test_util.run_deprecated_v1
   def testShapeError(self):
     tf_error = errors.OpError(None, self.var_a.initializer, "foo description",
                               None)
diff --git a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
index effcd500c7032fc5d545205a09070c38f20f84bb..d6d2b58b5f8138643bb4b9886da01b72295b5df7 100644
--- a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
@@ -70,6 +70,7 @@ def _assert_no_lines_match(pattern, lines):
         "%s matched at least one line in %s." % (pattern, str(lines)))
 
 
+@test_util.run_v1_only("b/120545219")
 class ProfileAnalyzerListProfileTest(test_util.TensorFlowTestCase):
 
   def testNodeInfoEmpty(self):
@@ -321,6 +322,7 @@ class ProfileAnalyzerListProfileTest(test_util.TensorFlowTestCase):
     _assert_at_least_one_line_matches(r"Device Total.*0\.009ms", prof_output)
 
 
+@test_util.run_v1_only("b/120545219")
 class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -348,7 +350,6 @@ class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
     ops.reset_default_graph()
     super(ProfileAnalyzerPrintSourceTest, self).tearDown()
 
-  @test_util.run_deprecated_v1
   def testPrintSourceForWhileLoop(self):
     prof_output = self.prof_analyzer.print_source([__file__])
 
@@ -362,7 +363,6 @@ class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
         r"\[(\|)+(\s)*\] .*us .*7\(55\) .*L%d.*(\S)+" % self.loop_lineno,
         prof_output.lines)
 
-  @test_util.run_deprecated_v1
   def testPrintSourceOutputContainsClickableLinks(self):
     prof_output = self.prof_analyzer.print_source([__file__])
     any_match, line_index = _at_least_one_line_matches(
@@ -379,7 +379,6 @@ class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
         break
     self.assertTrue(any_menu_item_match)
 
-  @test_util.run_deprecated_v1
   def testPrintSourceWithNonDefaultTimeUnit(self):
     prof_output = self.prof_analyzer.print_source([
         __file__, "--time_unit", "ms"])
@@ -394,7 +393,6 @@ class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
         r"\[(\|)+(\s)*\] .*ms .*7\(55\) .*L%d.*(\S)+" % self.loop_lineno,
         prof_output.lines)
 
-  @test_util.run_deprecated_v1
   def testPrintSourceWithNodeNameFilter(self):
     prof_output = self.prof_analyzer.print_source([
         __file__, "--node_name_filter", "x$"])
@@ -427,7 +425,6 @@ class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
         break
     self.assertTrue(any_menu_item_match)
 
-  @test_util.run_deprecated_v1
   def testPrintSourceWithOpTypeFilter(self):
     prof_output = self.prof_analyzer.print_source([
         __file__, "--op_type_filter", "Less"])
diff --git a/tensorflow/python/debug/cli/stepper_cli_test.py b/tensorflow/python/debug/cli/stepper_cli_test.py
index 7b8a42c25380dde8bc2ce0d34eb79f2ddd54922f..5cf69d0168b70a4d03162512b5024736c50cf23a 100644
--- a/tensorflow/python/debug/cli/stepper_cli_test.py
+++ b/tensorflow/python/debug/cli/stepper_cli_test.py
@@ -129,6 +129,7 @@ def _parse_updated(lines):
   return updated
 
 
+@test_util.run_v1_only("b/120545219")
 class NodeStepperSimpleGraphTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/examples/debug_errors.py b/tensorflow/python/debug/examples/debug_errors.py
index 28abc9734370630b864da4f693cbddd88c382502..e3692072cc558fa11a47daafb6fb0834d70ee654 100644
--- a/tensorflow/python/debug/examples/debug_errors.py
+++ b/tensorflow/python/debug/examples/debug_errors.py
@@ -77,4 +77,5 @@ if __name__ == "__main__":
       default=False,
       help="Use debugger to track down bad values during training")
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  with tf.Graph().as_default():
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_fibonacci.py b/tensorflow/python/debug/examples/debug_fibonacci.py
index 3821b393ec6847db71b7c4b7396b1ed448ae9538..777fb089881a069e403eb897f4efabcff815e2bf 100644
--- a/tensorflow/python/debug/examples/debug_fibonacci.py
+++ b/tensorflow/python/debug/examples/debug_fibonacci.py
@@ -100,4 +100,5 @@ if __name__ == "__main__":
       "--debug flag.")
 
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  with tf.Graph().as_default():
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_keras.py b/tensorflow/python/debug/examples/debug_keras.py
index 3272d85ade957b254b2c1a0977156179cd71bb9d..019121fa0a61a4e69ce370bac23c4575a27a72c9 100644
--- a/tensorflow/python/debug/examples/debug_keras.py
+++ b/tensorflow/python/debug/examples/debug_keras.py
@@ -86,4 +86,5 @@ if __name__ == "__main__":
       default=2,
       help="Number of epochs to train the model for.")
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  with tf.Graph().as_default():
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_mnist.py b/tensorflow/python/debug/examples/debug_mnist.py
index ab1c90371cd18bbaf278b72248bcc7e9e9c34b06..09fb06c9c065f544a4c9bb47b96157704a8306e2 100644
--- a/tensorflow/python/debug/examples/debug_mnist.py
+++ b/tensorflow/python/debug/examples/debug_mnist.py
@@ -190,4 +190,5 @@ if __name__ == "__main__":
       "the gRPC address (e.g., localhost:1234). Mutually exclusive with the "
       "--debug flag.")
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  with tf.Graph().as_default():
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py
index 1c531478638d9a84cc8083b32689ba44abcc0bb7..885691c3ef71ba995ec3ab38e2d1bda7e1e30b1a 100644
--- a/tensorflow/python/debug/lib/debug_gradients_test.py
+++ b/tensorflow/python/debug/lib/debug_gradients_test.py
@@ -36,6 +36,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import gradient_descent
 
 
+@test_util.run_v1_only("b/120545219")
 class IdentifyGradientTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -54,7 +55,6 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     ops.reset_default_graph()
     debug_gradients.clear_gradient_debuggers()
 
-  @test_util.run_deprecated_v1
   def testIdentifyGradientGivesCorrectTensorObjectWithoutContextManager(self):
     grad_debugger = debug_gradients.GradientsDebugger()
     id_grad_w = grad_debugger.identify_gradient(self.w)
@@ -85,7 +85,6 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsInstance(w_grad, ops.Tensor)
     self.assertAllClose(1.0, self.sess.run(w_grad))
 
-  @test_util.run_deprecated_v1
   def testIdentifyGradientGivesCorrectTensorObjectWithTfGradients(self):
     grad_debugger = debug_gradients.GradientsDebugger()
     id_grad_w = grad_debugger.identify_gradient(self.w)
@@ -117,7 +116,6 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsInstance(w_grad, ops.Tensor)
     self.assertAllClose(1.0, self.sess.run(w_grad))
 
-  @test_util.run_deprecated_v1
   def testCallingIdentifyGradientTwiceWithTheSameGradientsDebuggerErrors(self):
     grad_debugger = debug_gradients.GradientsDebugger()
     grad_debugger.identify_gradient(self.w)
@@ -125,7 +123,6 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
                                  "The graph already contains an op named .*"):
       grad_debugger.identify_gradient(self.w)
 
-  @test_util.run_deprecated_v1
   def testIdentifyGradientWorksOnMultipleLosses(self):
     grad_debugger_1 = debug_gradients.GradientsDebugger()
     grad_debugger_2 = debug_gradients.GradientsDebugger()
@@ -154,7 +151,6 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertAllClose(2.0 * 5.0, self.sess.run(dz1_dy))
     self.assertAllClose(0.5 * (5.0**-0.5), self.sess.run(dz2_dy))
 
-  @test_util.run_deprecated_v1
   def testIdentifyGradientRaisesLookupErrorForUnknownXTensor(self):
     grad_debugger_1 = debug_gradients.GradientsDebugger()
     grad_debugger_2 = debug_gradients.GradientsDebugger()
@@ -175,7 +171,6 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
         r"This GradientsDebugger has not received any gradient tensor for "):
       grad_debugger_2.gradient_tensor(self.w)
 
-  @test_util.run_deprecated_v1
   def testIdentifyGradientRaisesTypeErrorForNonTensorOrTensorNameInput(self):
     grad_debugger = debug_gradients.GradientsDebugger()
     with self.assertRaisesRegexp(
@@ -184,7 +179,6 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
         r"has type .*Operation.*"):
       grad_debugger.gradient_tensor(variables.global_variables_initializer())
 
-  @test_util.run_deprecated_v1
   def testIdentifyGradientTensorWorksWithGradientDescentOptimizer(self):
     grad_debugger = debug_gradients.GradientsDebugger()
     id_grad_w = grad_debugger.identify_gradient(self.w)
@@ -200,7 +194,6 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsInstance(w_grad, ops.Tensor)
     self.assertAllClose(1.0, self.sess.run(w_grad))
 
-  @test_util.run_deprecated_v1
   def testWatchGradientsByXTensorNamesWorks(self):
     y = math_ops.add(self.w, -1.0, name="y")
 
@@ -227,7 +220,6 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsInstance(w_grad, ops.Tensor)
     self.assertAllClose(1.0, self.sess.run(w_grad))
 
-  @test_util.run_deprecated_v1
   def testWatchGradientsByXTensorNamesWorksWithoutContextManager(self):
     y = math_ops.add(self.w, -1.0, name="y")
 
@@ -254,7 +246,6 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsInstance(w_grad, ops.Tensor)
     self.assertAllClose(1.0, self.sess.run(w_grad))
 
-  @test_util.run_deprecated_v1
   def testWatchGradientsWorksOnRefTensor(self):
     y = math_ops.add(self.w, -1.0, name="y")
 
@@ -273,7 +264,6 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertAllClose(3.0, self.sess.run(
         grad_debugger.gradient_tensor("u:0")))
 
-  @test_util.run_deprecated_v1
   def testWatchGradientsWorksOnMultipleTensors(self):
     y = math_ops.add(self.w, -1.0, name="y")
 
@@ -294,7 +284,6 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertAllClose(3.0, self.sess.run(
         grad_debugger.gradient_tensor("u:0")))
 
-  @test_util.run_deprecated_v1
   def testWatchGradientsByXTensorsWorks(self):
     y = math_ops.add(self.w, -1.0, name="foo/y")
     z = math_ops.square(y, name="foo/z")
@@ -317,7 +306,6 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertAllClose(10.0, self.sess.run(w_grad))
     self.assertAllClose(30.0, self.sess.run(u_grad))
 
-  @test_util.run_deprecated_v1
   def testWatchGradientsByTensorCanWorkOnMultipleLosses(self):
     y = math_ops.add(self.w, -1.0, name="y")
     z1 = math_ops.square(y, name="z1")
@@ -343,7 +331,6 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertAllClose(2.0 * 5.0, self.sess.run(dz1_dy))
     self.assertAllClose(0.5 * (5.0**-0.5), self.sess.run(dz2_dy))
 
-  @test_util.run_deprecated_v1
   def testGradientsValuesFromDumpWorks(self):
     y = math_ops.add(self.w, -1.0, name="y")
     z = math_ops.square(y, name="z")
diff --git a/tensorflow/python/debug/lib/debug_utils_test.py b/tensorflow/python/debug/lib/debug_utils_test.py
index cf59b30e3dab4493bc846b73bbd768821d32751c..9d59cfc1792a8df472998e115dc01387a9ba3cdf 100644
--- a/tensorflow/python/debug/lib/debug_utils_test.py
+++ b/tensorflow/python/debug/lib/debug_utils_test.py
@@ -185,7 +185,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     self.assertEqual(["file:///tmp/tfdbg_1", "file:///tmp/tfdbg_2"],
                      watch_0.debug_urls)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_allNodes(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -217,7 +217,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     self.assertTrue("p1" in node_names)
     self.assertTrue("s" in node_names)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameWhitelist(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -232,7 +232,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         sorted(["a1_init", "a1", "a1/Assign", "a1/read", "p1"]),
         sorted(node_names))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_opTypeWhitelist(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -258,7 +258,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(["p1"], node_names)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_tensorDTypeWhitelist(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -271,7 +271,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertItemsEqual(["a1", "a1/Assign", "b", "b/Assign"], node_names)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameAndTensorDTypeWhitelists(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -285,7 +285,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertItemsEqual(["a1", "a1/Assign"], node_names)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameBlacklist(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -300,7 +300,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         sorted(["b_init", "b", "b/Assign", "b/read", "c", "s"]),
         sorted(node_names))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_opTypeBlacklist(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -313,7 +313,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(sorted(["p1", "s"]), sorted(node_names))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameAndOpTypeBlacklists(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -327,7 +327,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(["s"], node_names)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_tensorDTypeBlacklists(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -344,7 +344,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     self.assertNotIn("b/Assign", node_names)
     self.assertIn("s", node_names)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameAndTensorDTypeBlacklists(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
diff --git a/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py b/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
index 74498c8ea3dd494cd8fc6237b60b11a202497990..2405e29aaa51c2e0c422fa6f950ec46553ae75c0 100644
--- a/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
 
+@test_util.run_v1_only("b/120545219")
 class DistributedSessionDebugTest(test_util.TensorFlowTestCase):
   """Test the debugging of distributed sessions."""
 
diff --git a/tensorflow/python/debug/lib/session_debug_file_test.py b/tensorflow/python/debug/lib/session_debug_file_test.py
index f5f9ba29ab56e6fbcb8e4f2beea70130bdbff926..16ab815d92ddffe2108776388f668427fd140f06 100644
--- a/tensorflow/python/debug/lib/session_debug_file_test.py
+++ b/tensorflow/python/debug/lib/session_debug_file_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
+@test_util.run_v1_only("b/120545219")
 class SessionDebugFileTest(session_debug_testlib.SessionDebugTestBase):
 
   def _debug_urls(self, run_number=None):
@@ -45,7 +46,6 @@ class SessionDebugFileTest(session_debug_testlib.SessionDebugTestBase):
     else:
       return os.path.join(self._dump_root, "run_%d" % run_number)
 
-  @test_util.run_deprecated_v1
   def testAllowsDifferentWatchesOnDifferentRuns(self):
     """Test watching different tensors on different runs of the same graph."""
 
diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py
index bfc9a3a382744676fafe9f280ab54f8dee3fedcb..472e2449156fefc2c00bb4079018de224097692e 100644
--- a/tensorflow/python/debug/lib/session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py
@@ -91,6 +91,7 @@ class GrpcDebugServerTest(test_util.TensorFlowTestCase):
     server.stop_server().wait()
 
 
+@test_util.run_v1_only("b/120545219")
 class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
 
   @classmethod
@@ -353,6 +354,7 @@ class SessionDebugConcurrentTest(
     return urls
 
 
+@test_util.run_v1_only("b/120545219")
 class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
   """Test server gating of debug ops."""
 
@@ -730,6 +732,7 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
       self.assertEqual("DebugNumericSummary", debug_watch.debug_op)
 
 
+@test_util.run_v1_only("b/120545219")
 class DelayedDebugServerTest(test_util.TensorFlowTestCase):
 
   def testDebuggedSessionRunWorksWithDelayedDebugServerStartup(self):
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index 25ef91b575957164691bccd9d15107d9a4812eac..5165febff52506d07e2d3b0aea361c31567cc419 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -84,6 +84,7 @@ class _RNNCellForTest(rnn_cell_impl.RNNCell):
     return (math_ops.multiply(self._w, input_), state)
 
 
+@test_util.run_v1_only("b/120545219")
 class SessionDebugTestBase(test_util.TensorFlowTestCase):
   """Base class for unit tests of tfdbg running with tf.Session."""
 
diff --git a/tensorflow/python/debug/lib/source_utils_test.py b/tensorflow/python/debug/lib/source_utils_test.py
index 9083297fdbb661f4dc5bfb6193712e21ad42340b..4f4aea032132d09f025392587038b79d7f0804c5 100644
--- a/tensorflow/python/debug/lib/source_utils_test.py
+++ b/tensorflow/python/debug/lib/source_utils_test.py
@@ -216,6 +216,7 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
     os.remove(unrelated_source_path)
 
 
+@test_util.run_v1_only("b/120545219")
 class ListSourceAgainstDumpTest(test_util.TensorFlowTestCase):
 
   def createAndRunGraphWithWhileLoop(self):
diff --git a/tensorflow/python/debug/lib/stepper_test.py b/tensorflow/python/debug/lib/stepper_test.py
index 3839c671982f80158273ea40de73ff920306316d..9e78e207b80a99f3812c5909cf3753d90eab3680 100644
--- a/tensorflow/python/debug/lib/stepper_test.py
+++ b/tensorflow/python/debug/lib/stepper_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import gradient_descent
 
 
+@test_util.run_v1_only("b/120545219")
 class StepperTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -443,6 +444,7 @@ class StepperTest(test_util.TensorFlowTestCase):
           self.assertAllClose(-4.0, result["fz"]["z"])
 
 
+@test_util.run_v1_only("b/120545219")
 class StepperTestWithPlaceHolders(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -577,6 +579,7 @@ class StepperTestWithPlaceHolders(test_util.TensorFlowTestCase):
       self.assertAllClose([[-1.0], [6.0]], stepper.finalize())
 
 
+@test_util.run_v1_only("b/120545219")
 class StepperAssignAddTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -692,6 +695,7 @@ class StepperAssignAddTest(test_util.TensorFlowTestCase):
       self.assertAllClose(12.0, stepper.cont(self.v))
 
 
+@test_util.run_v1_only("b/120545219")
 class StepperBackwardRunTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/wrappers/disk_usage_test.py b/tensorflow/python/debug/wrappers/disk_usage_test.py
index 0874525966ceb34b9cb99df9affd63cf1865b663..88b1cd540de7a6a56db6e5165be53ae8c9c2df26 100644
--- a/tensorflow/python/debug/wrappers/disk_usage_test.py
+++ b/tensorflow/python/debug/wrappers/disk_usage_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import monitored_session
 
 
+@test_util.run_v1_only("b/120545219")
 class DumpingDebugWrapperDiskUsageLimitTest(test_util.TensorFlowTestCase):
 
   @classmethod
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index 11011a5c1342b281ab86c7f861d895f570bd037d..42e3b09382d825840ea12eeaf2baf35f33c17da9 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -41,6 +41,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import monitored_session
 
 
+@test_util.run_v1_only("b/120545219")
 class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index 68584b4ede46f2e61310c262d543837b71542de4..a50fa7cf4b870868a61ea4df173fc24bc8a8e110 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -141,6 +141,7 @@ class TestDebugWrapperSessionBadAction(framework.BaseDebugWrapperSession):
     return framework.OnRunEndResponse()
 
 
+@test_util.run_v1_only("b/120545219")
 class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def _no_rewrite_session_config(self):
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 149a7497df8fecc19a665afc1483ad55c890c335..e38df861f5b633baf94c99e4892e1bd90943337d 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -127,6 +127,7 @@ class LocalCLIDebuggerWrapperSessionForTest(
         return e.exit_token
 
 
+@test_util.run_v1_only("b/120545219")
 class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 2d9a1764db78ca8919d8f8b53a9f4de21ac4e174..887c61cb8fd81c6be4d20ba6b25c2997cea8cb7f 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -103,6 +103,18 @@ cuda_py_test(
     ],
 )
 
+py_library(
+    name = "distribute",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":distribute_lib",
+        ":mirrored_strategy",
+    ],
+)
+
 py_library(
     name = "distribute_lib",
     srcs = [
diff --git a/tensorflow/python/distribute/__init__.py b/tensorflow/python/distribute/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ff912ae10d8336cfeeb42d060bd0d9c52e24482
--- /dev/null
+++ b/tensorflow/python/distribute/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Distribution Strategy library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import mirrored_strategy
+# pylint: enable=unused-import
diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
index 7774ac0e122a532e1e0280f185ead3022a0b89d6..73188bd7caaeb8f60e1e19dc11ce20e0a4349433 100644
--- a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
@@ -22,6 +22,8 @@ import abc
 
 import six
 
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
 from tensorflow.python.training.server_lib import ClusterSpec
 
 
@@ -32,6 +34,14 @@ def format_master_url(master, rpc_layer=None):
     return master
 
 
+def get_accelerator_devices(master, config_proto):
+  # TODO(frankchn): Add support for eager mode as well as graph mode.
+  with ops.Graph().as_default():
+    with session.Session(master, config=config_proto) as s:
+      devices = s.list_devices()
+  return devices
+
+
 @six.add_metaclass(abc.ABCMeta)
 class ClusterResolver(object):
   """Abstract class for all implementations of ClusterResolvers.
@@ -91,8 +101,11 @@ class ClusterResolver(object):
     """
     raise NotImplementedError()
 
-  @abc.abstractmethod
-  def num_accelerators_per_worker(self, session_config=None):
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
     """Returns the number of accelerator cores per worker.
 
     This returns the number of accelerator cores (such as GPUs and TPUs)
@@ -100,11 +113,24 @@ class ClusterResolver(object):
     should return 0. This method will query the master for this information
     if it is not otherwise known.
 
+    Optionally, we allow callers to specify the task_type, task_index, and
+    rpc_layer, if they want to target a specific TensorFlow process to query
+    the number of accelerators. This is to support heterogenous environments,
+    where the number of accelerators cores per host is different.
+
     Args:
-      session_config: (Optional) Configuration for starting a new session to
+      task_type: (Optional) The type of the TensorFlow task of the machine we
+        want to query.
+      task_index: (Optional) The index of the TensorFlow task of the machine we
+        want to query.
+      accelerator_type: (Optional) The type of accelerator we are trying to
+        query (defaults to 'GPU').
+      config_proto: (Optional) Configuration for starting a new session to
         query how many accelerator cores it has.
     """
-    raise NotImplementedError()
+    master = self.master(task_type, task_index)
+    devices = get_accelerator_devices(master, config_proto)
+    return sum(1 for d in devices if d.device_type == accelerator_type)
 
   @abc.abstractproperty
   def environment(self):
@@ -116,7 +142,7 @@ class SimpleClusterResolver(ClusterResolver):
   """Simple implementation of ClusterResolver that accepts a ClusterSpec."""
 
   def __init__(self, cluster_spec, master='', task_type=None, task_index=None,
-               environment='', num_accelerators_per_worker=0,
+               environment='', num_accelerators=0,
                rpc_layer=None):
     """Creates a SimpleClusterResolver from a ClusterSpec."""
     super(SimpleClusterResolver, self).__init__()
@@ -124,7 +150,7 @@ class SimpleClusterResolver(ClusterResolver):
     self._task_type = task_type
     self._task_index = task_index
     self._environment = environment
-    self._num_accelerators_per_worker = num_accelerators_per_worker
+    self._num_accelerators = num_accelerators
     self._rpc_layer = rpc_layer
 
     if not isinstance(cluster_spec, ClusterSpec):
@@ -180,17 +206,27 @@ class SimpleClusterResolver(ClusterResolver):
   def environment(self):
     return self._environment
 
-  def num_accelerators_per_worker(self, session_config=None):
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
     """Returns the number of accelerator cores per worker.
 
+    The SimpleClusterResolver does not do automatic detection of accelerators,
+    so a TensorFlow session will never be created, and thus all arguments are
+    unused and we simply return whatever was passed in when this object was
+    initialized.
+
     Args:
-      session_config: Unused. The SimpleClusterResolver does not do automatic
-        detection of accelerators, so a TensorFlow session will never be
-        created, and thus a `session_config` is never necessary here, and will
-        be ignored.
+      task_type: Unused.
+      task_index: Unused.
+      accelerator_type: Unused.
+      config_proto: Unused.
     """
-    del session_config
-    return self._num_accelerators_per_worker
+    # Unused
+    del task_type, task_index, accelerator_type, config_proto
+    return self._num_accelerators
 
   @property
   def rpc_layer(self):
@@ -361,9 +397,13 @@ class UnionClusterResolver(ClusterResolver):
   def environment(self):
     return self._cluster_resolvers[0].environment
 
-  def num_accelerators_per_worker(self, session_config=None):
-    return self._cluster_resolvers[0].num_accelerators_per_worker(
-        session_config)
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    return self._cluster_resolvers[0].num_accelerators(
+        task_type, task_index, accelerator_type, config_proto)
 
   @property
   def rpc_layer(self):
diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
index b5448faec6b2d929bcbb95b7b56f2197f40caaaa..0ff6b6be62122b3a7b71124613a694d9bb5fd357 100644
--- a/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
@@ -18,11 +18,64 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.client import session
+from tensorflow.python.distribute.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver import UnionClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
+mock = test.mock
+
+
+class MockBaseClusterResolver(ClusterResolver):
+
+  def cluster_spec(self):
+    return None
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    return ""
+
+  def environment(self):
+    return ""
+
+
+class BaseClusterResolverTest(test.TestCase):
+
+  @mock.patch.object(session.BaseSession, "list_devices")
+  def testNumAcceleratorsSuccess(self, mock_list_devices):
+    device_names = [
+        "/job:worker/task:0/device:GPU:0",
+        "/job:worker/task:0/device:GPU:1",
+        "/job:worker/task:0/device:GPU:2",
+        "/job:worker/task:0/device:GPU:3",
+    ]
+    device_list = [
+        session._DeviceAttributes(
+            name, "GPU", 1024, 0) for name in device_names
+    ]
+    mock_list_devices.return_value = device_list
+
+    resolver = MockBaseClusterResolver()
+    self.assertEqual(resolver.num_accelerators(), 4)
+
+  @mock.patch.object(session.BaseSession, "list_devices")
+  def testNumAcceleratorsFilterSuccess(self, mock_list_devices):
+    device_names = [
+        "/job:worker/task:0/device:TPU:0",
+        "/job:worker/task:0/device:TPU:1",
+        "/job:worker/task:0/device:TPU:2",
+        "/job:worker/task:0/device:TPU:3",
+    ]
+    device_list = [
+        session._DeviceAttributes(
+            name, "TPU", 1024, 0) for name in device_names
+    ]
+    mock_list_devices.return_value = device_list
+
+    resolver = MockBaseClusterResolver()
+    self.assertEqual(resolver.num_accelerators(), 0)
+
 
 class UnionClusterResolverTest(test.TestCase):
   # TODO(frankchn): Transform to parameterized test after it is included in the
@@ -65,13 +118,13 @@ class UnionClusterResolverTest(test.TestCase):
 
     simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
                                             task_index=1, environment="cloud",
-                                            num_accelerators_per_worker=8,
+                                            num_accelerators=8,
                                             rpc_layer="grpc")
 
     self.assertEqual(simple_resolver.task_type, "ps")
     self.assertEqual(simple_resolver.task_index, 1)
     self.assertEqual(simple_resolver.environment, "cloud")
-    self.assertEqual(simple_resolver.num_accelerators_per_worker(), 8)
+    self.assertEqual(simple_resolver.num_accelerators(), 8)
     self.assertEqual(simple_resolver.rpc_layer, "grpc")
 
   def testOverrideSimpleClusterResolver(self):
@@ -82,7 +135,7 @@ class UnionClusterResolverTest(test.TestCase):
 
     simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
                                             task_index=1, environment="cloud",
-                                            num_accelerators_per_worker=8,
+                                            num_accelerators=8,
                                             rpc_layer="grpc")
 
     simple_resolver.task_type = "worker"
@@ -130,7 +183,7 @@ class UnionClusterResolverTest(test.TestCase):
     })
     resolver1 = SimpleClusterResolver(cluster_spec_1, task_type="ps",
                                       task_index=1, environment="cloud",
-                                      num_accelerators_per_worker=8,
+                                      num_accelerators=8,
                                       rpc_layer="grpc")
 
     cluster_spec_2 = server_lib.ClusterSpec({
@@ -139,7 +192,7 @@ class UnionClusterResolverTest(test.TestCase):
     })
     resolver2 = SimpleClusterResolver(cluster_spec_2, task_type="worker",
                                       task_index=2, environment="local",
-                                      num_accelerators_per_worker=16,
+                                      num_accelerators=16,
                                       rpc_layer="http")
 
     union_resolver = UnionClusterResolver(resolver1, resolver2)
@@ -147,7 +200,7 @@ class UnionClusterResolverTest(test.TestCase):
     self.assertEqual(union_resolver.task_type, "ps")
     self.assertEqual(union_resolver.task_index, 1)
     self.assertEqual(union_resolver.environment, "cloud")
-    self.assertEqual(union_resolver.num_accelerators_per_worker(), 8)
+    self.assertEqual(union_resolver.num_accelerators(), 8)
     self.assertEqual(union_resolver.rpc_layer, "grpc")
 
     union_resolver.task_type = "worker"
diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
index b167bc8fc85c83083a0130e7f108981ecbb783a7..06512613cbe34b09730dd7c6914ea9d7098204d5 100644
--- a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
@@ -51,7 +51,6 @@ class GceClusterResolver(ClusterResolver):
                task_type='worker',
                task_index=0,
                rpc_layer='grpc',
-               num_accelerators_per_worker=0,
                credentials='default',
                service=None):
     """Creates a new GceClusterResolver object.
@@ -73,8 +72,6 @@ class GceClusterResolver(ClusterResolver):
         can be distinguished from each other.
       rpc_layer: The RPC layer TensorFlow should use to communicate across
         instances.
-      num_accelerators_per_worker: Number of accelerators (GPUs) present per
-        instance.
       credentials: GCE Credentials. If nothing is specified, this defaults to
         GoogleCredentials.get_application_default().
       service: The GCE API object returned by the googleapiclient.discovery
@@ -200,7 +197,3 @@ class GceClusterResolver(ClusterResolver):
   @rpc_layer.setter
   def rpc_layer(self, rpc_layer):
     self._rpc_layer = rpc_layer
-
-  def num_accelerators_per_worker(self, session_config=None):
-    del session_config  # Unused, since this is set manually in __init__.
-    return self._num_accelerators_per_worker
diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
index 041c0815409affb7a371e8504583ade57c02df3b..7ff6ec0f2d5c6f6d2315e98cf5e7250b118fbadd 100644
--- a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.client import device_lib
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
 from tensorflow.python.training import server_lib
@@ -108,16 +107,14 @@ class KubernetesClusterResolver(ClusterResolver):
     Returns:
       The name or URL of the session master.
     """
+    task_type = task_type if task_type is not None else self.task_type
+    task_index = task_index if task_index is not None else self.task_index
+
     if task_type is not None and task_index is not None:
       return format_master_url(
           self.cluster_spec().task_address(task_type, task_index),
           rpc_layer or self.rpc_layer)
 
-    if self.task_type is not None and self.task_index is not None:
-      return format_master_url(
-          self.cluster_spec().task_address(self.task_type, self.task_index),
-          rpc_layer or self.rpc_layer)
-
     return ''
 
   def cluster_spec(self):
@@ -167,7 +164,3 @@ class KubernetesClusterResolver(ClusterResolver):
     on internal systems.
     """
     return ''
-
-  def num_accelerators_per_worker(self, session_config=None):
-    local_devices = device_lib.list_local_devices(session_config)
-    return len([d for d in local_devices if d.device_type == 'GPU'])
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
index fd3c6d6a18fcdcf5e476cc088d7f7e6f006da479..9dbe25b613447fde2140585742d005dab82fb018 100644
--- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
@@ -23,6 +23,7 @@ import os
 import subprocess
 
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
 from tensorflow.python.training.server_lib import ClusterSpec
 
 
@@ -206,10 +207,13 @@ class SlurmClusterResolver(ClusterResolver):
     """
     task_type = task_type if task_type is not None else self.task_type
     task_index = task_index if task_index is not None else self.task_index
-    rpc_layer = rpc_layer or self.rpc_layer
-    master = self.cluster_spec().task_address(task_type, task_index)
 
-    return '%s://%s' % (rpc_layer, master) if rpc_layer else master
+    if task_type is not None and task_index is not None:
+      return format_master_url(
+          self.cluster_spec().task_address(task_type, task_index),
+          rpc_layer or self.rpc_layer)
+
+    return ''
 
   @property
   def environment(self):
@@ -221,6 +225,11 @@ class SlurmClusterResolver(ClusterResolver):
     """
     return ''
 
-  def num_accelerators_per_worker(self, session_config=None):
-    del session_config  # Unused, since this is set in __init__ manually.
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    # Unused, since this is set in __init__ manually.
+    del task_type, task_index, accelerator_type, config_proto
     return self._gpus_per_node
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
index a3246e77f4d4e666cf29ea6dad9a53a6ab915d9e..8d530cc15a035afcf2d3356599ed06e0b9d9a4cd 100644
--- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
@@ -54,8 +54,7 @@ class TFConfigClusterResolver(ClusterResolver):
                task_type=None,
                task_index=None,
                rpc_layer=None,
-               environment=None,
-               num_accelerators_per_worker=0):
+               environment=None):
     """Creates a new TFConfigClusterResolver.
 
     Args:
@@ -66,15 +65,11 @@ class TFConfigClusterResolver(ClusterResolver):
       rpc_layer: (String, optional) Overrides the rpc layer TensorFlow uses.
       environment: (String, optional) Overrides the environment TensorFlow
         operates in.
-      num_accelerators_per_worker: (Integer, optional) Specifies the number of
-        accelerators (e.g. GPUs, TPUs, others) that each node has.
     """
-
     self._task_type = task_type
     self._task_index = task_index
     self._rpc_layer = rpc_layer
     self._environment = environment
-    self._num_accelerators_per_worker = num_accelerators_per_worker
 
   @property
   def task_type(self):
@@ -115,11 +110,6 @@ class TFConfigClusterResolver(ClusterResolver):
   def rpc_layer(self, rpc_layer):
     self._rpc_layer = rpc_layer
 
-  def num_accelerators_per_worker(self, session_config=None):
-    # TODO(frankchn): Connect to server (w/ session_config) in the future.
-    del session_config  # Unused, we do not connect to another server here.
-    return self._num_accelerators_per_worker
-
   def cluster_spec(self):
     """Returns a ClusterSpec based on the TF_CONFIG environment variable.
 
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
index c20e51bc0bb88364b94766217825ad029fc97bdd..36b3bb9c1e1a32960525f8cff7f852e204c72211 100644
--- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
@@ -168,13 +168,11 @@ class TFConfigClusterResolverTest(test.TestCase):
     }
     """
 
-    cluster_resolver = TFConfigClusterResolver(task_type='ps', task_index=0,
-                                               num_accelerators_per_worker=8)
+    cluster_resolver = TFConfigClusterResolver(task_type='ps', task_index=0)
 
     self.assertEqual('grpc://ps0:2222', cluster_resolver.master())
     self.assertEqual('ps', cluster_resolver.task_type)
     self.assertEqual(0, cluster_resolver.task_index)
-    self.assertEqual(8, cluster_resolver.num_accelerators_per_worker())
 
     cluster_resolver.task_type = 'worker'
     cluster_resolver.task_index = 1
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
index 1956bd75a87ab3a85e690a29fabf3a12842487b2..52ac07d7ea5ea32833f0138441da54d69ae1ce4c 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
@@ -18,13 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import os
+import re
 
 from six.moves.urllib.request import Request
 from six.moves.urllib.request import urlopen
 
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import get_accelerator_devices
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
@@ -41,6 +46,45 @@ _ENDPOINTS_SEPARATOR = ','
 _DEFAULT_ENV_VARIABLE = 'TPU_NAME'
 _DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'
 
+_TPU_DEVICE_REGEX = re.compile(
+    r'.*task:(?P<host_id>\d+)/.*device:TPU:(?P<core_id>\d+)$')
+_TPU_CONN_RETRIES = 120
+
+DeviceDetails = collections.namedtuple(
+    'DeviceDetails', ['device_map', 'total_cores'])
+
+
+def _get_device_dict_and_cores(devices):
+  """Returns a dict of hosts to cores and total cores given devices names.
+
+  Returns a namedtuple with two attributes:
+    device_map: A map of host_ids to a list of core_ids.
+    total_cores: The total number of cores within the TPU system.
+
+  Args:
+    devices: A list of devices returned by session.list_devices()
+  """
+  device_map = collections.defaultdict(list)
+  num_cores = 0
+  for device in devices:
+    match = _TPU_DEVICE_REGEX.match(device.name)
+    if match:
+      host_id = match.group('host_id')
+      core_id = match.group('core_id')
+      device_map[host_id].append(core_id)
+      num_cores += 1
+  return DeviceDetails(device_map, num_cores)
+
+
+def _verify_and_return_same_core_count(device_dict):
+  """Verifies that every device in device_dict has the same number of cores."""
+  num_cores_per_host_set = (
+      {len(core_ids) for core_ids in device_dict.values()})
+  if len(num_cores_per_host_set) != 1:
+    raise RuntimeError('TPU cores on each device is not the same. This '
+                       'should never happen. Devices: {}'.format(device_dict))
+  return num_cores_per_host_set.pop()
+
 
 class TPUClusterResolver(ClusterResolver):
   """Cluster Resolver for Google Cloud TPUs.
@@ -92,7 +136,8 @@ class TPUClusterResolver(ClusterResolver):
         self._tpu == compat.as_bytes('local') or
         self._tpu.startswith(compat.as_bytes('/bns')) or
         self._tpu.startswith(compat.as_bytes('localhost:')) or
-        self._tpu.startswith(compat.as_bytes('grpc://'))):
+        self._tpu.startswith(compat.as_bytes('grpc://')) or
+        self._tpu.startswith(compat.as_bytes('uptc://'))):
       return False
     return True
 
@@ -196,13 +241,14 @@ class TPUClusterResolver(ClusterResolver):
     elif tpu == 'local' or not tpu:
       # Google environment, where the TPU is attached to the host.
       self._environment = 'google'
-    elif tpu.startswith('/bns'):
+    elif tpu.startswith('/bns') or tpu.startswith('uptc://'):
       # Google environment, where we reach the TPU through BNS.
       self._environment = 'google'
 
     # If TPU is in the Google environment or exists locally, we don't use any
     # RPC layer.
-    if tpu.startswith('/bns') or tpu == 'local' or not tpu:
+    if tpu.startswith('/bns') or tpu.startswith(
+        'uptc://') or tpu == 'local' or not tpu:
       self.rpc_layer = None
     else:
       self.rpc_layer = 'grpc'
@@ -385,18 +431,49 @@ class TPUClusterResolver(ClusterResolver):
 
     return server_lib.ClusterSpec(cluster_spec)
 
-  def num_accelerators_per_worker(self, session_config=None):
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='TPU',
+                       config_proto=None):
     """Returns the number of TPU cores per worker.
 
-    This defaults to 8 for all current TPU configurations, and we do not need
-    to query any remote systems for this.
+    Connects to the master and list all the devices present in the master,
+    and counts them up. Also verifies that the device counts per host in the
+    cluster is the same before returning the number of TPU cores per host.
 
     Args:
-      session_config: Unused. Not currently necessary to query anything as this
-        number is 8 for all TPU configurations.
+      task_type: Unused.
+      task_index: Unused.
+      accelerator_type: Unused.
+      config_proto: Used to create a connection to a TPU master in order to
+        retrieve the system metadata.
+
+    Raises:
+      RuntimeError: If we cannot talk to a TPU worker after retrying or if the
+        number of TPU devices per host is different.
     """
-    del session_config  # Unused. Not necessary to query anything.
-    return 8
+    retry_count = 1
+    # TODO(b/120564445): Replace with standard library for retries.
+    while True:
+      try:
+        device_details = _get_device_dict_and_cores(
+            get_accelerator_devices(self.master(), config_proto=config_proto))
+        break
+      except errors.DeadlineExceededError:
+        error_message = ('Failed to connect to master. The TPU might not be '
+                         'ready (e.g. still scheduling) or the master '
+                         'address is incorrect: got (%s)' % self.master())
+        if retry_count <= _TPU_CONN_RETRIES:
+          logging.warning(error_message)
+          logging.warning('Retrying (%d/%d)...', retry_count, _TPU_CONN_RETRIES)
+          retry_count += 1
+        else:
+          raise RuntimeError(error_message)
+
+    if device_details.total_cores:
+      return _verify_and_return_same_core_count(device_details.device_map)
+    return 0
 
   @property
   def environment(self):
@@ -404,7 +481,8 @@ class TPUClusterResolver(ClusterResolver):
     return self._environment
 
   def _start_local_server(self):
-    address = self._requestComputeMetadata('instance/network-interfaces/0/ip')
+    address = compat.as_text(self._requestComputeMetadata(
+        'instance/network-interfaces/0/ip'))
     self._server = server_lib.Server(
         {
             'local': ['0.0.0.0:0']
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
index 0f22ede3d9b6f5af4691872fc63216c0cf0c2b3a..27d92608fa2db95944c94160d716a033ab2f78a2 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
@@ -20,7 +20,10 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver
+from tensorflow.python.client import session
+from tensorflow.python.distribute import cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
@@ -101,7 +104,8 @@ class TPUClusterResolverTest(test.TestCase):
 
     return mock_client
 
-  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
+  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+                     '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testRetrieveProjectAndZoneFromMetadata(self):
     tpu_map = {
@@ -112,7 +116,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project=None,
         zone=None,
         tpu=['test-tpu-1'],
@@ -120,7 +124,7 @@ class TPUClusterResolverTest(test.TestCase):
         service=self.mock_service_client(tpu_map=tpu_map),
         coordinator_name='coordinator')
 
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job {
       name: 'coordinator'
@@ -130,11 +134,12 @@ class TPUClusterResolverTest(test.TestCase):
       name: 'worker'
       tasks { key: 0 value: '10.1.2.3:8470' }
     }
-    """ % tpu_cluster_resolver._coordinator_port
+    """ % resolver._coordinator_port
     self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
-    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.1.2.3:8470')
+    self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
 
-  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
+  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+                     '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testRetrieveProjectAndZoneFromMetadataNoCoordinator(self):
     tpu_map = {
@@ -145,7 +150,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project=None,
         zone=None,
         tpu=['test-tpu-1'],
@@ -153,14 +158,15 @@ class TPUClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
 
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
-    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.1.2.3:8470')
+    self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
 
-  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
+  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+                     '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testUnhealthyCloudTpu(self):
     tpu_map = {
@@ -171,7 +177,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project=None,
         zone=None,
         tpu='test-tpu-1',
@@ -180,9 +186,10 @@ class TPUClusterResolverTest(test.TestCase):
         service=self.mock_service_client(tpu_map=tpu_map))
 
     with self.assertRaises(RuntimeError):
-      tpu_cluster_resolver.cluster_spec()
+      resolver.cluster_spec()
 
-  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
+  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+                     '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testNotReadyCloudTpu(self):
     tpu_map = {
@@ -193,7 +200,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project=None,
         zone=None,
         tpu='test-tpu-1',
@@ -202,7 +209,7 @@ class TPUClusterResolverTest(test.TestCase):
         service=self.mock_service_client(tpu_map=tpu_map))
 
     with self.assertRaises(RuntimeError):
-      tpu_cluster_resolver.cluster_spec()
+      resolver.cluster_spec()
 
   def testSimpleSuccessfulRetrieval(self):
     tpu_map = {
@@ -213,7 +220,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu=['test-tpu-1'],
@@ -222,13 +229,13 @@ class TPUClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
 
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job { name: 'coordinator' tasks { key: 0 value: '10.128.1.5:10203' } }
     job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
-    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.1.2.3:8470')
+    self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
 
   def testNewNetworkEndpointFormat(self):
     tpu_map = {
@@ -241,7 +248,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu='test-tpu-1',
@@ -250,15 +257,16 @@ class TPUClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
 
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job { name: 'coordinator' tasks { key: 0 value: '10.128.1.5:10203' } }
     job { name: 'worker' tasks { key: 0 value: '10.2.3.4:8470' } }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
-    self.assertEqual('grpc://10.2.3.4:8470', tpu_cluster_resolver.master())
+    self.assertEqual('grpc://10.2.3.4:8470', resolver.master())
 
-  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
+  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+                     '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testPodResolution(self):
     tpu_map = {
@@ -286,13 +294,13 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         tpu='test-tpu-1',
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map),
         coordinator_name='coordinator')
 
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job {
       name: 'coordinator',
@@ -305,9 +313,9 @@ class TPUClusterResolverTest(test.TestCase):
       tasks { key: 2 value: '10.2.3.6:8470' }
       tasks { key: 3 value: '10.2.3.7:8470' }
     }
-    """ % tpu_cluster_resolver._coordinator_port
+    """ % resolver._coordinator_port
     self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
-    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.2.3.4:8470')
+    self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')
 
   def testPodResolutionNoCoordinator(self):
     tpu_map = {
@@ -335,7 +343,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu='test-tpu-1',
@@ -343,7 +351,7 @@ class TPUClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
 
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job {
       name: 'worker'
@@ -354,13 +362,13 @@ class TPUClusterResolverTest(test.TestCase):
     }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
-    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.2.3.4:8470')
+    self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')
 
   def testGetMasterNoEntries(self):
     tpu_map = {}
 
     with self.assertRaises(ValueError):
-      TPUClusterResolver(
+      cluster_resolver.TPUClusterResolver(
           project='test-project',
           zone='us-central1-c',
           tpu=[],
@@ -370,14 +378,14 @@ class TPUClusterResolverTest(test.TestCase):
 
   # TODO(saeta): Convert to parameterized test when included in OSS TF.
   def verifyShouldResolve(self, tpu, should_resolve):
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu=tpu,
         coordinator_name=None,
         credentials=None,
         service=self.mock_service_client(tpu_map={}))
-    self.assertEqual(should_resolve, tpu_cluster_resolver._shouldResolve(),
+    self.assertEqual(should_resolve, resolver._shouldResolve(),
                      "TPU: '%s'" % tpu)
 
   def testShouldResolveNoName(self):
@@ -402,25 +410,26 @@ class TPUClusterResolverTest(test.TestCase):
     self.verifyShouldResolve('grpctpu', True)
 
   def testNoCallComputeMetadata(self):
-    tpu_cluster_resolver = TPUClusterResolver(tpu='/bns/foo/bar')
+    resolver = cluster_resolver.TPUClusterResolver(
+        tpu='/bns/foo/bar')
     self.assertEqual(
-        compat.as_bytes('/bns/foo/bar'), tpu_cluster_resolver.master())
-    self.assertEqual(None, tpu_cluster_resolver.cluster_spec())
+        compat.as_bytes('/bns/foo/bar'), resolver.master())
+    self.assertEqual(None, resolver.cluster_spec())
 
   def testGkeEnvironmentForDonut(self):
     os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470'
 
     self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
-    self.assertTrue(TPUClusterResolver._inGke())
+    self.assertTrue(cluster_resolver.TPUClusterResolver._inGke())
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))
+        compat.as_bytes(cluster_resolver.TPUClusterResolver._gkeEndpoints()))
 
-    tpu_cluster_resolver = TPUClusterResolver()
+    resolver = cluster_resolver.TPUClusterResolver()
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(tpu_cluster_resolver.master()))
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+        compat.as_bytes(resolver.master()))
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job {
       name: 'worker'
@@ -438,19 +447,19 @@ class TPUClusterResolverTest(test.TestCase):
                                                      'grpc://10.120.27.8:8470')
 
     self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
-    self.assertTrue(TPUClusterResolver._inGke())
+    self.assertTrue(cluster_resolver.TPUClusterResolver._inGke())
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470,'
                         'grpc://10.120.27.6:8470,'
                         'grpc://10.120.27.7:8470,'
                         'grpc://10.120.27.8:8470'),
-        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))
+        compat.as_bytes(cluster_resolver.TPUClusterResolver._gkeEndpoints()))
 
-    tpu_cluster_resolver = TPUClusterResolver()
+    resolver = cluster_resolver.TPUClusterResolver()
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(tpu_cluster_resolver.master()))
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+        compat.as_bytes(resolver.master()))
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job {
       name: 'worker'
@@ -467,18 +476,21 @@ class TPUClusterResolverTest(test.TestCase):
   def testEnvironmentDiscoveryUrl(self):
     os.environ['TPU_API_DISCOVERY_URL'] = 'https://{api}.internal/{apiVersion}'
     self.assertEqual('https://{api}.internal/{apiVersion}',
-                     TPUClusterResolver._environmentDiscoveryUrl())
+                     (cluster_resolver.TPUClusterResolver.
+                      _environmentDiscoveryUrl()))
 
   def testEnvironmentAndRpcDetectionForGoogle(self):
-    tpu_cluster_resolver = TPUClusterResolver(tpu='/bns/ab/cd/ef')
-    self.assertEqual(tpu_cluster_resolver.environment, 'google')
-    self.assertEqual(tpu_cluster_resolver.rpc_layer, None)
+    resolver = cluster_resolver.TPUClusterResolver(
+        tpu='/bns/ab/cd/ef')
+    self.assertEqual(resolver.environment, 'google')
+    self.assertEqual(resolver.rpc_layer, None)
 
   def testEnvironmentAndRpcDetectionForGrpcString(self):
-    tpu_cluster_resolver = TPUClusterResolver(tpu='grpc://10.1.2.3:8470')
-    self.assertEqual(tpu_cluster_resolver.environment, '')
-    self.assertEqual(tpu_cluster_resolver.rpc_layer, 'grpc')
-    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.1.2.3:8470')
+    resolver = cluster_resolver.TPUClusterResolver(
+        tpu='grpc://10.1.2.3:8470')
+    self.assertEqual(resolver.environment, '')
+    self.assertEqual(resolver.rpc_layer, 'grpc')
+    self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
 
   def testOverrideTaskTypeAndIndexAndGetMaster(self):
     tpu_map = {
@@ -506,7 +518,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu='test-tpu-1',
@@ -514,17 +526,103 @@ class TPUClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
 
-    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.2.3.4:8470')
+    self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')
 
-    tpu_cluster_resolver.task_type = 'worker'
-    tpu_cluster_resolver.task_index = 3
-    self.assertEqual(tpu_cluster_resolver.master(), 'grpc://10.2.3.7:8470')
+    resolver.task_type = 'worker'
+    resolver.task_index = 3
+    self.assertEqual(resolver.master(), 'grpc://10.2.3.7:8470')
 
     self.assertEqual(
-        tpu_cluster_resolver.master(
+        resolver.master(
             task_type='worker', task_index=2, rpc_layer='test'),
         'test://10.2.3.6:8470')
 
+  def testGetDeviceDictAndCoresWithTPUs(self):
+    device_names = [
+        '/job:tpu_worker/task:0/device:TPU:0',
+        '/job:tpu_worker/task:1/device:TPU:1',
+        '/job:tpu_worker/task:2/device:TPU:0',
+        '/job:tpu_worker/task:3/device:TPU:1',
+        '/job:tpu_worker/task:0/device:TPU:4',
+        '/job:tpu_worker/task:1/device:TPU:5',
+        '/job:tpu_worker/task:2/device:TPU:4',
+        '/job:tpu_worker/task:3/device:TPU:5',
+    ]
+    device_list = [
+        session._DeviceAttributes(
+            name, 'TPU', 1024, 0) for name in device_names
+    ]
+
+    device_details = tpu_cluster_resolver._get_device_dict_and_cores(
+        device_list)
+    self.assertEqual(device_details.total_cores, 8)
+    self.assertEqual(device_details.device_map,
+                     {'0': ['0', '4'],
+                      '1': ['1', '5'],
+                      '2': ['0', '4'],
+                      '3': ['1', '5']})
+
+  def testGetDeviceDictAndCoresWithCPUsAndGPUs(self):
+    device_names = [
+        '/job:tpu_worker/task:0/device:CPU:0',
+        '/job:tpu_worker/task:1/device:CPU:0',
+        '/job:tpu_worker/task:2/device:CPU:0',
+        '/job:tpu_worker/task:3/device:CPU:0',
+        '/job:tpu_worker/task:0/device:GPU:1',
+        '/job:tpu_worker/task:1/device:GPU:1',
+        '/job:tpu_worker/task:2/device:GPU:1',
+        '/job:tpu_worker/task:3/device:GPU:1',
+    ]
+    device_list = [
+        session._DeviceAttributes(
+            name, 'XLA', 1024, 0) for name in device_names
+    ]
+
+    device_dict, num_cores = tpu_cluster_resolver._get_device_dict_and_cores(
+        device_list)
+    self.assertEqual(num_cores, 0)
+    self.assertEqual(device_dict, {})
+
+  def testVerifySameCoreCount(self):
+    self.assertEqual(
+        tpu_cluster_resolver._verify_and_return_same_core_count(
+            {0: [0, 1, 2, 3, 4, 5, 6, 7]}), 8)
+    self.assertEqual(
+        tpu_cluster_resolver._verify_and_return_same_core_count(
+            {0: [0, 1], 1: [2, 3]}), 2)
+    with self.assertRaises(RuntimeError):
+      tpu_cluster_resolver._verify_and_return_same_core_count(
+          {0: [0], 1: [1, 2]})
+
+  @mock.patch.object(session.BaseSession, 'list_devices')
+  def testNumAcceleratorsSuccess(self, mock_list_devices):
+    device_names = [
+        '/job:tpu_worker/task:0/device:TPU:0',
+        '/job:tpu_worker/task:1/device:TPU:1',
+        '/job:tpu_worker/task:2/device:TPU:0',
+        '/job:tpu_worker/task:3/device:TPU:1',
+        '/job:tpu_worker/task:0/device:TPU:4',
+        '/job:tpu_worker/task:1/device:TPU:5',
+        '/job:tpu_worker/task:2/device:TPU:4',
+        '/job:tpu_worker/task:3/device:TPU:5',
+    ]
+    device_list = [
+        session._DeviceAttributes(
+            name, 'TPU', 1024, 0) for name in device_names
+    ]
+    mock_list_devices.return_value = device_list
+
+    resolver = cluster_resolver.TPUClusterResolver(tpu='')
+    self.assertEqual(resolver.num_accelerators(), 2)
+
+  @mock.patch.object(session.BaseSession, 'list_devices')
+  def testNumAcceleratorsRetryFailure(self, mock_list_devices):
+    resolver = cluster_resolver.TPUClusterResolver(tpu='')
+    mock_list_devices.side_effect = errors.DeadlineExceededError(
+        None, None, 'timeout')
+    with self.assertRaises(RuntimeError):
+      resolver.num_accelerators()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index a88ed6253318a445ae3331b30763e15d14a0e458..23349a965e8c168874dd65e09e9cfa75c9c46fde 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -53,15 +53,53 @@ def validate_destinations(destinations):
   if not isinstance(
       destinations,
       (value_lib.DistributedValues, resource_variable_ops.ResourceVariable,
-       value_lib.AggregatingVariable, six.string_types, list)):
+       value_lib.AggregatingVariable, six.string_types, list, tuple,
+       value_lib.TPUMirroredVariable)):
     raise ValueError("destinations must be one of a `DistributedValues` object,"
-                     " a tf.Variable object, a device string, a list of device "
-                     "strings")
+                     " a tf.Variable object, a device string, a list or tuple "
+                     "of device strings")
 
   if not check_destinations(destinations):
     raise ValueError("destinations can not be empty")
 
 
+def reduce_non_distributed_value(extended, reduce_op, value, destinations):
+  """Reduce a non-DistributedValue `value` to `destinations`."""
+  if isinstance(value, value_lib.DistributedValues):
+    raise ValueError("You are passing a `DistributedValue` to "
+                     "`reduce_non_distributed_value`, which is not allowed.")
+
+  # If the same value is present on all replicas then the PerReplica value will
+  # be a single value. We also handle the case when `value` is a single value
+  # and equal to 0.
+  if value == 0:
+    return 0
+  # If there is only a single value and the reduce op is MEAN,
+  # that value should be on all destinations.
+  if reduce_op == reduce_util.ReduceOp.MEAN:
+    return value
+
+  validate_destinations(destinations)
+  # We do not support a reduce op of SUM if the value is the same across
+  # all replicas. We call this as part of assign functions for MirroredVariables
+  # and summing up identical values across replicas is not clearly defined.
+  if (len(extended.worker_devices) != 1 or
+      not check_destinations(destinations)):
+    raise ValueError("A non-DistributedValues value %s cannot be reduced with "
+                     "the given reduce op %s." % (value, reduce_op))
+  # TODO(anjalisridhar): Moves these methods to a device utility file?
+  devices = get_devices_from(destinations)
+  if len(devices) == 1:
+    with ops.device(devices[0]):
+      return array_ops.identity(value)
+  else:
+    value_updates = {}
+    for d in devices:
+      with ops.device(d):
+        value_updates[d] = array_ops.identity(value)
+    return value_lib.Mirrored(value_updates)
+
+
 def _make_tensor_into_per_replica(input_tensor):
   """Converts a single tensor into a PerReplica object."""
   if isinstance(input_tensor, (tuple, list)):
diff --git a/tensorflow/python/distribute/device_util.py b/tensorflow/python/distribute/device_util.py
index 70e1ca4b5d77e5e7529cb0d06a9ffb4657dc74fe..34474582adfa8c73c4a7bbbe130dcf6faf88ce0b 100644
--- a/tensorflow/python/distribute/device_util.py
+++ b/tensorflow/python/distribute/device_util.py
@@ -50,7 +50,7 @@ def canonicalize(d, default=None):
   # Fill in missing device fields using defaults.
   result = tf_device.DeviceSpec(
       replica=0, task=0, device_type="CPU", device_index=0)
-  if context.executing_eagerly():
+  if ops.executing_eagerly_outside_functions():
     result.job = "localhost"
   if default:
     result.merge_from(tf_device.DeviceSpec.from_string(default))
diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py
index c0f9b8a1fdfdf8bd95375f489058cadcd63c9cb9..78c995a57823c5ad274eebd52f39dcad81a67e19 100644
--- a/tensorflow/python/distribute/distribute_coordinator.py
+++ b/tensorflow/python/distribute/distribute_coordinator.py
@@ -29,6 +29,7 @@ from tensorflow.python.client import session
 from tensorflow.python.distribute import distribute_coordinator_context
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import coordinator
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import server_lib
 
@@ -328,7 +329,8 @@ def _run_single_worker(worker_fn,
                        task_id,
                        session_config,
                        rpc_layer="",
-                       worker_barrier=None):
+                       worker_barrier=None,
+                       coord=None):
   """Runs a single worker by calling `worker_fn` under context."""
   session_config = copy.deepcopy(session_config)
   strategy = copy.deepcopy(strategy)
@@ -350,7 +352,11 @@ def _run_single_worker(worker_fn,
       rpc_layer=rpc_layer,
       worker_barrier=worker_barrier)
   with context:
-    return worker_fn(strategy)
+    if coord:
+      with coord.stop_on_exception():
+        return worker_fn(strategy)
+    else:
+      return worker_fn(strategy)
 
 
 def _split_cluster_for_evaluator(cluster_spec, task_type):
@@ -423,6 +429,7 @@ def _run_std_server(cluster_spec=None,
 def _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
                               cluster_spec, session_config, rpc_layer):
   """Runs a standalone client for between-graph replication."""
+  coord = coordinator.Coordinator()
   eval_thread = None
   if _TaskType.EVALUATOR in cluster_spec.jobs:
     eval_thread = threading.Thread(
@@ -431,6 +438,7 @@ def _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
               session_config),
         kwargs={
             "rpc_layer": rpc_layer,
+            "coord": coord,
         })
     eval_thread.start()
 
@@ -444,18 +452,18 @@ def _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
                 session_config),
           kwargs={
               "rpc_layer": rpc_layer,
-              "worker_barrier": worker_barrier
+              "worker_barrier": worker_barrier,
+              "coord": coord,
           })
       t.start()
       threads.append(t)
 
-  # TODO(yuefengz): wrap threads into thread coordinator?
-  for t in threads:
-    t.join()
-
-  # TODO(yuefengz): is it necessary to join eval thread?
   if eval_thread:
-    eval_thread.join()
+    # TODO(yuefengz): is it necessary to join eval thread?
+    threads_to_join = threads + [eval_thread]
+  else:
+    threads_to_join = threads
+  coord.join(threads_to_join)
 
   # TODO(yuefengz): we probably want to return results from all workers?
   return None
@@ -464,6 +472,7 @@ def _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
 def _run_in_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
                          cluster_spec, session_config, rpc_layer):
   """Runs a standalone client for in-graph replication."""
+  coord = coordinator.Coordinator()
   eval_thread = None
   if _TaskType.EVALUATOR in cluster_spec.jobs:
     eval_thread = threading.Thread(
@@ -472,6 +481,7 @@ def _run_in_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
               session_config),
         kwargs={
             "rpc_layer": rpc_layer,
+            "coord": coord,
         })
     eval_thread.start()
 
@@ -482,9 +492,12 @@ def _run_in_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
       None,
       None,
       session_config,
-      rpc_layer=rpc_layer)
+      rpc_layer=rpc_layer,
+      coord=coord)
+
   if eval_thread:
-    eval_thread.join()
+    coord.join([eval_thread])
+
   return worker_result
 
 
diff --git a/tensorflow/python/distribute/distribute_coordinator_test.py b/tensorflow/python/distribute/distribute_coordinator_test.py
index f2cb950aada5a7aea7c239ec822893d56dece0bd..dbed3e7f593f2dc48f112a47dd82579e5a3683ed 100644
--- a/tensorflow/python/distribute/distribute_coordinator_test.py
+++ b/tensorflow/python/distribute/distribute_coordinator_test.py
@@ -230,7 +230,7 @@ class DistributeCoordinatorTestBase(test.TestCase):
       with ops.device("/job:worker/task:0"):
         result = math_ops.add_n(xs)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       result_value = sess.run(result)
     self.assertEqual(result_value, expected)
     if result_value == expected:
@@ -278,7 +278,7 @@ class DistributeCoordinatorTestBase(test.TestCase):
       train_op = control_flow_ops.group([x_add, y_sub])
 
       if context.is_chief:
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
       # Synchronize workers after initializaton.
       if context.has_barrier:
@@ -427,6 +427,7 @@ class DistributeCoordinatorTestStandaloneMode(DistributeCoordinatorTestBase):
     # Each finished worker will increment self._result_correct.
     self.assertEqual(self._result_correct, NUM_WORKERS)
 
+  @test_util.run_v1_only("b/120545219")
   def testBetweenGraphWithMonitoredSession(self):
     """Test monitored session in standalone client mode."""
     distribute_coordinator.run_distribute_coordinator(
@@ -600,6 +601,7 @@ class DistributeCoordinatorTestInpendentWorkerMode(
     # Each finished worker will increment self._result_correct.
     self.assertEqual(self._result_correct, NUM_WORKERS)
 
+  @test_util.run_v1_only("b/120545219")
   def testBetweenGraphWithMonitoredSession(self):
     cluster_spec = self._create_cluster_spec(
         num_workers=NUM_WORKERS, num_ps=NUM_PS)
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index eddd6ff8b1650711cc53d63e21e263c67ece6271..60bb75ded007d82e82c1de6db13905e4de73d480 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -422,6 +422,42 @@ class DistributionStrategy(object):
     return self.extended._make_input_fn_iterator(  # pylint: disable=protected-access
         input_fn, replication_mode=replication_mode)
 
+  def experimental_run(self, fn, input_iterator=None):
+    """Runs ops in `fn` on each replica, with inputs from `input_iterator`.
+
+    When eager execution is enabled, executes ops specified by `fn` on each
+    replica.  Otherwise, builds a graph to execute the ops on each replica.
+
+    Each replica will take a single, different input from the inputs provided by
+    one `get_next` call on the input iterator.
+
+    `fn` may call `tf.distribute.get_replica_context()` to access members such
+    as `replica_id_in_sync_group`.
+
+    IMPORTANT: Depending on the `DistributionStrategy` being used, and whether
+    eager execution is enabled, `fn` may be called one or more times (once for
+    each replica).
+
+    Args:
+      fn: function to run. The inputs to the function must match the outputs of
+        `input_iterator.get_next()`. The output must be a `tf.nest` of
+        `Tensor`s.
+      input_iterator: (Optional) input iterator from which the inputs are taken.
+
+    Returns:
+      Merged return value of `fn` across replicas. The structure of the return
+      value is the same as the return value from `fn`. Each element in the
+      structure can either be `PerReplica` (if the values are unsynchronized),
+      `Mirrored` (if the values are kept in sync), or `Tensor` (if running on a
+      single replica).
+    """
+    with self.scope():
+      if input_iterator is None:
+        return self._extended.call_for_each_replica(fn)
+      else:
+        inputs = input_iterator.get_next()
+        return self._extended.call_for_each_replica(fn, args=(inputs,))
+
   @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
   def broadcast(self, tensor, destinations=None):
     """DEPRECATED: use extended.broadcast_to() instead."""
@@ -565,8 +601,8 @@ class DistributionStrategy(object):
         variable created in `scope`.
 
     Returns:
-      A list of values contained in `value`. If `value` represents a single
-      value, this returns `[value].`
+      A tuple of values contained in `value`. If `value` represents a single
+      value, this returns `(value,).`
     """
     return self._extended._unwrap(value)  # pylint: disable=protected-access
 
@@ -1346,14 +1382,14 @@ class DistributionStrategyExtended(object):
 
   @property
   def worker_devices(self):
-    """Returns the list of devices used to run `call_for_each_replica()` calls.
+    """Returns the tuple of all devices used to for compute replica execution.
     """
     # TODO(josh11b): More docstring
     raise NotImplementedError("must be implemented in descendants")
 
   @property
   def parameter_devices(self):
-    """Returns the list of devices used for variable and `update` placement."""
+    """Returns the tuple of all devices used to place variables."""
     # TODO(josh11b): More docstring
     raise NotImplementedError("must be implemented in descendants")
 
@@ -1513,9 +1549,9 @@ class ReplicaContext(object):
 
   @property
   def devices(self):
-    """The devices this replica is to be executed on, as a list of strings."""
+    """The devices this replica is to be executed on, as a tuple of strings."""
     require_replica_context(self)
-    return [device_util.current()]
+    return (device_util.current(),)
 
   # TODO(josh11b): Implement `start_all_reduce(method, t)` for efficient
   # all-reduce. It would return a function returning the result of reducing `t`
@@ -1605,7 +1641,7 @@ class _DefaultDistributionExtended(DistributionStrategyExtended):
     return array_ops.identity(replica_local_var)
 
   def _unwrap(self, distributed_value):
-    return [distributed_value]
+    return (distributed_value,)
 
   def value_container(self, value):
     return value
diff --git a/tensorflow/python/distribute/estimator_training.py b/tensorflow/python/distribute/estimator_training.py
index 549fa8fb8aaaa047402f2bfedda9cb4c648fe861..7d5f231c37da41f10f945adc468f40ffd0ecc743 100644
--- a/tensorflow/python/distribute/estimator_training.py
+++ b/tensorflow/python/distribute/estimator_training.py
@@ -354,7 +354,7 @@ def estimator_evaluate(estimator, evaluate_distributed_fn, hooks):
   if (estimator._config._distribute_coordinator_mode !=
       dc.CoordinatorMode.STANDALONE_CLIENT):
     raise ValueError('Only `STANDALONE_CLIENT` mode is supported when you call '
-                     '`Estimator.train`')
+                     '`Estimator.evaluate`')
 
   if estimator._config._eval_distribute.extended.experimental_between_graph:
     # TODO(yuefengz): remove this limitation once we figure out how to merge
diff --git a/tensorflow/python/distribute/input_ops.py b/tensorflow/python/distribute/input_ops.py
index d7974942a144646a99bce77fb5a8e50e569e0b65..2ded209701e74afe45fc96d66fab65b3ae250596 100644
--- a/tensorflow/python/distribute/input_ops.py
+++ b/tensorflow/python/distribute/input_ops.py
@@ -84,7 +84,7 @@ def auto_shard_dataset(dataset, num_shards, index):
       elif hasattr(dataset, "_map_func"):
         # TODO(priyag): Make this check more robust by enforcing some common
         # property on all map/flatmap/interleave datasets.
-        map_func_def = dataset._map_func.definition
+        map_func_def = dataset._map_func.function.definition
         for node in map_func_def.node_def:
           if node.op in _READER_DATASET_OPS:
             found_reader_op = True
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index d6d40df5ce900ccd8731a8706cdc3a071de4f5bf..fb3cf844492d2cc796c2ee7dbfe7a2f0550cb249 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -43,18 +43,24 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import coordinator
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(josh11b): Replace asserts in this file with if ...: raise ...
 
 
 @contextlib.contextmanager
-def _enter_graph(g):
-  if context.executing_eagerly():
+def _enter_graph(g, eager, creator_stack=None):
+  """Context manager for selecting a graph and maybe eager mode."""
+  if eager:
     with g.as_default(), context.eager_mode():
+      if creator_stack is not None:
+        g._variable_creator_stack = creator_stack  # pylint: disable=protected-access
       yield
   else:
     with g.as_default():
+      if creator_stack is not None:
+        g._variable_creator_stack = creator_stack  # pylint: disable=protected-access
       yield
 
 
@@ -68,10 +74,9 @@ class _RequestedStop(Exception):  # pylint: disable=g-bad-exception-name
   pass
 
 
-# _call_for_each_replica and _reduce_non_distributed_value are not members of
-# MirroredStrategy so that they are generally not allowed to use anything
-# specific to MirroredStrategy and thus can be shared with other distribution
-# strategies.
+# _call_for_each_replica is not a member of MirroredStrategy so that it is
+# not allowed to use anything specific to MirroredStrategy and thus
+# can be shared with other distribution strategies.
 
 
 # TODO(yuefengz): maybe create a common class for those who need to call this
@@ -168,7 +173,12 @@ def _call_for_each_replica(distribution, fn, args, kwargs):
           # capture the name_scope from the first MRT and assume it is
           # the same for all other MRTs.
           mtt_captured_name_scope = threads[0].captured_name_scope
-          with ops.name_scope(mtt_captured_name_scope):
+          # Capture and merge the control dependencies from all the threads.
+          mtt_captured_control_deps = set()
+          for t in threads:
+            mtt_captured_control_deps.update(t.captured_control_deps)
+          with ops.name_scope(mtt_captured_name_scope),\
+              ops.control_dependencies(mtt_captured_control_deps):
             merge_result = threads[0].merge_fn(distribution, *merge_args,
                                                **merge_kwargs)
           for t in threads:
@@ -181,43 +191,6 @@ def _call_for_each_replica(distribution, fn, args, kwargs):
   return values.regroup({t.device: t.main_result for t in threads})
 
 
-def _reduce_non_distributed_value(extended, reduce_op, value, destinations):
-  """Reduce a non-DistributedValue `value` to `destinations`."""
-  if isinstance(value, values.DistributedValues):
-    raise ValueError("You are passing a `DistributedValue` to "
-                     "`_reduce_non_distributed_value`, which is not allowed.")
-
-  # If the same value is present on all replicas then the PerReplica value will
-  # be a single value. We also handle the case when `value` is a single value
-  # and equal to 0.
-  if value == 0:
-    return 0
-  # If there is only a single value and the reduce op is MEAN,
-  # that value should be on all destinations.
-  if reduce_op == reduce_util.ReduceOp.MEAN:
-    return value
-
-  cross_device_ops_lib.validate_destinations(destinations)
-  # We do not support a reduce op of SUM if the value is the same across
-  # all replicas. We call this as part of assign functions for MirroredVariables
-  # and summing up identical values across replicas is not clearly defined.
-  if (len(extended.worker_devices) != 1 or
-      not cross_device_ops_lib.check_destinations(destinations)):
-    raise ValueError("A non-DistributedValues value %s cannot be reduced with "
-                     "the given reduce op %s." % (value, reduce_op))
-  # TODO(anjalisridhar): Moves these methods to a device utility file?
-  devices = cross_device_ops_lib.get_devices_from(destinations)
-  if len(devices) == 1:
-    with ops.device(devices[0]):
-      return array_ops.identity(value)
-  else:
-    value_updates = {}
-    for d in devices:
-      with ops.device(d):
-        value_updates[d] = array_ops.identity(value)
-    return values.Mirrored(value_updates)
-
-
 def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs):  # pylint: disable=g-missing-docstring
   # Figure out what collections this variable should be added to.
   # We'll add the MirroredVariable to those collections instead.
@@ -296,6 +269,133 @@ def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs):
   return result
 
 
+def _is_device_list_local(devices):
+  """Checks whether the devices list is for local or multi-worker.
+
+  Args:
+    devices: a list of device strings, either local for remote devices.
+
+  Returns:
+    a boolean indicating whether these device strings are for local or for
+    remote.
+
+  Raises:
+    ValueError: if device strings are not consistent.
+  """
+  all_local = None
+  for d in devices:
+    d_spec = tf_device.DeviceSpec().parse_from_string(d)
+    is_local = d_spec.job in (None, "localhost")
+
+    if all_local is None:  # Determine all_local from first device.
+      all_local = is_local
+
+    if all_local:
+      if not is_local:
+        raise ValueError("Local device string cannot have job specified other "
+                         "than 'localhost'")
+    else:
+      if is_local:
+        raise ValueError("Remote device string must have job specified.")
+      if d_spec.task is None:
+        raise ValueError("Remote device string must have task specified.")
+  return all_local
+
+
+def _cluster_spec_to_device_list(cluster_spec, num_gpus_per_worker):
+  """Returns a device list given a cluster spec."""
+  cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
+  devices = []
+  for task_type in ("chief", "worker"):
+    for task_id in range(len(cluster_spec.as_dict().get(task_type, []))):
+      if num_gpus_per_worker is 0:
+        devices.append("/job:%s/task:%d" % (task_type, task_id))
+      else:
+        devices.extend([
+            "/job:%s/task:%d/device:GPU:%i" % (task_type, task_id, gpu_id)
+            for gpu_id in range(num_gpus_per_worker)
+        ])
+  return devices
+
+
+def _group_device_list(devices):
+  """Groups the devices list by task_type and task_id.
+
+  Args:
+    devices: a list of device strings for remote devices.
+
+  Returns:
+    a dict of list of device strings mapping from task_type to a list of devices
+    for the task_type in the asceding order of task_id.
+  """
+  assert not _is_device_list_local(devices)
+  device_dict = {}
+
+  for d in devices:
+    d_spec = tf_device.DeviceSpec().parse_from_string(d)
+
+    # Create an entry for the task_type.
+    if d_spec.job not in device_dict:
+      device_dict[d_spec.job] = []
+
+    # Fill the device list for task_type until it covers the task_id.
+    while len(device_dict[d_spec.job]) <= d_spec.task:
+      device_dict[d_spec.job].append([])
+
+    device_dict[d_spec.job][d_spec.task].append(d)
+
+  return device_dict
+
+
+def _infer_num_gpus_per_worker(devices):
+  """Infers the number of GPUs on each worker.
+
+  Currently to make multi-worker cross device ops work, we need all workers to
+  have the same number of GPUs.
+
+  Args:
+    devices: a list of device strings, can be either local devices or remote
+      devices.
+
+  Returns:
+    number of GPUs per worker.
+
+  Raises:
+    ValueError if workers have different number of GPUs or GPU indices are not
+    consecutive and starting from 0.
+  """
+  if _is_device_list_local(devices):
+    return len([d for d in devices if "GPU" in d.upper()])
+  else:
+    device_dict = _group_device_list(devices)
+    num_gpus = None
+    for _, devices_in_task in device_dict.items():
+      for device_in_task in devices_in_task:
+        if num_gpus is None:
+          num_gpus = len([d for d in device_in_task if "GPU" in d.upper()])
+
+        # Verify other workers have the same number of GPUs.
+        elif (
+            num_gpus != len([d for d in device_in_task if "GPU" in d.upper()])):
+          raise ValueError("All workers should have the same number of GPUs.")
+
+        for d in device_in_task:
+          d_spec = tf_device.DeviceSpec().parse_from_string(d)
+          if (d_spec.device_type.upper() == "GPU" and
+              d_spec.device_index >= num_gpus):
+            raise ValueError("Device_index on a worker should be consecutive "
+                             "and start from 0.")
+    return num_gpus
+
+
+def all_local_devices(num_gpus=None):
+  if num_gpus is None:
+    num_gpus = context.num_gpus()
+  return (tuple("/device:GPU:%d" % i for i in range(num_gpus)) or
+          ("/device:CPU:0",))
+
+
+@tf_export("distribute.MirroredStrategy")
 class MirroredStrategy(distribute_lib.DistributionStrategy):
   """Mirrors vars to distribute across multiple devices and machines.
 
@@ -306,86 +406,73 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
 
   Args:
     devices: a list of device strings.
-    num_gpus_per_worker: number of GPUs per worker.
     cross_device_ops: optional, a descedant of `CrossDeviceOps`. If this is not
       set, nccl will be use by default.
   """
 
-  def __init__(self,
-               devices=None,
-               num_gpus_per_worker=None,
-               cross_device_ops=None):
-    extended = MirroredExtended(self, devices, num_gpus_per_worker,
-                                cross_device_ops)
+  def __init__(self, devices=None, cross_device_ops=None):
+    extended = MirroredExtended(
+        self, devices=devices, cross_device_ops=cross_device_ops)
     super(MirroredStrategy, self).__init__(extended)
 
 
 class MirroredExtended(distribute_lib.DistributionStrategyExtended):
   """Implementation of MirroredStrategy."""
 
-  def __init__(self,
-               container_strategy,
-               devices=None,
-               num_gpus_per_worker=None,
-               cross_device_ops=None):
+  def __init__(self, container_strategy, devices=None, cross_device_ops=None):
     super(MirroredExtended, self).__init__(container_strategy)
+    if devices is None:
+      devices = all_local_devices()
+    if not devices:
+      raise ValueError("Got an empty `devices` list. Please make sure the "
+                       "`devices` you pass in is not empty.")
     self._cross_device_ops = cross_device_ops
-    # Remember num GPUs which might be needed by `configure` method.
-    self._num_gpus = num_gpus_per_worker
+    self._initialize_strategy(devices)
 
-    self._initialize_local(self._num_gpus, devices)
+  def _initialize_strategy(self, devices):
+    # The _initialize_strategy method is intended to be used by distribute
+    # coordinator as well.
+    if _is_device_list_local(devices):
+      self._initialize_local(devices)
+    else:
+      self._initialize_multi_worker(devices)
 
-  def _initialize_local(self, num_gpus, devices):
+  def _initialize_local(self, devices):
     """Initializes the object for local training."""
-    self._cluster_spec = None
-    # Convert `num_gpus` into `devices`, shouldn't specify both.
-    if devices is None:
-      if num_gpus is None:
-        num_gpus = context.num_gpus()
-      if num_gpus == 0:
-        devices = ["/device:CPU:0"]
-      else:
-        devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
-    elif num_gpus is not None:
-      raise ValueError("Must only specify one of `devices` and `num_gpus`.")
-    self._num_gpus = num_gpus
-    # TODO(yuefengz): consider setting the default device.
-
+    self._local_mode = True
     assert devices, "Must specify at least one device."
     assert len(set(devices)) == len(devices), (
         "No duplicates allowed in `devices` argument.")
     # TODO(josh11b): Require at least 2 devices?
-    self._devices = [device_util.resolve(d) for d in devices]
+    self._devices = tuple(device_util.resolve(d) for d in devices)
     self._canonical_device_set = set(self._devices)
     self._device_index = values.PerReplica(
         {d: i for i, d in enumerate(devices)})
 
-  def _initialize_multi_worker(self, num_gpus, cluster_spec):
+    self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best(
+        devices)
+
+  def _initialize_multi_worker(self, devices):
     """Initializes the object for multi-worker training."""
-    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-    self._cluster_spec = cluster_spec
+    self._local_mode = False
+
+    assert devices, "Must specify at least one device."
+    assert len(set(devices)) == len(devices), (
+        "No duplicates allowed in `devices` argument.")
+    # TODO(josh11b): Require at least 2 devices?
+    self._devices = tuple(device_util.resolve(d) for d in devices)
+    self._canonical_device_set = set(self._devices)
+    self._device_index = values.PerReplica(
+        {d: i for i, d in enumerate(devices)})
 
+    device_dict = _group_device_list(devices)
     self._workers = []
+    self._worker_devices = []
     for job in ["chief", "worker"]:
-      for task in range(len(cluster_spec.as_dict().get(job, []))):
-        self._workers.append("/job:%s/task:%d" % (job, task))
-
-    if num_gpus is None:
-      raise ValueError("`num_gpus` is required if `cluster_spec` is given.")
-    if num_gpus > 0:
-      self._worker_devices = [
-          (worker, [
-              device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
-              for gpu in range(num_gpus)
-          ]) for worker in self._workers
-      ]
-    else:
-      self._worker_devices = [
-          (worker, [device_util.canonicalize(worker, "/device:CPU:0")])
-          for worker in self._workers
-      ]
-
-    devices = nest.flatten([l for _, l in self._worker_devices])
+      for task in range(len(device_dict.get(job, []))):
+        worker = "/job:%s/task:%d" % (job, task)
+        self._workers.append(worker)
+        self._worker_devices.append((worker, device_dict[job][task]))
 
     # Setting `_default_device` will add a device scope in the
     # distribution.scope. We set the default device to the first worker. When
@@ -396,14 +483,8 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
     self._default_device = self._workers[0]
 
-    assert devices, "Must specify at least one device."
-    assert len(set(devices)) == len(devices), (
-        "No duplicates allowed in `devices` argument.")
-    # TODO(josh11b): Require at least 2 devices?
-    self._devices = [device_util.resolve(d) for d in devices]
-    self._canonical_device_set = set(self._devices)
-    self._device_index = values.PerReplica(
-        {d: i for i, d in enumerate(devices)})
+    self._inferred_cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce(
+        self._workers, _infer_num_gpus_per_worker(self._devices))
 
   def _create_variable(self, next_creator, *args, **kwargs):
     """Create a mirrored variable. See `DistributionStrategy.scope`."""
@@ -413,7 +494,7 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     def _real_mirrored_creator(devices, *args, **kwargs):  # pylint: disable=g-missing-docstring
       index = {}
       for i, d in enumerate(devices):
-        with ops.device(d):
+        with ops.init_scope(), ops.device(d):
           if i > 0:
             # Give replicas meaningful distinct names:
             var0name = index[devices[0]].name.split(":")[0]
@@ -444,21 +525,22 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
                                      **kwargs)
 
   def _distribute_dataset(self, dataset_fn):
-    if self._cluster_spec:
+    if self._local_mode:
+      return values.PerReplicaDataset(
+          self._call_dataset_fn(dataset_fn), self._devices)
+    else:
       return values.MultiWorkerDataset(
           functools.partial(self._call_dataset_fn, dataset_fn),
           self._worker_devices,
           auto_shard=False)
-    else:
-      return values.PerReplicaDataset(
-          self._call_dataset_fn(dataset_fn), self._devices)
 
   def _make_dataset_iterator(self, dataset):
-    if self._cluster_spec:
-      worker_device_pairs = self._worker_devices
-    else:
+    if self._local_mode:
       worker = device_util.canonicalize("/device:CPU:0")
       worker_device_pairs = [(worker, self._devices)]
+    else:
+      worker_device_pairs = self._worker_devices
+
     return values.DatasetIterator(dataset, worker_device_pairs,
                                   self._num_replicas_in_sync)
 
@@ -467,13 +549,14 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
       input_fn,
       replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
     input_contexts = []
-    if self._cluster_spec:
-      num_workers = len(self._worker_devices)
-      worker_device_pairs = self._worker_devices
-    else:
+    if self._local_mode:
       num_workers = 1
       worker = device_util.canonicalize("/device:CPU:0")
       worker_device_pairs = [(worker, self._devices)]
+    else:
+      num_workers = len(self._worker_devices)
+      worker_device_pairs = self._worker_devices
+
     for i in range(num_workers):
       input_contexts.append(distribute_lib.InputContext(
           num_input_pipelines=num_workers,
@@ -568,23 +651,12 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
       session_config.CopyFrom(self._update_config_proto(session_config))
 
     if cluster_spec:
-      self._initialize_multi_worker(self._num_gpus, cluster_spec)
-
-    if self._cross_device_ops is None:
-      if self._cluster_spec:
-        # It currently cannot detect the toplogy of remote workers. So we
-        # hard-code the multi-worker all-reduce algorithm for now.
-        if len(self._workers) == 1:
-          # The default is "nccl".
-          self._cross_device_ops = (
-              cross_device_ops_lib.AllReduceCrossDeviceOps())
-        else:
-          # The default is hierarchical reduce and broadcast.
-          self._cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce(
-              self._workers, self._num_gpus)
-      else:
-        self._cross_device_ops = cross_device_ops_lib.choose_the_best(
-            self._devices, session_config=session_config)
+      # TODO(yuefengz): remove the following code once cluster_resolver is
+      # added.
+      num_gpus_per_worker = _infer_num_gpus_per_worker(self._devices)
+      multi_worker_devices = _cluster_spec_to_device_list(
+          cluster_spec, num_gpus_per_worker)
+      self._initialize_multi_worker(multi_worker_devices)
 
   def _update_config_proto(self, config_proto):
     updated_config = copy.deepcopy(config_proto)
@@ -592,20 +664,20 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     return updated_config
 
   def _get_cross_device_ops(self):
-    if self._cross_device_ops is None:
-      self._cross_device_ops = (
-          cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps())
-    return self._cross_device_ops
+    return self._cross_device_ops or self._inferred_cross_device_ops
 
   def _reduce_to(self, reduce_op, value, destinations):
+    if (isinstance(value, values.Mirrored) and
+        reduce_op == reduce_util.ReduceOp.MEAN):
+      return value
     assert not isinstance(value, values.Mirrored)
     if not isinstance(value, values.DistributedValues):
       # This function handles reducing values that are not PerReplica or
       # Mirrored values. For example, the same value could be present on all
       # replicas in which case `value` would be a single value or value could
       # be 0.
-      return _reduce_non_distributed_value(self, reduce_op, value,
-                                           destinations)
+      return cross_device_ops_lib.reduce_non_distributed_value(
+          self, reduce_op, value, destinations)
     return self._get_cross_device_ops().reduce(
         reduce_op, value, destinations=destinations)
 
@@ -627,7 +699,7 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     return values.update_regroup(self, updates, group)
 
   def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
-    assert isinstance(colocate_with, list)
+    assert isinstance(colocate_with, tuple)
     # TODO(josh11b): In eager mode, use one thread per device.
     updates = {}
     for d in colocate_with:
@@ -648,9 +720,9 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     if isinstance(val, values.DistributedValues):
       # Return in a deterministic order.
       if set(val.devices) == self._canonical_device_set:
-        return [val.get(device=d) for d in self._devices]
-      return [val.get(device=d) for d in sorted(val.devices)]
-    return [val]
+        return tuple(val.get(device=d) for d in self._devices)
+      return tuple(val.get(device=d) for d in sorted(val.devices))
+    return (val,)
 
   def value_container(self, val):
     return values.value_container(val)
@@ -661,12 +733,11 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
 
   @property
   def worker_devices(self):
-    # Make a copy to prevent users from accidentally mutating our copy.
-    return list(self._devices)
+    return self._devices
 
   @property
   def parameter_devices(self):
-    return list(self._devices)
+    return self._devices
 
   @property
   def experimental_between_graph(self):
@@ -686,7 +757,7 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
 
   def non_slot_devices(self, var_list):
     del var_list
-    return list(self._devices)
+    return tuple(self._devices)
 
   def _get_devices_from(self, colocate_with=None):
     if colocate_with is None:
@@ -735,14 +806,19 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
       self.has_paused = threading.Event()
       # These fields have to do with inheriting various contexts from the
       # parent thread:
+      ctx = context.context()
+      self.in_eager = ctx.executing_eagerly()
       # pylint: disable=protected-access
-      self.context_mode = context.context()._eager_context.mode
-      if not context.context()._context_handle:
-        context.context()._initialize_handle_and_devices()
+      if not ctx._context_handle:
+        ctx._initialize_handle_and_devices()
       self.context_device_policy = (
           pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
-              context.context()._context_handle))
+              ctx._context_handle))
       self.graph = ops.get_default_graph()
+      with ops.init_scope():
+        self._init_in_eager = context.executing_eagerly()
+        self._init_graph = ops.get_default_graph()
+
       self._variable_creator_stack = self.graph._variable_creator_stack[:]
       self._captured_var_scope = variable_scope.get_variable_scope()
       # Adding a "/" at end lets us re-enter this scope later.
@@ -756,16 +832,16 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
 
     def run(self):
       # pylint: disable=protected-access
-      self.graph._variable_creator_stack = self._variable_creator_stack
       self.should_run.wait()
       self.should_run.clear()
       try:
         if self.coord.should_stop():
           return
         with self.coord.stop_on_exception(), \
-            context.context()._mode(self.context_mode), \
+            _enter_graph(self._init_graph, self._init_in_eager), \
+            _enter_graph(self.graph, self.in_eager,
+                         self._variable_creator_stack), \
             context.context().device_policy(self.context_device_policy), \
-            _enter_graph(self.graph), \
             MirroredReplicaContext(self.distribution, constant_op.constant(
                 self.replica_id, dtypes.int32)), \
             ops.device(self.device), \
@@ -799,6 +875,8 @@ class MirroredReplicaContext(distribute_lib.ReplicaContext):
     # Adding a "/" at end lets us re-enter this scope later.
     if t.captured_name_scope:
       t.captured_name_scope += "/"
+
+    t.captured_control_deps = t.graph._current_control_dependencies()  # pylint: disable=protected-access
     t.has_paused.set()
     t.should_run.wait()
     t.should_run.clear()
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index f3f7676f04a014a4f947ee8ea81e4c77181f0514..a5918b7b731fabf61ae66ab982d0dfc7eb3906de 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -100,10 +100,21 @@ class DistributedValues(object):
   # DistributionStrategy implementations.
 
 
+# NOTE(josh11b,apassos): It would be great if we could inspect the values this was
+# initialized with and use that to generate the overloaded operators here.
+# Unfortunately, Python's rules for special methods don't allow this, see
+# https://docs.python.org/3/reference/datamodel.html#special-method-names
+# "if a class defines a method named __getitem__(), and x is an instance of
+# this class, then x[i] is roughly equivalent to type(x).__getitem__(x, i)."
+# In particular, these special methods don't go through __getattr__, and
+# it will only use those methods if they are defined in the class, not the
+# object.
 class DistributedDelegate(DistributedValues):
   """A map from device to values; acts as the same type as the values."""
 
   def __getattr__(self, name):
+    # TODO(priyag): This needs to be made robust against pitfalls from mix use
+    # __getattr__ and @property. See b/120402273.
     return getattr(self.get(), name)
 
   # pylint: disable=multiple-statements
@@ -559,6 +570,12 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
       # See https://docs.python.org/3/library/constants.html#NotImplemented
       return NotImplemented
 
+  def __str__(self):
+    return "%s:%s" % (self.__class__.__name__, self._index)
+
+  def __repr__(self):
+    return "%s(%r)" % (self.__class__.__name__, self._index)
+
   @property
   def handle(self):
     # If we're in a tpu.rewrite(), return the replicated handle.
@@ -1160,7 +1177,7 @@ class PerReplicaDataset(object):
     # Eager mode prefetching would error out in constructor. Only remaining
     # case is non-prefetching in eager mode. We delegate to
     # PerReplicaDataIterator to handle that case.
-    dataset_iterator = self._dataset.make_one_shot_iterator()
+    dataset_iterator = dataset_ops.make_one_shot_iterator(self._dataset)
     return PerReplicaDataIterator(
         dataset_iterator, self._devices, prefetch_on_device=False)
 
@@ -1175,7 +1192,7 @@ class PerReplicaDataset(object):
       dataset_iterator = multi_device_iterator_ops.MultiDeviceIterator(
           self._dataset, self._devices)
     else:
-      dataset_iterator = self._dataset.make_initializable_iterator()
+      dataset_iterator = dataset_ops.make_initializable_iterator(self._dataset)
     return PerReplicaDataIterator(
         dataset_iterator,
         self._devices,
@@ -1293,14 +1310,15 @@ class MultiWorkerDataset(object):
     iterators = []
     for worker, dataset in self._datasets:
       with ops.device(worker):
-        iterators.append((worker, dataset.make_one_shot_iterator()))
+        iterators.append((worker, dataset_ops.make_one_shot_iterator(dataset)))
     return MultiWorkerDataIterator(iterators, self._worker_device_pairs)
 
   def make_initializable_iterator(self):
     iterators = []
     for worker, dataset in self._datasets:
       with ops.device(worker):
-        iterators.append((worker, dataset.make_initializable_iterator()))
+        iterators.append(
+            (worker, dataset_ops.make_initializable_iterator(dataset)))
     return MultiWorkerDataIterator(iterators, self._worker_device_pairs)
 
 
@@ -1512,7 +1530,7 @@ class _SingleWorkerDatasetIterator(object):
         # TODO(priyag): Measure the performance of this approach vs calling
         # get_next on the original dataset N times.
         dataset = self._dataset.batch(len(self._devices), drop_remainder=True)
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
       else:
         iterator = multi_device_iterator_ops.MultiDeviceIterator(
             self._dataset, self._devices)
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 5a18afa0fcd7e0d9d0bf9bc1db6ac44979eeab01..f43cf9327a1ad6b2b83ebcb2482ad3fc27515251 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -255,6 +255,18 @@ py_library(
     ],
 )
 
+py_test(
+    name = "execution_callbacks_test",
+    srcs = ["execution_callbacks_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":execution_callbacks",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
 py_library(
     name = "graph_only_ops",
     srcs = ["graph_only_ops.py"],
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 29f9b2cda3aa2c6e7fff6c6df10fed81779d02c7..481f680f567c92fe67d92b80c423ff8a8b5ec642 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -1104,8 +1104,13 @@ class GradientTape(object):
         dimension of `target` and `source` do not match.
     """
     target_shape = target.shape
-    if not target_shape.with_rank_at_least(2)[0].is_compatible_with(
-        source.shape.with_rank_at_least(2)[0]):
+    if target_shape.rank is None:
+      dim = Dimension(None)
+    else:
+      dim = target_shape.dims[0]
+    if not (target_shape.with_rank_at_least(2) and
+            source.shape.with_rank_at_least(2) and
+            dim.is_compatible_with(source.shape[0])):
       raise ValueError(
           "Need first dimension of target shape (%s) and "
           "source shape (%s) to match." % (target.shape, source.shape))
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 3cec40a48f739fa032f400f76c89db5ef9d4229d..477d18e214133b8f7eaa60f472adf67b8961411d 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -648,6 +648,7 @@ class BackpropTest(test.TestCase):
       g.gradient(x, y)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only('b/120545219')
   def testGradientTapeWithCond(self):
     x = constant_op.constant(3.0)
 
@@ -669,6 +670,7 @@ class BackpropTest(test.TestCase):
       self.assertEqual(self.evaluate(dy), 6.0)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only('b/120545219')
   def testGradientTapeWithWhileLoop(self):
     i = constant_op.constant(1)
     x = constant_op.constant(2.)
@@ -704,6 +706,7 @@ class BackpropTest(test.TestCase):
 
   @test_util.assert_no_new_tensors
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only('b/120545219')
   def testPersistentTape(self):
     with backprop.GradientTape(persistent=True) as g:
       x = constant_op.constant(3.0)
@@ -1243,16 +1246,19 @@ class JacobianTest(test.TestCase):
     answer = [array_ops.diag(2 * x * y), array_ops.diag(x * x)]
     return jacobian, answer
 
+  @test_util.run_v1_only('b/120545219')
   def testPfor(self):
     jacobian, answer = self._jacobian(experimental_use_pfor=True)
     for j, a in zip(jacobian, answer):
       self.assertAllEqual(a, j)
 
+  @test_util.run_v1_only('b/120545219')
   def testWhileLoop(self):
     jacobian, answer = self._jacobian(experimental_use_pfor=False)
     for j, a in zip(jacobian, answer):
       self.assertAllEqual(a, j)
 
+  @test_util.run_v1_only('b/120545219')
   def testPforDefun(self):
 
     @function.defun
@@ -1263,6 +1269,7 @@ class JacobianTest(test.TestCase):
     for j, a in zip(jacobian, answer):
       self.assertAllEqual(a, j)
 
+  @test_util.run_v1_only('b/120545219')
   def testWhileLoopDefun(self):
 
     @function.defun
@@ -1273,6 +1280,7 @@ class JacobianTest(test.TestCase):
     for j, a in zip(jacobian, answer):
       self.assertAllEqual(a, j)
 
+  @test_util.run_v1_only('b/120545219')
   def testPersistentTape(self):
     if not context.executing_eagerly():
       return
@@ -1283,6 +1291,7 @@ class JacobianTest(test.TestCase):
     with self.assertRaisesRegexp(RuntimeError, 'persistent'):
       g.jacobian(y, x, experimental_use_pfor=False)
 
+  @test_util.run_v1_only('b/120545219')
   def testPforException(self):
     var = variables.Variable([1.])
 
@@ -1303,6 +1312,7 @@ class JacobianTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'No converter'):
       g.jacobian(y, x, experimental_use_pfor=True)
 
+  @test_util.run_v1_only('b/120545219')
   def test_parallel_iterations(self):
     with backprop.GradientTape(persistent=True) as g:
       x = constant_op.constant([[1., 2], [3, 4]])
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 2f6b038dda945f20fa610a94e02b0dfb59dcab25..848b300ebaee249245554d8b1d5cb405ae1400a1 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -478,10 +478,6 @@ class Context(object):
     Raises:
       ValueError: If name is not a string or is an invalid device name.
     """
-    devices = self._context_devices
-    if devices is None:
-      self._initialize_handle_and_devices()
-      devices = self._context_devices
     eager_context = self._eager_context
     old_device_name = eager_context.device_name
     old_device_spec = eager_context.device_spec
@@ -502,7 +498,9 @@ class Context(object):
         if old_device_name:
           new_device_spec = copy.copy(old_device_spec)
         else:
-          new_device_spec = pydev.DeviceSpec.from_string(devices[0])
+          self._initialize_handle_and_devices()
+          new_device_spec = pydev.DeviceSpec.from_string(
+              self._context_devices[0])
         new_device_spec.merge_from(device_spec)
       else:
         new_device_spec = pydev.DeviceSpec.from_string("")
@@ -925,6 +923,10 @@ def add_function(fdef):
 # but they do all import this file.  Note that IS_IN_GRAPH_MODE and
 # in_graph_mode are both parameterless functions.
 def _tmp_in_graph_mode():
+  if context_safe() is None:
+    # Context not yet initialized. Assume graph mode following the
+    # default implementation in `is_in_graph_mode`.
+    return True
   return not executing_eagerly()
 
 
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 6bacd7a962fdefb8caf11189b0681694d23b97f0..5e7e866fd8bd399591b32fd4e10dcde28cd2f412 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -242,16 +242,20 @@ class PolymorphicFunction(object):
       raise NotImplementedError()
     self._created_variables = None
     self._stateful_fn = None
+    self._stateless_fn = None
     self._descriptor_cache = weakref.WeakKeyDictionary()
     self._name = name
 
   def _defun_with_scope(self, scope):
     """Creates a defun wrapped inside a variable creator scope."""
 
+    weak_wrapped_fn = None
     def wrapped_fn(*args, **kwds):
       with variable_scope.variable_creator_scope(scope):
-        # __wrapped__ allows AutoGraph to swap in a converted function.
-        return wrapped_fn.__wrapped__(*args, **kwds)
+        # __wrapped__ allows AutoGraph to swap in a converted function. We give
+        # the function a weak reference to itself to avoid a reference cycle.
+        return weak_wrapped_fn().__wrapped__(*args, **kwds)
+    weak_wrapped_fn = weakref.ref(wrapped_fn)
 
     # TODO(mdan): Pipe self._experimental_autograph_options through.
     return function_lib.defun(
@@ -262,21 +266,22 @@ class PolymorphicFunction(object):
   def _initialize(self, args, kwds, add_initializers_to=None):
     """Initializes, on the first call."""
 
-    self._created_variables = []
+    created_variables = []
 
     def variable_capturing_scope(unused_next_creator, **kwds):
       """Creates UnliftedInitializerVariables and saves references to them."""
       v = UnliftedInitializerVariable(
           add_initializers_to=add_initializers_to, **kwds)
-      self._created_variables.append(weakref.ref(v))
+      created_variables.append(weakref.ref(v))
       return v
 
+    self._created_variables = created_variables
     self._stateful_fn = self._defun_with_scope(variable_capturing_scope)
     self._stateful_fn._name = self._name  # pylint: disable=protected-access
-
     # Force the definition of the function for these arguments
     self._concrete_stateful_fn = (
-        self._stateful_fn._get_concrete_function_internal(*args, **kwds))  # pylint: disable=protected-access
+        self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access
+            *args, **kwds))
 
     def invalid_creator_scope(*unused_args, **unused_kwds):
       """Disables variable creation."""
@@ -342,6 +347,10 @@ class PolymorphicFunction(object):
     """The python function wrapped in this tf.function."""
     return self._python_function
 
+  @property
+  def input_signature(self):
+    return self._input_signature
+
   def get_initialization_function(self, *args, **kwargs):
     """Returns a `Function` object which initializes this function's variables.
 
@@ -378,6 +387,26 @@ class PolymorphicFunction(object):
 
     return initialize_variables.get_concrete_function()
 
+  @property
+  def _cached_input_signatures(self):
+    """All input signatures used to call this PolymorphicFunction."""
+    seen = set()
+    # Preserves signature ordering rather than returning a set() so that we
+    # don't need to re-sort signatures later to work around Python 2's set
+    # nondeterminism.
+    # pylint: disable=protected-access
+    concrete_functions = []
+    if self._stateful_fn:
+      concrete_functions.extend(self._stateful_fn._function_cache.values())
+    if self._stateless_fn:
+      concrete_functions.extend(self._stateless_fn._function_cache.values())
+    for concrete_function in concrete_functions:
+      signature = concrete_function._python_call_signature
+      if signature not in seen:
+        yield signature
+        seen.add(signature)
+    # pylint: enable=protected-access
+
   def get_concrete_function(self, *args, **kwargs):
     """Returns a `Function` object specialized to inputs and execution context.
 
@@ -528,12 +557,33 @@ def function(func=None,
   assert f(x, y).numpy() == g(x, y).numpy()
 
   # Tensors and tf.Variables used by the Python function are captured in the
-  # traced graph.
+  # graph.
   @tf.function
   def h():
     return f(x, y)
 
   assert (h().numpy() == f(x, y).numpy()).all()
+
+  # Data-dependent control flow is also captured in the graph. Supported
+  # control flow statements include `if`, `for`, `break`, `continue`, `return`.
+  @tf.function
+  def g(x):
+    if tf.reduce_sum(x) > 0:
+      return x * x
+    else:
+      return -x // 2
+
+  # print and TensorFlow side effects are supported, but exercise caution when
+  # using Python side effects like mutating objects, saving to files, etc.
+  l = []
+
+  @tf.function
+  def g(x):
+    for i in x:
+      print(i)                              # Works
+      tf.assign(v, i)                       # Works
+      tf.py_func(lambda i: l.append(i))(i)  # Works
+      l.append(i)                           # Caution! Doesn't work.
   ```
 
   _Referencing `tf.Variable`s_
@@ -605,6 +655,7 @@ def function(func=None,
   ```
 
   _Input Signatures_
+
   `function` instantiates a separate graph for every unique set of input
   shapes and datatypes. For example, the following code snippet will result
   in three distinct graphs being traced, as each input has a different
@@ -638,9 +689,15 @@ def function(func=None,
   When an `input_signature` is specified, the callable will only accept `Tensor`
   (or NumPy `ndarray`) objects as arguments.
 
-  _Tracing_
-  Note that `function` only traces TensorFlow operations, all the other
-  Python code that `func` executes will shape the _construction_ of the graph.
+  _Tracing and staging_
+
+  When `autograph` is `True`, all Python code that depends on `Tensor` values is
+  staged into a TensorFlow graph. When `autograph` is `False`, the function is
+  traced and control flow is not allowed to depend on data.
+
+  Note that `function` only stages TensorFlow operations, all Python code that
+  `func` executes and does not depend on data will shape the _construction_ of
+  the graph.
   For example, consider the following:
 
   ```python
@@ -653,21 +710,26 @@ def function(func=None,
   ```
 
   `add_noise()` will return a different output every time it is invoked.
-  However, `traced` will return the same value every time it is called, since a
-  particular random value generated by the `np.random.randn` call will be
-  inserted in the traced TensorFlow graph as a constant. In this particular
-  example, replacing `np.random.randn(5, 5)` with `tf.random_normal((5, 5))`
-  will result in the same behavior for `add_noise()` and `traced()`.
+  However, `add_noise` will return the same value every time it is called,
+  since a particular random value generated by the `np.random.randn` call will
+  be inserted in the traced/staged TensorFlow graph as a constant. In this
+  particular example, replacing `np.random.randn(5, 5)` with
+  `tf.random_normal((5, 5))` will result in the same behavior for `add_noise()`
+  and `traced()`.
 
   _Python Side-Effects_
+
   A corollary of the previous discussion on tracing is the following: If a
   Python function `func` has Python side-effects, then executing `func` multiple
-  times
-  may not be semantically equivalent to executing `F = tf.function(func)`
+  times may not be semantically equivalent to executing `F = tf.function(func)`
   multiple times; this difference is due to the fact that `function` only
   captures the subgraph of TensorFlow operations that is constructed when `func`
   is invoked to trace a graph.
 
+  The same is true if code with Python side effects is used inside control flow,
+  such as a loop. If your code uses side effects that are not intended to
+  control graph construction, wrap them inside `tf.py_func`.
+
   Args:
     func: function to be compiled. If `func` is None, returns a decorator that
       can be invoked with a single argument - `func`. The end result is
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 4100a10044c3c39763de8bb3eec645e278d94e19..77cc8ee981a176f9f57028832039fa9bfe1f47a1 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
+import weakref
 
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
@@ -25,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import math_ops
@@ -53,6 +55,13 @@ class _ModelWithOptimizer(training.Model):
     return {'loss': loss}
 
 
+class _HasDecoratedMethod(object):
+
+  @def_function.function
+  def f(self, x):
+    return x * 3.
+
+
 class DefFunctionTest(test.TestCase):
 
   def testNoVariables(self):
@@ -238,6 +247,47 @@ class DefFunctionTest(test.TestCase):
     concrete = compute.get_concrete_function(
         tensor_spec.TensorSpec(None, dtypes.float32))
     self.assertAllClose(4., concrete(constant_op.constant(2.)))
+    input_signature, = compute._cached_input_signatures
+    self.assertEqual(
+        tuple(input_signature),
+        (tensor_spec.TensorSpec(None, dtypes.float32),))
+
+  def test_serialization_signature_cache(self):
+
+    @def_function.function
+    def f(x, y):
+      return x, y
+
+    f(constant_op.constant([[3., 4.]]), constant_op.constant([2.]))
+    f(constant_op.constant([[3, 4, 5]]), constant_op.constant([2]))
+    self.assertEqual(
+        set(f._cached_input_signatures),
+        set(((tensor_spec.TensorSpec([1, 2], dtypes.float32),
+              tensor_spec.TensorSpec([1], dtypes.float32)),
+             (tensor_spec.TensorSpec([1, 3], dtypes.int32),
+              tensor_spec.TensorSpec([1], dtypes.int32)))))
+
+  @test_util.assert_no_garbage_created
+  def testFunctionReferenceCycles(self):
+    fn = def_function.function(lambda x: 2. * x)
+    fn(constant_op.constant(4.0))
+    weak_fn = weakref.ref(fn)
+    del fn
+    # Tests that the weak reference we made to the function is now dead, which
+    # means the object has been deleted. This should be true as long as the
+    # function itself is not involved in a reference cycle.
+    self.assertIs(None, weak_fn())
+
+  @test_util.assert_no_garbage_created
+  def testMethodReferenceCycles(self):
+    has_decorated_method = _HasDecoratedMethod()
+    has_decorated_method.f(constant_op.constant(5.))
+    weak_fn = weakref.ref(has_decorated_method.f)
+    del has_decorated_method
+    # Tests that the weak reference we made to the function is now dead, which
+    # means the object has been deleted. This should be true as long as the
+    # function itself is not involved in a reference cycle.
+    self.assertIs(None, weak_fn())
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index 6f8c780170cc8e3bfe5aa23603c0448e70b5e49c..7415a0ae22a3492fc9179a0cae37d09e9c1ad9aa 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -66,12 +66,6 @@ def quick_execute(op_name, num_outputs, inputs, attrs, ctx, name=None):
     six.raise_from(core._status_to_exception(e.code, message), None)
   except TypeError as e:
     if any(ops._is_keras_symbolic_tensor(x) for x in inputs):
-      if any(isinstance(x, ops.EagerTensor) for x in inputs):
-        raise TypeError("You are attempting to mix computation of symbolic "
-                        "Tensors (computation rooted at tf.keras.Input()) "
-                        "and concrete values. This is not supported. "
-                        "If you need this support, file an issue on the "
-                        "TensorFlow GitHub repository.")
       raise core._SymbolicException
     raise e
   # pylint: enable=protected-access
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 80ff4459d60a33d1a02f14acaafb8370a48fb6ca..af1afa3454655df233d8530bb89ae31c840de052 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
 import functools
+import enum  # pylint: disable=g-bad-import-order
 
 import numpy as np
 
@@ -28,8 +30,25 @@ from tensorflow.python.eager import core
 from tensorflow.python.eager import execute
 from tensorflow.python.platform import tf_logging as logging
 
-_DEFAULT_CALLBACK_ACTION = "raise"
-_VALID_CALLBACK_ACTIONS = (None, "ignore", "print", "raise", "warn")
+
+class ExecutionCallback(enum.Enum):
+  """Valid callback actions.
+
+  These can be passed to `seterr` or `errstate` to create callbacks when
+  specific events occur (e.g. an operation produces `NaN`s).
+
+  IGNORE: take no action.
+  PRINT:  print a warning to `stdout`.
+  RAISE:  raise an error (e.g. `InfOrNanError`).
+  WARN:   print a warning using `tf.logging.warn`.
+  """
+
+  IGNORE = "ignore"
+  PRINT = "print"
+  RAISE = "raise"
+  WARN = "warn"
+
+_DEFAULT_CALLBACK_ACTION = ExecutionCallback.RAISE
 
 
 # TODO(cais): Consider moving this exception class to errors_impl.py.
@@ -133,11 +152,8 @@ def inf_nan_callback(op_type,
       the output tensor values.
     check_nan: (`bool`) Whether this callback should check for `nan` values in
       the output tensor values.
-    action: (`str`) Action to be taken by the callback when `inf` or `nan`
-      values are detected. Possible values {"raise", "warn", "print"}
-      `"raise"`: Raise a `InfOrNanError`.
-      `"warn"`: Log a warning using `tf.logging.warn`.
-      `"print"`: Print a message to `sys.stdout`.
+    action: (`ExecutionCallback`) Action to be taken by the callback when
+      `inf` or `nan` values are detected.
 
   Raises:
     InfOrNanError: iff `inf` or `nan` values are seen in any of `outputs` and
@@ -146,6 +162,7 @@ def inf_nan_callback(op_type,
   """
   del attrs, inputs  # Not used.
 
+  action = ExecutionCallback(action)
   ctx = context.context()
 
   for index, output in enumerate(outputs):
@@ -174,16 +191,16 @@ def inf_nan_callback(op_type,
           continue
 
         error = InfOrNanError(op_type, op_name, index, len(outputs), value)
-        if action == "print":
+        if action == ExecutionCallback.PRINT:
           print("Warning: %s" % str(error))
-        elif action == "warn":
+        elif action == ExecutionCallback.WARN:
           logging.warn(str(error))
-        elif action == "raise":
+        elif action == ExecutionCallback.RAISE:
           raise error
         else:
           raise ValueError(
               "Invalid action for inf_nan_callback: %s. Valid actions are: "
-              "{print | warn | raise}" % action)
+              "{PRINT | WARN | RAISE}" % action)
 
 
 def inf_callback(op_type,
@@ -276,7 +293,7 @@ def seterr(inf_or_nan=None):
 
   Example:
   ```python
-  tfe.seterr(inf_or_nan="raise")
+  tfe.seterr(inf_or_nan=ExecutionCallback.RAISE)
   a = tf.constant(10.0)
   b = tf.constant(0.0)
   try:
@@ -284,18 +301,14 @@ def seterr(inf_or_nan=None):
   except Exception as e:
     print("Caught Exception: %s" % e)
 
-  tfe.seterr(inf_or_nan="ignore")
+  tfe.seterr(inf_or_nan=ExecutionCallback.IGNORE)
   c = a / b  # <-- Does NOT raise exception anymore.
   ```
 
   Args:
-    inf_or_nan: Set action for infinity (`inf`) and NaN (`nan`) values.
-      Possible values: `{"ignore", "print", "raise", "warn"}`.
-      `"ignore"`: take no action when `inf` values appear.
-      `"print"`: print a warning to `stdout`.
-      `"raise"`: raise an `InfOrNanError`.
-      `"warn"`: print a warning using `tf.logging.warn`.
-      A value of `None` leads to no change in the action of the condition.
+    inf_or_nan: An `ExecutionCallback` determining the action for infinity
+      (`inf`) and NaN (`nan`) values. A value of `None` leads to no change in
+      the action of the condition.
 
   Returns:
     A dictionary of old actions.
@@ -303,12 +316,8 @@ def seterr(inf_or_nan=None):
   Raises:
     ValueError: If the value of any keyword arguments is invalid.
   """
-  if inf_or_nan not in _VALID_CALLBACK_ACTIONS:
-    raise ValueError(
-        "Invalid action value for inf_or_nan: %s. "
-        "Valid actions are %s." % (inf_or_nan, _VALID_CALLBACK_ACTIONS))
-
-  old_settings = {"inf_or_nan": "ignore"}
+  inf_or_nan = ExecutionCallback(inf_or_nan) if inf_or_nan is not None else None
+  old_settings = {"inf_or_nan": ExecutionCallback.IGNORE}
   default_context = context.context()
 
   carryover_callbacks = []
@@ -330,8 +339,39 @@ def seterr(inf_or_nan=None):
     default_context.clear_post_execution_callbacks()
     for callback in carryover_callbacks:
       default_context.add_post_execution_callback(callback)
-    if inf_or_nan != "ignore":
+    if inf_or_nan != ExecutionCallback.IGNORE:
       default_context.add_post_execution_callback(
           functools.partial(inf_nan_callback, action=inf_or_nan))
 
   return old_settings
+
+
+@contextlib.contextmanager
+def errstate(inf_or_nan=None):
+  """Context manager setting error state.
+
+  Example:
+  ```
+  c = tf.log(0.)  # -inf
+
+  with errstate(inf_or_nan=ExecutionCallback.RAISE):
+    tf.log(0.)  # <-- Raises InfOrNanError.
+  ```
+
+  Args:
+    inf_or_nan: An `ExecutionCallback` determining the action for infinity
+      (`inf`) and NaN (`nan`) values. A value of `None` leads to no change in
+      the action of the condition.
+
+  Yields:
+    None.
+
+  Raises:
+    ValueError: If the value of any keyword arguments is invalid.
+  """
+  if not context.executing_eagerly():
+    yield
+  else:
+    old_settings = seterr(inf_or_nan=inf_or_nan)
+    yield
+    seterr(**old_settings)
diff --git a/tensorflow/python/eager/execution_callbacks_test.py b/tensorflow/python/eager/execution_callbacks_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8b786ad2eeff5513ab0c6b2072d7b91975ee1f4
--- /dev/null
+++ b/tensorflow/python/eager/execution_callbacks_test.py
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for eager execution_callbacks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import execution_callbacks
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+RAISE = execution_callbacks.ExecutionCallback.RAISE
+IGNORE = execution_callbacks.ExecutionCallback.IGNORE
+
+
+def log_zero():
+  """Computes `log(0.0)`."""
+  return math_ops.log(constant_op.constant(0.))
+
+
+class ExecutionCallbacksTest(test.TestCase):
+
+  def test_errstate_inf_raise(self):
+    with execution_callbacks.errstate(inf_or_nan=RAISE):
+      with self.assertRaises(execution_callbacks.InfOrNanError):
+        log_zero()
+
+  def test_errstate_inf_ignore(self):
+    with execution_callbacks.errstate(inf_or_nan=IGNORE):
+      self.assertEqual(-float("inf"), log_zero().numpy())
+
+  def test_errstate_nesting(self):
+    with execution_callbacks.errstate(inf_or_nan=RAISE):
+      with execution_callbacks.errstate(inf_or_nan=IGNORE):
+        self.assertEqual(-float("inf"), log_zero().numpy())
+
+      with self.assertRaises(execution_callbacks.InfOrNanError):
+        log_zero()
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 9d05a660b1f5ebefce54626fcf98eaebff073d49..885403dd10ca5c2e5b63acea14c95550d0c18e6d 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -50,6 +50,7 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util import memory
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
@@ -149,10 +150,9 @@ class _EagerDefinedFunction(object):
       outputs: the tensors in the graph which will be outputs to the function
       attrs: dict mapping names of attributes to their AttrValue values
     """
-    operations = [
-        op for op in graph.get_operations()
-        if op not in set(arg.op for arg in inputs)
-    ]
+    input_ops = set(arg.op for arg in inputs)
+    operations = [op for op in graph.get_operations() if op not in input_ops]
+
     fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
         graph._c_graph,  # pylint: disable=protected-access
         compat.as_str(name),
@@ -340,7 +340,7 @@ class Function(object):
       TypeError: For invalid positional/keyword argument combinations.
     """
     if self._arg_keywords is None or self._num_positional_args is None:
-      if self._signature:
+      if self._signature is not None:
         if kwargs:
           raise NotImplementedError(
               "Keyword arguments not supported when calling a "
@@ -748,6 +748,19 @@ class Function(object):
     return ret
 
 
+class UnknownArgument(object):
+  """Signifies an argument which is not currently handled."""
+  pass
+
+
+def _encode_arg_for_serialization(arg):
+  """A representation for this argument, for serializing signatures."""
+  if isinstance(arg, ops.Tensor):
+    return tensor_spec.TensorSpec(arg.shape, arg.dtype)
+  else:
+    return UnknownArgument()
+
+
 pywrap_tensorflow.RegisterType("Tensor", ops.Tensor)
 pywrap_tensorflow.RegisterType("IndexedSlices", ops.IndexedSlices)
 
@@ -804,6 +817,8 @@ class PolymorphicFunction(object):
     self._name = name
     self._autograph = autograph
     self._function_cache = collections.OrderedDict()
+    self._garbage_collector = _PolymorphicFunctionGarbageCollector(
+        self._function_cache)
     self._function_attributes = attributes or {}
 
     self._lock = threading.Lock()
@@ -857,13 +872,24 @@ class PolymorphicFunction(object):
     """Returns the wrapped Python function."""
     return self._python_function
 
-  def _get_concrete_function_internal(self, *args, **kwargs):
-    """Bypasses error checking when getting a graph function."""
+  def _get_concrete_function_internal_garbage_collected(self, *args, **kwargs):
+    """Returns a concrete function which cleans up its graph function."""
     if self._input_signature:
       args, kwargs = None, None
     graph_function, _, _ = self._maybe_define_function(args, kwargs)
     return graph_function
 
+  def _get_concrete_function_internal(self, *args, **kwargs):
+    """Bypasses error checking when getting a graph function."""
+    graph_function = self._get_concrete_function_internal_garbage_collected(
+        *args, **kwargs)
+    # We're returning this concrete function to someone, and they may keep a
+    # reference to the FuncGraph without keeping a reference to the Function
+    # object. So we won't clean up the reference cycles manually and instead
+    # will leave them to Python's garbage collector.
+    graph_function._garbage_collector.release()  # pylint: disable=protected-access
+    return graph_function
+
   def get_concrete_function(self, *args, **kwargs):
     """Returns a `Function` object specialized to inputs and execution context.
 
@@ -982,11 +1008,17 @@ class PolymorphicFunction(object):
       input_signature = self._flat_input_signature
 
     ctx = context.context()
-    with ops.init_scope():
-      # The graph, or whether we're executing eagerly, should be a part of the
-      # cache key so we don't improperly capture tensors such as variables.
-      executing_eagerly = ctx.executing_eagerly()
-      parent_graph = None if executing_eagerly else ops.get_default_graph()
+
+    # Don't need to open an init_scope if the _cache_key call is in eager mode
+    # already.
+    executing_eagerly = ctx.executing_eagerly()
+    parent_graph = None
+    if not executing_eagerly:
+      with ops.init_scope():
+        # The graph, or whether we're executing eagerly, should be a part of the
+        # cache key so we don't improperly capture tensors such as variables.
+        executing_eagerly = ctx.executing_eagerly()
+        parent_graph = None if executing_eagerly else ops.get_default_graph()
 
     # pylint: disable=protected-access
     default_graph = ops.get_default_graph()
@@ -1157,6 +1189,22 @@ class PolymorphicFunction(object):
                 autograph=self._autograph,
                 arg_names=arg_names),
             self._function_attributes)
+        if self._input_signature:
+          python_call_signature = self._input_signature
+        else:
+          python_call_signature = tuple(
+              _encode_arg_for_serialization(arg) for arg in args)
+        # pylint: disable=protected-access
+        # Save information about non-Tensor arguments with the concrete
+        # function. Used to serialize PolymorphicFunctions.
+        graph_function._python_call_signature = python_call_signature
+        # Tell the Function to clean up its graph once it goes out of
+        # scope. Function does not do this in its constructor since it gets used
+        # in some places (like Keras) where the FuncGraph lives longer than the
+        # Function.
+        graph_function._garbage_collector = _FunctionGarbageCollector(
+            graph_function.graph)
+        # pylint: enable=protected-access
         self._function_cache[cache_key] = graph_function
       return graph_function, args, kwargs
 
@@ -1197,19 +1245,18 @@ def validate_signature(signature):
 def defun(func=None, input_signature=None, autograph=True):
   """Compiles a Python function into a callable TensorFlow graph.
 
-  `defun` (short for "define function") trace-compiles a Python function
+  `defun` (short for "define function") compiles a Python function
   composed of TensorFlow operations into a callable that executes a `tf.Graph`
   containing those operations. The callable produced by `defun` contains only
   the subgraph of TensorFlow operations that were executed when the Python
   function was called with a particular input signature, defined as a list
   of the shapes and dtypes of the Python function's Tensor-valued arguments and
-  the values of its non-Tensor Python objects. In particular, `defun` is _not_ a
-  compiler for arbitrary Python code.
+  the values of its non-Tensor Python objects.
 
   When eager execution is enabled, the ability to create graphs from Python
   functions makes it possible to incrementally trade off debugability and
   interactivity for performance.  Functions compiled with `defun` cannot be
-  inspected with `pdb` and `print` statements; however, executing a graph
+  inspected with `pdb`; however, executing a graph
   generated by `defun` sometimes takes less time and memory than eagerly
   executing the corresponding Python function, since specifying computations as
   graphs allows for optimizations like automatic buffer reuse and
@@ -1300,6 +1347,7 @@ def defun(func=None, input_signature=None, autograph=True):
   outer graph otherwise.
 
   _Input Signatures_
+
   By default, `F = tf.contrib.eager.defun(f)` instantiates a separate graph
   for every unique sequence of the shapes and dtypes of Tensor arguments and
   the values of Python objects it is invoked with. For example, calling
@@ -1358,6 +1406,7 @@ def defun(func=None, input_signature=None, autograph=True):
   Tensors as arguments and must not take unnamed keyword arguments (**kwargs).
 
   _Tracing_
+
   Be aware that because `F` only logs TensorFlow operations, all the other
   Python code that `f` executes will only shape the _construction_ of the graphs
   that `F` executes: the Python code won't be executed when the graphs
@@ -1383,6 +1432,7 @@ def defun(func=None, input_signature=None, autograph=True):
   replace the call to `np.random.randn` with `tf.random_normal((5, 5))`.
 
   _Python Side-Effects_
+
   A corollary of the previous discussion on tracing is the following: If a
   Python function `f` has Python side-effects, then executing `f` multiple times
   will not necessarily be semantically equivalent to executing `F =
@@ -1390,7 +1440,8 @@ def defun(func=None, input_signature=None, autograph=True):
   that `defun` only captures the subgraph of TensorFlow operations that is
   constructed when `f` is called in a graph-building context.
 
-  _Python Control Flow_.
+  _Python Control Flow_
+
   The structure of many machine learning computations depend upon whether one is
   training or validating, and it is common to nest specialized logic under `if
   training:` blocks. By mapping each input signature to a unique graph, `defun`
@@ -1419,27 +1470,26 @@ def defun(func=None, input_signature=None, autograph=True):
   exact_outputs = lossy_matmul(W, x, training=False)
   ```
 
-  On the other hand, because `defun` generates graphs by tracing and not by
-  source code analysis, it fully unrolls Python `for` and `while` loops,
-  potentially creating large graphs. If your Python function has native loops
-  that run for many iterations, consider replacing them with `tf.while_loop`
-  operations.
+  _TensorFlow Control Flow_
 
-  When constructing graphs, `tf.Tensor` objects cannot be used as Python
-  `bool` objects. This means, for example, that you should replace code in `f`
-  resembling
+  When `autograph` is `True`, data-dependent control flow is allowed as well.
+  Control flow statements that depend on `Tensor` values are staged into
+  corresponding TensorFlow ops. For example, the following code will work as
+  expected:
 
   ```python
-
-  if tensor < 10:
-    true_fn()
-  else:
-    false_fn()
+  @tf.contrib.eager.defun
+  def dynamic_rnn_loop(cell, seq):
+    state, output = cell.zero_state()
+    for input in seq:
+      state, output = cell(input, state)
+    return output
   ```
 
-  with `tf.cond(tensor < 10, true_fn, false_fn)`.
+  For more information see `tf.autograph`.
 
   _Variables_
+
   TensorFlow operations related to variable creation and initialization are
   automatically lifted out of the graphs generated by `defun`. In practice, this
   implies that variable creation and initialization only happen the first time
@@ -1611,14 +1661,24 @@ def class_method_to_instance_method(original_function, instance):
   assert hasattr(original_function, "_input_signature")
   assert hasattr(original_function, "python_function")
 
+  weak_bound_method_wrapper = None
   def bound_method_wrapper(*args, **kwargs):
+    """Wraps either a dummy MethodType or a converted AutoGraph function."""
     # __wrapped__ allows AutoGraph to swap in a converted function.
-    wrapped_fn = bound_method_wrapper.__wrapped__
-    # If __wrapped__ was not replaced, then call original_function.
-    # TODO(b/119246461): This needs to be simplified.
-    if tf_inspect.ismethod(wrapped_fn):
+    strong_bound_method_wrapper = weak_bound_method_wrapper()
+    wrapped_fn = strong_bound_method_wrapper.__wrapped__
+
+    if wrapped_fn is strong_bound_method_wrapper.__original_wrapped__:
+      # If __wrapped__ was not replaced, then call original_function.
       wrapped_fn = original_function.python_function
+      if tf_inspect.ismethod(wrapped_fn):
+        wrapped_fn = six.get_unbound_function(wrapped_fn)
+      return wrapped_fn(weak_instance(), *args, **kwargs)
+
+    # If __wrapped__ was replaced, then it is always an unbound function
+    # that takes self as first argument.
     return wrapped_fn(weak_instance(), *args, **kwargs)
+  weak_bound_method_wrapper = weakref.ref(bound_method_wrapper)
 
   # pylint: disable=protected-access
   # We make a dummy MethodType object to generate the correct bound method
@@ -1635,3 +1695,33 @@ def class_method_to_instance_method(original_function, instance):
   wrapped_instance_func = tf_decorator.make_decorator(
       original_function.python_function, instance_func)
   return wrapped_instance_func
+
+
+class _PolymorphicFunctionGarbageCollector(object):
+  """Cleans up cycles when a defun goes out of scope."""
+
+  def __init__(self, cache):
+    self._cache = cache
+
+  def __del__(self):
+    if func_graph_module is None or memory is None:
+      return
+    while self._cache:
+      self._cache.popitem()
+    memory.dismantle_ordered_dict(self._cache)
+
+
+class _FunctionGarbageCollector(object):
+  """Cleans up reference cycles when a Function goes out of scope."""
+
+  def __init__(self, func_graph):
+    self._func_graph = func_graph
+
+  def release(self):
+    """Call off the FuncGraph deletion."""
+    self._func_graph = None
+
+  def __del__(self):
+    if func_graph_module is None or memory is None or self._func_graph is None:
+      return
+    func_graph_module.dismantle_func_graph(self._func_graph)
diff --git a/tensorflow/python/eager/function_gradients_test.py b/tensorflow/python/eager/function_gradients_test.py
index 9b83f57089a16c1a2942b674450b78ec8d74bd6e..98dec0b361b76eadbb107a7cd42e4deba6f2ea25 100644
--- a/tensorflow/python/eager/function_gradients_test.py
+++ b/tensorflow/python/eager/function_gradients_test.py
@@ -187,7 +187,7 @@ class FunctionGradientsTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(2, g(constant_op.constant(2.)))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def testGraphModeEagerGradError(self):
     with context.graph_mode():
       def f():
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 2f3eff6d6aaf637e993bb05210482725c2aea290..c7959441d8f0bec8bb90f77c79ac6f495dbfa94d 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -152,8 +152,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     pair = collections.namedtuple('pair', ['a', 'b'])
 
-    # TODO(b/120222989) remove autograph=False.
-    @def_function.function(autograph=False)
+    @def_function.function()
     def pairs_mul(pair_a, pair_b):
       return pair(matmul(pair_a.a, pair_b.a), matmul(pair_a.b, pair_b.b))
 
@@ -429,20 +428,21 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       self.evaluate(variables.global_variables_initializer())
     self.assertEqual(self.evaluate(value), 2.0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.also_run_as_tf_function
   def testInitScopeTensorInitializationInFunction(self):
 
     @def_function.function
     def tensor_init():
       with ops.init_scope():
         const = constant_op.constant(2.0)
+      # Note: this variable bypasses tf.function's variable creation
+      # requirements by bypassing variable_creator_scope by using
+      # ResourceVariable instead of Variable.
       self.v = resource_variable_ops.ResourceVariable(const)
       return self.v.read_value()
 
     value = tensor_init()
-    if not context.executing_eagerly():
-      self.evaluate(variables.global_variables_initializer())
-    self.assertEqual(self.evaluate(value), 2.0)
+    self.assertAllEqual(value, 2.0)
 
   def testDefunShapeInferenceWithCapturedResourceVariable(self):
     v = resource_variable_ops.ResourceVariable([[1, 2], [3, 4]])
@@ -462,6 +462,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     var_t = resource_variable_ops.read_variable_op(var_handle, dtype=v.dtype)
     self.assertEqual(var_t.shape, tensor_shape.TensorShape([2, 2]))
 
+  @test_util.enable_control_flow_v2
   def testVariableInLoopInFunction(self):
 
     @function.defun
@@ -544,7 +545,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertIsInstance(
         self.v, resource_variable_ops.ResourceVariable)
 
-  def disabled_testRunMetadata(self):
+  def testRunMetadata(self):
 
     @def_function.function
     def f(x):
@@ -579,7 +580,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
           return self.v * 2
 
       o = HasAVar()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       call = def_function.function(o.call)
       op = call()
       self.assertAllEqual(self.evaluate(op), 2.0)
@@ -936,9 +937,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllClose([[[[4.0]]]], self.evaluate(y))
 
-    # Remove reference cycles in model
-    test_util.dismantle_polymorphic_function(model)
-
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testDefunKerasModelCall(self):
     model = MiniModel()
@@ -952,8 +950,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual([[3.0]], self.evaluate(y))
 
-    # Remove reference cycles in defun.
-    test_util.dismantle_polymorphic_function(model.call)
     # Break the reference cycle between the MiniModel and the defun:
     # MiniModel --(through its `call` method)--> PolymorphicFunction
     # PolymorphicFunction --(instancemethod on MiniModel)--> MiniModel
@@ -963,6 +959,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
   # construction. Eager's configuration is controlled in `__main__`.
   @test_util.run_in_graph_and_eager_modes(
       config=config_pb2.ConfigProto(device_count={'CPU': 4}))
+  @test_util.run_v1_only('b/120545219')
   def testDeviceAnnotationsRespected(self):
 
     def multi_device_fn():
@@ -1001,6 +998,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(
       config=config_pb2.ConfigProto(device_count={'CPU': 2}))
+  @test_util.run_v1_only('b/120545219')
   def testCallingGraphFunctionOnDifferentDevice(self):
 
     def func():
@@ -1302,7 +1300,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       defined.get_concrete_function(
           tensor_spec.TensorSpec(shape=(3,), dtype=dtypes.float32))
 
-  @test_util.run_deprecated_v1
   def testInputSignatureForFunctionWithNonTensorInputsNotAllowed(self):
 
     def foo(a, training=True):
@@ -1311,10 +1308,16 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       else:
         return -1.0 * a
 
-    signature = [tensor_spec.TensorSpec([], dtypes.float32)] * 2
+    signature = [
+        tensor_spec.TensorSpec([], dtypes.float32),
+        tensor_spec.TensorSpec([], dtypes.bool),
+    ]
     defined = def_function.function(foo, input_signature=signature)
     a = constant_op.constant(1.0)
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegexp(
+        ValueError,
+        'When input_signature is provided, all inputs to '
+        'the Python function must be Tensors.'):
       defined(a, training=True)
 
   def testInputSignatureWithKeywordPositionalArgs(self):
@@ -2033,6 +2036,19 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
             5,
             add_five(constant_op.constant(0, dtype=dtypes.int32)).numpy())
 
+  @test_util.assert_no_garbage_created
+  def testReferenceCycles(self):
+
+    fn = function.defun(lambda x: 2. * x)
+
+    fn(constant_op.constant(4.0))
+    weak_fn = weakref.ref(fn)
+    del fn
+    # Tests that the weak reference we made to the function is now dead, which
+    # means the object has been deleted. This should be true as long as the
+    # function itself is not involved in a reference cycle.
+    self.assertIs(None, weak_fn())
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution(
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 0d0f70d543d0b5579cd08aa1284920c28c7d60e6..30a93fb0e421e0b26f517a03302d2e96913d8b9a 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -220,6 +220,14 @@ TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype) {
       return nullptr;
     }
   }
+  tensorflow::Safe_PyObjectPtr value_decrefer;
+  if (PyArray_IsScalar(value, Generic)) {
+    // Convert numpy scalars to numpy arrays.
+    value = PyArray_FromScalar(value, nullptr);
+    // The returned value needs to be DECREF'd, but the original value was
+    // created in python code, and doesn't need to be DECREF'd.
+    value_decrefer.reset(value);
+  }
   if (PyArray_Check(value)) {
     int desired_np_dtype = -1;
     if (desired_dtype >= 0) {
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 25442ff0485dc2bbe2f08dedeb1cde9859691454..0ee2ff68c209aa13aaeb32be610302c11616b9d7 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -95,6 +95,18 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     t = _create_tensor(values)
     self.assertAllEqual(values, t)
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testNumpyDtypeSurvivesThroughTensorConversion(self):
+    scalar_creators = [np.int32, np.int64, np.float32, np.float64]
+    conversion_functions = [ops.convert_to_tensor, constant_op.constant]
+
+    for scalar_creator in scalar_creators:
+      for conversion_function in conversion_functions:
+        np_val = scalar_creator(3)
+        tensor_val = conversion_function(np_val)
+        self.assertEqual(tensor_val.numpy().dtype, np_val.dtype)
+        self.assertEqual(tensor_val.numpy(), np_val)
+
   def testNumpyValueWithCast(self):
     values = np.array([3.0], dtype=np.float32)
     t = _create_tensor(values, dtype=dtypes.float64)
diff --git a/tensorflow/python/eager/wrap_function_test.py b/tensorflow/python/eager/wrap_function_test.py
index b32b6ca42691a6261576da6b105a0afc97e0ec63..d34e9228f332ad01f709c99e6988975c8061798d 100644
--- a/tensorflow/python/eager/wrap_function_test.py
+++ b/tensorflow/python/eager/wrap_function_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 
 from tensorflow.python.eager import wrap_function
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
@@ -70,6 +71,14 @@ class WrapFunctionTest(test.TestCase):
     f_pruned = f_wrapped.prune(x_in[0], [x_out[0]])
     self.assertAllEqual(f_pruned(ops.convert_to_tensor(2.0)), [4.0])
 
+  def testNoArguments(self):
+
+    def f():
+      return constant_op.constant(1.)
+
+    f_wrapped = wrap_function.wrap_function(f, [])
+    self.assertAllEqual(1.0, f_wrapped())
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 2af2b9f254abcb4a2e7a4b655a581338a9622ad3..914044d6d627684aed7a67b9a560ce6b78c0fc1e 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -1354,8 +1354,9 @@ def bucketized_column(source_column, boundaries):
     raise ValueError(
         'source_column must be one-dimensional column. '
         'Given: {}'.format(source_column))
-  if (not boundaries or
-      not (isinstance(boundaries, list) or isinstance(boundaries, tuple))):
+  if not boundaries:
+    raise ValueError('boundaries must not be empty.')
+  if not (isinstance(boundaries, list) or isinstance(boundaries, tuple)):
     raise ValueError('boundaries must be a sorted list.')
   for i in range(len(boundaries) - 1):
     if boundaries[i] >= boundaries[i + 1]:
@@ -3111,7 +3112,7 @@ class EmbeddingColumn(
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
-    sparse_tensors = self.categorical_column.get_sequence_sparse_tensors(
+    sparse_tensors = self.categorical_column.get_sparse_tensors(
         transformation_cache, state_manager)
     dense_tensor = self._get_dense_tensor_internal(sparse_tensors,
                                                    state_manager)
@@ -3307,7 +3308,7 @@ class SharedEmbeddingColumn(
           'Suggested fix A: If you wish to use input_layer, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
+          'SequenceFeatureLayer instead of FeatureLayer. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     return self._get_dense_tensor_internal(transformation_cache, state_manager)
@@ -3321,12 +3322,12 @@ class SharedEmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must be of type SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
+          'to use SequenceFeatureLayer. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
-    dense_tensor = self.get_dense_tensor_internal(transformation_cache,
-                                                  state_manager)
+    dense_tensor = self._get_dense_tensor_internal(transformation_cache,
+                                                   state_manager)
     sparse_tensors = self.categorical_column.get_sparse_tensors(
         transformation_cache, state_manager)
     sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
@@ -4469,8 +4470,8 @@ def _verify_static_batch_size_equality(tensors, columns):
 
 
 class SequenceCategoricalColumn(
-    FeatureColumn,
-    fc_old._CategoricalColumn,  # pylint: disable=protected-access
+    CategoricalColumn,
+    fc_old._SequenceCategoricalColumn,  # pylint: disable=protected-access
     collections.namedtuple('SequenceCategoricalColumn',
                            ('categorical_column'))):
   """Represents sequences of categorical data."""
@@ -4533,7 +4534,7 @@ class SequenceCategoricalColumn(
       weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape)
     return CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)
 
-  def get_sequence_sparse_tensors(self, transformation_cache, state_manager):
+  def get_sparse_tensors(self, transformation_cache, state_manager):
     """Returns an IdWeightPair.
 
     `IdWeightPair` is a pair of `SparseTensor`s which represents ids and
@@ -4696,7 +4697,10 @@ def deserialize_feature_column(config,
         'Expected FeatureColumn class, instead found: {}'.format(cls))
 
   # Always deserialize the FeatureColumn, in order to get the name.
-  new_instance = cls._from_config(cls_config, columns_by_name=columns_by_name)  # pylint: disable=protected-access
+  new_instance = cls._from_config(  # pylint: disable=protected-access
+      cls_config,
+      custom_objects=custom_objects,
+      columns_by_name=columns_by_name)
 
   # If the name already exists, re-use the column from columns_by_name,
   # (new_instance remains unused).
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index 083cd526e468a50714d19eaff2bccc8685d9c972..a2474253697ad526c33c0099bf955b96000cf0f7 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -31,7 +31,6 @@ from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
@@ -50,6 +49,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import rmsprop
+from tensorflow_estimator.python.estimator.inputs import numpy_io
 
 
 def _initialized_session(config=None):
@@ -485,7 +485,7 @@ class BucketizedColumnTest(test.TestCase):
   def test_invalid_boundaries(self):
     a = fc.numeric_column('aaa')
     with self.assertRaisesRegexp(ValueError,
-                                 'boundaries must be a sorted list'):
+                                 'boundaries must not be empty'):
       fc.bucketized_column(a, boundaries=None)
     with self.assertRaisesRegexp(ValueError,
                                  'boundaries must be a sorted list'):
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 30dc959e9a9f717bdb5c56bfbdde5ffa9d48c257..a72ded11314d4b491292aed73364be7d875baa86 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -100,6 +100,7 @@ class AutomaticControlDependencies(object):
     # graph (but that would mess up devices and collections at least,
     # probably other things as well).
     self._graph = ops.get_default_graph()
+    self._graph._add_control_dependencies = True  # pylint: disable=protected-access
     self._n_operations = len(self._graph.get_operations())
     return self
 
@@ -170,6 +171,14 @@ class AutomaticControlDependencies(object):
       raise RuntimeError(
           "Graph changed while trying to add control dependencies.")
 
+    # pylint: disable=protected-access
+    if hasattr(self._graph, "outer_graph"):
+      outer_val = self._graph.outer_graph._add_control_dependencies
+      self._graph._add_control_dependencies = outer_val
+    else:
+      self._graph._add_control_dependencies = False
+    # pylint: enable=protected-access
+
     # map from resource tensor to the last op which used it
     last_op_using_resource_tensor = {}
     # set of conditional and loop exits
diff --git a/tensorflow/python/framework/auto_control_deps_test.py b/tensorflow/python/framework/auto_control_deps_test.py
index a1dff9e8349aba3fb16ac57314f0ea34a37f2c5b..d81adef26a06ca231d640a9d4e0c4262926aad58 100644
--- a/tensorflow/python/framework/auto_control_deps_test.py
+++ b/tensorflow/python/framework/auto_control_deps_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import auto_control_deps as acd
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -38,7 +39,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
   def testBasic(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       with acd.AutomaticControlDependencies() as c:
         v.assign(v + 1)
         v.assign(2 * v)
@@ -46,10 +47,11 @@ class AutomaticControlDependenciesTest(test.TestCase):
         val = c.mark_as_return(val)
       self.assertAllEqual(val.eval(), 4.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondMustRun(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       p = array_ops.placeholder(dtype=dtypes.bool)
       with acd.AutomaticControlDependencies() as c:
 
@@ -67,10 +69,11 @@ class AutomaticControlDependenciesTest(test.TestCase):
       self.assertAllEqual(val.eval(feed_dict={p: False}), 5.0)
       self.assertAllEqual(val.eval(feed_dict={p: True}), 6.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondMustRunSeparateRead(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       p = array_ops.placeholder(dtype=dtypes.bool)
       with acd.AutomaticControlDependencies() as c:
 
@@ -90,10 +93,11 @@ class AutomaticControlDependenciesTest(test.TestCase):
       one.eval(feed_dict={p: True})
       self.assertAllEqual(v.read_value().eval(), 6.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondNested(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       p = array_ops.placeholder(dtype=dtypes.bool)
       q = array_ops.placeholder(dtype=dtypes.bool)
       with acd.AutomaticControlDependencies() as c:
@@ -124,10 +128,11 @@ class AutomaticControlDependenciesTest(test.TestCase):
       self.assertAllEqual(val.eval(feed_dict={p: True, q: True}), 7.0)
       self.assertAllEqual(val.eval(feed_dict={p: True, q: False}), 8.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondOneBranch(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       p = array_ops.placeholder(dtype=dtypes.bool)
       with acd.AutomaticControlDependencies() as c:
 
@@ -144,10 +149,11 @@ class AutomaticControlDependenciesTest(test.TestCase):
       self.assertAllEqual(val.eval(feed_dict={p: False}), 5.0)
       self.assertAllEqual(val.eval(feed_dict={p: True}), 5.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondOneBranchUpdateBefore(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       p = array_ops.placeholder(dtype=dtypes.bool)
       with acd.AutomaticControlDependencies() as c:
         v.assign(v * 2)
@@ -165,10 +171,11 @@ class AutomaticControlDependenciesTest(test.TestCase):
       self.assertAllEqual(val.eval(feed_dict={p: False}), 6.0)
       self.assertAllEqual(val.eval(feed_dict={p: True}), 12.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondOneBranchUpdateAfter(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       p = array_ops.placeholder(dtype=dtypes.bool)
       with acd.AutomaticControlDependencies() as c:
 
@@ -204,7 +211,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
   def testDecorator(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       @acd.automatic_control_dependencies
       def f():
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index f7a12d27df7b90b45cf0e02920b7199aeb310213..9a4fe4e93b32aeedcb74cf0f7b2703f64d9db23a 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -347,7 +347,7 @@ tf_export("dtypes.uint32", "uint32").export_constant(__name__, "uint32")
 uint64 = DType(types_pb2.DT_UINT64)
 tf_export("dtypes.uint64", "uint64").export_constant(__name__, "uint64")
 int16 = DType(types_pb2.DT_INT16)
-tf_export("dtypes.uint16", "int16").export_constant(__name__, "int16")
+tf_export("dtypes.int16", "int16").export_constant(__name__, "int16")
 int8 = DType(types_pb2.DT_INT8)
 tf_export("dtypes.int8", "int8").export_constant(__name__, "int8")
 string = DType(types_pb2.DT_STRING)
diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py
index 37a634d80679b095d319cabcd29208a35c4fe44f..557f947291ca57da17071e91f7cba2aa0c7a8a70 100644
--- a/tensorflow/python/framework/error_interpolation.py
+++ b/tensorflow/python/framework/error_interpolation.py
@@ -29,6 +29,7 @@ import re
 
 import six
 
+from tensorflow.python.framework.ops import Tensor
 from tensorflow.python.util import tf_stack
 
 _NAME_REGEX = r"[A-Za-z0-9.][A-Za-z0-9_.\-/]*?"
@@ -216,11 +217,13 @@ def _get_defining_frame_from_op(op):
   return frame
 
 
-def compute_field_dict(op):
+def compute_field_dict(op, strip_file_prefix=""):
   """Return a dictionary mapping interpolation tokens to values.
 
   Args:
     op: op.Operation object having a _traceback member.
+    strip_file_prefix: The common path in the stacktrace. We remove the prefix
+    from the file names.
 
   Returns:
     A dictionary mapping string tokens to string values.  The keys are shown
@@ -248,6 +251,8 @@ def compute_field_dict(op):
   """
   frame = _get_defining_frame_from_op(op)
   filename = frame[tf_stack.TB_FILENAME]
+  if filename.startswith(strip_file_prefix):
+    filename = filename[len(strip_file_prefix):]
   lineno = frame[tf_stack.TB_LINENO]
   defined_at = " (defined at %s:%d)" % (filename, lineno)
   colocation_summary = _compute_colocation_summary_from_op(op)
@@ -265,11 +270,110 @@ def compute_field_dict(op):
   return field_dict
 
 
+def _common_prefix(all_ops):
+  """Determines the common prefix from the paths of the stacktrace of 'all_ops'.
+
+  For example, if the paths are '/foo/bar/baz/' and '/foo/car', this would
+  return '/foo'.
+
+  Args:
+    all_ops: All the input nodes in the form of a list of lists of ops.
+
+  Returns:
+    The common prefix.
+  """
+  files = set()
+  for ops in all_ops:
+    if ops is None:
+      continue
+    for op in ops:
+      # pylint: disable=protected-access
+      tf_traceback = tf_stack.convert_stack(op._traceback)
+      # pylint: enable=protected-access
+      for frame in tf_traceback:
+        filename = frame[tf_stack.TB_FILENAME]
+        if "<embedded" not in filename:
+          files.add(filename)
+  return os.path.split(os.path.commonprefix(list(files)))[0]
+
+
+def _sources_for_node(name, graph):
+  """Gets the top-level root input nodes for 'name' node.
+
+  We recursively traverse the graph from 'name' node to its inputs and collect
+  all the nodes which don't have any inputs.
+
+  Args:
+    name: The name of the node.
+    graph: The graph containing the node.
+
+  Returns:
+    The unique top-level root input nodes.
+  """
+  def _helper(name, graph, seen_names, inputs):
+    """Recursive helper. 'seen_names' and 'inputs' are mutated."""
+    if name.startswith("^"):
+      name = name[1:]
+    try:
+      op = graph.as_graph_element(name)
+    except KeyError:
+      return
+    if isinstance(op, Tensor):
+      op = op.op
+    name = op.name
+    if name in seen_names:
+      return
+    seen_names.add(name)
+    if not op.node_def.input:
+      inputs.add(op)
+      return
+    for n in op.node_def.input:
+      _helper(n, graph, seen_names, inputs)
+
+  names = set()
+  inputs = set()
+  _helper(name, graph, names, inputs)
+  return list(inputs)
+
+
+def _build_error_message(op, input_ops, common_prefix):
+  """Returns the formatted error message for the given op.
+
+  Args:
+    op: The node.
+    input_ops: The input nodes to the 'op' node
+    common_prefix: The prefix path common to the stacktrace of inputs.
+
+  Returns:
+    The formatted error message for the given op. The error message also
+    includes the information about the input sources for the given op.
+  """
+  field_dict = compute_field_dict(op, common_prefix)
+  msg = "node %s%s " % (op.name, field_dict["defined_at"])
+  input_debug_info = []
+  # This stores the line numbers that we have already printed.
+  done = set()
+  done.add(field_dict["defined_at"])
+  for op_inp in input_ops:
+    field_dict_inp = compute_field_dict(op_inp, common_prefix)
+    if field_dict_inp["defined_at"] not in done:
+      input_debug_info.append(
+          " %s%s" % (op_inp.name, field_dict_inp["defined_at"]))
+      done.add(field_dict_inp["defined_at"])
+  if input_debug_info:
+    end_msg = ("\nInput Source operations connected to node %s:\n") % (op.name)
+    end_msg += "\t\n".join(input_debug_info)
+  else:
+    end_msg = ""
+  return msg, end_msg
+
+
 def interpolate(error_message, graph):
   """Interpolates an error message.
 
   The error message can contain tags of the form `{{type name}}` which will be
-  replaced.
+  replaced. For example: "{{node <name>}}" would get expanded to:
+  "node <name>(defined at <path>)".
 
   Args:
     error_message: A string to interpolate.
@@ -281,25 +385,41 @@ def interpolate(error_message, graph):
   """
   seps, tags = _parse_message(error_message)
   subs = []
-  end_msg = ""
+  end_msg = collections.defaultdict(list)
+  tagged_ops = []
 
   for t in tags:
     try:
       op = graph.get_operation_by_name(t.name)
     except KeyError:
       op = None
-
-    msg = "{{%s %s}}" % (t.type, t.name)
-    if op is not None:
-      field_dict = compute_field_dict(op)
-      if t.type == "node":
-        msg = "node %s%s " % (t.name, field_dict["defined_at"])
-      elif t.type == "colocation_node":
-        msg = "node %s%s having device %s " % (t.name, field_dict["defined_at"],
-                                               field_dict["devices"])
-        end_msg += "\n\n" + field_dict["devs_and_colocs"]
+    if op is None:
+      tagged_ops.append(None)
+    else:
+      tagged_ops.append([op] + _sources_for_node(op.name, graph))
+
+  common_prefix = _common_prefix(tagged_ops)
+  for tag, ops in zip(tags, tagged_ops):
+    msg = "{{%s %s}}" % (tag.type, tag.name)
+    if ops is not None:
+      if tag.type == "node":
+        msg, source_msg = _build_error_message(ops[0], ops[1:], common_prefix)
+        if source_msg:
+          end_msg["source_nodes"].append(source_msg)
+      elif tag.type == "colocation_node":
+        field_dict = compute_field_dict(ops[0], common_prefix)
+        msg = "node %s%s placed on device %s " % (
+            ops[0].name, field_dict["defined_at"], field_dict["devices"])
+        end_msg["colocations"].append(field_dict["devs_and_colocs"])
     subs.append(msg)
-  subs.append(end_msg)
+
+  if "source_nodes" in end_msg:
+    subs.append("\n\nErrors may have originated from an input operation.")
+    subs.append("\n".join(end_msg["source_nodes"]))
+    end_msg.pop("source_nodes", None)
+  for k, messages in end_msg.items():
+    subs.append("Additional information about %s:" % k)
+    subs.append("\n".join(messages))
 
   return "".join(
       itertools.chain(*six.moves.zip_longest(seps, subs, fillvalue="")))
diff --git a/tensorflow/python/framework/error_interpolation_test.py b/tensorflow/python/framework/error_interpolation_test.py
index 1b77548592cec08ff4fadfe2e740b746c6a9d115..d835ada086ae46acef81cce77ff66ec2b38c2ae9 100644
--- a/tensorflow/python/framework/error_interpolation_test.py
+++ b/tensorflow/python/framework/error_interpolation_test.py
@@ -19,11 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import re
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import traceable_stack
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_stack
 
@@ -112,6 +115,7 @@ class ComputeColocationSummaryFromOpTest(test.TestCase):
     self.assertIn("No node-device colocations", summary)
 
 
+@test_util.run_v1_only("b/120545219")
 class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
 
   def setUp(self):
@@ -193,6 +197,45 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
     self.assertRegexpMatches(interpolated_string, "constant_op.py:[0-9]+.*")
 
 
+@test_util.run_v1_only("b/120545219")
+class InputNodesTest(test.TestCase):
+
+  def setUp(self):
+    # Add nodes to the graph for retrieval by name later.
+    one = constant_op.constant(1, name="One")
+    two = constant_op.constant(2, name="Two")
+    three = math_ops.add(one, two, name="Three")
+    self.graph = three.graph
+
+    # Change the list of bad file substrings so that constant_op.py is chosen
+    # as the defining stack frame for constant_op.constant ops.
+    self.old_bad_strings = error_interpolation._BAD_FILE_SUBSTRINGS
+    error_interpolation._BAD_FILE_SUBSTRINGS = [
+        "%sops.py" % os.sep,
+        "%sutil" % os.sep,
+    ]
+
+  def tearDown(self):
+    error_interpolation._BAD_FILE_SUBSTRINGS = self.old_bad_strings
+
+  def testNoInputs(self):
+    two_tags_with_seps = ";;;{{node One}},,,{{node Two}};;;"
+    interpolated_string = error_interpolation.interpolate(
+        two_tags_with_seps, self.graph)
+    expected_regex = (
+        r"^;;;.*constant_op.py:[0-9]+\) ,,,.*constant_op.py:[0-9]+\) ;;;$")
+    self.assertRegexpMatches(interpolated_string, expected_regex)
+
+  def testBasicInputs(self):
+    tag = ";;;{{node Three}};;;"
+    interpolated_string = error_interpolation.interpolate(tag, self.graph)
+    expected_regex = re.compile(
+        r"^;;;.*op_def_library.py:[0-9]+\) ;;;.*Input.*constant_op.py:[0-9]+\)",
+        re.DOTALL)
+    self.assertRegexpMatches(interpolated_string, expected_regex)
+
+
+@test_util.run_v1_only("b/120545219")
 class InterpolateDeviceSummaryTest(test.TestCase):
 
   def _fancy_device_function(self, unused_op):
@@ -236,6 +279,7 @@ class InterpolateDeviceSummaryTest(test.TestCase):
     self.assertRegexpMatches(result, expected_re)
 
 
+@test_util.run_v1_only("b/120545219")
 class InterpolateColocationSummaryTest(test.TestCase):
 
   def setUp(self):
@@ -260,11 +304,13 @@ class InterpolateColocationSummaryTest(test.TestCase):
 
     self.graph = node_three.graph
 
+  @test_util.run_v1_only("b/120545219")
   def testNodeThreeHasColocationInterpolation(self):
     message = "{{colocation_node Three_with_one}}"
     result = error_interpolation.interpolate(message, self.graph)
     self.assertIn("colocate_with(One)", result)
 
+  @test_util.run_v1_only("b/120545219")
   def testNodeFourHasColocationInterpolationForNodeThreeOnly(self):
     message = "{{colocation_node Four_with_three}}"
     result = error_interpolation.interpolate(message, self.graph)
@@ -273,12 +319,14 @@ class InterpolateColocationSummaryTest(test.TestCase):
         "One", result,
         "Node One should not appear in Four_with_three's summary:\n%s" % result)
 
+  @test_util.run_v1_only("b/120545219")
   def testNodeFiveHasColocationInterpolationForNodeOneAndTwo(self):
     message = "{{colocation_node Five_with_one_with_two}}"
     result = error_interpolation.interpolate(message, self.graph)
     self.assertIn("colocate_with(One)", result)
     self.assertIn("colocate_with(Two)", result)
 
+  @test_util.run_v1_only("b/120545219")
   def testColocationInterpolationForNodeLackingColocation(self):
     message = "{{colocation_node One}}"
     result = error_interpolation.interpolate(message, self.graph)
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index f74d072e8e2329c29dd755d044ee2ed1308e622d..f8be5e9edf9168e942a170961c564eb33670c7ab 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -35,7 +35,9 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import compat
+from tensorflow.python.util import memory
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.lazy_loader import LazyLoader
 
@@ -108,36 +110,20 @@ class FuncGraph(ops.Graph):
 
     graph = self.outer_graph
 
-    # pylint: disable=protected-access
-    # TODO(b/112906995, nareshmodi): distribution strategy depends on inheriting
-    # this stack from the default graph even in eager mode. Maybe it should be
-    # part of the eager context? This would also allow us to remove a
-    # get_default_graph() call from the function cache lookup.
-    self._distribution_strategy_stack = list(graph._distribution_strategy_stack)
-    # We ignore device placements from any outer scopes while tracing the
-    # function when possible, to avoid hard-coding them in the function
-    # graph. "Default" placements come from the PartitionedCallOp's placement,
-    # so that the same trace of the Python function may be placed on several
-    # different devices and saved functions may be placed on new devices when
-    # restored.
     if context.executing_eagerly():
       self.seed = context.global_seed()
-      self._xla_compile = (context.context().device_spec.device_type == "TPU")
-      if self._distribution_strategy_stack or self._xla_compile:
-        self._add_device_to_stack(context.context().device_name)
+      device_type = context.context().device_spec.device_type
+      self._xla_compile = (device_type == "TPU" or device_type == "XLA_GPU"
+                           or device_type == "XLA_CPU")
     else:
       self.seed = graph.seed
       self._xla_compile = getattr(graph, "_xla_compile", False)
       # TODO(allenl): Figure out if we can remove colocation stack
       # specialization (currently used in cond_v2), here and in the cache key.
-      self._colocation_stack = graph._colocation_stack.copy()
-      if (self._distribution_strategy_stack
-          or self._xla_compile
-          or device_stack_has_callable(graph._device_function_stack)):
-        # Hard-code devices from device functions in the function body
-        self._device_function_stack = graph._device_function_stack.copy()
+      self._colocation_stack = graph._colocation_stack.copy()  # pylint: disable=protected-access
+
     if not self._read_only_collections:
-      self._collections = graph._collections
+      self._collections = graph._collections  # pylint: disable=protected-access
     else:
       for collection_name in graph.get_all_collection_keys():
         if collection_name not in WHITELIST_COLLECTIONS:
@@ -147,11 +133,63 @@ class FuncGraph(ops.Graph):
         self._collections[collection_name] = graph.get_collection_ref(
             collection_name)
 
-    self._variable_creator_stack = graph._variable_creator_stack
-    # Inherit the graph key, since this is used for matching variables in
-    # optimizers.
-    self._graph_key = graph._graph_key
-    # pylint: enable=protected-access
+  def as_default(self):
+    outer_cm = super(FuncGraph, self).as_default()
+
+    @tf_contextlib.contextmanager
+    def inner_cm():
+      """Context manager for copying distribute.Strategy scope information."""
+      graph = ops.get_default_graph()
+      # pylint: disable=protected-access
+      # TODO(b/112906995, nareshmodi): distribution strategy depends on
+      # inheriting this stack from the default graph even in eager mode. Maybe
+      # it should be part of the eager context? This would also allow us to
+      # remove a get_default_graph() call from the function cache lookup.
+      old_strategy_stack = self._distribution_strategy_stack
+      self._distribution_strategy_stack = list(
+          graph._distribution_strategy_stack)
+      # We ignore device placements from any outer scopes while tracing the
+      # function when possible, to avoid hard-coding them in the function
+      # graph. "Default" placements come from the PartitionedCallOp's placement,
+      # so that the same trace of the Python function may be placed on several
+      # different devices and saved functions may be placed on new devices when
+      # restored.
+      old_device_stack = self._device_function_stack
+      if context.executing_eagerly():
+        if self._distribution_strategy_stack or self._xla_compile:
+          self._add_device_to_stack(context.context().device_name)
+      else:
+        if (self._distribution_strategy_stack
+            or self._xla_compile
+            or device_stack_has_callable(graph._device_function_stack)):
+          # Hard-code devices from device functions in the function body
+          self._device_function_stack = graph._device_function_stack.copy()
+
+      old_creator_stack = self._variable_creator_stack
+      self._variable_creator_stack = graph._variable_creator_stack
+      # Inherit the graph key, since this is used for matching variables in
+      # optimizers.
+      old_graph_key = self._graph_key
+      self._graph_key = graph._graph_key
+      # pylint: enable=protected-access
+
+      with outer_cm as g:
+        try:
+          yield g
+        finally:
+          self._distribution_strategy_stack = old_strategy_stack
+          self._device_function_stack = old_device_stack
+          self._variable_creator_stack = old_creator_stack
+          self._graph_key = old_graph_key
+    return inner_cm()
+
+  @property
+  def output_types(self):
+    return [t.dtype for t in self.outputs]
+
+  @property
+  def output_shapes(self):
+    return [t.shape for t in self.outputs]
 
   @property
   def variables(self):
@@ -642,3 +680,22 @@ def _get_defun_inputs_from_kwargs(kwargs):
     names = []
     flat_args = []
   return _get_defun_inputs(flat_args, names, structure=kwargs)
+
+
+def dismantle_func_graph(func_graph):
+  """Removes reference cycles in `func_graph` FuncGraph.
+
+  Helpful for making sure the garbage collector doesn't need to run when
+  the FuncGraph goes out of scope, e.g. in tests using defun with
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True).
+
+  Args:
+    func_graph: A `FuncGraph` object to destroy. `func_graph` is unusable
+      after this function.
+  """
+  # TODO(b/115366440): Delete this method when a custom OrderedDict is added.
+  # Clearing captures using clear() leaves some cycles around.
+  while func_graph.captures:
+    func_graph.captures.popitem()
+  memory.dismantle_ordered_dict(func_graph.captures)
+  ops.dismantle_graph(func_graph)
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 622686ce005ef3dd29a94624d24dd0cb809881f6..afc11b17bfd1447e502906bb973eb5746dfe0274 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -209,6 +209,7 @@ class _DefinedFunction(object):
                out_names=None,
                shape_func=None,
                capture_by_value=False,
+               whitelisted_stateful_ops=None,
                **kwargs):
     """Creates _DefinedFunction.
 
@@ -229,6 +230,8 @@ class _DefinedFunction(object):
         output shapes.
       capture_by_value: Boolean (defaults to False). If True, captured values
         will be copied into the function body.
+      whitelisted_stateful_ops: A set of ops that if stateful we ignore and
+        copy into the function body, when `capture_by_value` is True.
       **kwargs: The keyword arguments. **kwargs is passed to every call
         site of this function.
 
@@ -244,6 +247,9 @@ class _DefinedFunction(object):
     self._out_names = out_names
     self._shape_func = shape_func
     self._capture_by_value = capture_by_value
+    self._whitelisted_stateful_ops = whitelisted_stateful_ops
+    if self._whitelisted_stateful_ops is None:
+      self._whitelisted_stateful_ops = set()
     self._extra_kwargs = kwargs
     # Constructed only when C API is disabled, lazily
     self._definition = None
@@ -340,8 +346,13 @@ class _DefinedFunction(object):
       return
 
     temp_graph = func_graph_from_py_func(
-        self._func, self._arg_names, self._arg_types, self._func_name,
-        self._capture_by_value, self._caller_device)
+        self._func,
+        self._arg_names,
+        self._arg_types,
+        self._func_name,
+        self._capture_by_value,
+        self._caller_device,
+        whitelisted_stateful_ops=self._whitelisted_stateful_ops)
 
     self._extra_inputs = temp_graph.extra_inputs
     # pylint: disable=protected-access
@@ -625,9 +636,11 @@ class _FuncGraph(ops.Graph):
   function argument and the caller passes in the captured tensor.
   """
 
-  def __init__(self, name, capture_by_value, *args, **kwargs):
+  def __init__(self, name, capture_by_value, whitelisted_stateful_ops, *args,
+               **kwargs):
     super(_FuncGraph, self).__init__(*args, **kwargs)
     self._capture_by_value = capture_by_value
+    self._whitelisted_stateful_ops = whitelisted_stateful_ops
     self._building_function = True
     self._outer_graph = ops.get_default_graph()
     self._vscope = vs.get_variable_scope()
@@ -785,7 +798,7 @@ class _FuncGraph(ops.Graph):
     # pylint: disable=protected-access
     op_def = graph_to_function_def._get_op_def(op)
     # pylint: enable=protected-access
-    if op_def.is_stateful:
+    if op_def.is_stateful and op not in self._whitelisted_stateful_ops:
       raise ValueError("Cannot capture a stateful node (name:%s, type:%s) "
                        "by value." % (op.name, op.type))
     elif op.type in ("Placeholder", "PlaceholderV2"):
@@ -807,10 +820,17 @@ class _FuncGraph(ops.Graph):
     return captured_op
 
 
-def func_graph_from_py_func(func, arg_names, arg_types, name=None,
-                            capture_by_value=False, device=None,
-                            colocation_stack=None, container=None,
-                            collections_ref=None, arg_shapes=None):
+def func_graph_from_py_func(func,
+                            arg_names,
+                            arg_types,
+                            name=None,
+                            capture_by_value=False,
+                            device=None,
+                            colocation_stack=None,
+                            container=None,
+                            collections_ref=None,
+                            arg_shapes=None,
+                            whitelisted_stateful_ops=None):
   """Returns a _FuncGraph generated from `func`.
 
   Args:
@@ -828,6 +848,8 @@ def func_graph_from_py_func(func, arg_names, arg_types, name=None,
     collections_ref: A reference to a collections dict the _FuncGraph should
       use internally.
     arg_shapes: A sequence of the function's argument shapes.
+    whitelisted_stateful_ops: A set of ops that if stateful we ignore and
+      re-create.
 
   Returns:
     A _FuncGraph.
@@ -837,7 +859,7 @@ def func_graph_from_py_func(func, arg_names, arg_types, name=None,
   """
   if not name:
     name = function_utils.get_func_name(func)
-  func_graph = _FuncGraph(name, capture_by_value)
+  func_graph = _FuncGraph(name, capture_by_value, whitelisted_stateful_ops)
 
   with func_graph.as_default(), ops.device(device):
     # pylint: disable=protected-access
@@ -971,17 +993,18 @@ def _call(sig, *inputs, **kwargs):
   name = kwargs.pop("name", None)
   g = ops.get_default_graph()
   func_name = sig.name
+  if name is None:
+    name = func_name
   attrs = _parse_kwargs_as_attrs(func_name, **kwargs)
   output_types = [dtypes.DType(x.type) for x in sig.output_arg]
-  with ops.name_scope(name, func_name, inputs) as name:
-    op = g.create_op(
-        func_name,
-        list(inputs),
-        output_types,
-        name=name,
-        attrs=attrs,
-        op_def=sig,
-        compute_shapes=False)
+  op = g.create_op(
+      func_name,
+      list(inputs),
+      output_types,
+      name=name,
+      attrs=attrs,
+      op_def=sig,
+      compute_shapes=False)
   if op.outputs:
     if len(op.outputs) == 1:
       ret = op.outputs[0]
@@ -1024,12 +1047,13 @@ def _from_definition(fdef, grad_func=None):
   c_func = c_api.TF_FunctionImportFunctionDef(serialized)
   result._c_func = c_api_util.ScopedTFFunction(c_func)
   result._extra_inputs = []
+  result._op_def = fdef.signature
   # pylint: enable=protected-access
 
   return result
 
 
-def _from_library(lib):
+def from_library(lib):
   """Creates _DefinedFunctions initialized from a FunctionDefLibrary proto.
 
   This method handles assigning the correct gradient functions to each
diff --git a/tensorflow/python/framework/function_def_to_graph.py b/tensorflow/python/framework/function_def_to_graph.py
index 1803cb906d8ed079a6f798e91e2eaf0dbb64d5bc..4d1aabde06984ded2a6e04d549538bc0afdbdc75 100644
--- a/tensorflow/python/framework/function_def_to_graph.py
+++ b/tensorflow/python/framework/function_def_to_graph.py
@@ -174,9 +174,7 @@ def function_def_to_graph_def(fdef, input_shapes=None):
   # Update inputs of all nodes in graph.
   for node_def in graph_def.node:
     for i in range(len(node_def.input)):
-      # TODO(apassos): how can it not be there?
-      if node_def.input[i] in nested_to_flat_tensor_name:
-        node_def.input[i] = nested_to_flat_tensor_name[node_def.input[i]]
+      node_def.input[i] = nested_to_flat_tensor_name[node_def.input[i]]
 
   return graph_def, nested_to_flat_tensor_name
 
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index d71f06ea5280da901d503220e9ce5100b9d979b3..7543376bcf274dc6edf821e19838c4aa574826ff 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -1054,6 +1054,28 @@ class FunctionTest(test.TestCase):
         self.assertFalse(all(val3 == val1))
         self.assertFalse(all(val4 == val2))
 
+  def testStatefulFunctionWithWhitelisting(self):
+    t = random_ops.random_uniform([100], maxval=10, dtype=dtypes.int32)
+
+    @function.Defun(capture_by_value=True)
+    def StatefulFn():
+      return t + constant_op.constant(3, dtype=dtypes.int32)
+
+    # First time we try to capture a stateful RandomUniform op.
+    with self.assertRaisesRegexp(ValueError, "Cannot capture a stateful node"):
+      res = StatefulFn()
+
+    # This time we whitelist this op, so that its recreated.
+    @function.Defun(capture_by_value=True, whitelisted_stateful_ops=set([t.op]))
+    def StatefulFn2():
+      return t + constant_op.constant(3, dtype=dtypes.int32)
+
+    res = StatefulFn2()
+    with session.Session() as sess:
+      r = sess.run(res)
+      for i in r:
+        self.assertGreaterEqual(i, 3)
+
   @test_util.run_deprecated_v1
   def testSameFunctionOnTwoDevices(self):
 
@@ -1265,7 +1287,7 @@ class FunctionsFromProtos(test.TestCase):
       gradients_impl.gradients([f1, f2, f3, f4], c)
 
     library = g.as_graph_def().library
-    new_funcs = function._from_library(library)
+    new_funcs = function.from_library(library)
 
     def CheckNewFunc(func):
       new_func = [f for f in new_funcs if f.name == func.name]
@@ -1281,7 +1303,7 @@ class FunctionsFromProtos(test.TestCase):
 
   def testFromLibraryEmptyLib(self):
     library = function_pb2.FunctionDefLibrary()
-    self.assertEqual(len(function._from_library(library)), 0)
+    self.assertEqual(len(function.from_library(library)), 0)
 
   def testFromLibraryMissingFuncDef(self):
 
@@ -1305,7 +1327,7 @@ class FunctionsFromProtos(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         "FunctionDefLibrary missing 'G1_[0-9a-zA-Z]{8,11}' FunctionDef"):
-      function._from_library(library)
+      function.from_library(library)
 
     # Create invalid function def that is missing F1 function def
     library = function_pb2.FunctionDefLibrary()
@@ -1315,7 +1337,7 @@ class FunctionsFromProtos(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         "FunctionDefLibrary missing 'F1_[0-9a-zA-Z]{8,11}' FunctionDef"):
-      function._from_library(library)
+      function.from_library(library)
 
   def testFromLibraryCyclicGradFuncs(self):
 
@@ -1344,7 +1366,7 @@ class FunctionsFromProtos(test.TestCase):
 
     with self.assertRaisesRegexp(
         ValueError, "FunctionDefLibrary contains cyclic gradient functions!"):
-      function._from_library(library)
+      function.from_library(library)
 
   def testExperimentalAttrs(self):
 
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index 4e7408ad49f1a5cd318ba5c569edb7ee3e496977..dd26b8a78e9d2e13b34770775fcb1219745396e0 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -103,7 +103,7 @@ class DeviceFunctionsTest(test.TestCase):
     self.assertDeviceEqual(var_5.device, "/device:GPU:0")
     self.assertDeviceEqual(var_6.device, "/device:CPU:0")
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNestedDeviceFunctions(self):
     with ops.Graph().as_default():
       var_0 = variables.VariableV1(0)
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 98c7aeccc4b19edfc433a6556108ef8b77d12aa4..c737bd48811a664a6d51af26d1137223ba74379c 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -442,11 +442,9 @@ def import_graph_def(graph_def,
     _ProcessNewOps(graph)
 
   if graph_def.library and graph_def.library.function:
-    # pylint: disable=protected-access
-    functions = function._from_library(graph_def.library)
+    functions = function.from_library(graph_def.library)
     for f in functions:
       f.add_to_graph(graph)
-    # pylint: enable=protected-access
 
   # Treat input mappings that don't appear in the graph as an error, because
   # they are likely to be due to a typo.
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index 46ce4616a5099860649974d8575daa5b8268db35..e6e87881649729ca65db8cba9914e29b5a0d064e 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -528,7 +528,7 @@ class ScopedMetaGraphTest(test.TestCase):
         actual_grad_value = self.evaluate(grad)
         self.assertEqual(expected_grad_value, actual_grad_value)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testImportWhileLoopInWhileLoop(self):
     # Create a simple while loop.
     with ops.Graph().as_default():
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index b5175d3c93bff6de590a4da4382e2dc531299560..27c56ef990a8d32c5c224c635d59761f09d482a1 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -43,7 +43,6 @@ from tensorflow.python.eager import tape
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import registry
@@ -1113,10 +1112,9 @@ def internal_convert_to_tensor(value,
   if ctx is None: ctx = context.context()
   if isinstance(value, EagerTensor):
     if ctx.executing_eagerly():
-      # Fast path for EagerTensors that don't need any conversion.
-      # Note that we don't check that value's dtype matches the dtype
-      # argument.  We expect that the C runtime will do that checking
-      # when we execute the kernel.
+      if dtype is not None:
+        dtype = dtypes.as_dtype(dtype)
+        value = _TensorTensorConversionFunction(value, dtype=dtype)
       return value
     else:
       graph = get_default_graph()
@@ -2087,6 +2085,31 @@ class Operation(object):
         tensor._as_tf_output(),  # pylint: disable=protected-access
         self._tf_input(index))
 
+  def _add_while_inputs(self, tensors):
+    """See AddWhileInputHack in python_api.h.
+
+    NOTE: This is for TF internal use only. Please don't use it.
+
+    Args:
+      tensors: list of Tensors
+
+    Raises:
+      TypeError: if tensor is not a Tensor,
+        or if input tensor type is not convertible to dtype.
+      ValueError: if the Tensor is from a different graph.
+    """
+    for tensor in tensors:
+      if not isinstance(tensor, Tensor):
+        raise TypeError("tensor must be a Tensor: %s" % tensor)
+      _assert_same_graph(self, tensor)
+
+      # Reset cached inputs.
+      self._inputs_val = None
+      c_api.AddWhileInputHack(
+          self._graph._c_graph,  # pylint: disable=protected-access
+          tensor._as_tf_output(),  # pylint: disable=protected-access
+          self._c_op)
+
   def _add_control_inputs(self, ops):
     """Add a list of new control inputs to this operation.
 
@@ -2873,6 +2896,9 @@ class Graph(object):
     self._last_loss_reduction = None
     self._container = ""
     self._registered_ops = op_def_registry.get_registered_ops()
+    # Set to True if this graph is being built in an
+    # AutomaticControlDependencies context.
+    self._add_control_dependencies = False
 
     # TODO(skyewm): fold as much of the above as possible into the C
     # implementation
@@ -3307,36 +3333,6 @@ class Graph(object):
     self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
-  def _make_colocation_conflict_message(self, op, colocation_op):
-    """Return detailed error message about device conflict due to colocation."""
-    # Example error message:
-    #   Tried to colocate op 'a' (defined at file1.py:149) having device
-    #   '/device:GPU:0' with op 'b' (defined at file2:96) which had an
-    #   incompatible device '/device:CPU:0'.
-    #
-    #   No node-device colocations were active during op 'a' creation.
-    #   Device assignments active during op 'a' creation:
-    #     with tf.device(/device:GPU:0): file1.py:148>
-    #
-    #   Node-device colocations active during op 'b' creation:
-    #     with tf.colocate_with(a): file2.py:93>
-    #   Device assignments active during op 'b' creation:
-    #     with tf.device(/cpu:0): file2.py:94
-    op_info = error_interpolation.compute_field_dict(op)
-    coloc_op_info = error_interpolation.compute_field_dict(colocation_op)
-    msg = ("Tried to colocate op '{op_name}'{op_loc} having device '{op_dev}' "
-           "with op '{coloc_op_name}'{coloc_op_loc} which had an incompatible "
-           "device '{coloc_op_dev}'.\n\n{op_summary}\n\n{coloc_op_summary}"
-           .format(op_name=op.name,
-                   op_loc=op_info["defined_at"],
-                   op_dev=op.device,
-                   op_summary=op_info["devs_and_colocs"],
-                   coloc_op_name=colocation_op.name,
-                   coloc_op_loc=coloc_op_info["defined_at"],
-                   coloc_op_dev=colocation_op.device,
-                   coloc_op_summary=coloc_op_info["devs_and_colocs"]))
-    return msg
-
   def _create_op_helper(self, op, compute_device=True):
     """Common logic for creating an op in this graph."""
     # Apply any additional attributes requested. Do not overwrite any existing
@@ -3389,12 +3385,9 @@ class Graph(object):
       for colocation_op in self._colocation_stack.peek_objs():
         all_colocation_groups.extend(colocation_op.colocation_groups())
         if colocation_op.device:
-          if (op.device and pydev.canonical_name(op.device) !=
-              pydev.canonical_name(colocation_op.device)):
-            msg = self._make_colocation_conflict_message(op, colocation_op)
-            logging.warning(msg)
-          else:
-            op._set_device(colocation_op.device)  # pylint: disable=protected-access
+          # pylint: disable=protected-access
+          op._set_device(colocation_op.device)
+          # pylint: enable=protected-access
 
       all_colocation_groups = sorted(set(all_colocation_groups))
       # pylint: disable=protected-access
@@ -5385,6 +5378,10 @@ def init_scope():
 
 def executing_eagerly_outside_functions():
   """Returns True if executing eagerly, even if inside a graph function."""
+  # Fastpath for when this is called eagerly (its not necessary to init_scope).
+  if context.executing_eagerly():
+    return True
+
   with init_scope():
     return context.executing_eagerly()
 
@@ -5393,7 +5390,7 @@ def inside_function():
   return get_default_graph().building_function
 
 
-@tf_export("enable_eager_execution")
+@tf_export(v1=["enable_eager_execution"])
 def enable_eager_execution(config=None,
                            device_policy=None,
                            execution_mode=None):
@@ -5464,6 +5461,17 @@ def enable_eager_execution(config=None,
         server_def=None)
 
 
+@tf_export(v1=["disable_eager_execution"])
+def disable_eager_execution():
+  """Disables eager execution.
+
+  This function can only be called before any Graphs, Ops, or Tensors have been
+  created. It can be used at the beginning of the program for complex migration
+  projects from TensorFlow 1.x to 2.x.
+  """
+  context.default_execution_mode = context.GRAPH_MODE
+
+
 def enable_eager_execution_internal(config=None,
                                     device_policy=None,
                                     execution_mode=None,
@@ -5471,6 +5479,7 @@ def enable_eager_execution_internal(config=None,
   """Enables eager execution for the lifetime of this program.
 
   Most of the doc string for enable_eager_execution is relevant here as well.
+
   Args:
     config: See enable_eager_execution doc string
     device_policy: See enable_eager_execution doc string
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 04a9ed05fd2631bfd78414ebe30bcb8721f34f3b..2d7ee1a99e02cbb663df38ae17d8772fa6f11816 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -146,15 +146,14 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
 class IndexedSlicesTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes
   def testToTensor(self):
-    with self.cached_session():
-      values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
-      indices = constant_op.constant([0, 2])
-      dense_shape = constant_op.constant([3, 2])
-      x = ops.IndexedSlices(values, indices, dense_shape)
-      tensor = ops.convert_to_tensor(x, name="tensor")
-      self.assertAllEqual(tensor.eval(), [[2, 3], [0, 0], [5, 7]])
+    values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
+    indices = constant_op.constant([0, 2])
+    dense_shape = constant_op.constant([3, 2])
+    x = ops.IndexedSlices(values, indices, dense_shape)
+    tensor = ops.convert_to_tensor(x, name="tensor")
+    self.assertAllEqual(self.evaluate(tensor), [[2, 3], [0, 0], [5, 7]])
 
   @test_util.run_deprecated_v1
   def testNegation(self):
@@ -325,12 +324,12 @@ class OperationTest(test_util.TensorFlowTestCase):
     op = test_ops.a()
     self.assertEqual(tensor_shape.unknown_shape(), op.get_shape())
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedArray(self):
-    with self.cached_session():
-      values = [[2], [3], [5], [7]]
-      tensor = ops.convert_to_tensor(values)
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, self.evaluate(tensor))
+    values = [[2], [3], [5], [7]]
+    tensor = ops.convert_to_tensor(values)
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(tensor))
 
   def testShapeTuple(self):
     with self.cached_session():
@@ -346,57 +345,61 @@ class OperationTest(test_util.TensorFlowTestCase):
       converted = ops.convert_to_tensor(1)
       self.assertTrue(isinstance(converted, ops.EagerTensor))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedTuple(self):
-    with self.cached_session():
-      values = ((2,), (3,), (5,), (7,))
-      tensor = ops.convert_to_tensor(values)
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, ops.convert_to_tensor(values).eval())
+    values = ((2,), (3,), (5,), (7,))
+    tensor = ops.convert_to_tensor(values)
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(ops.convert_to_tensor(values)))
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedTensors(self):
-    with self.cached_session():
-      values = ((2,), (3,), (5,), (7,))
-      tensor = ops.convert_to_tensor(
-          [constant_op.constant(row) for row in values])
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, self.evaluate(tensor))
-      tensor = ops.convert_to_tensor(
-          [[constant_op.constant(v) for v in row] for row in values])
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, self.evaluate(tensor))
+    values = ((2,), (3,), (5,), (7,))
+    tensor = ops.convert_to_tensor(
+        [constant_op.constant(row) for row in values])
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(tensor))
+    tensor = ops.convert_to_tensor(
+        [[constant_op.constant(v) for v in row] for row in values])
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(tensor))
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedMix(self):
-    with self.cached_session():
-      values = ([2], (3,), [constant_op.constant(5)], constant_op.constant([7]))
-      tensor = ops.convert_to_tensor(values)
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(((2,), (3,), (5,), (7,)), self.evaluate(tensor))
+    values = ([2], (3,), [constant_op.constant(5)], constant_op.constant([7]))
+    tensor = ops.convert_to_tensor(values)
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(((2,), (3,), (5,), (7,)), self.evaluate(tensor))
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorPreferred(self):
-    with self.cached_session():
-      values = [2, 3, 5, 7]
-      tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.float32)
-      self.assertEqual(dtypes.float32, tensor.dtype)
+    values = [2, 3, 5, 7]
+    tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.float32)
+    self.assertEqual(dtypes.float32, tensor.dtype)
 
-    with self.cached_session():
-      # Convert empty tensor to anything.
-      values = []
-      tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
-      self.assertEqual(dtypes.int64, tensor.dtype)
+    # Convert empty tensor to anything.
+    values = []
+    tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
+    self.assertEqual(dtypes.int64, tensor.dtype)
 
-    with self.cached_session():
-      # The preferred dtype is a type error and will convert to
-      # float32 instead.
-      values = [1.23]
-      tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
-      self.assertEqual(dtypes.float32, tensor.dtype)
+    # The preferred dtype is a type error and will convert to
+    # float32 instead.
+    values = [1.23]
+    tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
+    self.assertEqual(dtypes.float32, tensor.dtype)
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToInvalidTensorType(self):
     with self.assertRaises(TypeError):
       # Forcing an invalid dtype should fail with a type error.
       values = [1.23]
-      _ = ops.convert_to_tensor(values, dtype=dtypes.int64)
+      ops.convert_to_tensor(values, dtype=dtypes.int64)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConvertToTensorFromInvalidTensor(self):
+    tensor = constant_op.constant(42.0, dtype=dtypes.float32)
+    with self.assertRaises(ValueError):
+      ops.convert_to_tensor(tensor, dtype=dtypes.int32)
 
   @test_util.run_deprecated_v1
   def testNoConvert(self):
@@ -601,6 +604,35 @@ class OperationTest(test_util.TensorFlowTestCase):
     ):
       x.op._update_input(1, x)  # pylint: disable=protected-access
 
+  @test_util.enable_control_flow_v2
+  @test_util.run_v1_only("b/120545219")
+  def testAddWhileInput(self):
+    @eager_function.defun
+    def test():
+      output = control_flow_ops.while_loop(lambda x: x < 3, lambda x: x + 1,
+                                           [1])
+      while_op = output.op.inputs[0].op
+      self.assertEqual(while_op.type, "While")
+      orig_num_inputs = len(while_op.inputs)
+
+      # Make sure we can handle the while op having a control input.
+      while_op._add_control_input(constant_op.constant(0).op)
+
+      new_input1 = constant_op.constant(1.0)
+      new_input2 = constant_op.constant(True)
+
+      while_op._set_type_list_attr("T",
+                                   [t.dtype for t in while_op.inputs] +
+                                   [new_input1.dtype, new_input2.dtype])
+
+      while_op._add_while_inputs([new_input1, new_input2])
+      # Can't add an edge beyond what's specified by "T"
+      with self.assertRaises(errors.OutOfRangeError):
+        while_op._add_while_inputs([new_input2])
+      self.assertEqual(len(while_op.inputs), orig_num_inputs + 2)  # pylint: disable=g-deprecated-assert
+
+    test()
+
   @test_util.run_deprecated_v1
   def testOpDef(self):
     x = constant_op.constant(0)
@@ -752,7 +784,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(op3.name, "myop_2")
     self.assertEqual(op4.name, "myop_1_1")
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testCond(self):
     g = ops.Graph()
     with g.as_default():
@@ -782,7 +814,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
                      "cond/cond_text")
     # pylint: enable=protected-access
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWhileLoop(self):
     g = ops.Graph()
     with g.as_default():
@@ -812,7 +844,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
                      "myloop/while_context")
     # pylint: enable=protected-access
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWhileLoopWithInternalControlDep(self):
     g = ops.Graph()
     with g.as_default():
@@ -836,7 +868,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     # Internal control dep is preserved
     self.assertEqual(op.control_inputs, [c])
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWhileLoopWithExternalControlDep(self):
     g = ops.Graph()
     with g.as_default():
@@ -2255,7 +2287,7 @@ class InitScopeTest(test_util.TensorFlowTestCase):
       self.assertEqual(4, int(compiled_outer(inner=compiled_inner)))
       self.assertEqual(7, int(compiled_outer(inner=compiled_inner)))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testFallsBackToGlobalGraphWhenAllGraphsAreBuildingFunctions(self):
     with context.graph_mode():
       ops.reset_default_graph()
@@ -2822,43 +2854,6 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
       b = variables.Variable([3.0], name="b")
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
 
-  @test_util.run_deprecated_v1
-  def testInconsistentDeviceWithinColocate(self):
-    with ops.device("/device:GPU:0"):
-      a = constant_op.constant([2.0], name="a")
-      with ops.colocate_with(a.op):
-        # This is allowed due to legacy but clearly wrong, since we
-        # should really be colocating with 'a'.  We allow devices to
-        # override colocate_with, but we log warnings to suggest that
-        # this is probably unintentional or misguided.
-        with ops.device("/cpu:0"):
-          b = constant_op.constant([3.0], name="b")
-
-    self.assertEqual("/device:CPU:0", b.device)
-
-  @test_util.run_deprecated_v1
-  def testMakeColocationConflictMessage(self):
-    """Test that provides an example of a complicated error message."""
-    # We could test the message with any ops, but this test will be more
-    # instructive with a real colocation conflict.
-    with ops.device("/device:GPU:0"):
-      a = constant_op.constant([2.0], name="a")
-      with ops.colocate_with(a.op):
-        with ops.device("/cpu:0"):
-          b = constant_op.constant([3.0], name="b")
-    # The definition-location of the nodes will be wrong because of running
-    # from within a TF unittest.  The rest of the info should be correct.
-    message = ops.get_default_graph()._make_colocation_conflict_message(a.op,
-                                                                        b.op)
-    self.assertRegexpMatches(message,
-                             r"Tried to colocate op 'a' \(defined at.*\)")
-    self.assertRegexpMatches(message, "No node-device.*'a'")
-    self.assertRegexpMatches(message, "Device assignments active.*'a'")
-    self.assertRegexpMatches(message, "GPU:0")
-    self.assertRegexpMatches(message, "Node-device colocations active.*'b'")
-    self.assertRegexpMatches(message, "Device assignments active.*'b'")
-    self.assertRegexpMatches(message, "cpu:0")
-
 
 class DeprecatedTest(test_util.TensorFlowTestCase):
 
@@ -3003,7 +2998,7 @@ class TracebackTest(test_util.TensorFlowTestCase):
 
 class EnableEagerExecutionTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testBadArgumentsToEnableEagerExecution(self):
     with self.assertRaisesRegexp(TypeError, "config must be a tf.ConfigProto"):
       ops.enable_eager_execution(context.DEVICE_PLACEMENT_SILENT)
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 465016b808726f28909013e994b9b23b915d982a..d460168631c3032bb91894c9997b2de29bf026e6 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -142,6 +142,7 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
   void AddEagerAttrs(const string& indentation);
   void AddEagerExecute(const string& indentation,
                        const string& num_outputs_expr);
+  void AddDispatch(const string& prefix);
 
   void AddAttrForArg(const string& attr, int arg_index) {
     gtl::InsertIfNotPresent(&inferred_attrs_, attr,
@@ -356,9 +357,14 @@ string GenEagerPythonOp::Code() {
 
 void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
   strings::StrAppend(&result_, "  # Add nodes to the TensorFlow graph.\n");
-  strings::StrAppend(&result_, function_setup,
-                     "  _, _, _op = _op_def_lib._apply_op_helper(\n");
-  AddBodyNoReturn("        ");
+  strings::StrAppend(&result_, function_setup);
+  if (api_def_.visibility() == ApiDef::VISIBLE) {
+    strings::StrAppend(&result_, "  try:\n  ");
+  }
+  strings::StrAppend(&result_, "  _, _, _op = _op_def_lib._apply_op_helper(\n");
+  AddBodyNoReturn(strings::StrCat("        \"", op_def_.name(), "\", "));
+  AddDispatch("  ");
+
   if (num_outs_ > 0) {
     strings::StrAppend(&result_, "  _result = _op.outputs[:]\n");
     // Special case handling for stateful op with single list output
@@ -628,6 +634,9 @@ void GenEagerPythonOp::AddEagerFunctionTeardown(
 bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& eager_not_allowed_error) {
+  if (api_def_.visibility() == ApiDef::VISIBLE) {
+    strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");
+  }
   AddExport();
   AddDefLine(function_name_, parameters);
   AddDocStringDescription();
@@ -758,6 +767,7 @@ void GenEagerPythonOp::AddEagerFastPathExecute() {
   strings::StrAppend(&result_, "      except _core._SymbolicException:\n");
   strings::StrAppend(&result_,
                      "        pass  # Add nodes to the TensorFlow graph.\n");
+  AddDispatch("      ");
 
   // Any errors thrown from execute need to be unwrapped from
   // _NotOkStatusException.
@@ -898,6 +908,19 @@ void GenEagerPythonOp::AddEagerExecute(const string& indentation,
                      WordWrap(return_prefix, return_args, kRightMargin), "\n");
 }
 
+void GenEagerPythonOp::AddDispatch(const string& prefix) {
+  if (api_def_.visibility() != ApiDef::VISIBLE) return;
+
+  strings::StrAppend(&result_, prefix, "except (TypeError, ValueError):\n");
+  strings::StrAppend(&result_, prefix, "  result = _dispatch.dispatch(\n");
+  AddBodyNoReturn(strings::StrCat(prefix, "        ", function_name_, ", "));
+  strings::StrAppend(&result_, prefix,
+                     "  if result is not "
+                     "_dispatch.OpDispatcher.NOT_SUPPORTED:\n");
+  strings::StrAppend(&result_, prefix, "    return result\n");
+  strings::StrAppend(&result_, prefix, "  raise\n");
+}
+
 string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops, bool require_shapes,
                     const string& source_file_name = "") {
@@ -937,6 +960,7 @@ from tensorflow.python.framework import op_def_registry as _op_def_registry
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.framework import op_def_library as _op_def_library
 from tensorflow.python.util.deprecation import deprecated_endpoints
+from tensorflow.python.util import dispatch as _dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 )");
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
index 65b9ad5c6a2b5170a70ce376114feff27bb622d2..cbdeecfbfb93ad776ff9d3db755503c47970d330 100644
--- a/tensorflow/python/framework/python_op_gen_internal.cc
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -804,8 +804,8 @@ void GenPythonOp::AddDocStringOutputs() {
 }
 
 void GenPythonOp::AddBody(const string& prefix) {
-  const string apply_prefix =
-      strings::StrCat(prefix, "_result = _op_def_lib.apply_op(");
+  const string apply_prefix = strings::StrCat(
+      prefix, "_result = _op_def_lib.apply_op(\"", op_def_.name(), "\", ");
   AddBodyNoReturn(apply_prefix);
   if (num_outs_ > 1) {
     strings::StrAppend(&result_, prefix, "_result = _", op_def_.name(),
@@ -815,7 +815,7 @@ void GenPythonOp::AddBody(const string& prefix) {
 }
 
 void GenPythonOp::AddBodyNoReturn(const string& apply_prefix) {
-  string args = strings::StrCat("\"", op_def_.name(), "\", ");
+  string args;
   for (size_t i = 0; i < param_names_.size(); ++i) {
     strings::StrAppend(&args, AvoidPythonReserved(param_names_[i].GetName()),
                        "=", param_names_[i].GetRenameTo(), ", ");
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 3643fc5e00475b8d2ebc2e2fc23fa6fd19bea114..5e1a95a26be034bff0a1f5eb996ac6f16c61e282 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -244,7 +244,7 @@ class SparseTensor(_TensorLike):
 
 SparseTensorValue = collections.namedtuple(
     "SparseTensorValue", ["indices", "values", "dense_shape"])
-tf_export("SparseTensorValue")(SparseTensorValue)
+tf_export(v1=["SparseTensorValue"])(SparseTensorValue)
 pywrap_tensorflow.RegisterType("SparseTensorValue", SparseTensorValue)
 
 
diff --git a/tensorflow/python/framework/subscribe_test.py b/tensorflow/python/framework/subscribe_test.py
index 61c6ea651903b0434835f9f7b8ba5ed490a74415..a74e96f9d9d6469b66426dd85628f926297afcd0 100644
--- a/tensorflow/python/framework/subscribe_test.py
+++ b/tensorflow/python/framework/subscribe_test.py
@@ -215,7 +215,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertIn('graph2', shared)
     self.assertIn('graph3', shared)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def testSubscribeVariable(self):
     """Confirm that variables can be subscribed."""
     v1 = variables.VariableV1(0.0)
@@ -254,7 +254,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
       # Make sure the values read from the variable match the expected ones.
       self.assertEqual([0.0, 3.0], shared)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def testResourceType(self):
     """Confirm that subscribe correctly handles tensors with 'resource' type."""
     tensor_array = tensor_array_ops.TensorArray(
@@ -344,7 +344,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertEqual(add.device, add_sub.device)
     self.assertEqual(mul.device, mul_sub.device)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def test_subscribe_tensors_within_control_flow_context(self):
     """Side effect ops are added with the same control flow context."""
     c1 = constant_op.constant(10)
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 155e45d16c0cd65bf8079c0d4c66ecf9919d6fc2..ffab93c84eac1e024914a05058a5d2bc49ae5fb9 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -54,6 +54,7 @@ from tensorflow.python import tf2
 from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
@@ -66,16 +67,16 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
-from tensorflow.python.util import memory
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.protobuf import compare
 from tensorflow.python.util.tf_export import tf_export
@@ -406,42 +407,12 @@ def enable_control_flow_v2(fn):
   """
 
   def wrapper(*args, **kwargs):
-    enable_cond_v2_old = control_flow_ops.ENABLE_COND_V2
-    enable_while_v2_old = control_flow_ops.ENABLE_WHILE_V2
-    enable_tensor_array_v2_old = tensor_array_ops.ENABLE_TENSOR_ARRAY_V2
-    control_flow_ops.ENABLE_COND_V2 = True
-    control_flow_ops.ENABLE_WHILE_V2 = True
-    tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = True
+    enable_control_flow_v2_old = control_flow_util.ENABLE_CONTROL_FLOW_V2
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = True
     try:
       fn(*args, **kwargs)
     finally:
-      control_flow_ops.ENABLE_COND_V2 = enable_cond_v2_old
-      control_flow_ops.ENABLE_WHILE_V2 = enable_while_v2_old
-      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = enable_tensor_array_v2_old
-
-  return wrapper
-
-
-def enable_tensor_array_v2(fn):
-  """Decorator for enabling _GraphTensorArrayV2 on a test.
-
-  Note this enables _GraphTensorArrayV2 after running the test class's
-  setup/teardown methods.
-
-  Args:
-    fn: the function to be wrapped
-
-  Returns:
-    The wrapped function
-  """
-
-  def wrapper(*args, **kwargs):
-    enable_tensor_array_v2_old = tensor_array_ops.ENABLE_TENSOR_ARRAY_V2
-    tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = True
-    try:
-      fn(*args, **kwargs)
-    finally:
-      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = enable_tensor_array_v2_old
+      control_flow_util.ENABLE_CONTROL_FLOW_V2 = enable_control_flow_v2_old
 
   return wrapper
 
@@ -490,11 +461,12 @@ def with_control_flow_v2(cls):
   Returns:
     cls with new test methods added
   """
-  if control_flow_ops.ENABLE_WHILE_V2 and control_flow_ops.ENABLE_COND_V2:
+  if control_flow_util.ENABLE_CONTROL_FLOW_V2:
     return cls
 
   for name, value in cls.__dict__.copy().items():
-    if (callable(value) and name.startswith("test") and
+    if (callable(value) and
+        name.startswith(unittest.TestLoader.testMethodPrefix) and
         not getattr(value, "_disable_control_flow_v2", False)):
       setattr(cls, name + "WithControlFlowV2", enable_control_flow_v2(value))
   return cls
@@ -893,8 +865,10 @@ def run_all_in_graph_and_eager_modes(cls):
   """Execute all test methods in the given class with and without eager."""
   base_decorator = run_in_graph_and_eager_modes
   for name, value in cls.__dict__.copy().items():
-    if callable(value) and name.startswith("test") and not (
-        name.startswith("testSkipEager") or name.startswith("test_skip_eager")):
+    if (callable(value) and
+        name.startswith(unittest.TestLoader.testMethodPrefix) and
+        not (name.startswith("testSkipEager")
+             or name.startswith("test_skip_eager"))):
       setattr(cls, name, base_decorator(value))
   return cls
 
@@ -1006,6 +980,59 @@ def run_in_graph_and_eager_modes(func=None,
   return decorator
 
 
+def py_func_if_in_function(f):
+
+  def decorated(*args, **kwds):
+    if not ops.get_default_graph()._building_function:
+      return f(*args, **kwds)
+
+    tensor_args, tensor_indices = zip(
+        *[(x, i) for i, x in enumerate(args)
+          if isinstance(x, (ops.Tensor, variables.Variable))])
+
+    def inner_f(*inner_tensor_args):
+      my_args = list(args)
+      for i, n in zip(tensor_indices, inner_tensor_args):
+        my_args[i] = n
+      return f(*my_args, **kwds)
+
+    return script_ops.py_func(inner_f, tensor_args, [])
+
+  return tf_decorator.make_decorator(f, decorated)
+
+
+def also_run_as_tf_function(f):
+  """Runs the decorated test twice--once as is, once inside a tf.function.
+
+  This allows you to run a test both in eager execution and inside a
+  tf.function, exercising the two execution modes supported in tf 2.0. The test
+  assertions are automatically done inside tf.py_funcs, and tf.function ensures
+  that they run in the proper order and with the proper side effects.
+
+  Currently variable creation is not supported in tests annotated with this
+  decorator since it's tricky to ensure the variable doesn't get repeatedly
+  created when retracing the tf.function.
+
+  Args:
+    f: the test method to be decorated
+
+  Returns:
+    The decorated test method, which will run both in eager and inside a
+    tf.function.
+  """
+
+  def decorated(*args, **kwds):
+    def bound_f():
+      f(*args, **kwds)
+    with context.eager_mode():
+      # Running in eager mode
+      bound_f()
+      # Running as TF function
+      def_function.function(bound_f)()
+
+  return decorated
+
+
 def run_deprecated_v1(func=None):
   """Execute the decorated test in graph mode.
 
@@ -1041,6 +1068,148 @@ def run_deprecated_v1(func=None):
   return decorator
 
 
+def run_v1_only(reason, func=None):
+  """Execute the decorated test only if running in v1 mode.
+
+  This function is intended to be applied to tests that exercise v1 only
+  functionality. If the test is run in v2 mode it will simply be skipped.
+
+  Args:
+    reason: string giving a reason for limiting the test to v1 only.
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      setup = f.__dict__.get("setUp")
+      if setup is not None:
+        setattr(f, "setUp", decorator(setup))
+
+      for name, value in f.__dict__.copy().items():
+        if (callable(value) and
+            name.startswith(unittest.TestLoader.testMethodPrefix)):
+          setattr(f, name, decorator(value))
+
+      return f
+
+    def decorated(self, *args, **kwargs):
+      if tf2.enabled():
+        self.skipTest(reason)
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def run_v2_only(func=None):
+  """Execute the decorated test only if running in v2 mode.
+
+  This function is intended to be applied to tests that exercise v2 only
+  functionality. If the test is run in v1 mode it will simply be skipped.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_v2_only` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if not tf2.enabled():
+        self.skipTest("Test is only comptaible in v2")
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def run_gpu_only(func=None):
+  """Execute the decorated test only if a GPU is available.
+
+  This function is intended to be applied to tests that require the precense
+  of a GPU. If a GPU is absent, it will simply be skipped.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_gpu_only` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if not is_gpu_available():
+        self.skipTest("Test requires GPU")
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def run_cuda_only(func=None):
+  """Execute the decorated test only if a GPU is available.
+
+  This function is intended to be applied to tests that require the precense
+  of a CUDA GPU. If a CUDA GPU is absent, it will simply be skipped.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_cuda_only` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if not is_gpu_available(cuda_only=True):
+        self.skipTest("Test requires CUDA GPU")
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
 @tf_export("test.is_gpu_available")
 def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
   """Returns whether TensorFlow can access a GPU.
@@ -1132,6 +1301,63 @@ class CapturedWrites(object):
     return output_data
 
 
+class FakeEagerSession(object):
+  """Fake session so tests that conditionally use placeholders can use eager.
+
+  There are a number of tests that conditionally use placeholders for shape
+  inference. The pattern is demonstrated here:
+
+  ```python
+  with self.cached_session() as sess:
+    if static_shape:
+      y = math_ops.matmul(x, ...)
+      feed_dict = {}
+    else:
+      x_ph = array_ops.placeholder(...)
+      y = math_ops.matmul(x_ph, ...)
+      feed_dict = {x_ph: x}
+    val = sess.run(y, feed_dict=feed_dict)
+  ```
+
+  Since the feed_dict is empty when not using placeholders we should be able to
+  call self.evaluate(), however this requires rewriting the test case.
+  This class shold be considered a stop-gap solution to get tests running with
+  eager with minimal changes to the actual test.
+  """
+
+  def __init__(self, test_case):
+    self._test_case = test_case
+
+  def run(self, fetches, *args, **kwargs):
+    """Evalaute `fetches`.
+
+    Fail if additional args are specified.
+
+    Args:
+      fetches: A Tensor or a nested list/tuple of Tensors.
+      *args: Positional arguments
+      **kwargs: Keyword arguments
+
+    Raises:
+      RuntimeError: If args or kwargs are specified.
+
+    Returns:
+      Tensors as numpy values.
+    """
+    feed_dict = kwargs.pop("feed_dict", {})
+    if feed_dict:
+      raise RuntimeError(
+          "feed_dict is not supported when eager execution is enabled "
+          "(in this case, sess.run(t) is shorthand for t.numpy()")
+
+    if args or kwargs:
+      raise RuntimeError(
+          "Optional args are not supported when eager execution is enabled "
+          "(in this case, sess.run(t) is shorthand for t.numpy()")
+
+    return self._test_case.evaluate(fetches)
+
+
 class ErrorLoggingSession(session.Session):
   """Wrapper around a Session that logs errors in run().
   """
@@ -1173,6 +1399,10 @@ class TensorFlowTestCase(googletest.TestCase):
     ops.reset_default_graph()
     random_seed.set_random_seed(random_seed.DEFAULT_GRAPH_SEED)
 
+    # Avoiding calling setUp() for the poorly named test_session method.
+    if self.id().endswith(".test_session"):
+      self.skipTest("Not a test.")
+
   def tearDown(self):
     for thread in self._threads:
       thread.check_termination()
@@ -1439,7 +1669,7 @@ class TensorFlowTestCase(googletest.TestCase):
       the graph building and execution code in a test case.
     """
     if context.executing_eagerly():
-      yield None
+      yield FakeEagerSession(self)
     else:
       sess = self._get_cached_session(
           graph, config, force_gpu, crash_if_inconsistent_args=True)
@@ -1458,7 +1688,6 @@ class TensorFlowTestCase(googletest.TestCase):
     """Use cached_session instead."""
     if self.id().endswith(".test_session"):
       self.skipTest("Not a test.")
-
     if context.executing_eagerly():
       yield None
     else:
@@ -1581,8 +1810,8 @@ class TensorFlowTestCase(googletest.TestCase):
     return ret
 
 
-# pylint: enable=invalid-name
-
+  # pylint: enable=invalid-name
+  @py_func_if_in_function
   def assertNear(self, f1, f2, err, msg=None):
     """Asserts that two floats are near each other.
 
@@ -1601,6 +1830,7 @@ class TensorFlowTestCase(googletest.TestCase):
         "%f != %f +/- %f%s" % (f1, f2, err, " (%s)" % msg
                                if msg is not None else ""))
 
+  @py_func_if_in_function
   def assertArrayNear(self, farray1, farray2, err, msg=None):
     """Asserts that two float arrays are near each other.
 
@@ -1620,6 +1850,7 @@ class TensorFlowTestCase(googletest.TestCase):
   def _NDArrayNear(self, ndarray1, ndarray2, err):
     return np.linalg.norm(ndarray1 - ndarray2) < err
 
+  @py_func_if_in_function
   def assertNDArrayNear(self, ndarray1, ndarray2, err, msg=None):
     """Asserts that two numpy arrays have near values.
 
@@ -1635,7 +1866,7 @@ class TensorFlowTestCase(googletest.TestCase):
     # If a is a tensor then convert it to ndarray
     if isinstance(a, ops.Tensor):
       if isinstance(a, ops._EagerTensorBase):
-        return a.numpy()
+        a = a.numpy()
       else:
         a = self.evaluate(a)
     if not isinstance(a, np.ndarray):
@@ -1757,6 +1988,7 @@ class TensorFlowTestCase(googletest.TestCase):
         e.args = ((e.args[0] + " : " + msg,) + e.args[1:])
         raise
 
+  @py_func_if_in_function
   def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
     """Asserts that two structures of numpy arrays or Tensors, have near values.
 
@@ -1782,6 +2014,7 @@ class TensorFlowTestCase(googletest.TestCase):
     """
     self._assertAllCloseRecursive(a, b, rtol=rtol, atol=atol, msg=msg)
 
+  @py_func_if_in_function
   def assertAllCloseAccordingToType(self,
                                     a,
                                     b,
@@ -1829,6 +2062,7 @@ class TensorFlowTestCase(googletest.TestCase):
 
     self.assertAllClose(a, b, rtol=rtol, atol=atol, msg=msg)
 
+  @py_func_if_in_function
   def assertNotAllClose(self, a, b, **kwargs):
     """Assert that two numpy arrays, or or Tensors, do not have near values.
 
@@ -1847,6 +2081,7 @@ class TensorFlowTestCase(googletest.TestCase):
       return
     raise AssertionError("The two values are close at all elements")
 
+  @py_func_if_in_function
   def assertAllEqual(self, a, b, msg=None):
     """Asserts that two numpy arrays or Tensors have the same values.
 
@@ -1889,6 +2124,7 @@ class TensorFlowTestCase(googletest.TestCase):
       msgs.append("not equal rhs = {}".format(y))
       np.testing.assert_array_equal(a, b, err_msg="\n".join(msgs))
 
+  @py_func_if_in_function
   def assertAllGreater(self, a, comparison_target):
     """Assert element values are all greater than a target value.
 
@@ -1900,6 +2136,7 @@ class TensorFlowTestCase(googletest.TestCase):
     a = self._GetNdArray(a)
     self.assertGreater(np.min(a), comparison_target)
 
+  @py_func_if_in_function
   def assertAllLess(self, a, comparison_target):
     """Assert element values are all less than a target value.
 
@@ -1911,6 +2148,7 @@ class TensorFlowTestCase(googletest.TestCase):
     a = self._GetNdArray(a)
     self.assertLess(np.max(a), comparison_target)
 
+  @py_func_if_in_function
   def assertAllGreaterEqual(self, a, comparison_target):
     """Assert element values are all greater than or equal to a target value.
 
@@ -1922,6 +2160,7 @@ class TensorFlowTestCase(googletest.TestCase):
     a = self._GetNdArray(a)
     self.assertGreaterEqual(np.min(a), comparison_target)
 
+  @py_func_if_in_function
   def assertAllLessEqual(self, a, comparison_target):
     """Assert element values are all less than or equal to a target value.
 
@@ -1964,6 +2203,7 @@ class TensorFlowTestCase(googletest.TestCase):
       lines.append(prefix + "...")
     return lines
 
+  @py_func_if_in_function
   def assertAllInRange(self,
                        target,
                        lower_bound,
@@ -2022,6 +2262,7 @@ class TensorFlowTestCase(googletest.TestCase):
           "Subscript(s) and value(s) of the offending elements:\n" +
           "\n".join(self._format_subscripts(violation_subscripts, target)))
 
+  @py_func_if_in_function
   def assertAllInSet(self, target, expected_set):
     """Assert that elements of a Tensor are all in a given closed set.
 
@@ -2043,6 +2284,7 @@ class TensorFlowTestCase(googletest.TestCase):
       raise AssertionError("%d unique element(s) are not in the set %s: %s" %
                            (np.size(diff), expected_set, diff))
 
+  @py_func_if_in_function
   def assertDTypeEqual(self, target, expected_dtype):
     """Assert ndarray data type is equal to expected.
 
@@ -2347,42 +2589,3 @@ def set_producer_version(graph, producer_version):
   with graph.as_default():
     importer.import_graph_def(graph_def)
   assert graph.graph_def_versions.producer, producer_version
-
-
-def dismantle_func_graph(func_graph):
-  """Removes reference cycles in `func_graph` FuncGraph.
-
-  Helpful for making sure the garbage collector doesn't need to run when
-  the FuncGraph goes out of scope, e.g. in tests using defun with
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True).
-
-  Args:
-    func_graph: A `FuncGraph` object to destroy. `func_graph` is unusable
-      after this function.
-  """
-  # TODO(b/115366440): Delete this method when a custom OrderedDict is added.
-  # Clearing captures using clear() leaves some cycles around.
-  while func_graph.captures:
-    func_graph.captures.popitem()
-  memory.dismantle_ordered_dict(func_graph.captures)
-  ops.dismantle_graph(func_graph)
-
-
-def dismantle_polymorphic_function(func):
-  """Removes reference cycles in PolymorphicFunction `func`.
-
-  Helpful for making sure the garbage collector doesn't need to run when
-  PolymorphicFunction goes out of scope, e.g. in tests using defun with
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True).
-
-  Args:
-    func: A `PolymorphicFunction` object to destroy. `func` is unusable
-      after this function.
-  """
-  # TODO(b/115366440): Delete this method when a custom OrderedDict is added
-  cache = func._function_cache  # pylint: disable=protected-access
-  for concrete_func in cache.values():
-    dismantle_func_graph(concrete_func.graph)
-  while cache:
-    cache.popitem()
-  memory.dismantle_ordered_dict(cache)
diff --git a/tensorflow/python/grappler/datasets_test.py b/tensorflow/python/grappler/datasets_test.py
index bd870ad8de4b6526f778fa94e8b71cc789dfe99e..6937301ab255b87fa51444b70bc0e2b20d306ea3 100644
--- a/tensorflow/python/grappler/datasets_test.py
+++ b/tensorflow/python/grappler/datasets_test.py
@@ -48,7 +48,7 @@ class GrapplerTest(test.TestCase):
     for test_case in test_cases:
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -73,7 +73,7 @@ class GrapplerTest(test.TestCase):
     for test_case in test_cases:
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensor_slices(test_case['tensor'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -109,7 +109,7 @@ class GrapplerTest(test.TestCase):
             make_generator(test_case['tensor']),
             dtypes.int64,
             output_shapes=test_case['shape'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -122,7 +122,7 @@ class GrapplerTest(test.TestCase):
   def testRange(self):
     with ops.Graph().as_default() as g:
       dataset = dataset_ops.Dataset.range(42)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       get_next = iterator.get_next()
       train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
       train_op.append(get_next)
@@ -148,7 +148,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = fn(dataset, test_case['tensor'], test_case['shape'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -252,7 +252,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = dataset.batch(42)
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -281,7 +281,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = dataset.padded_batch(42, padded_shapes=test_case['shape'][1:])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -318,7 +318,7 @@ class GrapplerTest(test.TestCase):
           return dataset_fn
 
         dataset = dataset.flat_map(make_dataset(test_case['tensor']))
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -353,7 +353,7 @@ class GrapplerTest(test.TestCase):
 
         dataset = dataset.interleave(
             make_dataset(test_case['tensor']), cycle_length=42)
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -382,7 +382,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = dataset.map(array_ops.transpose)
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
diff --git a/tensorflow/python/grappler/item_test.py b/tensorflow/python/grappler/item_test.py
index 78604b259cac7216e69025b8b66a6072930dd5ba..c02fd9f55b885c0e8b0647a74547887eff7453f0 100644
--- a/tensorflow/python/grappler/item_test.py
+++ b/tensorflow/python/grappler/item_test.py
@@ -108,7 +108,7 @@ class ItemTest(test.TestCase):
     newest_tf_item = grappler_item.tf_item
     self.assertEqual(new_tf_item, newest_tf_item)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def testColocationContraints(self):
     with ops.Graph().as_default() as g:
       c = constant_op.constant([10])
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index 6eb16fbd39e39021fa29e74ac9765028da344401..e2864ebb4df646262456f2d04e4a24bdd06482b7 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -62,7 +62,7 @@ class MemoryOptimizerSwapTest(test.TestCase):
     self.assertEqual(len(graph.node), graph_size)
     self.assertItemsEqual([node.name for node in graph.node], nodes)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def testSimpleSwap(self):
     """Check that the swap annotations are followed."""
     a = variables.VariableV1(10, name='a')
diff --git a/tensorflow/python/grappler/tf_optimizer_test.py b/tensorflow/python/grappler/tf_optimizer_test.py
index 06ccaa813f2cd33fc5550959ba2669426e3bf41b..8186c81378af7c9fdbd39d4001998d2f959d4dd3 100644
--- a/tensorflow/python/grappler/tf_optimizer_test.py
+++ b/tensorflow/python/grappler/tf_optimizer_test.py
@@ -57,7 +57,7 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     self.assertEqual(len(graph.node), 1)
     self.assertItemsEqual([node.name for node in graph.node], ['d'])
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def testKeepNodes(self):
     g = ops.Graph()
     with g.as_default():
@@ -86,7 +86,7 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     self.assertEqual(len(optimized_graph_nodes), len(expected_nodes))
     self.assertAllInSet(optimized_graph_nodes, expected_nodes)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def testLoops(self):
     g = ops.Graph()
     with g.as_default():
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index a573fd5cfbe3d37df872efb604cc7ee60bbef83e..3b3986dc2f09c1633f2802170c2b324907664854 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -41,6 +41,7 @@ py_library(
         "datasets/mnist.py",
         "datasets/reuters.py",
         "estimator/__init__.py",
+        "keras_parameterized.py",
         "preprocessing/__init__.py",
         "preprocessing/image.py",
         "preprocessing/sequence.py",
@@ -60,10 +61,13 @@ py_library(
         ":engine",
         ":layers",
         ":pil_for_keras",
+        "@keras_applications_archive//:keras_applications",
         "//tensorflow/python:training",
         "//tensorflow/python/keras/optimizer_v2",
+        # TODO(kathywu): move saving into engine after resolving circular
+        # dependencies between Keras and SavedModel
+        "//tensorflow/python/keras/saving",
         "//tensorflow/python/saved_model",
-        "@keras_applications_archive//:keras_applications",
     ],
 )
 
@@ -144,6 +148,7 @@ py_library(
         "utils/data_utils.py",
         "utils/io_utils.py",
         "utils/losses_utils.py",
+        "utils/metrics_utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -214,6 +219,7 @@ py_test(
         "//tensorflow/python:layers",
         "//tensorflow/python:nn",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -226,6 +232,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -238,6 +245,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -251,6 +259,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:init_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -262,6 +271,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -286,6 +296,19 @@ py_test(
     size = "small",
     srcs = ["losses_test.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "metrics_functional_test",
+    size = "medium",
+    srcs = ["metrics_functional_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -297,12 +320,8 @@ py_test(
     name = "metrics_test",
     size = "medium",
     srcs = ["metrics_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "no_oss",
-        "notap",
-    ],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -332,6 +351,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -345,6 +365,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -352,12 +373,13 @@ py_test(
     name = "convolutional_test",
     size = "large",
     srcs = ["layers/convolutional_test.py"],
-    shard_count = 4,
+    shard_count = 11,
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -383,6 +405,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -390,12 +413,13 @@ py_test(
     name = "core_test",
     size = "medium",
     srcs = ["layers/core_test.py"],
-    shard_count = 2,
+    shard_count = 3,
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -405,6 +429,7 @@ cuda_py_test(
     srcs = ["layers/embeddings_test.py"],
     additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -420,6 +445,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -432,6 +458,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -443,6 +470,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -450,12 +478,14 @@ py_test(
     name = "normalization_test",
     size = "medium",
     srcs = ["layers/normalization_test.py"],
+    shard_count = 3,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -469,6 +499,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -476,12 +507,14 @@ py_test(
     name = "gru_test",
     size = "large",
     srcs = ["layers/gru_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = ["notsan"],  # http://b/62136390
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -499,30 +532,48 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_test(
     name = "recurrent_test",
-    size = "large",
+    size = "medium",
     srcs = ["layers/recurrent_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 cuda_py_test(
-    name = "unified_rnn_test",
+    name = "unified_lstm_test",
     size = "medium",
-    srcs = ["layers/unified_rnn_test.py"],
+    srcs = ["layers/unified_lstm_test.py"],
     additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "unified_gru_test",
+    size = "medium",
+    srcs = ["layers/unified_gru_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
 )
 
 py_test(
@@ -533,6 +584,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -550,6 +602,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -563,6 +616,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -582,6 +636,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -593,6 +648,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -620,6 +676,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -632,6 +689,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -640,6 +698,7 @@ cuda_py_test(
     srcs = ["utils/multi_gpu_utils_test.py"],
     additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
@@ -655,6 +714,7 @@ cuda_py_test(
     srcs = ["engine/training_gpu_test.py"],
     additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
@@ -682,6 +742,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -694,6 +755,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -706,6 +768,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -719,6 +782,22 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "correctness_test",
+    size = "medium",
+    srcs = ["engine/correctness_test.py"],
+    shard_count = 2,
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -726,13 +805,14 @@ py_test(
     name = "training_test",
     size = "medium",
     srcs = ["engine/training_test.py"],
-    shard_count = 4,
+    shard_count = 16,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -740,6 +820,7 @@ py_test(
     name = "training_dataset_test",
     size = "medium",
     srcs = ["engine/training_dataset_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -751,11 +832,11 @@ py_test(
 
 py_test(
     name = "training_generator_test",
-    size = "enormous",
+    size = "large",
     srcs = ["engine/training_generator_test.py"],
+    shard_count = 3,
     srcs_version = "PY2AND3",
     tags = [
-        "no_oss",
         "notsan",
     ],
     deps = [
@@ -791,6 +872,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -804,6 +886,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -811,19 +894,20 @@ py_test(
     name = "model_subclassing_test",
     size = "medium",
     srcs = ["model_subclassing_test.py"],
-    shard_count = 2,
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_test(
     name = "topology_test",
-    size = "small",
+    size = "medium",
     srcs = ["engine/topology_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -833,6 +917,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -845,6 +930,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -886,6 +972,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -903,15 +990,16 @@ py_test(
     ],
 )
 
-py_library(
-    name = "testing_utils",
-    srcs = [
-        "testing_utils.py",
-    ],
+py_test(
+    name = "keras_parameterized_test",
+    size = "small",
+    srcs = ["keras_parameterized_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":keras",
-        "//tensorflow/python:util",
+        "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/keras/__init__.py b/tensorflow/python/keras/__init__.py
index be46a894e1b9979ea682aa2b635dc68da35c6097..2a6de2de88b728d64db49a366ee907e0b6ae06bf 100644
--- a/tensorflow/python/keras/__init__.py
+++ b/tensorflow/python/keras/__init__.py
@@ -41,6 +41,8 @@ from tensorflow.python.keras import wrappers
 from tensorflow.python.keras.layers import Input
 from tensorflow.python.keras.models import Model
 from tensorflow.python.keras.models import Sequential
+from tensorflow.python.keras.saving.saved_model import export
+from tensorflow.python.keras.saving.saved_model import load_from_saved_model
 
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/activations_test.py b/tensorflow/python/keras/activations_test.py
index 6b7bfb698b8abef4a3e0ac115f2f247103b92abc..33001f419ef076b1473b5407bc6a5ba4ee788104 100644
--- a/tensorflow/python/keras/activations_test.py
+++ b/tensorflow/python/keras/activations_test.py
@@ -31,6 +31,7 @@ def _ref_softmax(values):
   return e / np.sum(e)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class KerasActivationsTest(test.TestCase):
 
   def test_serialization(self):
@@ -46,12 +47,11 @@ class KerasActivationsTest(test.TestCase):
       assert fn == ref_fn
 
   def test_softmax(self):
-    with self.cached_session():
-      x = keras.backend.placeholder(ndim=2)
-      f = keras.backend.function([x], [keras.activations.softmax(x)])
-      test_values = np.random.random((2, 5))
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.softmax(x)])
+    test_values = np.random.random((2, 5))
 
-      result = f([test_values])[0]
+    result = f([test_values])[0]
     expected = _ref_softmax(test_values[0])
     self.assertAllClose(result[0], expected, rtol=1e-05)
 
@@ -60,40 +60,36 @@ class KerasActivationsTest(test.TestCase):
       keras.activations.softmax(x)
 
   def test_temporal_softmax(self):
-    with self.cached_session():
-      x = keras.backend.placeholder(shape=(2, 2, 3))
-      f = keras.backend.function([x], [keras.activations.softmax(x)])
-      test_values = np.random.random((2, 2, 3)) * 10
-      result = f([test_values])[0]
+    x = keras.backend.placeholder(shape=(2, 2, 3))
+    f = keras.backend.function([x], [keras.activations.softmax(x)])
+    test_values = np.random.random((2, 2, 3)) * 10
+    result = f([test_values])[0]
     expected = _ref_softmax(test_values[0, 0])
     self.assertAllClose(result[0, 0], expected, rtol=1e-05)
 
-  @test_util.run_deprecated_v1
   def test_selu(self):
     x = keras.backend.placeholder(ndim=2)
     f = keras.backend.function([x], [keras.activations.selu(x)])
     alpha = 1.6732632423543772848170429916717
     scale = 1.0507009873554804934193349852946
 
-    with self.cached_session():
-      positive_values = np.array([[1, 2]], dtype=keras.backend.floatx())
-      result = f([positive_values])[0]
-      self.assertAllClose(result, positive_values * scale, rtol=1e-05)
+    positive_values = np.array([[1, 2]], dtype=keras.backend.floatx())
+    result = f([positive_values])[0]
+    self.assertAllClose(result, positive_values * scale, rtol=1e-05)
 
-      negative_values = np.array([[-1, -2]], dtype=keras.backend.floatx())
-      result = f([negative_values])[0]
-      true_result = (np.exp(negative_values) - 1) * scale * alpha
-      self.assertAllClose(result, true_result)
+    negative_values = np.array([[-1, -2]], dtype=keras.backend.floatx())
+    result = f([negative_values])[0]
+    true_result = (np.exp(negative_values) - 1) * scale * alpha
+    self.assertAllClose(result, true_result)
 
   def test_softplus(self):
     def softplus(x):
       return np.log(np.ones_like(x) + np.exp(x))
 
-    with self.cached_session():
-      x = keras.backend.placeholder(ndim=2)
-      f = keras.backend.function([x], [keras.activations.softplus(x)])
-      test_values = np.random.random((2, 5))
-      result = f([test_values])[0]
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.softplus(x)])
+    test_values = np.random.random((2, 5))
+    result = f([test_values])[0]
     expected = softplus(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
@@ -101,11 +97,10 @@ class KerasActivationsTest(test.TestCase):
     def softsign(x):
       return np.divide(x, np.ones_like(x) + np.absolute(x))
 
-    with self.cached_session():
-      x = keras.backend.placeholder(ndim=2)
-      f = keras.backend.function([x], [keras.activations.softsign(x)])
-      test_values = np.random.random((2, 5))
-      result = f([test_values])[0]
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.softsign(x)])
+    test_values = np.random.random((2, 5))
+    result = f([test_values])[0]
     expected = softsign(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
@@ -118,68 +113,60 @@ class KerasActivationsTest(test.TestCase):
         return z / (1 + z)
     sigmoid = np.vectorize(ref_sigmoid)
 
-    with self.cached_session():
-      x = keras.backend.placeholder(ndim=2)
-      f = keras.backend.function([x], [keras.activations.sigmoid(x)])
-      test_values = np.random.random((2, 5))
-      result = f([test_values])[0]
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.sigmoid(x)])
+    test_values = np.random.random((2, 5))
+    result = f([test_values])[0]
     expected = sigmoid(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
-  @test_util.run_deprecated_v1
   def test_hard_sigmoid(self):
     def ref_hard_sigmoid(x):
       x = (x * 0.2) + 0.5
       z = 0.0 if x <= 0 else (1.0 if x >= 1 else x)
       return z
     hard_sigmoid = np.vectorize(ref_hard_sigmoid)
-    with self.cached_session():
-      x = keras.backend.placeholder(ndim=2)
-      f = keras.backend.function([x], [keras.activations.hard_sigmoid(x)])
-      test_values = np.random.random((2, 5))
-      result = f([test_values])[0]
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.hard_sigmoid(x)])
+    test_values = np.random.random((2, 5))
+    result = f([test_values])[0]
     expected = hard_sigmoid(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
   def test_relu(self):
-    with self.cached_session():
-      x = keras.backend.placeholder(ndim=2)
-      f = keras.backend.function([x], [keras.activations.relu(x)])
-      test_values = np.random.random((2, 5))
-      result = f([test_values])[0]
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.relu(x)])
+    test_values = np.random.random((2, 5))
+    result = f([test_values])[0]
     # No negative values in test values...
     self.assertAllClose(result, test_values, rtol=1e-05)
 
-  @test_util.run_deprecated_v1
   def test_elu(self):
-    with self.cached_session():
-      x = keras.backend.placeholder(ndim=2)
-      f = keras.backend.function([x], [keras.activations.elu(x, 0.5)])
-      test_values = np.random.random((2, 5))
-      result = f([test_values])[0]
-      self.assertAllClose(result, test_values, rtol=1e-05)
-      negative_values = np.array([[-1, -2]], dtype=keras.backend.floatx())
-      result = f([negative_values])[0]
-      true_result = (np.exp(negative_values) - 1) / 2
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.elu(x, 0.5)])
+    test_values = np.random.random((2, 5))
+    result = f([test_values])[0]
+    self.assertAllClose(result, test_values, rtol=1e-05)
+    negative_values = np.array([[-1, -2]], dtype=keras.backend.floatx())
+    result = f([negative_values])[0]
+    true_result = (np.exp(negative_values) - 1) / 2
     self.assertAllClose(result, true_result)
 
   def test_tanh(self):
-    with self.cached_session():
-      test_values = np.random.random((2, 5))
-      x = keras.backend.placeholder(ndim=2)
-      exp = keras.activations.tanh(x)
-      f = keras.backend.function([x], [exp])
-      result = f([test_values])[0]
+    test_values = np.random.random((2, 5))
+    x = keras.backend.placeholder(ndim=2)
+    exp = keras.activations.tanh(x)
+    f = keras.backend.function([x], [exp])
+    result = f([test_values])[0]
     expected = np.tanh(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
   def test_exponential(self):
-    with self.cached_session():
-      test_values = np.random.random((2, 5))
-      x = keras.backend.placeholder(ndim=2)
-      exp = keras.activations.exponential(x)
-      f = keras.backend.function([x], [exp])
-      result = f([test_values])[0]
+    test_values = np.random.random((2, 5))
+    x = keras.backend.placeholder(ndim=2)
+    exp = keras.activations.exponential(x)
+    f = keras.backend.function([x], [exp])
+    result = f([test_values])[0]
     expected = np.exp(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 7dae203f8a08ba8285312b12b4bbd151c4be1224..23623e6e2c06a8b913175244f851fdd13c948e75 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -568,6 +568,10 @@ def _get_available_gpus():
   Returns:
       A list of available GPU devices.
   """
+  if ops.executing_eagerly_outside_functions():
+    # Returns names of devices directly.
+    return [name for name in context.list_devices() if 'GPU' in name]
+
   global _LOCAL_DEVICES
   if _LOCAL_DEVICES is None:
     _LOCAL_DEVICES = get_session().list_devices()
@@ -2555,6 +2559,7 @@ def arange(start, stop=None, step=1, dtype='int32'):
     result = cast(result, dtype)
   return result
 
+
 @tf_export('keras.backend.tile')
 def tile(x, n):
   """Creates a tensor by tiling `x` by `n`.
@@ -2926,17 +2931,12 @@ class GraphExecutionFunction(object):
   def __init__(self, inputs, outputs, updates=None, name=None,
                **session_kwargs):
     updates = updates or []
-    if not isinstance(inputs, (list, tuple)):
-      raise TypeError('`inputs` to a Keras backend function '
-                      'should be a list or tuple.')
-    if not isinstance(outputs, (list, tuple)):
-      raise TypeError('`outputs` of a Keras backend function '
-                      'should be a list or tuple.')
     if not isinstance(updates, (list, tuple)):
       raise TypeError('`updates` in a Keras backend function '
                       'should be a list or tuple.')
-    self.inputs = list(inputs)
-    self.outputs = list(outputs)
+    self.inputs = nest.flatten(inputs)
+    self._outputs_structure = outputs
+    self.outputs = nest.flatten(outputs)
     with ops.control_dependencies(self.outputs):
       updates_ops = []
       for update in updates:
@@ -3033,8 +3033,7 @@ class GraphExecutionFunction(object):
         self.fetch_callbacks[fetch](output)
 
   def __call__(self, inputs):
-    if not isinstance(inputs, (list, tuple)):
-      raise TypeError('`inputs` should be a list or tuple.')
+    inputs = nest.flatten(inputs)
 
     session = get_session()
     feed_arrays = []
@@ -3077,7 +3076,8 @@ class GraphExecutionFunction(object):
     fetched = self._callable_fn(*array_vals,
                                 run_metadata=self.run_metadata)
     self._call_fetch_callbacks(fetched[-len(self._fetches):])
-    return fetched[:len(self.outputs)]
+    return nest.pack_sequence_as(self._outputs_structure,
+                                 fetched[:len(self.outputs)])
 
 
 class EagerExecutionFunction(object):
@@ -3093,17 +3093,12 @@ class EagerExecutionFunction(object):
 
   def __init__(self, inputs, outputs, updates=None, name=None):
     updates = updates or []
-    if not isinstance(inputs, (list, tuple)):
-      raise TypeError('`inputs` to a Keras backend function '
-                      'should be a list or tuple.')
-    if not isinstance(outputs, (list, tuple)):
-      raise TypeError('`outputs` of a Keras backend function '
-                      'should be a list or tuple.')
     if not isinstance(updates, (list, tuple)):
       raise TypeError('`updates` in a Keras backend function '
                       'should be a list or tuple.')
-    self.inputs = list(inputs)
-    self.outputs = list(outputs)
+    self.inputs = nest.flatten(inputs)
+    self._outputs_structure = outputs
+    self.outputs = nest.flatten(outputs)
     self.name = name
 
     graph = get_graph()
@@ -3153,6 +3148,7 @@ class EagerExecutionFunction(object):
               x.op.inputs[0])
 
   def __call__(self, inputs):
+    inputs = nest.flatten(inputs)
     converted_inputs = []
     for tensor, value in zip(self.inputs, inputs):
       if value is None:
@@ -3161,14 +3157,16 @@ class EagerExecutionFunction(object):
         if value is None:
           raise ValueError(
               'You must feed a value for placeholder %s' % (tensor,))
-      value = ops.convert_to_tensor(value, dtype=tensor.dtype)
+      if not isinstance(value, ops.Tensor):
+        value = ops.convert_to_tensor(value, dtype=tensor.dtype)
       if value.dtype != tensor.dtype:
         # Temporary workaround due to `convert_to_tensor` not casting floats.
         # See b/119637405
         value = math_ops.cast(value, tensor.dtype)
       converted_inputs.append(value)
     outputs = self._graph_fn(*converted_inputs)
-    return [x.numpy() for x in outputs]
+    return nest.pack_sequence_as(self._outputs_structure,
+                                 [x.numpy() for x in outputs])
 
 
 @tf_export('keras.backend.function')
@@ -3858,19 +3856,23 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   Raises:
       ValueError: if `axis` is neither -1 nor one of the axes of `output`.
   """
-  rank = len(output.shape)
-  axis = axis % rank
-  # Note: nn.softmax_cross_entropy_with_logits_v2
-  # expects logits, Keras expects probabilities.
   if not from_logits:
-    # scale preds so that the class probas of each sample sum to 1
-    output = output / math_ops.reduce_sum(output, axis, True)
-    # manual computation of crossentropy
-    epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
-    output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
-    return -math_ops.reduce_sum(target * math_ops.log(output), axis)
-  else:
-    return nn.softmax_cross_entropy_with_logits_v2(labels=target, logits=output)
+    if context.executing_eagerly() or output.op.type != 'Softmax':
+      axis = axis % len(output.shape)
+      # scale preds so that the class probas of each sample sum to 1
+      output = output / math_ops.reduce_sum(output, axis, True)
+      # manual computation of crossentropy
+      epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
+      output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
+      return -math_ops.reduce_sum(target * math_ops.log(output), axis)
+    else:
+      # When softmax activation function is used for output operation, we
+      # use logits from the softmax function directly to compute loss in order
+      # to prevent collapsing zero when training.
+      # See b/117284466
+      assert len(output.op.inputs) == 1
+      output = output.op.inputs[0]
+  return nn.softmax_cross_entropy_with_logits_v2(labels=target, logits=output)
 
 
 @tf_export('keras.backend.sparse_categorical_crossentropy')
@@ -3894,19 +3896,25 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   Raises:
       ValueError: if `axis` is neither -1 nor one of the axes of `output`.
   """
+  if not from_logits:
+    if context.executing_eagerly() or output.op.type != 'Softmax':
+      epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
+      output = clip_ops.clip_by_value(output, epsilon_, 1 - epsilon_)
+      output = math_ops.log(output)
+    else:
+      # When softmax activation function is used for output operation, we
+      # use logits from the softmax function directly to compute loss in order
+      # to prevent collapsing zero when training.
+      # See b/117284466
+      assert len(output.op.inputs) == 1
+      output = output.op.inputs[0]
+
   rank = len(output.shape)
   axis = axis % rank
   if axis != rank - 1:
     permutation = list(range(axis)) + list(range(axis + 1, rank)) + [axis]
     output = array_ops.transpose(output, perm=permutation)
 
-  # Note: nn.sparse_softmax_cross_entropy_with_logits
-  # expects logits, Keras expects probabilities.
-  if not from_logits:
-    epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
-    output = clip_ops.clip_by_value(output, epsilon_, 1 - epsilon_)
-    output = math_ops.log(output)
-
   output_shape = output.shape
   targets = cast(flatten(target), 'int64')
   logits = array_ops.reshape(output, [-1, int(output_shape[-1])])
@@ -3933,13 +3941,18 @@ def binary_crossentropy(target, output, from_logits=False):
   Returns:
       A tensor.
   """
-  # Note: nn.sigmoid_cross_entropy_with_logits
-  # expects logits, Keras expects probabilities.
   if not from_logits:
-    # transform back to logits
-    epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
-    output = clip_ops.clip_by_value(output, epsilon_, 1 - epsilon_)
-    output = math_ops.log(output / (1 - output))
+    if context.executing_eagerly() or output.op.type != 'Sigmoid':
+      # transform back to logits
+      epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
+      output = clip_ops.clip_by_value(output, epsilon_, 1 - epsilon_)
+      output = math_ops.log(output / (1 - output))
+    else:
+      # When sigmoid activation function is used for output operation, we
+      # use logits from the sigmoid function directly to compute loss in order
+      # to prevent collapsing zero when training.
+      assert len(output.op.inputs) == 1
+      output = output.op.inputs[0]
   return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
 
 
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index fa32b1ecd7a7d797beb5477d1ae2150a06d6c46c..4b83f0bf664e3cdffee889f504dc2fc47a94a1ce 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -1069,13 +1069,13 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
                                                              initial_states,
                                                              **kwargs)
         # check static shape inference
-        self.assertEquals(last_output.get_shape().as_list(),
-                          [num_samples, output_dim])
-        self.assertEquals(outputs.get_shape().as_list(),
-                          [num_samples, timesteps, output_dim])
+        self.assertEqual(last_output.get_shape().as_list(),
+                         [num_samples, output_dim])
+        self.assertEqual(outputs.get_shape().as_list(),
+                         [num_samples, timesteps, output_dim])
         for state in new_states:
-          self.assertEquals(state.get_shape().as_list(),
-                            [num_samples, output_dim])
+          self.assertEqual(state.get_shape().as_list(),
+                           [num_samples, output_dim])
 
         last_output_list[i].append(keras.backend.eval(last_output))
         outputs_list[i].append(keras.backend.eval(outputs))
@@ -1173,7 +1173,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(outputs.get_shape().as_list(),
                          [num_samples, timesteps, output_dim])
         # for state in new_states:
-        #   self.assertEquals(state.get_shape().as_list(),
+        #   self.assertEqual(state.get_shape().as_list(),
         #                     [num_samples, output_dim])
         self.assertEqual(new_states[0].get_shape().as_list(),
                          [num_samples, output_dim])
@@ -1422,7 +1422,7 @@ class TestCTC(test.TestCase):
                 decode_truth[i] == keras.backend.eval(decode_pred_tf[i])))
       self.assertAllClose(log_prob_truth, log_prob_pred)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def test_ctc_batch_cost(self):
     with self.cached_session():
       label_lens = np.expand_dims(np.asarray([5, 4]), 1)
@@ -1695,6 +1695,39 @@ class BackendGraphTests(test.TestCase):
       self.assertEqual(callback.times_called, 1)
       self.assertEqual(callback.callback_result, 200)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_function_dict_outputs(self):
+    x_ph = keras.backend.placeholder(shape=(), name='x')
+    y_ph = keras.backend.placeholder(shape=(), name='y')
+    outputs = {'x*y': y_ph * x_ph, 'x*x': x_ph * x_ph}
+
+    f = keras.backend.function(inputs=[x_ph, y_ph], outputs=outputs)
+    x, y = 2., 5.
+    results = f([x, y])
+
+    self.assertEqual(results['x*y'], 10.)
+    self.assertEqual(results['x*x'], 4)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_function_dict_inputs(self):
+    placeholders = {
+        'x': keras.backend.placeholder(shape=()),
+        'y': keras.backend.placeholder(shape=())
+    }
+    outputs = [placeholders['x'] * placeholders['y']]
+
+    f = keras.backend.function(inputs=placeholders, outputs=outputs)
+    results = f({'x': 2., 'y': 3.})
+    self.assertEqual(results[0], 6.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_function_single_input_output(self):
+    x_ph = keras.backend.placeholder(shape=(), name='x')
+    output = x_ph * x_ph
+    f = keras.backend.function(x_ph, output)
+    result = f(2.)
+    self.assertEqual(result, 4.)
+
   def test_placeholder(self):
     x = keras.backend.placeholder(shape=(3, 4))
     self.assertEqual(x.get_shape().as_list(), [3, 4])
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 3e3c087e3fc5b715dec95c6d743bc970a48c9d2d..1cb326752759f0e51ea6cb418a3aca2a9cbb7e16 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -24,7 +24,6 @@ import copy
 import csv
 import io
 import json
-import math
 import os
 import time
 
@@ -35,7 +34,6 @@ from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine.training_utils import standardize_input_data
 from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
@@ -47,45 +45,38 @@ from tensorflow.python.summary import summary as tf_summary
 from tensorflow.python.training import saver
 from tensorflow.python.util.tf_export import tf_export
 
-
 try:
   import requests
 except ImportError:
   requests = None
 
 
+_TRAIN = 'train'
+_TEST = 'test'
+_PREDICT = 'predict'
+
+
 # pylint: disable=protected-access
 def configure_callbacks(callbacks,
                         model,
                         do_validation=False,
-                        val_inputs=None,
-                        val_targets=None,
-                        val_sample_weights=None,
                         batch_size=None,
                         epochs=None,
                         steps_per_epoch=None,
                         samples=None,
-                        validation_steps=None,
                         verbose=1,
                         count_mode='steps',
-                        mode='train'):
+                        mode=_TRAIN):
   """Configures callbacks for use in various training loops.
 
   Arguments:
       callbacks: List of Callbacks.
       model: Model being trained.
       do_validation: Whether or not validation loop will be run.
-      val_inputs: Inputs to Model for validation loop. Can be any
-        data format Keras accepts.
-      val_targets: Targets for Model for validation loop. Can be any
-        data format Keras accepts.
-      val_sample_weights: Sample weights for Model for validation loop.
-        Can be any data format Keras accepts.
       batch_size: Number of samples per batch.
       epochs: Number of epoch to train.
       steps_per_epoch: Number of batches to run per training epoch.
       samples: Number of training samples.
-      validation_steps: Number of batches to run per validation epoch.
       verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
       count_mode: One of 'steps' or 'samples'. Per-batch or per-sample count.
       mode: String. One of 'train', 'test', or 'predict'. Which loop mode to
@@ -102,7 +93,7 @@ def configure_callbacks(callbacks,
     callbacks = []
 
   # Add additional callbacks during training.
-  if mode == 'train':
+  if mode == _TRAIN:
     model.history = History()
     stateful_metric_names = None
     if hasattr(model, 'metrics_names'):
@@ -122,12 +113,10 @@ def configure_callbacks(callbacks,
   callback_metrics = []
   # When we have deferred build scenario with iterator input, we will compile
   # when we standardize first batch of data.
-  if mode != 'predict' and hasattr(model, 'metrics_names'):
+  if mode != _PREDICT and hasattr(model, 'metrics_names'):
     callback_metrics = copy.copy(model.metrics_names)
     if do_validation:
       callback_metrics += ['val_' + n for n in model.metrics_names]
-  if validation_steps is None and isinstance(val_inputs, Sequence):
-    validation_steps = len(val_inputs)
   callback_params = {
       'batch_size': batch_size,
       'epochs': epochs,
@@ -136,30 +125,15 @@ def configure_callbacks(callbacks,
       'verbose': verbose,
       'do_validation': do_validation,
       'metrics': callback_metrics,
-      'validation_steps': validation_steps
   }
   callback_list.set_params(callback_params)
 
-  # Pass validation data to callbacks
-  # TODO(omalleyt): remove this once val hooks are ready.
-  if model._distribution_strategy or not val_inputs:
-    val_data = []
-  else:
-    if not model.run_eagerly:
-      # Need to create the eval_function before start of the first epoch
-      # because TensorBoard callback on_epoch_begin adds summary to the
-      # list of fetches of the eval_function
-      callback_model._make_eval_function()
-    if _is_generator_like(val_inputs):
-      val_data = val_inputs
-    else:
-      val_data = val_inputs + val_targets
-      if val_sample_weights:
-        val_data += val_sample_weights
-      if not isinstance(K.symbolic_learning_phase(), int):
-        val_data += [False]
-  for cbk in callbacks:
-    cbk.validation_data = val_data
+  if (do_validation and not model._distribution_strategy and
+      not model.run_eagerly):
+    # Need to create the eval_function before start of the first epoch
+    # because TensorBoard callback on_epoch_begin adds summary to the
+    # list of fetches of the eval_function
+    callback_model._make_eval_function()
 
   callback_list.model.stop_training = False
   return callback_list
@@ -172,6 +146,17 @@ def _is_generator_like(data):
       data, (Sequence, iterator_ops.Iterator, iterator_ops.EagerIterator)))
 
 
+def make_logs(model, logs, outputs, mode, prefix=''):
+  """Computes logs for sending to `on_batch_end` methods."""
+  if mode in {_TRAIN, _TEST}:
+    if hasattr(model, 'metrics_names'):
+      for label, output in zip(model.metrics_names, outputs):
+        logs[prefix + label] = output
+  else:
+    logs['outputs'] = outputs
+  return logs
+
+
 class CallbackList(object):
   """Container abstracting a list of callbacks.
 
@@ -209,10 +194,6 @@ class CallbackList(object):
 
   def _call_batch_hook(self, mode, hook, batch, logs=None):
     """Helper function for all batch_{begin | end} methods."""
-    # TODO(omalleyt): add batch hooks for test/predict.
-    if mode != 'train':
-      return
-
     hook_name = 'on_{mode}_batch_{hook}'.format(mode=mode, hook=hook)
     if hook == 'begin':
       self._t_enter_batch = time.time()
@@ -237,87 +218,175 @@ class CallbackList(object):
 
   def _call_begin_hook(self, mode):
     """Helper function for on_{train|test|predict}_begin methods."""
-    # TODO(omalleyt): add test/predict methods.
-    if mode == 'train':
+    if mode == _TRAIN:
       self.on_train_begin()
+    elif mode == _TEST:
+      self.on_test_begin()
+    else:
+      self.on_predict_begin()
 
   def _call_end_hook(self, mode):
     """Helper function for on_{train|test|predict}_end methods."""
-    # TODO(omalleyt): add test/predict methods.
-    if mode == 'train':
+    if mode == _TRAIN:
       self.on_train_end()
+    elif mode == _TEST:
+      self.on_test_end()
+    else:
+      self.on_predict_end()
 
   def on_batch_begin(self, batch, logs=None):
-    self._call_batch_hook('train', 'begin', batch, logs=logs)
+    self._call_batch_hook(_TRAIN, 'begin', batch, logs=logs)
 
   def on_batch_end(self, batch, logs=None):
-    self._call_batch_hook('train', 'end', batch, logs=logs)
+    self._call_batch_hook(_TRAIN, 'end', batch, logs=logs)
 
   def on_epoch_begin(self, epoch, logs=None, mode='train'):
-    """Called at the start of an epoch.
+    """Calls the `on_epoch_begin` methods of its callbacks.
 
     Arguments:
         epoch: integer, index of epoch.
-        logs: dictionary of logs.
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
         mode: One of 'train'/'test'/'predict'
     """
-    if mode == 'train':
+    if mode == _TRAIN:
       logs = logs or {}
       for callback in self.callbacks:
         callback.on_epoch_begin(epoch, logs)
     self._reset_batch_timing()
 
   def on_epoch_end(self, epoch, logs=None, mode='train'):
-    """Called at the end of an epoch.
+    """Calls the `on_epoch_end` methods of its callbacks.
 
     Arguments:
         epoch: integer, index of epoch.
-        logs: dictionary of logs.
+        logs: dict, metric results for this training epoch, and for the
+          validation epoch if validation is performed. Validation result keys
+          are prefixed with `val_`.
         mode: One of 'train'/'test'/'predict'
     """
-    if mode == 'train':
+    if mode == _TRAIN:
       logs = logs or {}
       for callback in self.callbacks:
         callback.on_epoch_end(epoch, logs)
 
   def on_train_batch_begin(self, batch, logs=None):
-    """Called at the beginning of a training batch in `fit` methods.
+    """Calls the `on_train_batch_begin` methods of its callbacks.
 
     Arguments:
         batch: integer, index of batch within the current epoch.
-        logs: dictionary of logs.
+        logs: dict. Has keys `batch` and `size` representing the current batch
+          number and the size of the batch.
     """
-    self._call_batch_hook('train', 'begin', batch, logs=logs)
+    self._call_batch_hook(_TRAIN, 'begin', batch, logs=logs)
 
   def on_train_batch_end(self, batch, logs=None):
-    """Called at the end of a training batch in `fit` methods.
+    """Calls the `on_train_batch_end` methods of its callbacks.
 
     Arguments:
         batch: integer, index of batch within the current epoch.
-        logs: dictionary of logs.
+        logs: dict. Metric results for this batch.
     """
-    self._call_batch_hook('train', 'end', batch, logs=logs)
+    self._call_batch_hook(_TRAIN, 'end', batch, logs=logs)
+
+  def on_test_batch_begin(self, batch, logs=None):
+    """Calls the `on_test_batch_begin` methods of its callbacks.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Has keys `batch` and `size` representing the current batch
+          number and the size of the batch.
+    """
+    self._call_batch_hook(_TEST, 'begin', batch, logs=logs)
+
+  def on_test_batch_end(self, batch, logs=None):
+    """Calls the `on_test_batch_end` methods of its callbacks.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Metric results for this batch.
+    """
+    self._call_batch_hook(_TEST, 'end', batch, logs=logs)
+
+  def on_predict_batch_begin(self, batch, logs=None):
+    """Calls the `on_predict_batch_begin` methods of its callbacks.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Has keys `batch` and `size` representing the current batch
+          number and the size of the batch.
+    """
+    self._call_batch_hook(_PREDICT, 'begin', batch, logs=logs)
+
+  def on_predict_batch_end(self, batch, logs=None):
+    """Calls the `on_predict_batch_end` methods of its callbacks.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Metric results for this batch.
+    """
+    self._call_batch_hook(_PREDICT, 'end', batch, logs=logs)
 
   def on_train_begin(self, logs=None):
-    """Called at the beginning of training.
+    """Calls the `on_train_begin` methods of its callbacks.
 
     Arguments:
-        logs: dictionary of logs.
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
     """
-    logs = logs or {}
     for callback in self.callbacks:
       callback.on_train_begin(logs)
 
   def on_train_end(self, logs=None):
-    """Called at the end of training.
+    """Calls the `on_train_end` methods of its callbacks.
 
     Arguments:
-        logs: dictionary of logs.
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
     """
-    logs = logs or {}
     for callback in self.callbacks:
       callback.on_train_end(logs)
 
+  def on_test_begin(self, logs=None):
+    """Calls the `on_test_begin` methods of its callbacks.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+    for callback in self.callbacks:
+      callback.on_test_begin(logs)
+
+  def on_test_end(self, logs=None):
+    """Calls the `on_test_end` methods of its callbacks.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+    for callback in self.callbacks:
+      callback.on_test_end(logs)
+
+  def on_predict_begin(self, logs=None):
+    """Calls the 'on_predict_begin` methods of its callbacks.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+    for callback in self.callbacks:
+      callback.on_predict_begin(logs)
+
+  def on_predict_end(self, logs=None):
+    """Calls the `on_predict_end` methods of its callbacks.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+    for callback in self.callbacks:
+      callback.on_predict_end(logs)
+
   def __iter__(self):
     return iter(self.callbacks)
 
@@ -360,31 +429,169 @@ class Callback(object):
   def set_model(self, model):
     self.model = model
 
-  def on_epoch_begin(self, epoch, logs=None):
-    pass
-
-  def on_epoch_end(self, epoch, logs=None):
-    pass
-
   def on_batch_begin(self, batch, logs=None):
-    pass
+    """A backwards compatibility alias for `on_train_batch_begin`."""
 
   def on_batch_end(self, batch, logs=None):
-    pass
+    """A backwards compatibility alias for `on_train_batch_end`."""
+
+  def on_epoch_begin(self, epoch, logs=None, mode='train'):
+    """Called at the start of an epoch.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        epoch: integer, index of epoch.
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+        mode: One of 'train'/'test'/'predict'
+    """
+
+  def on_epoch_end(self, epoch, logs=None, mode='train'):
+    """Called at the end of an epoch.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        epoch: integer, index of epoch.
+        logs: dict, metric results for this training epoch, and for the
+          validation epoch if validation is performed. Validation result keys
+          are prefixed with `val_`.
+        mode: One of 'train'/'test'/'predict'
+    """
 
   def on_train_batch_begin(self, batch, logs=None):
-    # For backwards compatibility
+    """Called at the beginning of a training batch in `fit` methods.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Has keys `batch` and `size` representing the current batch
+          number and the size of the batch.
+    """
+    # For backwards compatibility.
     self.on_batch_begin(batch, logs=logs)
 
   def on_train_batch_end(self, batch, logs=None):
-    # For backwards compatibility
+    """Called at the end of a training batch in `fit` methods.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Metric results for this batch.
+    """
+    # For backwards compatibility.
     self.on_batch_end(batch, logs=logs)
 
+  def on_test_batch_begin(self, batch, logs=None):
+    """Called at the beginning of a batch in `evaluate` methods.
+
+    Also called at the beginning of a validation batch in the `fit`
+    methods, if validation data is provided.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Has keys `batch` and `size` representing the current batch
+          number and the size of the batch.
+    """
+
+  def on_test_batch_end(self, batch, logs=None):
+    """Called at the end of a batch in `evaluate` methods.
+
+    Also called at the end of a validation batch in the `fit`
+    methods, if validation data is provided.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Metric results for this batch.
+    """
+
+  def on_predict_batch_begin(self, batch, logs=None):
+    """Called at the beginning of a batch in `predict` methods.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Has keys `batch` and `size` representing the current batch
+          number and the size of the batch.
+    """
+
+  def on_predict_batch_end(self, batch, logs=None):
+    """Called at the end of a batch in `predict` methods.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Metric results for this batch.
+    """
+
   def on_train_begin(self, logs=None):
-    pass
+    """Called at the beginning of training.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
 
   def on_train_end(self, logs=None):
-    pass
+    """Called at the end of training.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+
+  def on_test_begin(self, logs=None):
+    """Called at the beginning of evaluation or validation.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+
+  def on_test_end(self, logs=None):
+    """Called at the end of evaluation or validation.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+
+  def on_predict_begin(self, logs=None):
+    """Called at the beginning of prediction.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+
+  def on_predict_end(self, logs=None):
+    """Called at the end of prediction.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
 
 
 @tf_export('keras.callbacks.BaseLogger')
@@ -957,6 +1164,7 @@ class TensorBoard(Callback):
     self.batch_size = batch_size
     self._current_batch = 0
     self._total_batches_seen = 0
+    self._total_val_batches_seen = 0
     self.embeddings_freq = embeddings_freq
     self.embeddings_layer_names = embeddings_layer_names
     self.embeddings_metadata = embeddings_metadata
@@ -1045,8 +1253,10 @@ class TensorBoard(Callback):
     # If both embedding_freq and embeddings_data are available, we will
     # visualize embeddings.
     if self.embeddings_freq and self.embeddings_data is not None:
-      self.embeddings_data = standardize_input_data(self.embeddings_data,
-                                                    model.input_names)
+      # Avoid circular dependency.
+      from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
+      self.embeddings_data = training_utils.standardize_input_data(
+          self.embeddings_data, model.input_names)
 
       # If embedding_layer_names are not provided, get all of the embedding
       # layers from the model.
@@ -1111,10 +1321,8 @@ class TensorBoard(Callback):
       projector.visualize_embeddings(self.writer, config)
 
   def _fetch_callback(self, summary):
-    self.writer.add_summary(
-        summary,
-        self._epoch + self._current_val_batch / self._validation_batches)
-    self._current_val_batch += 1
+    self.writer.add_summary(summary, self._total_val_batches_seen)
+    self._total_val_batches_seen += 1
 
   def _write_custom_summaries(self, step, logs=None):
     """Writes metrics out as custom scalar summaries.
@@ -1145,22 +1353,6 @@ class TensorBoard(Callback):
         self.writer.add_summary(summary, step)
     self.writer.flush()
 
-  def on_train_begin(self, logs=None):
-    """Checks if histogram summaries can be run."""
-    # will never be set when in eager
-    if self.histogram_freq:
-      if self.params.get('validation_steps', None) is not None:
-        self._validation_batches = self.params['validation_steps']
-      elif self.validation_data:
-        self._validation_batches = math.ceil(
-            self.validation_data[0].shape[0] / self.batch_size)
-      else:
-        raise ValueError('If printing histograms, validation data must be '
-                         'provided.')
-      if self._validation_batches == 0:
-        raise ValueError(
-            'If printing histograms, validation data must have length > 0.')
-
   def on_batch_end(self, batch, logs=None):
     """Writes scalar summaries for metrics on every training batch."""
     # Don't output batch_size and batch number as Tensorboard summaries
@@ -1181,7 +1373,6 @@ class TensorBoard(Callback):
     # check if histogram summary should be run for this epoch
     if self.histogram_freq and epoch % self.histogram_freq == 0:
       self._epoch = epoch
-      self._current_val_batch = 0
       # pylint: disable=protected-access
       # add the histogram summary op if it should run this epoch
       if self.merged not in self.model._eval_function.fetches:
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index ed05572c68bf6e8b70c6e6f520dda015c95b7dfa..ef469c5e4f5deb3e4f0cff7cb3deea95d0266d9b 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import csv
 import os
 import re
@@ -33,10 +34,10 @@ from tensorflow.python import keras
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import adam
 
 try:
@@ -58,6 +59,142 @@ NUM_HIDDEN = 5
 BATCH_SIZE = 5
 
 
+class Counter(keras.callbacks.Callback):
+  """Counts the number of times each callback method was run.
+
+  Attributes:
+    method_counts: dict. Contains the counts of time  each callback method was
+      run.
+  """
+
+  def __init__(self):
+    self.method_counts = collections.defaultdict(int)
+    methods_to_count = [
+        'on_batch_begin', 'on_batch_end', 'on_epoch_begin', 'on_epoch_end',
+        'on_predict_batch_begin', 'on_predict_batch_end', 'on_predict_begin',
+        'on_predict_end', 'on_test_batch_begin', 'on_test_batch_end',
+        'on_test_begin', 'on_test_end', 'on_train_batch_begin',
+        'on_train_batch_end', 'on_train_begin', 'on_train_end'
+    ]
+    for method_name in methods_to_count:
+      setattr(self, method_name,
+              self.wrap_with_counts(method_name, getattr(self, method_name)))
+
+  def wrap_with_counts(self, method_name, method):
+
+    def _call_and_count(*args, **kwargs):
+      self.method_counts[method_name] += 1
+      return method(*args, **kwargs)
+
+    return _call_and_count
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class CallbackCountsTest(keras_parameterized.TestCase):
+
+  def _check_counts(self, counter, expected_counts):
+    """Checks that the counts registered by `counter` are those expected."""
+    for method_name, expected_count in expected_counts.items():
+      self.assertEqual(
+          counter.method_counts[method_name],
+          expected_count,
+          msg='For method {}: expected {}, got: {}'.format(
+              method_name, expected_count, counter.method_counts[method_name]))
+
+  def _get_model(self):
+    layers = [
+        keras.layers.Dense(10, activation='relu'),
+        keras.layers.Dense(1, activation='sigmoid')
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(10,))
+    model.compile(
+        adam.AdamOptimizer(0.001),
+        'binary_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
+
+  def test_callback_hooks_are_called_in_fit(self):
+    x, y = np.ones((10, 10)), np.ones((10, 1))
+    val_x, val_y = np.ones((4, 10)), np.ones((4, 1))
+
+    model = self._get_model()
+    counter = Counter()
+    model.fit(
+        x,
+        y,
+        validation_data=(val_x, val_y),
+        batch_size=2,
+        epochs=5,
+        callbacks=[counter])
+
+    self._check_counts(
+        counter, {
+            'on_batch_begin': 25,
+            'on_batch_end': 25,
+            'on_epoch_begin': 5,
+            'on_epoch_end': 5,
+            'on_predict_batch_begin': 0,
+            'on_predict_batch_end': 0,
+            'on_predict_begin': 0,
+            'on_predict_end': 0,
+            'on_test_batch_begin': 10,
+            'on_test_batch_end': 10,
+            'on_test_begin': 5,
+            'on_test_end': 5,
+            'on_train_batch_begin': 25,
+            'on_train_batch_end': 25,
+            'on_train_begin': 1,
+            'on_train_end': 1
+        })
+
+  def test_callback_hooks_are_called_in_evaluate(self):
+    x, y = np.ones((10, 10)), np.ones((10, 1))
+
+    model = self._get_model()
+    counter = Counter()
+    model.evaluate(x, y, batch_size=2, callbacks=[counter])
+    self._check_counts(
+        counter, {
+            'on_test_batch_begin': 5,
+            'on_test_batch_end': 5,
+            'on_test_begin': 1,
+            'on_test_end': 1
+        })
+
+  def test_callback_hooks_are_called_in_predict(self):
+    x = np.ones((10, 10))
+
+    model = self._get_model()
+    counter = Counter()
+    model.predict(x, batch_size=2, callbacks=[counter])
+    self._check_counts(
+        counter, {
+            'on_predict_batch_begin': 5,
+            'on_predict_batch_end': 5,
+            'on_predict_begin': 1,
+            'on_predict_end': 1
+        })
+
+  def test_callback_list_methods(self):
+    counter = Counter()
+    callback_list = keras.callbacks.CallbackList([counter])
+
+    batch = 0
+    callback_list.on_test_batch_begin(batch)
+    callback_list.on_test_batch_end(batch)
+    callback_list.on_predict_batch_begin(batch)
+    callback_list.on_predict_batch_end(batch)
+
+    self._check_counts(
+        counter, {
+            'on_test_batch_begin': 1,
+            'on_test_batch_end': 1,
+            'on_predict_batch_begin': 1,
+            'on_predict_batch_end': 1
+        })
+
+
 class KerasCallbacksTest(test.TestCase):
 
   def test_ModelCheckpoint(self):
@@ -404,7 +541,6 @@ class KerasCallbacksTest(test.TestCase):
           float(keras.backend.get_value(
               model.optimizer.lr)) - 0.01 / 4) < keras.backend.epsilon()
 
-  @test_util.run_deprecated_v1
   def test_ReduceLROnPlateau(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -780,79 +916,6 @@ class KerasCallbacksTest(test.TestCase):
           data_generator(True), len(x_train), epochs=2, callbacks=cbks)
       assert os.path.exists(temp_dir)
 
-  @test_util.run_deprecated_v1
-  def test_TensorBoard_histogram_freq_must_have_validation_data(self):
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-
-    with self.cached_session():
-      filepath = os.path.join(tmpdir, 'logs')
-
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = keras.utils.to_categorical(y_test)
-      y_train = keras.utils.to_categorical(y_train)
-
-      def data_generator(train):
-        if train:
-          max_batch_index = len(x_train) // BATCH_SIZE
-        else:
-          max_batch_index = len(x_test) // BATCH_SIZE
-        i = 0
-        while 1:
-          if train:
-            # simulate multi-input/output models
-            yield (x_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE],
-                   y_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE])
-          else:
-            yield (x_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE],
-                   y_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE])
-          i += 1
-          i %= max_batch_index
-
-      inp = keras.Input((INPUT_DIM,))
-      hidden = keras.layers.Dense(2, activation='relu')(inp)
-      hidden = keras.layers.Dropout(0.1)(hidden)
-      output = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
-      model = keras.models.Model(inputs=inp, outputs=output)
-      model.compile(loss='categorical_crossentropy',
-                    optimizer='sgd',
-                    metrics=['accuracy'])
-
-      # we must generate new callbacks for each test, as they aren't stateless
-      def callbacks_factory(histogram_freq):
-        return [keras.callbacks.TensorBoard(
-            log_dir=filepath,
-            histogram_freq=histogram_freq,
-            write_images=True, write_grads=True,
-            batch_size=5)]
-
-      # fit w/o validation data should raise ValueError if histogram_freq > 0
-      cbs = callbacks_factory(histogram_freq=1)
-      with self.assertRaises(ValueError):
-        model.fit(
-            x_train, y_train, batch_size=BATCH_SIZE, callbacks=cbs, epochs=3)
-
-      for cb in cbs:
-        cb.on_train_end()
-
-      # fit generator without validation data should raise ValueError if
-      # histogram_freq > 0
-      cbs = callbacks_factory(histogram_freq=1)
-      with self.assertRaises(ValueError):
-        model.fit_generator(
-            data_generator(True), len(x_train), epochs=2, callbacks=cbs)
-
-      for cb in cbs:
-        cb.on_train_end()
-
-      # Make sure file writer cache is clear to avoid failures during cleanup.
-      writer_cache.FileWriterCache.clear()
-
   @test_util.run_deprecated_v1
   def test_TensorBoard_multi_input_output(self):
     np.random.seed(1337)
@@ -1001,7 +1064,7 @@ class KerasCallbacksTest(test.TestCase):
           epochs=3,
           verbose=0)
 
-      self.assertAllEqual(tsb.writer.steps_seen, [0, 0.5, 1, 1.5, 2, 2.5])
+      self.assertAllEqual(tsb.writer.steps_seen, [0, 1, 2, 3, 4, 5])
 
   @test_util.run_deprecated_v1
   def test_Tensorboard_histogram_summaries_with_generator(self):
diff --git a/tensorflow/python/keras/constraints_test.py b/tensorflow/python/keras/constraints_test.py
index 4f674ea7c5826f916f31f08d60d060e024931a9f..92bc4852cff849674457a6546340a7a2bdd9b79f 100644
--- a/tensorflow/python/keras/constraints_test.py
+++ b/tensorflow/python/keras/constraints_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -35,6 +36,7 @@ def get_example_array():
   return example_array
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class KerasConstraintsTest(test.TestCase):
 
   def test_serialization(self):
@@ -49,54 +51,47 @@ class KerasConstraintsTest(test.TestCase):
       assert fn.__class__ == ref_fn.__class__
 
   def test_max_norm(self):
-    with self.cached_session():
-      array = get_example_array()
-      for m in get_test_values():
-        norm_instance = keras.constraints.max_norm(m)
-        normed = norm_instance(keras.backend.variable(array))
-        assert np.all(keras.backend.eval(normed) < m)
-
-      # a more explicit example
-      norm_instance = keras.constraints.max_norm(2.0)
-      x = np.array([[0, 0, 0], [1.0, 0, 0], [3, 0, 0], [3, 3, 3]]).T
-      x_normed_target = np.array([[0, 0, 0], [1.0, 0, 0],
-                                  [2.0, 0, 0],
-                                  [2. / np.sqrt(3),
-                                   2. / np.sqrt(3),
-                                   2. / np.sqrt(3)]]).T
-      x_normed_actual = keras.backend.eval(
-          norm_instance(keras.backend.variable(x)))
-      self.assertAllClose(x_normed_actual, x_normed_target, rtol=1e-05)
+    array = get_example_array()
+    for m in get_test_values():
+      norm_instance = keras.constraints.max_norm(m)
+      normed = norm_instance(keras.backend.variable(array))
+      assert np.all(keras.backend.eval(normed) < m)
+
+    # a more explicit example
+    norm_instance = keras.constraints.max_norm(2.0)
+    x = np.array([[0, 0, 0], [1.0, 0, 0], [3, 0, 0], [3, 3, 3]]).T
+    x_normed_target = np.array(
+        [[0, 0, 0], [1.0, 0, 0], [2.0, 0, 0],
+         [2. / np.sqrt(3), 2. / np.sqrt(3), 2. / np.sqrt(3)]]).T
+    x_normed_actual = keras.backend.eval(
+        norm_instance(keras.backend.variable(x)))
+    self.assertAllClose(x_normed_actual, x_normed_target, rtol=1e-05)
 
   def test_non_neg(self):
-    with self.cached_session():
-      non_neg_instance = keras.constraints.non_neg()
-      normed = non_neg_instance(keras.backend.variable(get_example_array()))
-      assert np.all(np.min(keras.backend.eval(normed), axis=1) == 0.)
+    non_neg_instance = keras.constraints.non_neg()
+    normed = non_neg_instance(keras.backend.variable(get_example_array()))
+    assert np.all(np.min(keras.backend.eval(normed), axis=1) == 0.)
 
   def test_unit_norm(self):
-    with self.cached_session():
-      unit_norm_instance = keras.constraints.unit_norm()
-      normalized = unit_norm_instance(
-          keras.backend.variable(get_example_array()))
-      norm_of_normalized = np.sqrt(
-          np.sum(keras.backend.eval(normalized) ** 2, axis=0))
-      # In the unit norm constraint, it should be equal to 1.
-      difference = norm_of_normalized - 1.
-      largest_difference = np.max(np.abs(difference))
-      assert np.abs(largest_difference) < 10e-5
+    unit_norm_instance = keras.constraints.unit_norm()
+    normalized = unit_norm_instance(keras.backend.variable(get_example_array()))
+    norm_of_normalized = np.sqrt(
+        np.sum(keras.backend.eval(normalized)**2, axis=0))
+    # In the unit norm constraint, it should be equal to 1.
+    difference = norm_of_normalized - 1.
+    largest_difference = np.max(np.abs(difference))
+    assert np.abs(largest_difference) < 10e-5
 
   def test_min_max_norm(self):
-    with self.cached_session():
-      array = get_example_array()
-      for m in get_test_values():
-        norm_instance = keras.constraints.min_max_norm(min_value=m,
-                                                       max_value=m * 2)
-        normed = norm_instance(keras.backend.variable(array))
-        value = keras.backend.eval(normed)
-        l2 = np.sqrt(np.sum(np.square(value), axis=0))
-        assert not l2[l2 < m]
-        assert not l2[l2 > m * 2 + 1e-5]
+    array = get_example_array()
+    for m in get_test_values():
+      norm_instance = keras.constraints.min_max_norm(
+          min_value=m, max_value=m * 2)
+      normed = norm_instance(keras.backend.variable(array))
+      value = keras.backend.eval(normed)
+      l2 = np.sqrt(np.sum(np.square(value), axis=0))
+      assert not l2[l2 < m]
+      assert not l2[l2 > m * 2 + 1e-5]
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 8e353003429a6e57da3a40b36ff9aa3d708f8a11..aeed75065295cba79fe35a775616343a5c872c80 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import functools
 import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
+import itertools
 
 import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
@@ -45,6 +46,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import layer_utils as checkpointable_layer_utils
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
@@ -82,6 +84,12 @@ class Layer(checkpointable.CheckpointableBase):
     name: String name of the layer.
     dtype: Default dtype of the layer's weights (default of `None` means use the
       type of the first input).
+    dynamic: Set this to `True` if your layer should only be run eagerly, and
+      should not be used to generate a static computation graph.
+      This would be the case for a Tree-RNN or a recursive network,
+      for example, or generally for any layer that manipulates tensors
+      using Python control flow. If `False`, we assume that the layer can
+      safely be used to generate a static computation graph.
 
   Read-only properties:
     name: The name of the layer (string).
@@ -102,7 +110,8 @@ class Layer(checkpointable.CheckpointableBase):
   """
 
   @checkpointable.no_automatic_dependency_tracking
-  def __init__(self, trainable=True, name=None, dtype=None, **kwargs):
+  def __init__(self, trainable=True, name=None, dtype=None, dynamic=False,
+               **kwargs):
     # These properties should be set by the user via keyword arguments.
     # note that 'dtype', 'input_shape' and 'batch_input_shape'
     # are only applicable to input layers: do not pass these keywords
@@ -135,8 +144,10 @@ class Layer(checkpointable.CheckpointableBase):
 
     self._init_set_name(name)
     self._activity_regularizer = kwargs.pop('activity_regularizer', None)
-    self._trainable_weights = []
-    self._non_trainable_weights = []
+    if not hasattr(self, '_trainable_weights'):
+      self._trainable_weights = []
+    if not hasattr(self, '_non_trainable_weights'):
+      self._non_trainable_weights = []
     self._updates = []
     # A list of zero-argument lambdas which return Tensors, used for variable
     # regularizers.
@@ -164,6 +175,8 @@ class Layer(checkpointable.CheckpointableBase):
                                    hasattr(self, 'compute_mask'))
     self._call_convention = (base_layer_utils
                              .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
+    if not hasattr(self, '_layers'):
+      self._layers = []  # Dependencies tracked via attribute assignment.
 
     # These lists will be filled via successive calls
     # to self._add_inbound_node().
@@ -177,7 +190,7 @@ class Layer(checkpointable.CheckpointableBase):
       self._expects_training_arg = False
 
     # Whether the `call` method can be used to build a TF graph without issues.
-    self._call_is_graph_friendly = True
+    self._dynamic = dynamic
 
     # Manage input shape information if passed.
     if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
@@ -509,7 +522,6 @@ class Layer(checkpointable.CheckpointableBase):
     # mode when all inputs can be traced back to `keras.Input()` (when building
     # models using the functional API).
     build_graph = tf_utils.are_all_symbolic_tensors(input_list)
-    executing_eagerly = context.executing_eagerly()
 
     # Handle Keras mask propagation from previous layer to current layer.
     previous_mask = None
@@ -517,16 +529,13 @@ class Layer(checkpointable.CheckpointableBase):
                         self._compute_previous_mask):
       previous_mask = base_layer_utils.collect_previous_mask(inputs)
       if not hasattr(self, '_call_fn_args'):
-        self._call_fn_args = self._no_dependency(
-            function_utils.fn_args(self.call))
+        self._call_fn_args = function_utils.fn_args(self.call)
       if ('mask' in self._call_fn_args and 'mask' not in kwargs and
           not generic_utils.is_all_none(previous_mask)):
         # The previous layer generated a mask, and mask was not explicitly pass
         # to __call__, hence we set previous_mask as the default value.
         kwargs['mask'] = previous_mask
 
-    input_shapes = None
-
     with ops.name_scope(self._name_scope()):
       if not self.built:
         # Build layer if applicable (if the `build` method has been overridden).
@@ -543,30 +552,28 @@ class Layer(checkpointable.CheckpointableBase):
             self.input_spec, inputs, self.name)
         graph = backend.get_graph()
         with graph.as_default():
-          if not executing_eagerly:
-            # In graph mode, failure to build the layer's graph
-            # implies a user-side bug. We don't catch exceptions.
-            outputs = self.call(inputs, *args, **kwargs)
-          else:
+          if not self.dynamic:
             try:
               outputs = self.call(inputs, *args, **kwargs)
-            except Exception:  # pylint: disable=broad-except
-              # Any issue during graph-building means we will later run the
-              # model in eager mode, whether the issue was related to
-              # graph mode or not. This provides a nice debugging experience.
-              self._call_is_graph_friendly = False
-              # We will use static shape inference to return symbolic tensors
-              # matching the specifications of the layer outputs.
-              # Since we have set `self._call_is_graph_friendly = False`,
-              # we will never attempt to run the underlying TF graph (which is
-              # disconnected).
-              # TODO(fchollet): consider py_func as an alternative, which
-              # would enable us to run the underlying graph if needed.
-              input_shapes = nest.map_structure(lambda x: x.shape, inputs)
-              output_shapes = self.compute_output_shape(input_shapes)
-              outputs = nest.map_structure(
-                  lambda shape: backend.placeholder(shape, dtype=self.dtype),
-                  output_shapes)
+            except TypeError as e:
+              messages = ['`tf.Tensor` as a Python `bool` is not allowed',
+                          'Tensor objects are only iterable when eager']
+              for msg in messages:
+                if msg in str(e):
+                  raise TypeError('You are attempting to use Python control '
+                                  'flow in a layer that was not declared to be '
+                                  'dynamic. Pass `dynamic=True` to the class '
+                                  'constructor.\nEncountered error:\n"""\n' +
+                                  str(e) + '\n"""')
+              raise e
+          else:
+            # We will use static shape inference to return symbolic tensors
+            # matching the specifications of the layer outputs.
+            # Since `self.dynamic` is True, we will never attempt to
+            # run the underlying TF graph (which is disconnected).
+            # TODO(fchollet): consider py_func as an alternative, which
+            # would enable us to run the underlying graph if needed.
+            outputs = self._symbolic_call(inputs)
 
           if outputs is None:
             raise ValueError('A layer\'s `call` method should return a '
@@ -580,7 +587,9 @@ class Layer(checkpointable.CheckpointableBase):
           if hasattr(self, '_set_inputs') and not self.inputs:
             # Subclassed network: explicitly set metadata normally set by
             # a call to self._set_inputs().
-            # This is not relevant in eager execution.
+            # TODO(b/120997007): This should be done in Eager as well, but
+            # causes garbage collection issues because of the placeholders
+            # created on the default Keras graph.
             self._set_inputs(inputs, outputs)
       else:
         # Eager execution on data tensors.
@@ -605,6 +614,10 @@ class Layer(checkpointable.CheckpointableBase):
   def name(self):
     return self._name
 
+  @property
+  def dynamic(self):
+    return self._dynamic
+
   @property
   def activity_regularizer(self):
     """Optional regularizer function for the output of this layer."""
@@ -613,18 +626,24 @@ class Layer(checkpointable.CheckpointableBase):
   @activity_regularizer.setter
   def activity_regularizer(self, regularizer):
     """Optional regularizer function for the output of this layer."""
-    self._activity_regularizer = self._no_dependency(regularizer)
+    self._activity_regularizer = regularizer
 
   @property
   def trainable_weights(self):
-    return self._trainable_weights if self.trainable else []
+    if self.trainable:
+      nested = self._gather_children_attribute('trainable_weights')
+      return self._trainable_weights + nested
+    else:
+      return []
 
   @property
   def non_trainable_weights(self):
     if self.trainable:
-      return self._non_trainable_weights
+      nested = self._gather_children_attribute('non_trainable_weights')
+      return self._non_trainable_weights + nested
     else:
-      return self._trainable_weights + self._non_trainable_weights
+      nested = self._gather_children_attribute('weights')
+      return self._trainable_weights + self._non_trainable_weights + nested
 
   @property
   def weights(self):
@@ -639,7 +658,7 @@ class Layer(checkpointable.CheckpointableBase):
   def updates(self):
     if not self.trainable and not self.stateful:
       return []
-    return self._updates
+    return self._updates + self._gather_children_attribute('updates')
 
   @property
   def losses(self):
@@ -661,7 +680,7 @@ class Layer(checkpointable.CheckpointableBase):
       loss_tensor = regularizer()
       if loss_tensor is not None:
         collected_losses.append(loss_tensor)
-    return collected_losses
+    return collected_losses + self._gather_children_attribute('losses')
 
   @doc_controls.for_subclass_implementers
   def add_loss(self, losses, inputs=None):
@@ -1311,9 +1330,8 @@ class Layer(checkpointable.CheckpointableBase):
 
     def _loss_for_variable(v):
       """Creates a regularization loss `Tensor` for variable `v`."""
-      with ops.colocate_with(v):
-        with ops.name_scope(name + '/Regularizer'):
-          regularization = regularizer(v)
+      with ops.name_scope(name + '/Regularizer'):
+        regularization = regularizer(v)
       return regularization
 
     if isinstance(variable, tf_variables.PartitionedVariable):
@@ -1558,23 +1576,6 @@ class Layer(checkpointable.CheckpointableBase):
     else:
       return values
 
-  @property
-  def _static_graph_friendly(self):
-    """Whether the layer can be called to create a static graph.
-
-    Because of nesting, there are two components to being "graph-friendly":
-      1) all inner layers are graph-friendly
-      2) the way they are composed is graph-friendly.
-    We denote the latter as "_call_is_graph_friendly", and define
-    "_static_graph_friendly" as being the combination of
-    "_call_is_graph_friendly" and "all inner layers are _static_graph_friendly".
-    For atomic layers (no inner layers), this is just "_call_is_graph_friendly".
-
-    Returns:
-      Boolean.
-    """
-    return self._call_is_graph_friendly
-
   def _maybe_build(self, inputs):
     # Check input assumptions set before layer building, e.g. input rank.
     input_spec.assert_input_compatibility(
@@ -1592,6 +1593,57 @@ class Layer(checkpointable.CheckpointableBase):
     if not hasattr(self.build, '_is_default'):
       self.build(input_shapes)
 
+  def _symbolic_call(self, inputs):
+    input_shapes = nest.map_structure(lambda x: x.shape, inputs)
+    output_shapes = self.compute_output_shape(input_shapes)
+    return nest.map_structure(
+        lambda shape: backend.placeholder(shape, dtype=self.dtype),
+        output_shapes)
+
+  def __setattr__(self, name, value):
+    if (not getattr(self, '_setattr_tracking', True) or
+        getattr(self, '_is_graph_network', False)):
+      super(Layer, self).__setattr__(name, value)
+      return
+
+    # Append value to self._layers if relevant
+    if (isinstance(value, Layer) or
+        checkpointable_layer_utils.has_weights(value)):
+      # Initialize `_layers` here in case `__init__` has not yet been called.
+      if not hasattr(self, '_layers'):
+        self._layers = []
+      # We need to check object identity to avoid de-duplicating empty
+      # container types which compare equal.
+      if not any((layer is value for layer in self._layers)):
+        self._layers.append(value)
+        if hasattr(value, '_use_resource_variables'):
+          # Legacy layers (V1 tf.layers) must always use
+          # resource variables.
+          value._use_resource_variables = True
+
+    # Append value to list of trainable / non-trainable weights if relevant
+    if isinstance(value, tf_variables.Variable):
+      # Users may add extra weights/variables
+      # simply by assigning them to attributes (invalid for graph networks)
+      if not hasattr(self, '_trainable_weights'):
+        self._trainable_weights = []
+      if not hasattr(self, '_non_trainable_weights'):
+        self._non_trainable_weights = []
+      if value not in self._trainable_weights + self._non_trainable_weights:
+        if value.trainable:
+          self._trainable_weights.append(value)
+        else:
+          self._non_trainable_weights.append(value)
+    super(Layer, self).__setattr__(name, value)
+
+  def _gather_children_attribute(self, attribute):
+    assert attribute in {'weights', 'trainable_weights',
+                         'non_trainable_weights', 'updates', 'losses'}
+    if hasattr(self, '_layers'):
+      return list(itertools.chain.from_iterable(
+          getattr(layer, attribute) for layer in self._layers))
+    return []
+
 
 class Node(object):
   """A `Node` describes the connectivity between two layers.
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 798775b6a5b29aa72a2c766584811aa469db2471..ebee4a3043e57d149bb8d81812e7568aff8f8eb8 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -18,20 +18,28 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class DynamicLayer1(base_layer.Layer):
 
+  def __init__(self, dynamic=False, **kwargs):
+    super(DynamicLayer1, self).__init__(dynamic=dynamic, **kwargs)
+
   def call(self, inputs):
     if math_ops.reduce_sum(inputs) > 0:
       return math_ops.sqrt(inputs)
@@ -44,6 +52,9 @@ class DynamicLayer1(base_layer.Layer):
 
 class DynamicLayer2(base_layer.Layer):
 
+  def __init__(self, dynamic=False, **kwargs):
+    super(DynamicLayer2, self).__init__(dynamic=dynamic, **kwargs)
+
   def call(self, inputs):
     samples = []
     for sample in inputs:
@@ -59,67 +70,145 @@ class InvalidLayer(base_layer.Layer):
   def call(self, inputs):
     raise ValueError('You did something wrong!')
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
 
-class BaseLayerTest(test.TestCase):
+class BaseLayerTest(test.TestCase, parameterized.TestCase):
 
-  def test_dynamic_layer_in_functional_model_in_graph_mode(self):
+  @parameterized.parameters(DynamicLayer1, DynamicLayer2)
+  def test_dynamic_layer_in_functional_model_in_graph_mode(self, layer_class):
     with context.graph_mode():
       inputs = keras.Input((3,))
+      # Works when `dynamic=True` is declared.
+      outputs = layer_class(dynamic=True)(inputs)
+      model = keras.Model(inputs, outputs)
+      self.assertEqual(model.dynamic, True)
+      # But then you cannot run the model since you're in a graph scope.
       with self.assertRaisesRegexp(
-          TypeError, 'Using a `tf.Tensor` as a Python `bool` is not allowed'):
-        _ = DynamicLayer1()(inputs)
+          ValueError, 'You must enable eager execution'):
+        model.compile(rmsprop.RMSprop(0.001), loss='mse')
 
-      inputs = keras.Input((3,))
+      # Fails when `dynamic=True` not declared.
       with self.assertRaisesRegexp(
-          TypeError, 'Tensor objects are only iterable when eager'):
-        _ = DynamicLayer2()(inputs)
+          TypeError, 'attempting to use Python control flow'):
+        _ = layer_class()(inputs)
 
-  def test_dynamic_layer_in_functional_model_in_eager_mode(self):
+  @parameterized.parameters(DynamicLayer1, DynamicLayer2)
+  def test_dynamic_layer_in_functional_model_in_eager_mode(self, layer_class):
     inputs = keras.Input((3,))
-    outputs = DynamicLayer1()(inputs)
-    model = keras.Model(inputs, outputs)
-    self.assertEqual(model._static_graph_friendly, False)
-    model.compile(RMSPropOptimizer(0.001), loss='mse')
-    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
-
-    inputs = keras.Input((3,))
-    outputs = DynamicLayer2()(inputs)
+    # Fails when `dynamic=True` not declared.
+    with self.assertRaisesRegexp(
+        TypeError, 'attempting to use Python control flow'):
+      _ = layer_class()(inputs)
+    # Works when `dynamic=True` is declared.
+    outputs = layer_class(dynamic=True)(inputs)
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._static_graph_friendly, False)
-    model.compile(RMSPropOptimizer(0.001), loss='mse')
+    self.assertEqual(model.dynamic, True)
+    model.compile(rmsprop.RMSprop(0.001), loss='mse')
+    self.assertEqual(model.run_eagerly, True)
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
-  def nested_dynamic_layers_in_eager_mode(self):
+  def test_nested_dynamic_layers_in_eager_mode(self):
     inputs = keras.Input((3,))
-    outputs = DynamicLayer1()(inputs)
+    outputs = DynamicLayer1(dynamic=True)(inputs)
     inner_model = keras.Model(inputs, outputs)
+    self.assertEqual(inner_model.dynamic, True)
 
     inputs = keras.Input((3,))
-    x = DynamicLayer2()(inputs)
+    x = DynamicLayer2(dynamic=True)(inputs)
     outputs = inner_model(x)
 
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._static_graph_friendly, False)
-    model.compile(RMSPropOptimizer(0.001), loss='mse')
+    self.assertEqual(model.dynamic, True)
+    model.compile(rmsprop.RMSprop(0.001), loss='mse')
+    self.assertEqual(model.run_eagerly, True)
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
-  def test_invalid_forward_pass_in_graph_mode(self):
-    with context.graph_mode():
-      inputs = keras.Input((3,))
-      with self.assertRaisesRegexp(ValueError, 'You did something wrong!'):
-        _ = InvalidLayer()(inputs)
+  def test_dynamic_layers_in_sequential_model(self):
+    # Without input_shape argument
+    model = keras.Sequential([DynamicLayer1(dynamic=True),
+                              keras.layers.Dense(3),
+                              DynamicLayer2(dynamic=True)])
+    self.assertEqual(model.dynamic, True)
+    model.compile(rmsprop.RMSprop(0.001), loss='mse')
+    self.assertEqual(model.run_eagerly, True)
+    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+    # With input_shape argument
+    model = keras.Sequential([DynamicLayer1(dynamic=True, input_shape=(3,)),
+                              DynamicLayer2(dynamic=True)])
+    self.assertEqual(model.dynamic, True)
+    model.compile(rmsprop.RMSprop(0.001), loss='mse')
+    self.assertEqual(model.run_eagerly, True)
+    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+  def test_dynamic_layers_in_subclassed_model(self):
 
-  def test_invalid_forward_pass_in_eager_mode(self):
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.layer1 = DynamicLayer1(dynamic=True)
+
+      def call(self, inputs):
+        return self.layer1(inputs)
+
+    model = MyModel()
+    self.assertEqual(model.dynamic, True)
+    model.compile(rmsprop.RMSprop(0.001), loss='mse')
+    self.assertEqual(model.run_eagerly, True)
+    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+  def test_dynamic_subclassed_model_no_shape_inference(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__(dynamic=True)
+        self.layer1 = keras.layers.Dense(3)
+        self.layer2 = keras.layers.Dense(3)
+
+      def call(self, inputs):
+        if math_ops.reduce_sum(inputs) > 0:
+          return self.layer1(inputs)
+        else:
+          return self.layer2(inputs)
+
+    model = MyModel()
+    self.assertEqual(model.dynamic, True)
+    model.compile(rmsprop.RMSprop(0.001), loss='mse')
+    self.assertEqual(model.run_eagerly, True)
+    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+    self.assertEqual(model.outputs, [None])
+
+  def test_dynamic_subclassed_model_with_shape_inference(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__(dynamic=True)
+        self.layer1 = keras.layers.Dense(3)
+        self.layer2 = keras.layers.Dense(3)
+
+      def call(self, inputs):
+        if math_ops.reduce_sum(inputs) > 0:
+          return self.layer1(inputs)
+        else:
+          return self.layer2(inputs)
+
+      def compute_output_shape(self, input_shape):
+        return tensor_shape.TensorShape(
+            tuple(input_shape[:-1].as_list()) + (3,))
+
+    model = MyModel()
+    self.assertEqual(model.dynamic, True)
+    model.compile(rmsprop.RMSprop(0.001), loss='mse')
+    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+    self.assertEqual(model.outputs[0].shape.as_list(), [None, 3])
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_invalid_forward_pass(self):
     inputs = keras.Input((3,))
-    outputs = InvalidLayer()(inputs)
-    model = keras.Model(inputs, outputs)
-    self.assertEqual(model._static_graph_friendly, False)
-    model.compile(RMSPropOptimizer(0.001), loss='mse')
     with self.assertRaisesRegexp(ValueError, 'You did something wrong!'):
-      model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+      _ = InvalidLayer()(inputs)
 
   def test_using_symbolic_tensors_with_tf_ops(self):
     # Single-input.
@@ -149,7 +238,7 @@ class BaseLayerTest(test.TestCase):
     with ops.Graph().as_default():
       x1 = array_ops.ones((3, 3))
     x2 = array_ops.ones((3, 3))
-    self.assertTrue(isinstance(x2, ops.EagerTensor))
+    self.assertIsInstance(x2, ops.EagerTensor)
     with self.assertRaisesRegexp(TypeError,
                                  'provided list of inputs contains '
                                  'objects other than \'EagerTensor\''):
@@ -164,22 +253,117 @@ class BaseLayerTest(test.TestCase):
                                  'objects other than \'EagerTensor\''):
       math_ops.matmul(x1, x2)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_mixing_keras_symbolic_tensors_and_eager_tensors(self):
     x1 = keras.Input((3,))
     x2 = array_ops.ones((3, 3))
-    with self.assertRaisesRegexp(
-        TypeError,
-        'mix computation of symbolic Tensors'):
-      math_ops.matmul(x1, x2)
-
+    y = math_ops.matmul(x1, x2)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+    fn = keras.backend.function(inputs=[x1], outputs=[y])
+    x_val = np.random.random((3, 3))
+    y_val = np.ones((3, 3))
+    self.assertAllClose(fn([x_val])[0],
+                        np.matmul(x_val, y_val),
+                        atol=1e-5)
+
+  @test_util.run_in_graph_and_eager_modes
   def test_mixing_keras_symbolic_tensors_and_numpy_arrays(self):
-    # For the time being we treat Numpy arrays as EagerTensors when mixing both.
     x1 = keras.Input((3,))
     x2 = np.ones((3, 3), dtype='float32')
-    with self.assertRaisesRegexp(
-        TypeError,
-        'mix computation of symbolic Tensors'):
-      math_ops.matmul(x1, x2)
+    y = math_ops.matmul(x1, x2)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+    fn = keras.backend.function(inputs=[x1], outputs=[y])
+    x_val = np.random.random((3, 3))
+    y_val = np.ones((3, 3))
+    self.assertAllClose(fn([x_val])[0],
+                        np.matmul(x_val, y_val),
+                        atol=1e-5)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class NestedTrackingTest(test.TestCase):
+
+  def test_nested_layer_variable_tracking(self):
+    # Test that variables from nested sublayers are
+    # being tracked by subclassed layers.
+
+    class MyLayer(keras.layers.Layer):
+
+      def __init__(self):
+        super(MyLayer, self).__init__()
+        self.dense1 = keras.layers.Dense(1)
+        self.dense2 = keras.layers.BatchNormalization()
+
+      def build(self, input_shape):
+        self.v1 = self.add_weight('v1', shape=input_shape[1:].as_list())
+        self.v2 = variables.Variable(
+            name='v2',
+            initial_value=np.zeros(input_shape[1:].as_list(), dtype='float32'),
+            trainable=False)
+
+      def call(self, inputs):
+        x = self.dense1(inputs) + self.dense2(inputs)
+        return x + self.v1 + self.v2
+
+    layer = MyLayer()
+    inputs = keras.Input((1,))
+    _ = layer(inputs)
+
+    self.assertEqual(len(layer.weights), 8)
+    self.assertEqual(len(layer.trainable_weights), 5)
+    self.assertEqual(len(layer.non_trainable_weights), 3)
+
+    layer.dense1.trainable = False
+    self.assertEqual(len(layer.weights), 8)
+    self.assertEqual(len(layer.trainable_weights), 3)
+    self.assertEqual(len(layer.non_trainable_weights), 5)
+
+    layer.trainable = False
+    self.assertEqual(len(layer.weights), 8)
+    self.assertEqual(len(layer.trainable_weights), 0)
+    self.assertEqual(len(layer.non_trainable_weights), 8)
+
+  def test_nested_layer_updates_losses_tracking(self):
+    # Test that updates and losses from nested sublayers are
+    # being tracked by subclassed layers.
+
+    class UpdateAndLossLayer(keras.layers.Layer):
+
+      def build(self, _):
+        self.v1 = self.add_weight('v1', shape=())
+
+      def call(self, inputs):
+        self.add_loss(math_ops.reduce_sum(inputs))
+        self.add_update(state_ops.assign_add(self.v1, 1))
+        return inputs + 1
+
+    class MyLayer(keras.layers.Layer):
+
+      def build(self, _):
+        self.v1 = self.add_weight('v1', shape=())
+
+      def __init__(self):
+        super(MyLayer, self).__init__()
+        self.ul1 = UpdateAndLossLayer()
+        self.ul2 = UpdateAndLossLayer()
+
+      def call(self, inputs):
+        self.add_loss(math_ops.reduce_sum(inputs))
+        self.add_update(state_ops.assign_add(self.v1, 1))
+        x = self.ul1(inputs)
+        return self.ul2(x)
+
+    layer = MyLayer()
+
+    if context.executing_eagerly():
+      inputs = array_ops.ones((3, 1))
+      _ = layer(inputs)
+      self.assertEqual(len(layer.losses), 3)
+    else:
+      inputs = keras.Input((1,))
+      _ = layer(inputs)
+      self.assertEqual(len(layer.losses), 3)
+      self.assertEqual(len(layer.updates), 3)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/engine/correctness_test.py b/tensorflow/python/keras/engine/correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2f3b040de3269c6921d95d8a845869511ac0634
--- /dev/null
+++ b/tensorflow/python/keras/engine/correctness_test.py
@@ -0,0 +1,147 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for numerical correctness."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class Bias(keras.layers.Layer):
+  """Layer that add a bias to its inputs."""
+
+  def build(self, input_shape):
+    self.bias = self.add_variable('bias', (1,), initializer='zeros')
+
+  def call(self, inputs):
+    return inputs + self.bias
+
+
+class MultiInputSubclassed(keras.Model):
+  """Subclassed Model that adds its inputs and then adds a bias."""
+
+  def __init__(self):
+    super(MultiInputSubclassed, self).__init__()
+    self.add = keras.layers.Add()
+    self.bias = Bias()
+
+  def call(self, inputs):
+    added = self.add(inputs)
+    return self.bias(added)
+
+
+def multi_input_functional():
+  """Functional Model that adds its inputs and then adds a bias."""
+  input_1 = keras.Input(shape=(1,))
+  input_2 = keras.Input(shape=(1,))
+  input_3 = keras.Input(shape=(1,))
+  added = keras.layers.Add()([input_1, input_2, input_3])
+  output = Bias()(added)
+  return keras.Model([input_1, input_2, input_3], output)
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class SimpleBiasTest(keras_parameterized.TestCase):
+
+  def _get_simple_bias_model(self):
+    model = testing_utils.get_model_from_layers([Bias()], input_shape=(1,))
+    model.compile(keras.optimizer_v2.gradient_descent.SGD(0.1), 'mae')
+    return model
+
+  def test_simple_bias_fit(self):
+    x = np.array([[0.], [1.], [2.]])
+    y = np.array([[0.5], [2.], [3.5]])
+    model = self._get_simple_bias_model()
+
+    history = model.fit(x, y, batch_size=3, epochs=5)
+    self.assertAllClose(history.history['loss'], [1., 0.9, 0.8, 0.7, 0.6])
+
+  def test_simple_bias_evaluate(self):
+    x = np.array([[0.], [1.], [2.]])
+    y = np.array([[1.], [3.], [5.]])
+    model = self._get_simple_bias_model()
+
+    loss = model.evaluate(x, y, batch_size=1)
+    self.assertAlmostEqual(loss, 2.)
+
+  def test_simple_bias_predict(self):
+    x = np.array([[0.], [1.], [2.]])
+    model = self._get_simple_bias_model()
+
+    pred = model.predict(x, batch_size=1)
+    self.assertAllClose(x, pred)
+
+
+@keras_parameterized.run_all_keras_modes
+class MultipleInputTest(keras_parameterized.TestCase):
+
+  def _get_multiple_input_model(self, subclassed=True):
+    if subclassed:
+      model = MultiInputSubclassed()
+    else:
+      model = multi_input_functional()
+    model.compile(keras.optimizer_v2.gradient_descent.SGD(0.1), 'mae')
+    return model
+
+  @parameterized.named_parameters(('subclassed', True), ('functional', False))
+  def test_multiple_input_fit(self, subclassed):
+    x = [
+        np.array([[1.], [2.], [3.]]),
+        np.array([[4.], [5.], [6.]]),
+        np.array([[7.], [8.], [9.]])
+    ]
+    y = np.array([[12.5], [16.], [19.5]])
+
+    model = self._get_multiple_input_model(subclassed)
+    history = model.fit(x, y, batch_size=3, epochs=5)
+    self.assertAllClose(history.history['loss'], [1., 0.9, 0.8, 0.7, 0.6])
+
+  @parameterized.named_parameters(('subclassed', True), ('functional', False))
+  def test_multiple_input_evaluate(self, subclassed):
+    x = [
+        np.array([[1.], [2.], [3.]]),
+        np.array([[4.], [5.], [6.]]),
+        np.array([[7.], [8.], [9.]])
+    ]
+    y = np.array([[13.], [17.], [21.]])
+
+    model = self._get_multiple_input_model(subclassed)
+    loss = model.evaluate(x, y, batch_size=3)
+    self.assertAlmostEqual(loss, 2.)
+
+  @parameterized.named_parameters(('subclassed', True), ('functional', False))
+  def test_multiple_input_predict(self, subclassed):
+    x = [
+        np.array([[1.], [2.], [3.]]),
+        np.array([[4.], [5.], [6.]]),
+        np.array([[7.], [8.], [9.]])
+    ]
+
+    model = self._get_multiple_input_model(subclassed)
+    pred = model.predict(x, batch_size=1)
+    self.assertAllClose(pred, [[12.], [15.], [18.]])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/engine/distributed_training_utils.py b/tensorflow/python/keras/engine/distributed_training_utils.py
index d100182381ed597651f9c95f3efc624502d819ce..32129afe64761048ed219a4e0caaae19292b9bc4 100644
--- a/tensorflow/python/keras/engine/distributed_training_utils.py
+++ b/tensorflow/python/keras/engine/distributed_training_utils.py
@@ -199,11 +199,19 @@ def validate_callbacks(input_callbacks, optimizer, current_strategy):
       # running ops.
       if isinstance(callback, callbacks.TensorBoard):
         if callback.__getattribute__('histogram_freq'):
-          raise ValueError('histogram_freq in the TensorBoard callback is not '
-                           'supported when using DistributionStrategy.')
+          logging.warning(
+              UserWarning(
+                  '`histogram_freq` in the TensorBoard callback is not '
+                  'supported when using DistributionStrategy. Setting '
+                  '`histogram_freq` to `0`.'))
+          callback.histogram_freq = 0
         if callback.__getattribute__('write_grads'):
-          raise ValueError('write_grads in the TensorBoard callback is not '
-                           'supported when using DistributionStrategy.')
+          logging.warning(
+              UserWarning(
+                  '`write_grads` in the TensorBoard callback is not supported '
+                  'when using DistributionStrategy. Setting `write_grads` '
+                  'to `False`.'))
+          callback.histogram_freq = False
 
 
 def validate_distributed_dataset_inputs(distribution_strategy, x, y,
diff --git a/tensorflow/python/keras/engine/feature_columns_integration_test.py b/tensorflow/python/keras/engine/feature_columns_integration_test.py
index b7549e013c909a72198018985e2c96d2c20199ea..b3f8cfe72585188d631c072b690729054d5db775 100644
--- a/tensorflow/python/keras/engine/feature_columns_integration_test.py
+++ b/tensorflow/python/keras/engine/feature_columns_integration_test.py
@@ -18,15 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
 from tensorflow.python.feature_column import feature_column_lib as fc
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training import rmsprop
 
@@ -44,12 +43,12 @@ class TestDNNModel(keras.models.Model):
     return net
 
 
-class FeatureColumnsIntegrationTest(test.TestCase, parameterized.TestCase):
+class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
   """Most Sequential model API tests are covered in `training_test.py`.
 
   """
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_model(self):
     columns = [fc.numeric_column('a')]
     model = keras.models.Sequential([
@@ -60,7 +59,8 @@ class FeatureColumnsIntegrationTest(test.TestCase, parameterized.TestCase):
     model.compile(
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
         loss='categorical_crossentropy',
-        metrics=['accuracy'])
+        metrics=['accuracy'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = {'a': np.random.random((10, 1))}
     y = np.random.randint(20, size=(10, 1))
@@ -70,7 +70,7 @@ class FeatureColumnsIntegrationTest(test.TestCase, parameterized.TestCase):
     model.evaluate(x, y, batch_size=5)
     model.predict(x, batch_size=5)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_model_with_ds_input(self):
     columns = [fc.numeric_column('a')]
     model = keras.models.Sequential([
@@ -81,7 +81,8 @@ class FeatureColumnsIntegrationTest(test.TestCase, parameterized.TestCase):
     model.compile(
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
         loss='categorical_crossentropy',
-        metrics=['accuracy'])
+        metrics=['accuracy'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     y = np.random.randint(20, size=(100, 1))
     y = keras.utils.to_categorical(y, num_classes=20)
@@ -94,7 +95,7 @@ class FeatureColumnsIntegrationTest(test.TestCase, parameterized.TestCase):
     model.evaluate(ds, steps=1)
     model.predict(ds, steps=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_subclassed_model_with_feature_columns(self):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
@@ -104,7 +105,8 @@ class FeatureColumnsIntegrationTest(test.TestCase, parameterized.TestCase):
     dnn_model.compile(
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
         loss='categorical_crossentropy',
-        metrics=['accuracy'])
+        metrics=['accuracy'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = {'a': np.random.random((10, 1)), 'b': np.random.random((10, 1))}
     y = np.random.randint(20, size=(10, 1))
@@ -114,10 +116,8 @@ class FeatureColumnsIntegrationTest(test.TestCase, parameterized.TestCase):
     dnn_model.evaluate(x=x, y=y, batch_size=5)
     dnn_model.predict(x=x, batch_size=5)
 
-  @parameterized.parameters(True, False)
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_subclassed_model_with_feature_columns_with_ds_input(self,
-                                                               run_eagerly):
+  @keras_parameterized.run_all_keras_modes
+  def test_subclassed_model_with_feature_columns_with_ds_input(self):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
 
@@ -127,7 +127,7 @@ class FeatureColumnsIntegrationTest(test.TestCase, parameterized.TestCase):
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
         loss='categorical_crossentropy',
         metrics=['accuracy'],
-        run_eagerly=run_eagerly and context.executing_eagerly())
+        run_eagerly=testing_utils.should_run_eagerly())
 
     y = np.random.randint(20, size=(100, 1))
     y = keras.utils.to_categorical(y, num_classes=20)
@@ -140,7 +140,8 @@ class FeatureColumnsIntegrationTest(test.TestCase, parameterized.TestCase):
     dnn_model.evaluate(ds, steps=1)
     dnn_model.predict(ds, steps=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  # TODO(kaftan) seems to throw an error when enabled.
+  @keras_parameterized.run_all_keras_modes
   def DISABLED_test_function_model_feature_layer_input(self):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
@@ -166,7 +167,8 @@ class FeatureColumnsIntegrationTest(test.TestCase, parameterized.TestCase):
     data = ({'a': np.arange(10), 'b': np.arange(10)}, np.arange(10, 20))
     print(model.fit(*data, epochs=1))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  # TODO(kaftan) seems to throw an error when enabled.
+  @keras_parameterized.run_all_keras_modes
   def DISABLED_test_function_model_multiple_feature_layer_inputs(self):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 93ae667c8ba113304beeb1b6c891bd29b2fdbf30..7435da61cc92765846962e4f518147a80038e787 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -38,11 +38,11 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import saving
+from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -142,7 +142,6 @@ class Network(base_layer.Layer):
     self._metrics_tensors = {}
     self._scope = None  # Never used.
     self._reuse = None  # Never used.
-    self._call_is_graph_friendly = True
     if context.executing_eagerly():
       self._graph = None
     else:
@@ -174,43 +173,7 @@ class Network(base_layer.Layer):
       self.outputs = list(outputs)
     else:
       self.outputs = [outputs]
-
-    # Check for redundancy in inputs.
-    if len(set(self.inputs)) != len(self.inputs):
-      raise ValueError('The list of inputs passed to the model '
-                       'is redundant. '
-                       'All inputs should only appear once.'
-                       ' Found: ' + str(self.inputs))
-    for x in self.inputs:
-      # Check that x has appropriate `_keras_history` metadata.
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError('Input tensors to a ' + cls_name + ' ' +
-                         'must come from `tf.keras.Input`. '
-                         'Received: ' + str(x) +
-                         ' (missing previous layer metadata).')
-      # Check that x is an input tensor.
-      # pylint: disable=protected-access
-      layer, node_index, tensor_index = x._keras_history
-      if len(layer._inbound_nodes) > 1 or (
-          layer._inbound_nodes and layer._inbound_nodes[0].inbound_layers):
-        cls_name = self.__class__.__name__
-        logging.warning(cls_name + ' inputs must come from '
-                        '`tf.keras.Input` (thus holding past layer metadata), '
-                        'they cannot be the output of '
-                        'a previous non-Input layer. '
-                        'Here, a tensor specified as '
-                        'input to "' + self.name + '" was not an Input tensor, '
-                        'it was generated by layer ' + layer.name + '.\n'
-                        'Note that input tensors are '
-                        'instantiated via `tensor = tf.keras.Input(shape)`.\n'
-                        'The tensor that caused the issue was: ' + str(x.name))
-    for x in self.outputs:
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError('Output tensors to a ' + cls_name + ' must be '
-                         'the output of a TensorFlow `Layer` '
-                         '(thus holding past layer metadata). Found: ' + str(x))
+    self._validate_graph_inputs_and_outputs()
 
     self._base_init(name=name)
     self._compute_previous_mask = (
@@ -221,6 +184,7 @@ class Network(base_layer.Layer):
     self.built = True
     self._compute_output_and_mask_jointly = True
     self._is_graph_network = True
+    self._dynamic = False
 
     self._input_layers = []
     self._output_layers = []
@@ -287,9 +251,10 @@ class Network(base_layer.Layer):
       self.output_names.append(layer.name)
 
   @checkpointable.no_automatic_dependency_tracking
-  def _init_subclassed_network(self, name=None):
+  def _init_subclassed_network(self, name=None, dynamic=False):
     self._base_init(name=name)
     self._is_graph_network = False
+    self._dynamic = dynamic
     call_argspec = tf_inspect.getfullargspec(self.call)
     if 'training' in call_argspec.args:
       self._expects_training_arg = True
@@ -301,10 +266,10 @@ class Network(base_layer.Layer):
     self.built = False
 
   @property
-  def _static_graph_friendly(self):
+  def dynamic(self):
     if self._is_graph_network:
-      return all(layer._static_graph_friendly for layer in self.layers)
-    return self._call_is_graph_friendly
+      return any(layer.dynamic for layer in self.layers)
+    return self._dynamic or any(layer.dynamic for layer in self.layers)
 
   def _determine_call_convention(self, call_argspec):
     """Decides how `self.call()` is invoked. See `CallConvention`."""
@@ -362,71 +327,31 @@ class Network(base_layer.Layer):
       self._track_checkpointable(
           layer, name='layer-%d' % layer_index, overwrite=True)
 
-  def _no_dependency(self, value):
-    """Override to allow `Layer` to disable dependency tracking.
-
-    `CheckpointableBase` defines this method, whose semantics are "if a subclass
-    does dependency tracking, this method exempts `value`." Layer uses
-    `_no_dependency` to exempt some of its attribute assignments (conditional on
-    attribute assignment causing tracking in the subclass).
-
-    Args:
-      value: An object which will be assigned to an object attribute, whose
-        value should not be tracked.
-
-    Returns:
-      A wrapped object which, when assigned to an attribute, will not be
-      tracked (`value` will be stored in the attribute).
-    """
-    return data_structures.NoDependency(value)
-
   def __setattr__(self, name, value):
     if not getattr(self, '_setattr_tracking', True):
       super(Network, self).__setattr__(name, value)
       return
-    no_dependency = isinstance(value, data_structures.NoDependency)
-    value = data_structures.sticky_attribute_assignment(
-        checkpointable=self, value=value, name=name)
     if (isinstance(value, (base_layer.Layer,
-                           Network,
                            data_structures.CheckpointableDataStructure))
         or checkpointable_layer_utils.has_weights(value)):
       try:
-        is_graph_network = self._is_graph_network
+        self._is_graph_network
       except AttributeError:
         raise RuntimeError('It looks like you are subclassing `Model` and you '
                            'forgot to call `super(YourClass, self).__init__()`.'
                            ' Always start with this line.')
-      if not is_graph_network:
-        # We need to check object identity to avoid de-duplicating empty
-        # container types which compare equal.
-        if not any((layer is value for layer in self._layers)):
-          self._layers.append(value)
-          if hasattr(value, '_use_resource_variables'):
-            # In subclassed models, legacy layers (tf.layers) must always use
-            # resource variables.
-            value._use_resource_variables = True
-    if (not no_dependency
-        and isinstance(value, checkpointable.CheckpointableBase)):
-      if (  # For subclassed models only, users may add extra weights/variables
-            # simply by assigning them to attributes.
-          not self._is_graph_network
-          and isinstance(value, variables.Variable)):
-        if value.trainable:
-          # Could already be added via `add_weight`.
-          if value not in self._trainable_weights:
-            self._trainable_weights.append(value)
-        else:
-          if value not in self._non_trainable_weights:
-            self._non_trainable_weights.append(value)
+    # Keep track of checkpointable objects,
+    # for the needs of `self.save/save_weights`.
+    value = data_structures.sticky_attribute_assignment(
+        checkpointable=self, value=value, name=name)
+    super(Network, self).__setattr__(name, value)
 
-    # Keeping track of metric instance created in subclassed model/layer.
+    # Keep track of metric instance created in subclassed model/layer.
     # We do this so that we can maintain the correct order of metrics by adding
     # the instance to the `metrics` list as soon as it is created.
     from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
     if isinstance(value, metrics_module.Metric):
       self._metrics.append(value)
-    super(Network, self).__setattr__(name, value)
 
   @property
   def stateful(self):
@@ -1074,6 +999,8 @@ class Network(base_layer.Layer):
               else:
                 if context.executing_eagerly():
                   output_tensors = layer(computed_tensor, **kwargs)
+                elif layer.dynamic:
+                  output_tensors = layer._symbolic_call(computed_tensor)  # pylint: disable=protected-call
                 else:
                   output_tensors = layer.call(computed_tensor, **kwargs)
                 if hasattr(layer, 'compute_mask'):
@@ -1098,6 +1025,8 @@ class Network(base_layer.Layer):
               else:
                 if context.executing_eagerly():
                   output_tensors = layer(computed_tensors, **kwargs)
+                elif layer.dynamic:
+                  output_tensors = layer._symbolic_call(computed_tensors)  # pylint: disable=protected-call
                 else:
                   output_tensors = layer.call(computed_tensors, **kwargs)
                 if hasattr(layer, 'compute_mask'):
@@ -1692,6 +1621,62 @@ class Network(base_layer.Layer):
                               positions=positions,
                               print_fn=print_fn)
 
+  def _validate_graph_inputs_and_outputs(self):
+    """Validates the inputs and outputs of a Graph Network."""
+    # Check for redundancy in inputs.
+    if len(set(self.inputs)) != len(self.inputs):
+      raise ValueError('The list of inputs passed to the model '
+                       'is redundant. '
+                       'All inputs should only appear once.'
+                       ' Found: ' + str(self.inputs))
+
+    for x in self.inputs:
+      # Check that x has appropriate `_keras_history` metadata.
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise ValueError('Input tensors to a ' + cls_name + ' ' +
+                         'must come from `tf.keras.Input`. '
+                         'Received: ' + str(x) +
+                         ' (missing previous layer metadata).')
+      # Check that x is an input tensor.
+      # pylint: disable=protected-access
+      layer, _, _ = x._keras_history
+      if len(layer._inbound_nodes) > 1 or (
+          layer._inbound_nodes and layer._inbound_nodes[0].inbound_layers):
+        cls_name = self.__class__.__name__
+        logging.warning(cls_name + ' inputs must come from '
+                        '`tf.keras.Input` (thus holding past layer metadata), '
+                        'they cannot be the output of '
+                        'a previous non-Input layer. '
+                        'Here, a tensor specified as '
+                        'input to "' + self.name + '" was not an Input tensor, '
+                        'it was generated by layer ' + layer.name + '.\n'
+                        'Note that input tensors are '
+                        'instantiated via `tensor = tf.keras.Input(shape)`.\n'
+                        'The tensor that caused the issue was: ' + str(x.name))
+
+    # Check compatibility of batch sizes of Input Layers.
+    input_batch_sizes = [
+        training_utils.get_static_batch_size(x._keras_history[0])
+        for x in self.inputs
+    ]
+    consistent_batch_size = None
+    for batch_size in input_batch_sizes:
+      if batch_size is not None:
+        if (consistent_batch_size is not None and
+            batch_size != consistent_batch_size):
+          raise ValueError('The specified batch sizes of the Input Layers'
+                           ' are incompatible. Found batch sizes: {}'.format(
+                               input_batch_sizes))
+        consistent_batch_size = batch_size
+
+    for x in self.outputs:
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise ValueError('Output tensors to a ' + cls_name + ' must be '
+                         'the output of a TensorFlow `Layer` '
+                         '(thus holding past layer metadata). Found: ' + str(x))
+
 
 def _is_hdf5_filepath(filepath):
   return (filepath.endswith('.h5') or filepath.endswith('.keras') or
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index 6d9d9a2fcae53ffacf9297b1fbfa4ad2155a8aa8..92fac6f24285017422d4daa5d1524d6787227bba 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import saving
 from tensorflow.python.keras.engine import training
 from tensorflow.python.lib.io import file_io
@@ -332,7 +333,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
 
 class TestWholeModelSaving(test.TestCase):
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120994067')
   def test_sequential_model_saving(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -383,7 +384,10 @@ class TestWholeModelSaving(test.TestCase):
 
       out = model.predict(x)
       out2 = new_model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
+
+      # TODO(b/120930751) This tolerance should be 1e-05,
+      # very concerning that its not.
+      self.assertAllClose(out, out2, atol=1e-03)
 
   @test_util.run_deprecated_v1
   def test_sequential_model_saving_without_input_shape(self):
@@ -635,8 +639,8 @@ class TestWholeModelSaving(test.TestCase):
       os.close(fd)
       os.remove(fname)
 
-  @test_util.run_deprecated_v1
   def test_saving_model_with_long_weights_names(self):
+    self.skipTest('b/120921503')
     if h5py is None:
       self.skipTest('h5py required to run this test')
 
@@ -756,14 +760,13 @@ class SubclassedModel(training.Model):
 
 class TestWeightSavingAndLoadingTFFormat(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_keras_optimizer_warning(self):
     graph = ops.Graph()
     with graph.as_default(), self.session(graph):
       model = keras.models.Sequential()
       model.add(keras.layers.Dense(2, input_shape=(3,)))
       model.add(keras.layers.Dense(3))
-      model.compile(loss='mse', optimizer='adam', metrics=['acc'])
+      model.compile(loss='mse', optimizer=optimizers.Adam(), metrics=['acc'])
       model._make_train_function()
       temp_dir = self.get_temp_dir()
       prefix = os.path.join(temp_dir, 'ckpt')
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 6e2f76cd80fce76958999f8ed4f6bf5ed950b289..5a42afe847b50de71a7946183598d0b81d07fd56 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import layers as layer_module
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
 from tensorflow.python.keras.engine.network import Network
@@ -120,8 +121,8 @@ class Sequential(Model):
     return layers[:]
 
   @property
-  def _static_graph_friendly(self):
-    return all(layer._static_graph_friendly for layer in self.layers)
+  def dynamic(self):
+    return any(layer.dynamic for layer in self.layers)
 
   @checkpointable.no_automatic_dependency_tracking
   def add(self, layer):
@@ -150,7 +151,7 @@ class Sequential(Model):
         assert len(layer._inbound_nodes[-1].output_tensors) == 1
         set_inputs = True
       else:
-        batch_shape, dtype = get_input_shape_and_dtype(layer)
+        batch_shape, dtype = training_utils.get_input_shape_and_dtype(layer)
         if batch_shape:
           # Instantiate an input layer.
           x = Input(
@@ -252,7 +253,12 @@ class Sequential(Model):
           with ops.name_scope(layer._name_scope()):
             layer._maybe_build(x)
           layer.built = True
-        x = layer.call(x, **kwargs)
+        if context.executing_eagerly():
+          x = layer(x, **kwargs)
+        elif layer.dynamic:
+          x = layer._symbolic_call(x)
+        else:
+          x = layer.call(x, **kwargs)
         if layer.supports_masking:
           mask = layer.compute_mask(x, mask)
         else:
@@ -359,38 +365,3 @@ class Sequential(Model):
     if self.layers and hasattr(self.layers[0], 'input_spec'):
       return self.layers[0].input_spec
     return None
-
-
-def get_input_shape_and_dtype(layer):
-  """Retrieve input shape and input dtype of layer if applicable.
-
-  Args:
-    layer: Layer (or model) instance.
-
-  Returns:
-    Tuple (input_shape, input_dtype). Both could be None if the layer
-      does not have a defined input shape.
-
-  Raises:
-    ValueError: in case an empty Sequential or Graph Network is passed.
-  """
-  if ((isinstance(layer, Model) and layer._is_graph_network)
-      or isinstance(layer, Sequential)):
-    # We were passed a model as first layer.
-    # This requires a specific way to figure out the
-    # input shape and dtype.
-    if not layer.layers:
-      raise ValueError('Cannot add an empty model '
-                       'to a `Sequential` model.')
-    # In case of nested models: recover the first layer
-    # of the deepest model to infer input shape and dtype.
-    layer = layer.layers[0]
-    while ((isinstance(layer, Model) and layer._is_graph_network)
-           or isinstance(layer, Sequential)):
-      layer = layer.layers[0]
-
-  if hasattr(layer, '_batch_input_shape'):
-    batch_shape = layer._batch_input_shape
-    dtype = layer.dtype
-    return batch_shape, dtype
-  return None, None
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index fbf893c663375ef747fe167812e757ba651825ef..30a41e39b714534260e46cc7d9f446f42b29b929 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -26,17 +26,18 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import rmsprop
 
 
-class TestSequential(test.TestCase, parameterized.TestCase):
+class TestSequential(keras_parameterized.TestCase):
   """Most Sequential model API tests are covered in `training_test.py`.
   """
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_basic_methods(self):
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(1, input_dim=2))
@@ -47,7 +48,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     self.assertEqual(len(model.weights), 2 * 2)
     self.assertEqual(model.get_layer(name='dp').name, 'dp')
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_pop(self):
     num_hidden = 5
     input_dim = 3
@@ -56,14 +57,16 @@ class TestSequential(test.TestCase, parameterized.TestCase):
 
     model = testing_utils.get_small_sequential_mlp(
         num_hidden, num_classes, input_dim)
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
     x = np.random.random((batch_size, input_dim))
     y = np.random.random((batch_size, num_classes))
     model.fit(x, y, epochs=1)
     model.pop()
     self.assertEqual(len(model.layers), 1)
     self.assertEqual(model.output_shape, (None, num_hidden))
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
     y = np.random.random((batch_size, num_hidden))
     model.fit(x, y, epochs=1)
 
@@ -79,7 +82,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     with self.assertRaises(TypeError):
       model.pop()
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_deferred_build_with_np_arrays(self):
     num_hidden = 5
     input_dim = 3
@@ -90,7 +93,8 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     model.compile(
         loss='mse',
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=[keras.metrics.CategoricalAccuracy()])
+        metrics=[keras.metrics.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
     self.assertEqual(len(model.layers), 2)
     self.assertEqual(len(model.weights), 0)
     self.assertFalse(model.built)
@@ -102,7 +106,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     self.assertFalse(model._is_graph_network)
     self.assertEqual(len(model.weights), 2 * 2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_deferred_build_with_dataset_iterators(self):
     num_hidden = 5
     input_dim = 3
@@ -114,7 +118,8 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     model.compile(
         loss='mse',
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=[keras.metrics.CategoricalAccuracy()])
+        metrics=[keras.metrics.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
     self.assertEqual(len(model.layers), 2)
     self.assertEqual(len(model.weights), 0)
     self.assertFalse(model.built)
@@ -124,13 +129,14 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     model.fit(iterator, epochs=1, steps_per_epoch=steps_per_epoch)
     self.assertTrue(model.built)
     self.assertEqual(len(model.weights), 2 * 2)
     self.assertFalse(model._is_graph_network)
 
+  # TODO(kaftan) This test fails w/ run_with_all_keras_modes. File ticket
   @parameterized.parameters((True,), (False,))
   @tf_test_util.run_deprecated_v1
   def test_training_and_eval_methods_on_symbolic_tensors(self, deferred):
@@ -175,7 +181,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
           validation_data=(inputs, targets),
           validation_steps=2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_invalid_use_cases(self):
     # Added objects must be layer instances
     with self.assertRaises(TypeError):
@@ -199,7 +205,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
       model.add(keras.layers.Dense(1, input_dim=1))
       model.add(MyLayer())
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_nested_sequential_trainability(self):
     input_dim = 20
     num_units = 10
@@ -220,7 +226,6 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     inner_model.trainable = True
     self.assertEqual(len(model.trainable_weights), 4)
 
-  @tf_test_util.run_deprecated_v1
   def test_sequential_update_disabling(self):
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
@@ -249,7 +254,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
       x2 = model.predict(val_a)
       assert np.abs(np.sum(x1 - x2)) > 1e-5
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_deferred_build_serialization(self):
     num_hidden = 5
     input_dim = 3
@@ -260,7 +265,8 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     model.compile(
         loss='mse',
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=[keras.metrics.CategoricalAccuracy()])
+        metrics=[keras.metrics.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
     self.assertFalse(model.built)
 
     x = np.random.random((batch_size, input_dim))
@@ -275,13 +281,13 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     self.assertEqual(len(new_model.layers), 2)
     self.assertEqual(len(new_model.weights), 4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_shape_inference_deferred(self):
     model = testing_utils.get_small_sequential_mlp(4, 5)
     output_shape = model.compute_output_shape((None, 7))
     self.assertEqual(tuple(output_shape.as_list()), (None, 5))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_build_deferred(self):
     model = testing_utils.get_small_sequential_mlp(4, 5)
 
@@ -298,18 +304,19 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     self.assertTrue(model.built)
     self.assertEqual(len(model.weights), 8)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_nesting(self):
     model = testing_utils.get_small_sequential_mlp(4, 3)
     inner_model = testing_utils.get_small_sequential_mlp(4, 5)
     model.add(inner_model)
 
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
     x = np.random.random((2, 6))
     y = np.random.random((2, 5))
     model.fit(x, y, epochs=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_variable_names(self):
     model = keras.models.Sequential([keras.layers.Dense(3)])
     model.add(keras.layers.Dense(2))
@@ -319,7 +326,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
          'sequential/dense_1/kernel:0', 'sequential/dense_1/bias:0'],
         [v.name for v in model.variables])
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_input_assumptions_propagation(self):
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(1))
@@ -329,9 +336,9 @@ class TestSequential(test.TestCase, parameterized.TestCase):
         model(1.0)
 
 
-class TestSequentialEagerIntegration(test.TestCase):
+class TestSequentialEagerIntegration(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_defun_on_call(self):
     # Check that one can subclass Sequential and place the `call` in a `defun`.
 
@@ -345,17 +352,19 @@ class TestSequentialEagerIntegration(test.TestCase):
     model.add(keras.layers.Dense(4, activation='relu'))
     model.add(keras.layers.Dense(5, activation='softmax'))
 
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.random.random((2, 6))
     y = np.random.random((2, 5))
     model.fit(x, y, epochs=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_build_before_fit(self):
     # Fix for b/112433577
     model = testing_utils.get_small_sequential_mlp(4, 5)
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     model.build((None, 6))
 
@@ -363,7 +372,7 @@ class TestSequentialEagerIntegration(test.TestCase):
     y = np.random.random((2, 5))
     model.fit(x, y, epochs=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_model_fails_with_dict_inputs(self):
     num_classes = 5
     model = testing_utils.get_small_sequential_mlp(
@@ -372,7 +381,8 @@ class TestSequentialEagerIntegration(test.TestCase):
         rmsprop.RMSPropOptimizer(learning_rate=0.001),
         metrics=['acc'],
         weighted_metrics=['mae'],
-        loss='categorical_crossentropy')
+        loss='categorical_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = {'dense_input': np.random.random((10, 1))}
     y = np.random.randint(num_classes, size=(10, 1))
diff --git a/tensorflow/python/keras/engine/topology_test.py b/tensorflow/python/keras/engine/topology_test.py
index 03bfd35589cedbccbd30b25218d529d41c8869ae..cd1f4d16971a52d595ff4967a999ab75b04bcebe 100644
--- a/tensorflow/python/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/engine/topology_test.py
@@ -26,8 +26,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer as input_layer_lib
 from tensorflow.python.keras.engine import network as network_lib
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -40,7 +43,7 @@ except ImportError:
   yaml = None
 
 
-class TopologyConstructionTest(test.TestCase):
+class TopologyConstructionTest(keras_parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def test_get_updates(self):
@@ -107,6 +110,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(len(network.updates), 5)
     self.assertEqual(len(network.get_updates_for(x4)), 2)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_get_updates_bn(self):
     x1 = input_layer_lib.Input(shape=(1,))
     layer = keras.layers.BatchNormalization()
@@ -179,6 +183,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(len(network.losses), 5)
     self.assertEqual(len(network.get_losses_for(x4)), 2)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testTopologicalAttributes(self):
     # test layer attributes / methods related to cross-layer connectivity.
     a = input_layer_lib.Input(shape=(32,), name='input_a')
@@ -236,6 +241,7 @@ class TopologyConstructionTest(test.TestCase):
       b_2 = dense(b)
       _ = new_dense.output_shape
 
+  @test_util.run_in_graph_and_eager_modes()
   def testTopologicalAttributesMultiOutputLayer(self):
 
     class PowersLayer(keras.layers.Layer):
@@ -252,6 +258,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(test_layer.input_shape, (None, 32))
     self.assertEqual(test_layer.output_shape, [(None, 32), (None, 32)])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testTopologicalAttributesMultiInputLayer(self):
 
     class AddLayer(keras.layers.Layer):
@@ -303,6 +310,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(network.non_trainable_weights,
                      dense.trainable_weights + dense.non_trainable_weights)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_trainable_weights(self):
     a = keras.layers.Input(shape=(2,))
     b = keras.layers.Dense(1)(a)
@@ -423,6 +431,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(dense.get_output_mask_at(0), None)
     self.assertEqual(dense.get_output_mask_at(1), None)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_multi_input_layer(self):
     with self.cached_session():
       # test multi-input layer
@@ -557,6 +566,7 @@ class TopologyConstructionTest(test.TestCase):
       fn_outputs = fn([input_a_np, input_b_np])
       self.assertListEqual([x.shape for x in fn_outputs], [(10, 7), (10, 64)])
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_multi_input_multi_output_recursion(self):
     with self.cached_session():
       # test multi-input multi-output
@@ -630,6 +640,7 @@ class TopologyConstructionTest(test.TestCase):
         yaml_str = model.to_yaml()
         keras.models.model_from_yaml(yaml_str)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_invalid_graphs(self):
     a = keras.layers.Input(shape=(32,), name='input_a')
     b = keras.layers.Input(shape=(32,), name='input_b')
@@ -719,6 +730,7 @@ class TopologyConstructionTest(test.TestCase):
     x = keras.layers.Input(tensor=x)
     keras.layers.Dense(2)(x)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_basic_masking(self):
     a = keras.layers.Input(shape=(10, 32), name='input_a')
     b = keras.layers.Masking()(a)
@@ -785,121 +797,128 @@ class TopologyConstructionTest(test.TestCase):
     loss = model_b.evaluate(x)
     self.assertEqual(loss, 4.)
 
+  @keras_parameterized.run_all_keras_modes
   def test_layer_sharing_at_heterogenous_depth(self):
-    with self.cached_session():
-      x_val = np.random.random((10, 5))
+    x_val = np.random.random((10, 5))
 
-      x = input_layer_lib.Input(shape=(5,))
-      a = keras.layers.Dense(5, name='A')
-      b = keras.layers.Dense(5, name='B')
-      output = a(b(a(b(x))))
-      m = keras.models.Model(x, output)
+    x = input_layer_lib.Input(shape=(5,))
+    a = keras.layers.Dense(5, name='A')
+    b = keras.layers.Dense(5, name='B')
+    output = a(b(a(b(x))))
+    m = keras.models.Model(x, output)
+    m.run_eagerly = testing_utils.should_run_eagerly()
 
-      output_val = m.predict(x_val)
+    output_val = m.predict(x_val)
 
-      config = m.get_config()
-      weights = m.get_weights()
+    config = m.get_config()
+    weights = m.get_weights()
 
-      m2 = keras.models.Model.from_config(config)
-      m2.set_weights(weights)
+    m2 = keras.models.Model.from_config(config)
+    m2.set_weights(weights)
 
-      output_val_2 = m2.predict(x_val)
-      self.assertAllClose(output_val, output_val_2, atol=1e-6)
+    output_val_2 = m2.predict(x_val)
+    self.assertAllClose(output_val, output_val_2, atol=1e-6)
 
+  @keras_parameterized.run_all_keras_modes
   def test_layer_sharing_at_heterogenous_depth_with_concat(self):
-    with self.cached_session():
-      input_shape = (16, 9, 3)
-      input_layer = input_layer_lib.Input(shape=input_shape)
+    input_shape = (16, 9, 3)
+    input_layer = input_layer_lib.Input(shape=input_shape)
 
-      a = keras.layers.Dense(3, name='dense_A')
-      b = keras.layers.Dense(3, name='dense_B')
-      c = keras.layers.Dense(3, name='dense_C')
+    a = keras.layers.Dense(3, name='dense_A')
+    b = keras.layers.Dense(3, name='dense_B')
+    c = keras.layers.Dense(3, name='dense_C')
 
-      x1 = b(a(input_layer))
-      x2 = a(c(input_layer))
-      output = keras.layers.concatenate([x1, x2])
+    x1 = b(a(input_layer))
+    x2 = a(c(input_layer))
+    output = keras.layers.concatenate([x1, x2])
 
-      m = keras.models.Model(inputs=input_layer, outputs=output)
+    m = keras.models.Model(inputs=input_layer, outputs=output)
+    m.run_eagerly = testing_utils.should_run_eagerly()
 
-      x_val = np.random.random((10, 16, 9, 3))
-      output_val = m.predict(x_val)
+    x_val = np.random.random((10, 16, 9, 3))
+    output_val = m.predict(x_val)
 
-      config = m.get_config()
-      weights = m.get_weights()
+    config = m.get_config()
+    weights = m.get_weights()
 
-      m2 = keras.models.Model.from_config(config)
-      m2.set_weights(weights)
+    m2 = keras.models.Model.from_config(config)
+    m2.set_weights(weights)
 
-      output_val_2 = m2.predict(x_val)
-      self.assertAllClose(output_val, output_val_2, atol=1e-6)
+    output_val_2 = m2.predict(x_val)
+    self.assertAllClose(output_val, output_val_2, atol=1e-6)
 
-  @test_util.run_deprecated_v1
+  @keras_parameterized.run_all_keras_modes
   def test_explicit_training_argument(self):
-    with self.cached_session():
-      a = keras.layers.Input(shape=(2,))
-      b = keras.layers.Dropout(0.5)(a)
-      base_model = keras.models.Model(a, b)
-
-      a = keras.layers.Input(shape=(2,))
-      b = base_model(a, training=False)
-      model = keras.models.Model(a, b)
-
-      x = np.ones((100, 2))
-      y = np.ones((100, 2))
-      model.compile(optimizer='sgd', loss='mse')
-      loss = model.train_on_batch(x, y)
-      self.assertEqual(loss, 0)  # In inference mode, output is equal to input.
-
-      a = keras.layers.Input(shape=(2,))
-      b = base_model(a, training=True)
-      model = keras.models.Model(a, b)
-      preds = model.predict(x)
-      self.assertEqual(np.min(preds), 0.)  # At least one unit was dropped.
+    a = keras.layers.Input(shape=(2,))
+    b = keras.layers.Dropout(0.5)(a)
+    base_model = keras.models.Model(a, b)
 
-  def test_multi_output_model_with_none_masking(self):
+    a = keras.layers.Input(shape=(2,))
+    b = base_model(a, training=False)
+    model = keras.models.Model(a, b)
 
-    with self.cached_session():
+    x = np.ones((100, 2))
+    y = np.ones((100, 2))
+    model.compile(
+        optimizer=gradient_descent.SGD(),
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    loss = model.train_on_batch(x, y)
+    self.assertEqual(loss, 0)  # In inference mode, output is equal to input.
+
+    a = keras.layers.Input(shape=(2,))
+    b = base_model(a, training=True)
+    model = keras.models.Model(a, b)
+    preds = model.predict(x)
+    self.assertEqual(np.min(preds), 0.)  # At least one unit was dropped.
 
-      def func(x):
-        return [x * 0.2, x * 0.3]
+  @keras_parameterized.run_all_keras_modes
+  def test_multi_output_model_with_none_masking(self):
+    def func(x):
+      return [x * 0.2, x * 0.3]
 
-      def output_shape(input_shape):
-        return [input_shape, input_shape]
+    def output_shape(input_shape):
+      return [input_shape, input_shape]
 
-      i = keras.layers.Input(shape=(3, 2, 1))
-      o = keras.layers.Lambda(function=func, output_shape=output_shape)(i)
+    i = keras.layers.Input(shape=(3, 2, 1))
+    o = keras.layers.Lambda(function=func, output_shape=output_shape)(i)
 
-      self.assertEqual(keras.backend.int_shape(o[0]), (None, 3, 2, 1))
-      self.assertEqual(keras.backend.int_shape(o[1]), (None, 3, 2, 1))
+    self.assertEqual(keras.backend.int_shape(o[0]), (None, 3, 2, 1))
+    self.assertEqual(keras.backend.int_shape(o[1]), (None, 3, 2, 1))
 
-      o = keras.layers.add(o)
-      model = keras.Model(i, o)
+    o = keras.layers.add(o)
+    model = keras.Model(i, o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
-      i2 = keras.layers.Input(shape=(3, 2, 1))
-      o2 = model(i2)
-      model2 = keras.Model(i2, o2)
+    i2 = keras.layers.Input(shape=(3, 2, 1))
+    o2 = model(i2)
+    model2 = keras.Model(i2, o2)
+    model2.run_eagerly = testing_utils.should_run_eagerly()
 
-      x = np.random.random((4, 3, 2, 1))
-      out = model2.predict(x)
-      assert out.shape == (4, 3, 2, 1)
-      self.assertAllClose(out, x * 0.2 + x * 0.3, atol=1e-4)
+    x = np.random.random((4, 3, 2, 1))
+    out = model2.predict(x)
+    assert out.shape == (4, 3, 2, 1)
+    self.assertAllClose(out, x * 0.2 + x * 0.3, atol=1e-4)
 
+  @keras_parameterized.run_all_keras_modes
   def test_constant_initializer_with_numpy(self):
+    initializer = keras.initializers.Constant(np.ones((3, 2)))
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Dense(2, input_shape=(3,), kernel_initializer=initializer))
+    model.add(keras.layers.Dense(3))
+    model.compile(
+        loss='mse',
+        optimizer=gradient_descent.SGD(),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
-    with self.cached_session():
-      initializer = keras.initializers.Constant(np.ones((3, 2)))
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,),
-                                   kernel_initializer=initializer))
-      model.add(keras.layers.Dense(3))
-      model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
-
-      json_str = model.to_json()
-      keras.models.model_from_json(json_str)
+    json_str = model.to_json()
+    keras.models.model_from_json(json_str)
 
-      if yaml is not None:
-        yaml_str = model.to_yaml()
-        keras.models.model_from_yaml(yaml_str)
+    if yaml is not None:
+      yaml_str = model.to_yaml()
+      keras.models.model_from_yaml(yaml_str)
 
 
 class DeferredModeTest(test.TestCase):
@@ -928,7 +947,7 @@ class DeferredModeTest(test.TestCase):
       self.assertEqual(outputs.shape.as_list(), [10, 4])
 
   @test_util.run_in_graph_and_eager_modes()
-  def testMultiIONetworkbuilding(self):
+  def testMultiIONetworkBuilding(self):
     input_a = input_layer_lib.Input(shape=(32,))
     input_b = input_layer_lib.Input(shape=(16,))
     a = keras.layers.Dense(16)(input_a)
@@ -953,7 +972,7 @@ class DeferredModeTest(test.TestCase):
       self.assertEqual(outputs[1].shape.as_list(), [10, 2])
 
 
-class DefaultShapeInferenceBehaviorTest(test.TestCase):
+class DefaultShapeInferenceBehaviorTest(keras_parameterized.TestCase):
 
   def _testShapeInference(self, model, input_shape, expected_output_shape):
     input_value = np.random.random(input_shape)
@@ -1121,7 +1140,7 @@ class DefaultShapeInferenceBehaviorTest(test.TestCase):
     output = model(sample_input)
     self.assertEqual(output.shape, (1, 3))
 
-  @test_util.run_in_graph_and_eager_modes()
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_as_downstream_of_masking_layer(self):
     inputs = keras.layers.Input(shape=(3, 4))
     x = keras.layers.Masking(mask_value=0., input_shape=(3, 4))(inputs)
@@ -1131,7 +1150,10 @@ class DefaultShapeInferenceBehaviorTest(test.TestCase):
 
     x = keras.layers.wrappers.TimeDistributed(s)(x)
     model = keras.Model(inputs=inputs, outputs=x)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(1e-3), loss='mse')
+    model.compile(
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     model_input = np.random.randint(
         low=1, high=5, size=(10, 3, 4)).astype('float32')
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index a66d8866bd37f30735736659700c38e10ec40f2d..320e76162e9bfe4fa3f176501d1fec43d06134d2 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -27,6 +27,7 @@ from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
@@ -39,6 +40,7 @@ from tensorflow.python.keras.engine import training_eager
 from tensorflow.python.keras.engine import training_generator
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine.network import Network
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
@@ -127,272 +129,413 @@ class Model(Network):
 
     self.run_eagerly = None
 
-  def _set_sample_weight_attributes(self, sample_weight_mode,
-                                    skip_target_weighing_indices):
-    """Sets sample weight related attributes on the model."""
-    sample_weights, sample_weight_modes = training_utils.prepare_sample_weights(
-        self.output_names, sample_weight_mode, skip_target_weighing_indices)
-    self.sample_weights = sample_weights
-    self.sample_weight_modes = sample_weight_modes
-    self._feed_sample_weight_modes = [
-        sample_weight_modes[i]
-        for i in range(len(self.outputs))
-        if i not in skip_target_weighing_indices
-    ]
-    self._feed_sample_weights = [
-        sample_weights[i]
-        for i in range(len(sample_weights))
-        if i not in skip_target_weighing_indices
-    ]
-
-  def _cache_output_metric_attributes(self, metrics, weighted_metrics):
-    """Caches metric name and function attributes for every model output."""
-    output_shapes = [
-        None if output is None else output.get_shape().as_list()
-        for output in self.outputs
-    ]
-    self._per_output_metrics = training_utils.collect_per_output_metric_info(
-        metrics, self.output_names, output_shapes, self.loss_functions)
-    self._per_output_weighted_metrics = \
-        training_utils.collect_per_output_metric_info(
-            weighted_metrics, self.output_names, output_shapes,
-            self.loss_functions, self.sample_weights)
-
-  def _add_unique_metric_name(self, metric_name, output_index):
-    """Makes the metric name unique and adds it to the model's metric name list.
-
-      If there are multiple outputs for which the metrics are calculated, the
-      metric names have to be made unique by appending an integer.
+  @checkpointable.no_automatic_dependency_tracking
+  def compile(self,
+              optimizer,
+              loss=None,
+              metrics=None,
+              loss_weights=None,
+              sample_weight_mode=None,
+              weighted_metrics=None,
+              target_tensors=None,
+              distribute=None,
+              **kwargs):
+    """Configures the model for training.
 
     Arguments:
-      metric_name: Metric name that corresponds to the metric specified by the
-          user. For example: 'acc'.
-      output_index: The index of the model output for which the metric name is
-        being added.
+        optimizer: String (name of optimizer) or optimizer instance.
+            See `tf.keras.optimizers`.
+        loss: String (name of objective function) or objective function.
+            See `tf.losses`. If the model has multiple outputs, you can use a
+            different loss on each output by passing a dictionary or a list of
+            losses. The loss value that will be minimized by the model
+            will then be the sum of all individual losses.
+        metrics: List of metrics to be evaluated by the model
+            during training and testing.
+            Typically you will use `metrics=['accuracy']`.
+            To specify different metrics for different outputs of a
+            multi-output model, you could also pass a dictionary,
+            such as `metrics={'output_a': 'accuracy'}`.
+        loss_weights: Optional list or dictionary specifying scalar
+            coefficients (Python floats) to weight the loss contributions
+            of different model outputs.
+            The loss value that will be minimized by the model
+            will then be the *weighted sum* of all individual losses,
+            weighted by the `loss_weights` coefficients.
+            If a list, it is expected to have a 1:1 mapping
+            to the model's outputs. If a tensor, it is expected to map
+            output names (strings) to scalar coefficients.
+        sample_weight_mode: If you need to do timestep-wise
+            sample weighting (2D weights), set this to `"temporal"`.
+            `None` defaults to sample-wise weights (1D).
+            If the model has multiple outputs, you can use a different
+            `sample_weight_mode` on each output by passing a
+            dictionary or a list of modes.
+        weighted_metrics: List of metrics to be evaluated and weighted
+            by sample_weight or class_weight during training and testing.
+        target_tensors: By default, Keras will create placeholders for the
+            model's target, which will be fed with the target data during
+            training. If instead you would like to use your own
+            target tensors (in turn, Keras will not expect external
+            Numpy data for these targets at training time), you
+            can specify them via the `target_tensors` argument. It can be
+            a single tensor (for a single-output model), a list of tensors,
+            or a dict mapping output names to target tensors.
+        distribute: The DistributionStrategy instance that we want to use to
+            distribute the training of the model.
+        **kwargs: These arguments are passed to `tf.Session.run`.
 
-    Returns:
-      string, name of the model's unique metric name
+    Raises:
+        ValueError: In case of invalid arguments for
+            `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
-    if len(self.output_names) > 1:
-      metric_name = '%s_%s' % (self.output_names[output_index], metric_name)
-    j = 1
-    base_metric_name = metric_name
-    while metric_name in self._compile_metrics_names:
-      metric_name = '%s_%d' % (base_metric_name, j)
-      j += 1
-
-    return metric_name
-
-  @property
-  def metrics(self):
-    """Returns the model's metrics added using `compile`, `add_metric` APIs."""
-    metrics = []
-    if self._is_compiled:
-      metrics += self._compile_stateful_metric_functions
-    return metrics + super(Model, self).metrics
+    run_eagerly = kwargs.pop('run_eagerly', None)
+    self._run_eagerly = run_eagerly
+    optimizer = optimizers.get(optimizer)
 
-  @property
-  def metrics_names(self):
-    """Returns the model's display labels for all outputs."""
-    metrics_names = []
-    if self._is_compiled:
-      metrics_names += self._compile_metrics_names  # Includes names of losses.
+    # Validate that arguments passed by the user to `compile` are supported by
+    # DistributionStrategy.
+    if distribute:
+      if not isinstance(optimizer,
+                        (tf_optimizer_module.Optimizer, optimizers.TFOptimizer,
+                         optimizer_v2.OptimizerV2)):
+        raise NotImplementedError(
+            'optimizer must be an instance of '
+            'tf.train.Optimizer, not a %s' % type(optimizer))
+      if sample_weight_mode:
+        raise NotImplementedError('sample_weight_mode is not supported with '
+                                  'DistributionStrategy.')
+      if weighted_metrics:
+        raise NotImplementedError('weighted_metrics is not supported with '
+                                  'DistributionStrategy.')
+      if target_tensors:
+        raise ValueError('target_tensors is not supported with '
+                         'DistributionStrategy.')
 
-    # Add metric names from layers.
-    for layer in self.layers:
-      metrics_names += [m.name for m in layer._metrics]  # pylint: disable=protected-access
-    metrics_names += [m.name for m in self._metrics]
-    return metrics_names
+    loss = loss or {}
+    if self.run_eagerly and not isinstance(
+        optimizer, (tf_optimizer_module.Optimizer, optimizers.TFOptimizer,
+                    optimizer_v2.OptimizerV2)):
+      raise ValueError(
+          'When running a model in eager execution, the optimizer must be an '
+          'instance of tf.train.Optimizer. Received: '
+          '%s' % optimizer)
 
-  @property
-  def _all_metrics_tensors(self):
-    """Returns the network's symbolic metric tensors."""
-    metrics_tensors = {}
-    if self._is_compiled:
-      metrics_tensors.update(self._compile_metrics_tensors)
-    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
-    return metrics_tensors
+    self.optimizer = optimizer
+    # We've disabled automatic dependency tracking for this method, but do want
+    # to add a checkpoint dependency on the optimizer if it's checkpointable.
+    if isinstance(self.optimizer, checkpointable.CheckpointableBase):
+      self._track_checkpointable(
+          self.optimizer, name='optimizer', overwrite=True)
+    self.loss = loss
+    self._compile_metrics = metrics or []
+    self.loss_weights = loss_weights
+    self.sample_weight_mode = sample_weight_mode
+    self._compile_weighted_metrics = weighted_metrics
+    if self.run_eagerly and target_tensors is not None:
+      raise ValueError(
+          'target_tensors argument is not supported when '
+          'running a model eagerly.')
+    self.target_tensors = target_tensors
 
-  @property
-  def _all_stateful_metrics_tensors(self):
-    """Returns the network's symbolic metric tensors."""
-    metrics_tensors = {}
-    if self._is_compiled:
-      metrics_tensors.update(self._compile_stateful_metrics_tensors)
-    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
-    return metrics_tensors
+    # Set DistributionStrategy specific parameters.
+    self._distribution_strategy = distribute
+    # Reset the value of grouped_model
+    self._grouped_model = None
+    if self._distribution_strategy is not None:
+      distributed_training_utils.configure_and_create_session(
+          self._distribution_strategy)
+    # Initialize model metric attributes.
+    self._init_metric_attributes()
+    if not self.built:
+      # Model is not compilable because it does not know its number of inputs
+      # and outputs, nor their shapes and names. We will compile after the first
+      # time the model gets called on training data.
+      return
+    self._is_compiled = True
 
-  def _init_metric_attributes(self):
-    """Initialized model metric attributes."""
-    # List of all metric names in the model.
-    self._compile_metrics_names = ['loss']
-    # List of stateful metric functions. Used for resetting metric state during
-    # training/eval.
-    # This includes loss functions when there are multiple outputs.
-    self._compile_stateful_metric_functions = []
-    # Dict of all aggregated metric result tensors. This includes aggregated
-    # loss result tensors when there are multiple outputs.
-    self._compile_stateful_metrics_tensors = {}
-    # Dict of all metric result tensors (aggregated or not - based on the
-    # values given in compile.). This includes aggregated loss result tensors
-    # when there are multiple outputs.
-    self._compile_metrics_tensors = {}
+    # Prepare loss functions.
+    if isinstance(loss, dict):
+      for name in loss:
+        if name not in self.output_names:
+          raise ValueError(
+              'Unknown entry in loss '
+              'dictionary: "' + name + '". '
+              'Only expected the following keys: ' + str(self.output_names))
+      loss_functions = []
+      for name in self.output_names:
+        if name not in loss:
+          logging.warning(
+              'Output "' + name +
+              '" missing from loss dictionary. We assume '
+              'this was done on purpose. The fit and evaluate APIs will not be '
+              'expecting any data to be passed to "' + name + '".')
+        loss_functions.append(training_utils.get_loss_function(loss.get(name)))
+    elif isinstance(loss, list):
+      if len(loss) != len(self.outputs):
+        raise ValueError('When passing a list as loss, '
+                         'it should have one entry per model outputs. '
+                         'The model has ' + str(len(self.outputs)) +
+                         ' outputs, but you passed loss=' + str(loss))
+      loss_functions = [training_utils.get_loss_function(l) for l in loss]
+    else:
+      loss_function = training_utils.get_loss_function(loss)
+      loss_functions = [loss_function for _ in range(len(self.outputs))]
+    self.loss_functions = loss_functions
 
-  def _set_per_output_metric_attributes(self, metrics_dict, output_index):
-    """Sets the metric attributes on the model for the given output.
+    skip_target_indices = []
+    skip_target_weighing_indices = []
+    self._feed_outputs = []
+    self._feed_output_names = []
+    self._feed_output_shapes = []
+    self._feed_loss_fns = []
+    for i in range(len(loss_functions)):
+      if loss_functions[i] is None:
+        skip_target_indices.append(i)
+        skip_target_weighing_indices.append(i)
 
-    Arguments:
-      metrics_dict: A dict with metric names as keys and metric fns as values.
-      output_index: The index of the model output for which the metric
-        attributes are added.
+    # Prepare output masks.
+    if not self.run_eagerly:
+      masks = [getattr(x, '_keras_mask', None) for x in self.outputs]
+      if not isinstance(masks, list):
+        masks = [masks]
 
-    Returns:
-      Metrics dict updated with unique metric names as keys.
-    """
-    updated_metrics_dict = collections.OrderedDict()
-    for metric_name, (metric_fn, stateful_metric_fn) in metrics_dict.items():
-      metric_name = self._add_unique_metric_name(metric_name, output_index)
-      updated_metrics_dict[metric_name] = (metric_fn, stateful_metric_fn)
-      # Keep track of metric name, function and stateful function.
-      self._compile_metrics_names.append(metric_name)
-      self._compile_stateful_metric_functions.append(stateful_metric_fn)
-    return updated_metrics_dict
-
-  def _set_metric_attributes(self, outputs, skip_target_indices=None):
-    """Sets the metric attributes on the model for all the model outputs."""
-    skip_target_indices = skip_target_indices or []
-    updated_per_output_metrics = []
-    updated_per_output_weighted_metrics = []
-    for i in range(len(outputs)):
-      if i in skip_target_indices:
-        updated_per_output_metrics.append(self._per_output_metrics[i])
-        updated_per_output_weighted_metrics.append(
-            self._per_output_weighted_metrics[i])
-        continue
-      updated_per_output_metrics.append(
-          self._set_per_output_metric_attributes(self._per_output_metrics[i],
-                                                 i))
-      updated_per_output_weighted_metrics.append(
-          self._set_per_output_metric_attributes(
-              self._per_output_weighted_metrics[i], i))
-
-    self._per_output_metrics = updated_per_output_metrics
-    self._per_output_weighted_metrics = updated_per_output_weighted_metrics
+    # Prepare loss weights.
+    if loss_weights is None:
+      loss_weights_list = [1. for _ in range(len(self.outputs))]
+    elif isinstance(loss_weights, dict):
+      for name in loss_weights:
+        if name not in self.output_names:
+          raise ValueError(
+              'Unknown entry in loss_weights '
+              'dictionary: "' + name + '". '
+              'Only expected the following keys: ' + str(self.output_names))
+      loss_weights_list = []
+      for name in self.output_names:
+        loss_weights_list.append(loss_weights.get(name, 1.))
+    elif isinstance(loss_weights, list):
+      if len(loss_weights) != len(self.outputs):
+        raise ValueError(
+            'When passing a list as loss_weights, '
+            'it should have one entry per model output. '
+            'The model has ' + str(len(self.outputs)) +
+            ' outputs, but you passed loss_weights=' + str(loss_weights))
+      loss_weights_list = loss_weights
+    else:
+      raise TypeError('Could not interpret loss_weights argument: ' +
+                      str(loss_weights) + ' - expected a list of dicts.')
+    self.loss_weights_list = loss_weights_list
 
-  def _handle_per_output_metrics(self,
-                                 metrics_dict,
-                                 y_true,
-                                 y_pred,
-                                 mask,
-                                 weights=None,
-                                 return_stateful_result=True):
-    """Calls metric functions for a single output.
+    # Initialization for Eager mode execution.
+    if self.run_eagerly:
+      # Prepare sample weights.
+      self._set_sample_weight_attributes(sample_weight_mode,
+                                         skip_target_weighing_indices)
+      # Save all metric attributes per output of the model.
+      self._cache_output_metric_attributes(metrics, weighted_metrics)
 
-    Arguments:
-      metrics_dict: A dict with metric names as keys and metric fns as values.
-      y_true: Target output.
-      y_pred: Predicted output.
-      mask: Computed mask value for the current output.
-      weights: Weights to be applied on the current output.
-      return_stateful_result: Boolean, indicates whether the stateful
-        (aggregated)/stateless metric result should be returned.
+      if target_tensors is not None:
+        raise ValueError('target_tensors are not currently supported in Eager '
+                         'mode.')
+      self.total_loss = None
+      for i in range(len(self.outputs)):
+        if len(self.outputs) > 1:
+          self._compile_metrics_names.append(self.output_names[i] + '_loss')
 
-    Returns:
-      A list of metric result tensors.
-    """
-    metric_results = []
-    for metric_name, (metric_fn, stateful_fn) in metrics_dict.items():
-      with K.name_scope(metric_name):
+      # Set metric attributes on model.
+      self._set_metric_attributes(
+          self.outputs,
+          skip_target_indices=skip_target_indices,
+      )
 
-        def _call_stateful_fn(fn):
-          return training_utils.call_metric_function(
-              fn, y_true, y_pred, weights=weights, mask=mask)
+      self.targets = []
+      for i in range(len(self.outputs)):
+        self._feed_output_names.append(self.output_names[i])
+      self._collected_trainable_weights = self.trainable_weights
+      return
 
-        def _call_stateless_fn(fn):
-          weighted_metric_fn = training_utils.weighted_masked_objective(fn)
-          return weighted_metric_fn(y_true, y_pred, weights=weights, mask=mask)
+    with K.get_graph().as_default():
+      # Prepare targets of model.
+      self.targets = []
+      self._feed_targets = []
+      if target_tensors not in (None, []):
+        if isinstance(target_tensors, list):
+          if len(target_tensors) != len(self.outputs):
+            raise ValueError(
+                'When passing a list as `target_tensors`, '
+                'it should have one entry per model output. '
+                'The model has %s outputs, but you passed target_tensors=%s' %
+                (len(self.outputs), target_tensors))
+        elif isinstance(target_tensors, dict):
+          for name in target_tensors:
+            if name not in self.output_names:
+              raise ValueError(
+                  'Unknown entry in `target_tensors` '
+                  'dictionary: "' + name + '". '
+                  'Only expected the following keys: ' + str(self.output_names))
+          tmp_target_tensors = []
+          for name in self.output_names:
+            tmp_target_tensors.append(target_tensors.get(name, None))
+          target_tensors = tmp_target_tensors
+        elif tensor_util.is_tensor(target_tensors):
+          target_tensors = [target_tensors]
+        else:
+          raise TypeError('Expected `target_tensors` to be a list or tuple or '
+                          'dict or a single tensor, but got:', target_tensors)
 
-        def _track_metric_tensors(name, stateless_result, stateful_result):
-          self._compile_metrics_tensors[name] = stateless_result
-          self._compile_stateful_metrics_tensors[name] = stateful_result
+      for i in range(len(self.outputs)):
+        if i in skip_target_indices:
+          self.targets.append(None)
+        else:
+          shape = K.int_shape(self.outputs[i])
+          name = self.output_names[i]
+          if target_tensors not in (None, []):
+            target = target_tensors[i]
+          else:
+            target = None
+          if target is None or K.is_placeholder(target):
+            if target is None:
+              target_dtype = losses.LABEL_DTYPES_FOR_LOSSES.get(
+                  self.loss_functions[i],
+                  K.dtype(self.outputs[i]))
 
-        if isinstance(metric_fn, metrics_module.Metric):
-          # If the given metric fn is stateful, call the fn and return result.
-          metric_result = _call_stateful_fn(metric_fn)
-          metric_results.append(metric_result)
-          if not self.run_eagerly:
-            _track_metric_tensors(metric_name, metric_result, metric_result)
-        elif self.run_eagerly:
-          # In eager mode, if the given metric fn is not stateful, we invoke the
-          # given fn or its stateful version based on the given flag.
-          if return_stateful_result:
-            metric_result = _call_stateful_fn(stateful_fn)
+              target = K.placeholder(
+                  ndim=len(shape),
+                  name=name + '_target',
+                  sparse=K.is_sparse(self.outputs[i]),
+                  dtype=target_dtype)
+            self._feed_targets.append(target)
+            self._feed_outputs.append(self.outputs[i])
+            self._feed_output_names.append(name)
+            self._feed_output_shapes.append(shape)
+            self._feed_loss_fns.append(self.loss_functions[i])
           else:
-            metric_result = _call_stateless_fn(metric_fn)
-          metric_results.append(metric_result)
-        else:
-          # In graph mode, we build the sub-graph for both the stateful and the
-          # stateless fns.
-          stateful_metric_result = _call_stateful_fn(stateful_fn)
-          metric_result = _call_stateless_fn(metric_fn)
-          _track_metric_tensors(metric_name, metric_result,
-                                stateful_metric_result)
+            skip_target_weighing_indices.append(i)
+          self.targets.append(target)
 
-    return metric_results
+      # Prepare sample weights.
+      self._set_sample_weight_attributes(sample_weight_mode,
+                                         skip_target_weighing_indices)
+      # Save all metric attributes per output of the model.
+      self._cache_output_metric_attributes(metrics, weighted_metrics)
 
-  def _handle_metrics(self,
-                      outputs,
-                      skip_target_indices=None,
-                      targets=None,
-                      sample_weights=None,
-                      masks=None,
-                      return_stateful_result=True):
-    """Handles calling metric functions.
+      # Compute total loss.
+      total_loss = None
+      with K.name_scope('loss'):
+        for i in range(len(self.outputs)):
+          if i in skip_target_indices:
+            continue
+          y_true = self.targets[i]
+          y_pred = self.outputs[i]
+          loss_fn = loss_functions[i]
+          sample_weight = self.sample_weights[i]
+          mask = masks[i]
+          loss_weight = loss_weights_list[i]
+          with K.name_scope(self.output_names[i] + '_loss'):
+            if isinstance(loss_fn, losses.Loss):
+              if mask is not None:
+                mask = math_ops.cast(mask, y_pred.dtype)
+                # Update weights with mask.
+                if sample_weight is None:
+                  sample_weight = mask
+                else:
+                  # Update dimensions of weights to match with mask if possible.
+                  mask, _, sample_weight = squeeze_or_expand_dimensions(
+                      mask, None, sample_weight)
+                  sample_weight *= mask
+              output_loss = loss_fn(y_true, y_pred, sample_weight=sample_weight)
+            else:
+              weighted_loss = training_utils.weighted_masked_objective(loss_fn)
+              output_loss = weighted_loss(y_true, y_pred, sample_weight, mask)
 
-    Arguments:
-      outputs: List of outputs (predictions).
-      skip_target_indices: Optional. List of target ids to skip.
-      targets: List of targets.
-      sample_weights: Optional list of sample weight arrays.
-      masks: List of computed output mask values.
-      return_stateful_result: Boolean, indicates whether the stateful
-        (aggregated)/stateless metric result should be returned.
+          if len(self.outputs) > 1:
+            # Keep track of the un-aggregated loss result tensor.
+            self._compile_metrics_tensors[self.output_names[i] +
+                                          '_loss'] = output_loss
 
-    Returns:
-      A list of metric result tensors.
-    """
-    skip_target_indices = skip_target_indices or []
-    metric_results = []
-    with K.name_scope('metrics'):
-      # Invoke all metrics added using `compile`.
-      for i in range(len(outputs)):
-        if i in skip_target_indices:
-          continue
-        output = outputs[i] if outputs else None
-        target = targets[i] if targets else None
-        output_mask = masks[i] if masks else None
-        metric_results.extend(
-            self._handle_per_output_metrics(
-                self._per_output_metrics[i],
-                target,
-                output,
-                output_mask,
-                return_stateful_result=return_stateful_result))
-        metric_results.extend(
-            self._handle_per_output_metrics(
-                self._per_output_weighted_metrics[i],
-                target,
-                output,
-                output_mask,
-                weights=sample_weights[i],
-                return_stateful_result=return_stateful_result))
+            # Keep track of stateful result tensor and function for the loss.
+            loss_name = loss_fn.name if isinstance(
+                loss_fn, losses.Loss) else loss_fn.__name__
+            mean_wrapped_loss = metrics_module.MeanMetricWrapper(
+                loss_fn, name=loss_name)
+            result_tensor = training_utils.call_metric_function(
+                mean_wrapped_loss,
+                y_true,
+                y_pred,
+                weights=sample_weight,
+                mask=mask)
+            self._compile_stateful_metrics_tensors[self.output_names[i] +
+                                                   '_loss'] = result_tensor
+            self._compile_stateful_metric_functions.append(mean_wrapped_loss)
 
-    # Add metric results from the `add_metric` metrics in eager mode.
-    if context.executing_eagerly():
-      for m in self.metrics:
-        if m not in self._compile_stateful_metric_functions:
-          metric_results.append(m.result())
-    return metric_results
+            self._compile_metrics_names.append(self.output_names[i] + '_loss')
+          if total_loss is None:
+            total_loss = loss_weight * output_loss
+          else:
+            total_loss += loss_weight * output_loss
+        if total_loss is None:
+          if not self.losses:
+            raise ValueError('The model cannot be compiled '
+                             'because it has no loss to optimize.')
+          else:
+            total_loss = 0.
+
+        # Add regularization penalties
+        # and other layer-specific losses.
+        for loss_tensor in self.losses:
+          total_loss += loss_tensor
+
+      # Set metric attributes on model.
+      self._set_metric_attributes(
+          self.outputs,
+          skip_target_indices=skip_target_indices,
+      )
+      # Invoke metric functions for all the outputs.
+      self._handle_metrics(
+          self.outputs,
+          masks=masks,
+          targets=self.targets,
+          skip_target_indices=skip_target_indices,
+          sample_weights=self.sample_weights)
+
+      # Prepare gradient updates and state updates.
+      self.total_loss = total_loss
+
+      # Functions for train, test and predict will
+      # be compiled lazily when required.
+      # This saves time when the user is not using all functions.
+      self._function_kwargs = kwargs
+
+      self._fit_function = None
+      self._eval_function = None
+      self.train_function = None
+      self.test_function = None
+      self.predict_function = None
+
+      # Collected trainable weights, sorted in topological order.
+      trainable_weights = self.trainable_weights
+      self._collected_trainable_weights = trainable_weights
+
+  @property
+  def metrics(self):
+    """Returns the model's metrics added using `compile`, `add_metric` APIs."""
+    metrics = []
+    if self._is_compiled:
+      metrics += self._compile_stateful_metric_functions
+    return metrics + super(Model, self).metrics
+
+  @property
+  def metrics_names(self):
+    """Returns the model's display labels for all outputs."""
+    metrics_names = []
+    if self._is_compiled:
+      metrics_names += self._compile_metrics_names  # Includes names of losses.
+
+    # Add metric names from layers.
+    for layer in self.layers:
+      metrics_names += [m.name for m in layer._metrics]  # pylint: disable=protected-access
+    metrics_names += [m.name for m in self._metrics]
+    return metrics_names
 
   @property
   def run_eagerly(self):
@@ -411,16 +554,23 @@ class Model(Network):
     if self._run_eagerly is True and not context.executing_eagerly():
       raise ValueError('You can only set `run_eagerly=True` if eager execution '
                        'is enabled.')
-    if self._static_graph_friendly:
+    if not self.dynamic:
       if self._run_eagerly is None:
         return False
       else:
         return self._run_eagerly
     else:
+      if not context.executing_eagerly():
+        raise ValueError('Your model contains layers that can only be '
+                         'successfully run in eager execution (layers '
+                         'constructed with `dynamic=True`). '
+                         'You must enable eager execution with '
+                         '`tf.enable_eager_execution()`.')
       if self._run_eagerly is False:
         # TODO(fchollet): consider using py_func to enable this.
         raise ValueError('Your model contains layers that can only be '
-                         'successfully run in eager execution. '
+                         'successfully run in eager execution (layers '
+                         'constructed with `dynamic=True`). '
                          'You cannot set `run_eagerly=False`.')
       return context.executing_eagerly()
 
@@ -428,2060 +578,2008 @@ class Model(Network):
   def run_eagerly(self, value):
     self._run_eagerly = value
 
-  @checkpointable.no_automatic_dependency_tracking
-  def compile(self,
-              optimizer,
-              loss=None,
-              metrics=None,
-              loss_weights=None,
-              sample_weight_mode=None,
-              weighted_metrics=None,
-              target_tensors=None,
-              distribute=None,
-              **kwargs):
-    """Configures the model for training.
+  def fit(self,
+          x=None,
+          y=None,
+          batch_size=None,
+          epochs=1,
+          verbose=1,
+          callbacks=None,
+          validation_split=0.,
+          validation_data=None,
+          shuffle=True,
+          class_weight=None,
+          sample_weight=None,
+          initial_epoch=0,
+          steps_per_epoch=None,
+          validation_steps=None,
+          max_queue_size=10,
+          workers=1,
+          use_multiprocessing=False,
+          **kwargs):
+    """Trains the model for a fixed number of epochs (iterations on a dataset).
 
     Arguments:
-        optimizer: String (name of optimizer) or optimizer instance.
-            See [optimizers](/api_docs/python/tf/keras/optimizers).
-        loss: String (name of objective function) or objective function.
-            See [losses](/api_docs/python/tf/losses).
-            If the model has multiple outputs, you can use a different loss
-            on each output by passing a dictionary or a list of losses.
-            The loss value that will be minimized by the model
-            will then be the sum of all individual losses.
-        metrics: List of metrics to be evaluated by the model
-            during training and testing.
-            Typically you will use `metrics=['accuracy']`.
-            To specify different metrics for different outputs of a
-            multi-output model, you could also pass a dictionary,
-            such as `metrics={'output_a': 'accuracy'}`.
-        loss_weights: Optional list or dictionary specifying scalar
-            coefficients (Python floats) to weight the loss contributions
-            of different model outputs.
-            The loss value that will be minimized by the model
-            will then be the *weighted sum* of all individual losses,
-            weighted by the `loss_weights` coefficients.
-            If a list, it is expected to have a 1:1 mapping
-            to the model's outputs. If a tensor, it is expected to map
-            output names (strings) to scalar coefficients.
-        sample_weight_mode: If you need to do timestep-wise
-            sample weighting (2D weights), set this to `"temporal"`.
-            `None` defaults to sample-wise weights (1D).
-            If the model has multiple outputs, you can use a different
-            `sample_weight_mode` on each output by passing a
-            dictionary or a list of modes.
-        weighted_metrics: List of metrics to be evaluated and weighted
-            by sample_weight or class_weight during training and testing.
-        target_tensors: By default, Keras will create placeholders for the
-            model's target, which will be fed with the target data during
-            training. If instead you would like to use your own
-            target tensors (in turn, Keras will not expect external
-            Numpy data for these targets at training time), you
-            can specify them via the `target_tensors` argument. It can be
-            a single tensor (for a single-output model), a list of tensors,
-            or a dict mapping output names to target tensors.
-        distribute: The DistributionStrategy instance that we want to use to
-            distribute the training of the model.
-        **kwargs: These arguments are passed to `tf.Session.run`.
-
-    Raises:
-        ValueError: In case of invalid arguments for
-            `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
-    """
-    run_eagerly = kwargs.pop('run_eagerly', None)
-    self._run_eagerly = run_eagerly
-
-    # Validate that arguments passed by the user to `compile` are supported by
-    # DistributionStrategy.
-    if distribute:
-      if not isinstance(
-          optimizer, (tf_optimizer_module.Optimizer, optimizers.TFOptimizer)):
-        raise NotImplementedError(
-            'optimizer must be an instance of '
-            'tf.train.Optimizer, not a %s' % type(optimizer))
-      if sample_weight_mode:
-        raise NotImplementedError('sample_weight_mode is not supported with '
-                                  'DistributionStrategy.')
-      if weighted_metrics:
-        raise NotImplementedError('weighted_metrics is not supported with '
-                                  'DistributionStrategy.')
-      if target_tensors:
-        raise ValueError('target_tensors is not supported with '
-                         'DistributionStrategy.')
-
-    loss = loss or {}
-    if self.run_eagerly and not isinstance(
-        optimizer, (tf_optimizer_module.Optimizer, optimizers.TFOptimizer)):
-      raise ValueError(
-          'When running a model in eager execution, the optimizer must be an '
-          'instance of tf.train.Optimizer. Received: '
-          '%s' % optimizer)
-
-    self.optimizer = optimizers.get(optimizer)
-    # We've disabled automatic dependency tracking for this method, but do want
-    # to add a checkpoint dependency on the optimizer if it's checkpointable.
-    if isinstance(self.optimizer, checkpointable.CheckpointableBase):
-      self._track_checkpointable(
-          self.optimizer, name='optimizer', overwrite=True)
-    self.loss = loss
-    self._compile_metrics = metrics or []
-    self.loss_weights = loss_weights
-    self.sample_weight_mode = sample_weight_mode
-    self._compile_weighted_metrics = weighted_metrics
-    if self.run_eagerly and target_tensors is not None:
-      raise ValueError(
-          'target_tensors argument is not supported when '
-          'running a model eagerly.')
-    self.target_tensors = target_tensors
-
-    # Set DistributionStrategy specific parameters.
-    self._distribution_strategy = distribute
-    # Reset the value of grouped_model
-    self._grouped_model = None
-    if self._distribution_strategy is not None:
-      distributed_training_utils.configure_and_create_session(
-          self._distribution_strategy)
-    # Initialize model metric attributes.
-    self._init_metric_attributes()
-    if not self.built:
-      # Model is not compilable because it does not know its number of inputs
-      # and outputs, nor their shapes and names. We will compile after the first
-      # time the model gets called on training data.
-      return
-    self._is_compiled = True
-
-    # Prepare loss functions.
-    if isinstance(loss, dict):
-      for name in loss:
-        if name not in self.output_names:
-          raise ValueError(
-              'Unknown entry in loss '
-              'dictionary: "' + name + '". '
-              'Only expected the following keys: ' + str(self.output_names))
-      loss_functions = []
-      for name in self.output_names:
-        if name not in loss:
-          logging.warning(
-              'Output "' + name +
-              '" missing from loss dictionary. We assume '
-              'this was done on purpose. The fit and evaluate APIs will not be '
-              'expecting any data to be passed to "' + name + '".')
-        loss_functions.append(training_utils.get_loss_function(loss.get(name)))
-    elif isinstance(loss, list):
-      if len(loss) != len(self.outputs):
-        raise ValueError('When passing a list as loss, '
-                         'it should have one entry per model outputs. '
-                         'The model has ' + str(len(self.outputs)) +
-                         ' outputs, but you passed loss=' + str(loss))
-      loss_functions = [training_utils.get_loss_function(l) for l in loss]
-    else:
-      loss_function = training_utils.get_loss_function(loss)
-      loss_functions = [loss_function for _ in range(len(self.outputs))]
-    self.loss_functions = loss_functions
-
-    skip_target_indices = []
-    skip_target_weighing_indices = []
-    self._feed_outputs = []
-    self._feed_output_names = []
-    self._feed_output_shapes = []
-    self._feed_loss_fns = []
-    for i in range(len(loss_functions)):
-      if loss_functions[i] is None:
-        skip_target_indices.append(i)
-        skip_target_weighing_indices.append(i)
-
-    # Prepare output masks.
-    if not self.run_eagerly:
-      masks = [getattr(x, '_keras_mask', None) for x in self.outputs]
-      if not isinstance(masks, list):
-        masks = [masks]
-
-    # Prepare loss weights.
-    if loss_weights is None:
-      loss_weights_list = [1. for _ in range(len(self.outputs))]
-    elif isinstance(loss_weights, dict):
-      for name in loss_weights:
-        if name not in self.output_names:
-          raise ValueError(
-              'Unknown entry in loss_weights '
-              'dictionary: "' + name + '". '
-              'Only expected the following keys: ' + str(self.output_names))
-      loss_weights_list = []
-      for name in self.output_names:
-        loss_weights_list.append(loss_weights.get(name, 1.))
-    elif isinstance(loss_weights, list):
-      if len(loss_weights) != len(self.outputs):
-        raise ValueError(
-            'When passing a list as loss_weights, '
-            'it should have one entry per model output. '
-            'The model has ' + str(len(self.outputs)) +
-            ' outputs, but you passed loss_weights=' + str(loss_weights))
-      loss_weights_list = loss_weights
-    else:
-      raise TypeError('Could not interpret loss_weights argument: ' +
-                      str(loss_weights) + ' - expected a list of dicts.')
-    self.loss_weights_list = loss_weights_list
-
-    # Initialization for Eager mode execution.
-    if self.run_eagerly:
-      # Prepare sample weights.
-      self._set_sample_weight_attributes(sample_weight_mode,
-                                         skip_target_weighing_indices)
-      # Save all metric attributes per output of the model.
-      self._cache_output_metric_attributes(metrics, weighted_metrics)
-
-      if target_tensors is not None:
-        raise ValueError('target_tensors are not currently supported in Eager '
-                         'mode.')
-      self.total_loss = None
-      for i in range(len(self.outputs)):
-        if len(self.outputs) > 1:
-          self._compile_metrics_names.append(self.output_names[i] + '_loss')
-
-      # Set metric attributes on model.
-      self._set_metric_attributes(
-          self.outputs,
-          skip_target_indices=skip_target_indices,
-      )
-
-      self.targets = []
-      for i in range(len(self.outputs)):
-        self._feed_output_names.append(self.output_names[i])
-      self._collected_trainable_weights = self.trainable_weights
-      return
-
-    with K.get_graph().as_default():
-      # Prepare targets of model.
-      self.targets = []
-      self._feed_targets = []
-      if target_tensors not in (None, []):
-        if isinstance(target_tensors, list):
-          if len(target_tensors) != len(self.outputs):
-            raise ValueError(
-                'When passing a list as `target_tensors`, '
-                'it should have one entry per model output. '
-                'The model has %s outputs, but you passed target_tensors=%s' %
-                (len(self.outputs), target_tensors))
-        elif isinstance(target_tensors, dict):
-          for name in target_tensors:
-            if name not in self.output_names:
-              raise ValueError(
-                  'Unknown entry in `target_tensors` '
-                  'dictionary: "' + name + '". '
-                  'Only expected the following keys: ' + str(self.output_names))
-          tmp_target_tensors = []
-          for name in self.output_names:
-            tmp_target_tensors.append(target_tensors.get(name, None))
-          target_tensors = tmp_target_tensors
-        elif tensor_util.is_tensor(target_tensors):
-          target_tensors = [target_tensors]
-        else:
-          raise TypeError('Expected `target_tensors` to be a list or tuple or '
-                          'dict or a single tensor, but got:', target_tensors)
-
-      for i in range(len(self.outputs)):
-        if i in skip_target_indices:
-          self.targets.append(None)
-        else:
-          shape = K.int_shape(self.outputs[i])
-          name = self.output_names[i]
-          if target_tensors not in (None, []):
-            target = target_tensors[i]
-          else:
-            target = None
-          if target is None or K.is_placeholder(target):
-            if target is None:
-              target_dtype = losses.LABEL_DTYPES_FOR_LOSSES.get(
-                  self.loss_functions[i],
-                  K.dtype(self.outputs[i]))
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset or a dataset iterator. Should return a tuple
+            of either `(inputs, targets)` or
+            `(inputs, targets, sample_weights)`.
+          - A generator or `keras.utils.Sequence` returning `(inputs, targets)`
+            or `(inputs, targets, sample weights)`.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset, dataset
+          iterator, generator, or `keras.utils.Sequence` instance, `y` should
+          not be specified (since targets will be obtained from `x`).
+        batch_size: Integer or `None`.
+            Number of samples per gradient update.
+            If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` if your data is in the
+            form of symbolic tensors, dataset, dataset iterators,
+            generators, or `keras.utils.Sequence` instances (since they generate
+            batches).
+        epochs: Integer. Number of epochs to train the model.
+            An epoch is an iteration over the entire `x` and `y`
+            data provided.
+            Note that in conjunction with `initial_epoch`,
+            `epochs` is to be understood as "final epoch".
+            The model is not trained for a number of iterations
+            given by `epochs`, but merely until the epoch
+            of index `epochs` is reached.
+        verbose: Integer. 0, 1, or 2. Verbosity mode.
+            0 = silent, 1 = progress bar, 2 = one line per epoch.
+        callbacks: List of `keras.callbacks.Callback` instances.
+            List of callbacks to apply during training.
+            See `tf.keras.callbacks`.
+        validation_split: Float between 0 and 1.
+            Fraction of the training data to be used as validation data.
+            The model will set apart this fraction of the training data,
+            will not train on it, and will evaluate
+            the loss and any model metrics
+            on this data at the end of each epoch.
+            The validation data is selected from the last samples
+            in the `x` and `y` data provided, before shuffling. This argument is
+            not supported when `x` is a dataset, dataset iterator, generator or
+           `keras.utils.Sequence` instance.
+        validation_data: Data on which to evaluate
+            the loss and any model metrics at the end of each epoch.
+            The model will not be trained on this data.
+            `validation_data` will override `validation_split`.
+            `validation_data` could be:
+              - tuple `(x_val, y_val)` of Numpy arrays or tensors
+              - tuple `(x_val, y_val, val_sample_weights)` of Numpy arrays
+              - dataset or a dataset iterator
+            For the first two cases, `batch_size` must be provided.
+            For the last case, `validation_steps` must be provided.
+        shuffle: Boolean (whether to shuffle the training data
+            before each epoch) or str (for 'batch').
+            'batch' is a special option for dealing with the
+            limitations of HDF5 data; it shuffles in batch-sized chunks.
+            Has no effect when `steps_per_epoch` is not `None`.
+        class_weight: Optional dictionary mapping class indices (integers)
+            to a weight (float) value, used for weighting the loss function
+            (during training only).
+            This can be useful to tell the model to
+            "pay more attention" to samples from
+            an under-represented class.
+        sample_weight: Optional Numpy array of weights for
+            the training samples, used for weighting the loss function
+            (during training only). You can either pass a flat (1D)
+            Numpy array with the same length as the input samples
+            (1:1 mapping between weights and samples),
+            or in the case of temporal data,
+            you can pass a 2D array with shape
+            `(samples, sequence_length)`,
+            to apply a different weight to every timestep of every sample.
+            In this case you should make sure to specify
+            `sample_weight_mode="temporal"` in `compile()`. This argument is not
+            supported when `x` is a dataset, dataset iterator, generator, or
+           `keras.utils.Sequence` instance, instead provide the sample_weights
+            as the third element of `x`.
+        initial_epoch: Integer.
+            Epoch at which to start training
+            (useful for resuming a previous training run).
+        steps_per_epoch: Integer or `None`.
+            Total number of steps (batches of samples)
+            before declaring one epoch finished and starting the
+            next epoch. When training with input tensors such as
+            TensorFlow data tensors, the default `None` is equal to
+            the number of samples in your dataset divided by
+            the batch size, or 1 if that cannot be determined.
+        validation_steps: Only relevant if `validation_data` is provided and
+            is a dataset or dataset iterator. Total number of steps (batches of
+            samples) to draw before stopping when performing validation
+            at the end of every epoch.
+        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
+            input only. Maximum size for the generator queue.
+            If unspecified, `max_queue_size` will default to 10.
+        workers: Integer. Used for generator or `keras.utils.Sequence` input
+            only. Maximum number of processes to spin up
+            when using process-based threading. If unspecified, `workers`
+            will default to 1. If 0, will execute the generator on the main
+            thread.
+        use_multiprocessing: Boolean. Used for generator or
+            `keras.utils.Sequence` input only. If `True`, use process-based
+            threading. If unspecified, `use_multiprocessing` will default to
+            `False`. Note that because this implementation relies on
+            multiprocessing, you should not pass non-picklable arguments to
+            the generator as they can't be passed easily to children processes.
+        **kwargs: Used for backwards compatibility.
 
-              target = K.placeholder(
-                  ndim=len(shape),
-                  name=name + '_target',
-                  sparse=K.is_sparse(self.outputs[i]),
-                  dtype=target_dtype)
-            self._feed_targets.append(target)
-            self._feed_outputs.append(self.outputs[i])
-            self._feed_output_names.append(name)
-            self._feed_output_shapes.append(shape)
-            self._feed_loss_fns.append(self.loss_functions[i])
-          else:
-            skip_target_weighing_indices.append(i)
-          self.targets.append(target)
+    Returns:
+        A `History` object. Its `History.history` attribute is
+        a record of training loss values and metrics values
+        at successive epochs, as well as validation loss values
+        and validation metrics values (if applicable).
 
-      # Prepare sample weights.
-      self._set_sample_weight_attributes(sample_weight_mode,
-                                         skip_target_weighing_indices)
-      # Save all metric attributes per output of the model.
-      self._cache_output_metric_attributes(metrics, weighted_metrics)
+    Raises:
+        RuntimeError: If the model was never compiled.
+        ValueError: In case of mismatch between the provided input data
+            and what the model expects.
+    """
+    # TODO(fchollet): this method may be creating reference cycles, which would
+    # lead to accumulating garbage in memory when called in a loop. Investigate.
+    if data_utils.is_generator_or_sequence(x):
+      training_utils.check_generator_arguments(y, sample_weight)
+      return self.fit_generator(
+          x,
+          steps_per_epoch=steps_per_epoch,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          validation_data=validation_data,
+          validation_steps=validation_steps,
+          class_weight=class_weight,
+          max_queue_size=max_queue_size,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing,
+          shuffle=shuffle,
+          initial_epoch=initial_epoch)
 
-      # Compute total loss.
-      total_loss = None
-      with K.name_scope('loss'):
-        for i in range(len(self.outputs)):
-          if i in skip_target_indices:
-            continue
-          y_true = self.targets[i]
-          y_pred = self.outputs[i]
-          loss_fn = loss_functions[i]
-          sample_weight = self.sample_weights[i]
-          mask = masks[i]
-          loss_weight = loss_weights_list[i]
-          with K.name_scope(self.output_names[i] + '_loss'):
-            if isinstance(loss_fn, losses.Loss):
-              if mask is not None:
-                mask = math_ops.cast(mask, y_pred.dtype)
-                # Update weights with mask.
-                if sample_weight is None:
-                  sample_weight = mask
-                else:
-                  # Update dimensions of weights to match with mask if possible.
-                  mask, _, sample_weight = squeeze_or_expand_dimensions(
-                      mask, None, sample_weight)
-                  sample_weight *= mask
-              output_loss = loss_fn(y_true, y_pred, sample_weight=sample_weight)
-            else:
-              weighted_loss = training_utils.weighted_masked_objective(loss_fn)
-              output_loss = weighted_loss(y_true, y_pred, sample_weight, mask)
+    # Legacy support
+    if 'nb_epoch' in kwargs:
+      logging.warning(
+          'The `nb_epoch` argument in `fit` '
+          'has been renamed `epochs`.')
+      epochs = kwargs.pop('nb_epoch')
+    if kwargs:
+      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
 
-          if len(self.outputs) > 1:
-            # Keep track of the un-aggregated loss result tensor.
-            self._compile_metrics_tensors[self.output_names[i] +
-                                          '_loss'] = output_loss
+    # Validate and standardize user data.
+    if self._distribution_strategy:
+      distributed_training_utils.validate_callbacks(callbacks, self.optimizer,
+                                                    self._distribution_strategy)
 
-            # Keep track of stateful result tensor and function for the loss.
-            loss_name = loss_fn.name if isinstance(
-                loss_fn, losses.Loss) else loss_fn.__name__
-            mean_wrapped_loss = metrics_module.MeanMetricWrapper(
-                loss_fn, name=loss_name)
-            result_tensor = training_utils.call_metric_function(
-                mean_wrapped_loss,
-                y_true,
-                y_pred,
-                weights=sample_weight,
-                mask=mask)
-            self._compile_stateful_metrics_tensors[self.output_names[i] +
-                                                   '_loss'] = result_tensor
-            self._compile_stateful_metric_functions.append(mean_wrapped_loss)
+      distributed_training_utils.validate_inputs(
+          x, y, self._distribution_strategy)
 
-            self._compile_metrics_names.append(self.output_names[i] + '_loss')
-          if total_loss is None:
-            total_loss = loss_weight * output_loss
-          else:
-            total_loss += loss_weight * output_loss
-        if total_loss is None:
-          if not self.losses:
-            raise ValueError('The model cannot be compiled '
-                             'because it has no loss to optimize.')
-          else:
-            total_loss = 0.
+      first_x_value = nest.flatten(x)[0]
+      if isinstance(first_x_value, np.ndarray):
+        steps_per_epoch, batch_size = (
+            distributed_training_utils.get_input_params(
+                self._distribution_strategy, first_x_value, steps_per_epoch,
+                batch_size, is_training=True))
 
-        # Add regularization penalties
-        # and other layer-specific losses.
-        for loss_tensor in self.losses:
-          total_loss += loss_tensor
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps_per_epoch,
+                                                    x)
 
-      # Set metric attributes on model.
-      self._set_metric_attributes(
-          self.outputs,
-          skip_target_indices=skip_target_indices,
-      )
-      # Invoke metric functions for all the outputs.
-      self._handle_metrics(
-          self.outputs,
-          masks=masks,
-          targets=self.targets,
-          skip_target_indices=skip_target_indices,
-          sample_weights=self.sample_weights)
+    x, y, sample_weights = self._standardize_user_data(
+        x,
+        y,
+        sample_weight=sample_weight,
+        class_weight=class_weight,
+        batch_size=batch_size,
+        check_steps=True,
+        steps_name='steps_per_epoch',
+        steps=steps_per_epoch,
+        validation_split=validation_split,
+        shuffle=shuffle)
 
-      # Prepare gradient updates and state updates.
-      self.total_loss = total_loss
+    # Prepare validation data.
+    if validation_data:
+      if (isinstance(validation_data, iterator_ops.Iterator) or
+          isinstance(validation_data, iterator_ops.EagerIterator) or
+          isinstance(validation_data, dataset_ops.DatasetV2)):
+        val_x = validation_data
+        val_y = None
+        val_sample_weight = None
+      elif len(validation_data) == 2:
+        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
+        val_sample_weight = None
+      elif len(validation_data) == 3:
+        val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
+      else:
+        raise ValueError(
+            'When passing a `validation_data` argument, '
+            'it must contain either 2 items (x_val, y_val), '
+            'or 3 items (x_val, y_val, val_sample_weights), '
+            'or alternatively it could be a dataset or a '
+            'dataset or a dataset iterator. '
+            'However we received `validation_data=%s`' % validation_data)
 
-      # Functions for train, test and predict will
-      # be compiled lazily when required.
-      # This saves time when the user is not using all functions.
-      self._function_kwargs = kwargs
+      # Validate and standardize validation data.
+      if self._distribution_strategy:
+        distributed_training_utils.validate_inputs(
+            val_x, val_y, self._distribution_strategy)
+        first_valx_value = nest.flatten(val_x)[0]
+        if isinstance(first_valx_value, np.ndarray):
+          validation_steps, _ = distributed_training_utils.get_input_params(
+              self._distribution_strategy, first_valx_value, validation_steps,
+              batch_size)
 
-      self._fit_function = None
-      self._eval_function = None
-      self.train_function = None
-      self.test_function = None
-      self.predict_function = None
+      val_x, val_y, val_sample_weights = self._standardize_user_data(
+          val_x,
+          val_y,
+          sample_weight=val_sample_weight,
+          batch_size=batch_size,
+          steps=validation_steps)
 
-      # Collected trainable weights, sorted in topological order.
-      trainable_weights = self.trainable_weights
-      self._collected_trainable_weights = trainable_weights
+    elif validation_split and 0. < validation_split < 1.:
+      if training_utils.has_symbolic_tensors(x):
+        raise ValueError('If your data is in the form of symbolic tensors, '
+                         'you cannot use `validation_split`.')
+      if hasattr(x[0], 'shape'):
+        split_at = int(x[0].shape[0] * (1. - validation_split))
+      else:
+        split_at = int(len(x[0]) * (1. - validation_split))
+      x, val_x = (slice_arrays(x, 0, split_at), slice_arrays(x, split_at))
+      y, val_y = (slice_arrays(y, 0, split_at), slice_arrays(y, split_at))
+      sample_weights, val_sample_weights = (slice_arrays(
+          sample_weights, 0, split_at), slice_arrays(sample_weights, split_at))
+    elif validation_steps:
+      val_x = []
+      val_y = []
+      val_sample_weights = []
+    else:
+      val_x = None
+      val_y = None
+      val_sample_weights = None
 
-  def _check_trainable_weights_consistency(self):
-    """Check trainable weights count consistency.
+    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
+                             not self._distribution_strategy)):
+      return training_generator.fit_generator(
+          self, (x, y, sample_weights),
+          steps_per_epoch=steps_per_epoch,
+          batch_size=batch_size,
+          epochs=epochs,
+          shuffle=shuffle,
+          verbose=verbose,
+          callbacks=callbacks,
+          validation_data=validation_data,
+          validation_steps=validation_steps,
+          workers=0,
+          initial_epoch=initial_epoch)
+    elif distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy):
+      return training_distributed.experimental_fit_loop(
+          self,
+          x,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          val_iterator=val_x,
+          initial_epoch=initial_epoch,
+          steps_per_epoch=steps_per_epoch,
+          validation_steps=validation_steps)
+    else:
+      return training_arrays.fit_loop(
+          self,
+          x,
+          y,
+          sample_weights=sample_weights,
+          batch_size=batch_size,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          val_inputs=val_x,
+          val_targets=val_y,
+          val_sample_weights=val_sample_weights,
+          shuffle=shuffle,
+          initial_epoch=initial_epoch,
+          steps_per_epoch=steps_per_epoch,
+          validation_steps=validation_steps)
 
-    This will raise a warning if `trainable_weights` and
-    `_collected_trainable_weights` are inconsistent (i.e. have different
-    number of parameters).
-    Inconsistency will typically arise when one modifies `model.trainable`
-    without calling `model.compile` again.
-    """
-    if not hasattr(self, '_collected_trainable_weights'):
-      return
+  def evaluate(self,
+               x=None,
+               y=None,
+               batch_size=None,
+               verbose=1,
+               sample_weight=None,
+               steps=None,
+               callbacks=None,
+               max_queue_size=10,
+               workers=1,
+               use_multiprocessing=False):
+    """Returns the loss value & metrics values for the model in test mode.
 
-    if len(self.trainable_weights) != len(self._collected_trainable_weights):
-      logging.log_first_n(
-          logging.WARN, 'Discrepancy between trainable weights and collected'
-          ' trainable weights, did you set `model.trainable`'
-          ' without calling `model.compile` after ?', 1)
+    Computation is done in batches.
 
-  def _make_train_function_helper(self, fn_name, outputs, metric_updates=None):
-    if not hasattr(self, fn_name):
-      raise RuntimeError('You must compile your model before using it.')
-    self._check_trainable_weights_consistency()
-    if getattr(self, fn_name) is None:
-      inputs = (self._feed_inputs +
-                self._feed_targets +
-                self._feed_sample_weights)
-      if not isinstance(K.symbolic_learning_phase(), int):
-        inputs += [K.symbolic_learning_phase()]
+    Arguments:
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset or a dataset iterator.
+          - A generator or `keras.utils.Sequence` instance.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely).
+          If `x` is a dataset, dataset iterator, generator or
+          `keras.utils.Sequence` instance, `y` should not be specified (since
+          targets will be obtained from the iterator/dataset).
+        batch_size: Integer or `None`.
+            Number of samples per gradient update.
+            If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` is your data is in the
+            form of symbolic tensors, dataset, dataset iterators,
+            generators, or `keras.utils.Sequence` instances (since they generate
+            batches).
+        verbose: 0 or 1. Verbosity mode.
+            0 = silent, 1 = progress bar.
+        sample_weight: Optional Numpy array of weights for
+            the test samples, used for weighting the loss function.
+            You can either pass a flat (1D)
+            Numpy array with the same length as the input samples
+            (1:1 mapping between weights and samples),
+            or in the case of temporal data,
+            you can pass a 2D array with shape
+            `(samples, sequence_length)`,
+            to apply a different weight to every timestep of every sample.
+            In this case you should make sure to specify
+            `sample_weight_mode="temporal"` in `compile()`. This argument is not
+            supported when `x` is a dataset or a dataset iterator, instead pass
+            sample weights as the third element of `x`.
+        steps: Integer or `None`.
+            Total number of steps (batches of samples)
+            before declaring the evaluation round finished.
+            Ignored with the default value of `None`.
+        callbacks: List of `keras.callbacks.Callback` instances.
+            List of callbacks to apply during evaluation.
+            See [callbacks](/api_docs/python/tf/keras/callbacks).
+        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
+            input only. Maximum size for the generator queue.
+            If unspecified, `max_queue_size` will default to 10.
+        workers: Integer. Used for generator or `keras.utils.Sequence` input
+            only. Maximum number of processes to spin up when using
+            process-based threading. If unspecified, `workers` will default
+            to 1. If 0, will execute the generator on the main thread.
+        use_multiprocessing: Boolean. Used for generator or
+            `keras.utils.Sequence` input only. If `True`, use process-based
+            threading. If unspecified, `use_multiprocessing` will default to
+            `False`. Note that because this implementation relies on
+            multiprocessing, you should not pass non-picklable arguments to
+            the generator as they can't be passed easily to children processes.
 
-      with K.get_graph().as_default():
-        with K.name_scope('training'):
-          with K.name_scope(self.optimizer.__class__.__name__):
-            # Training updates
-            updates = self.optimizer.get_updates(
-                params=self._collected_trainable_weights, loss=self.total_loss)
-      # Unconditional updates
-      updates += self.get_updates_for(None)
-      # Conditional updates relevant to this model
-      updates += self.get_updates_for(self.inputs)
-      # Add stateful metrics updates.
-      if metric_updates is not None:
-        updates += metric_updates
+    Returns:
+        Scalar test loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
 
-      with K.name_scope('training'):
-        # Gets loss and metrics. Updates weights at each call.
-        fn = K.function(
-            inputs,
-            outputs,
-            updates=updates,
-            name='train_function',
-            **self._function_kwargs)
-        setattr(self, fn_name, fn)
+    Raises:
+        ValueError: in case of invalid arguments.
+    """
+    if data_utils.is_generator_or_sequence(x):
+      training_utils.check_generator_arguments(y, sample_weight)
+      return self.evaluate_generator(
+          x,
+          steps=steps,
+          verbose=verbose,
+          max_queue_size=max_queue_size,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing)
+    # Validate and standardize user data.
+    if self._distribution_strategy:
+      distributed_training_utils.validate_inputs(
+          x, y, self._distribution_strategy)
+      first_x_value = nest.flatten(x)[0]
+      if isinstance(first_x_value, np.ndarray):
+        steps, batch_size = distributed_training_utils.get_input_params(
+            self._distribution_strategy, first_x_value, steps, batch_size)
 
-  def _make_train_function(self):
-    metrics_tensors = [
-        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
-    ]
-    self._make_train_function_helper('train_function',
-                                     [self.total_loss] + metrics_tensors)
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
 
-  def _make_fit_function(self):
-    # TODO(psv/anjalisridhar): Remove updates after we fix b/118841692
-    # Stateful metrics updates
-    metric_updates = []
-    for m in self.metrics:
-      metric_updates += m.updates
+    x, y, sample_weights = self._standardize_user_data(
+        x,
+        y,
+        sample_weight=sample_weight,
+        batch_size=batch_size,
+        check_steps=True,
+        steps_name='steps',
+        steps=steps)
 
-    metrics_tensors = [
-        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
-    ]
-    self._make_train_function_helper(
-        '_fit_function', [self.total_loss] + metrics_tensors, metric_updates)
+    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
+                             not self._distribution_strategy)):
+      return training_generator.evaluate_generator(
+          self, (x, y, sample_weights),
+          steps=steps,
+          batch_size=batch_size,
+          verbose=verbose,
+          workers=0,
+          callbacks=callbacks)
+    elif distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy):
+      return training_distributed.experimental_test_loop(
+          self, iterator=x, verbose=verbose, steps=steps)
+    else:
+      return training_arrays.test_loop(
+          self,
+          inputs=x,
+          targets=y,
+          sample_weights=sample_weights,
+          batch_size=batch_size,
+          verbose=verbose,
+          steps=steps,
+          callbacks=callbacks)
 
-  def _make_test_function_helper(self, fn_name, outputs, metric_updates=None):
-    if not hasattr(self, fn_name):
-      raise RuntimeError('You must compile your model before using it.')
-    if getattr(self, fn_name) is None:
-      inputs = (self._feed_inputs +
-                self._feed_targets +
-                self._feed_sample_weights)
+  def predict(self,
+              x,
+              batch_size=None,
+              verbose=0,
+              steps=None,
+              callbacks=None,
+              max_queue_size=10,
+              workers=1,
+              use_multiprocessing=False):
+    """Generates output predictions for the input samples.
 
-      with K.name_scope('evaluation'):
-        updates = self.state_updates
-        # Add stateful metrics updates.
-        if metric_updates is not None:
-          updates += metric_updates
-        # Return loss and metrics, no gradient updates.
-        # Does update the network states.
-        fn = K.function(
-            inputs,
-            outputs,
-            updates=updates,
-            name='test_function',
-            **self._function_kwargs)
-        setattr(self, fn_name, fn)
+    Computation is done in batches.
 
-  def _make_test_function(self):
-    metrics_tensors = [
-        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
-    ]
-    self._make_test_function_helper('test_function',
-                                    [self.total_loss] + metrics_tensors)
+    Arguments:
+         x: Input samples. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A `tf.data` dataset or a dataset iterator.
+          - A generator or `keras.utils.Sequence` instance.
+        batch_size: Integer or `None`.
+            Number of samples per gradient update.
+            If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` is your data is in the
+            form of symbolic tensors, dataset, dataset iterators,
+            generators, or `keras.utils.Sequence` instances (since they generate
+            batches).
+        verbose: Verbosity mode, 0 or 1.
+        steps: Total number of steps (batches of samples)
+            before declaring the prediction round finished.
+            Ignored with the default value of `None`.
+        callbacks: List of `keras.callbacks.Callback` instances.
+            List of callbacks to apply during prediction.
+            See [callbacks](/api_docs/python/tf/keras/callbacks).
+        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
+            input only. Maximum size for the generator queue.
+            If unspecified, `max_queue_size` will default to 10.
+        workers: Integer. Used for generator or `keras.utils.Sequence` input
+            only. Maximum number of processes to spin up when using
+            process-based threading. If unspecified, `workers` will default
+            to 1. If 0, will execute the generator on the main thread.
+        use_multiprocessing: Boolean. Used for generator or
+            `keras.utils.Sequence` input only. If `True`, use process-based
+            threading. If unspecified, `use_multiprocessing` will default to
+            `False`. Note that because this implementation relies on
+            multiprocessing, you should not pass non-picklable arguments to
+            the generator as they can't be passed easily to children processes.
 
-  def _make_eval_function(self):
-    metrics_tensors = [
-        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
-    ]
-    self._make_test_function_helper('_eval_function',
-                                    [self.total_loss] + metrics_tensors)
 
-  def _make_predict_function(self):
-    if not hasattr(self, 'predict_function'):
-      self.predict_function = None
-    if self.predict_function is None:
-      inputs = self._feed_inputs
-      # Gets network outputs. Does not update weights.
-      # Does update the network states.
-      kwargs = getattr(self, '_function_kwargs', {})
-      with K.name_scope('predict'):
-        self.predict_function = K.function(
-            inputs,
-            self.outputs,
-            updates=self.state_updates,
-            name='predict_function',
-            **kwargs)
+    Returns:
+        Numpy array(s) of predictions.
+
+    Raises:
+        ValueError: In case of mismatch between the provided
+            input data and the model's expectations,
+            or in case a stateful model receives a number of samples
+            that is not a multiple of the batch size.
+    """
+    if data_utils.is_generator_or_sequence(x):
+      return self.predict_generator(
+          x,
+          steps=steps,
+          verbose=verbose,
+          max_queue_size=max_queue_size,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing)
+    if self._distribution_strategy:
+      distributed_training_utils.validate_inputs(
+          x, None, self._distribution_strategy)
+      first_x_value = nest.flatten(x)[0]
+      if isinstance(first_x_value, np.ndarray):
+        steps, batch_size = distributed_training_utils.get_input_params(
+            self._distribution_strategy, first_x_value, steps, batch_size)
 
-  def _get_execution_function(self, mode):
-    if mode == 'train':
-      self._make_fit_function()
-      return self._fit_function
-    if mode == 'test':
-      self._make_eval_function()
-      return self._eval_function
-    if mode == 'predict':
-      self._make_predict_function()
-      return self.predict_function
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
 
-  def _get_iterator_get_next_tensors(self, iterator):
-    get_next_op = self._iterator_get_next.get(iterator, None)
-    if get_next_op is None:
-      get_next_op = iterator.get_next()
-      self._iterator_get_next[iterator] = get_next_op
-    return get_next_op
+    # Validate and standardize user data.
+    if self._distribution_strategy:
+      x, _, _ = self._standardize_user_data(
+          x, check_steps=True, steps_name='steps', steps=steps,
+          batch_size=batch_size)
+    else:
+      # TODO(anjalisridhar): We don't pass batch_size here for some reason. This
+      # means we need to special case distribution strategy which needs the
+      # batch size.
+      x, _, _ = self._standardize_user_data(
+          x, check_steps=True, steps_name='steps', steps=steps)
 
-  def _distribution_standardize_user_data(self,
-                                          x,
-                                          y=None,
-                                          sample_weight=None,
-                                          class_weight=None,
-                                          batch_size=None,
-                                          check_steps=False,
-                                          steps_name='steps',
-                                          steps=None,
-                                          validation_split=0,
-                                          shuffle=False):
-    """Runs validation checks on input and target data passed by the user.
+    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
+                             not self._distribution_strategy)):
+      return training_generator.predict_generator(
+          self,
+          x,
+          steps=steps,
+          batch_size=batch_size,
+          verbose=verbose,
+          workers=0,
+          callbacks=callbacks)
+    elif distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy):
+      return training_distributed.experimental_predict_loop(
+          self, x, verbose=verbose, steps=steps)
+    else:
+      return training_arrays.predict_loop(
+          self,
+          x,
+          batch_size=batch_size,
+          verbose=verbose,
+          steps=steps,
+          callbacks=callbacks)
 
-    This is called when using DistributionStrategy to train, evaluate or serve
-    the model.
+  def reset_metrics(self):
+    """Resets the state of metrics."""
+    if hasattr(self, 'metrics'):
+      for m in self.metrics:
+        m.reset_states()
+      if self._distribution_strategy:
+        training_distributed._reset_metrics(self)  # pylint: disable=protected-access
+
+  def train_on_batch(self,
+                     x,
+                     y=None,
+                     sample_weight=None,
+                     class_weight=None,
+                     reset_metrics=True):
+    """Runs a single gradient update on a single batch of data.
 
-    Args:
-      x: Input data. A numpy array or `tf.data` dataset.
-      y: Target data. A numpy array or None if x is a `tf.data` dataset.
-      sample_weight: An optional sample-weight array passed by the user to
-        weight the importance of each sample in `x`.
-      class_weight: An optional class-weight array by the user to
-        weight the importance of samples in `x` based on the class they belong
-        to, as conveyed by `y`.
-      batch_size: Integer batch size. If provided, it is used to run additional
-        validation checks on stateful models.
-      check_steps: boolean, True if we want to check for validity of `steps` and
-        False, otherwise.
-      steps_name: The public API's parameter name for `steps`.
-      steps: Integer or `None`. Total number of steps (batches of samples) to
-        execute.
-      validation_split: Float between 0 and 1.
-        Fraction of the training data to be used as validation data.
-      shuffle: Boolean whether to shuffle the training data before each epoch.
+    Arguments:
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+              (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+              (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+              if the model has named inputs.
+          - A `tf.data` dataset or a dataset iterator.
+        y: Target data. Like the input data `x`, it could be either Numpy
+          array(s) or TensorFlow tensor(s). It should be consistent with `x`
+          (you cannot have Numpy inputs and tensor targets, or inversely). If
+          `x` is a dataset or a dataset iterator, `y` should not be specified
+          (since targets will be obtained from the iterator).
+        sample_weight: Optional array of the same length as x, containing
+          weights to apply to the model's loss for each sample. In the case of
+          temporal data, you can pass a 2D array with shape (samples,
+          sequence_length), to apply a different weight to every timestep of
+          every sample. In this case you should make sure to specify
+          sample_weight_mode="temporal" in compile(). This argument is not
+          supported when `x` is a dataset or a dataset iterator.
+        class_weight: Optional dictionary mapping class indices (integers) to a
+          weight (float) to apply to the model's loss for the samples from this
+          class during training. This can be useful to tell the model to "pay
+          more attention" to samples from an under-represented class.
+        reset_metrics: If `True`, the metrics returned will be only for this
+          batch. If `False`, the metrics will be statefully accumulated across
+          batches.
 
     Returns:
-      Iterator for reading the dataset `x`.
+        Scalar training loss
+        (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
 
     Raises:
-      ValueError: In case of invalid user-provided data.
-      RuntimeError: If the model was never compiled.
+      ValueError: In case of invalid user-provided arguments.
     """
-    if class_weight:
-      raise NotImplementedError('`class_weight` is currently not supported '
-                                'when using DistributionStrategy.')
-
-    if (sample_weight is not None and sample_weight.all() and
-        distributed_training_utils.is_tpu_strategy(
-            self._distribution_strategy)):
-      raise NotImplementedError('`sample_weight` is currently not supported '
-                                'when using TPUStrategy.')
+    if self._distribution_strategy:
+      raise NotImplementedError('`train_on_batch` is not supported for models '
+                                'compiled with DistributionStrategy.')
+    # Validate and standardize user data.
+    x, y, sample_weights = self._standardize_user_data(
+        x, y, sample_weight=sample_weight, class_weight=class_weight)
 
-    # Validates `steps` argument right at the beginning since we use it to
-    # construct the dataset object.
-    # TODO(anjalisridhar): Remove this check once we refactor the
-    # _standardize_user_data code path. This check is already present elsewhere
-    # in the codebase.
-    if check_steps and isinstance(x, dataset_ops.DatasetV2) and steps is None:
-      raise ValueError('When using Datasets as input, '
-                       'you should specify the `{steps_name}` argument.'
-                       .format(steps_name=steps_name))
+    if self.run_eagerly:
+      outputs = training_eager.train_on_batch(
+          self, x, y, sample_weights=sample_weights)
+    else:
+      if not isinstance(K.symbolic_learning_phase(), int):
+        ins = x + y + sample_weights + [True]
+      else:
+        ins = x + y + sample_weights
 
-    first_x_value = nest.flatten(x)[0]
-    if isinstance(first_x_value, np.ndarray):
-      # We need to use the drop_remainder argument to allow for a static
-      # input shape which is required for TPUs.
-      drop_remainder = self._distribution_strategy.require_static_shapes
-      if y is not None:
-        var_x = distributed_training_utils.get_var_for_numpy(
-            self._distribution_strategy, x)
-        var_y = distributed_training_utils.get_var_for_numpy(
-            self._distribution_strategy, y)
-        if sample_weight is not None:
-          var_sample_weights = distributed_training_utils.get_var_for_numpy(
-              self._distribution_strategy, sample_weight)
+      if reset_metrics:
+        self._make_train_function()
+        outputs = self.train_function(ins)  # pylint: disable=not-callable
+      else:
+        self._make_fit_function()
+        outputs = self._fit_function(ins)  # pylint: disable=not-callable
 
-          x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y,
-                                                      var_sample_weights))
-        else:
-          x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y))
+    if reset_metrics:
+      self.reset_metrics()
 
-        x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y))
-        if shuffle:
-          # 1024 is a good buffer size since it is much larger than the average
-          # batch size provided by the user and provides sufficient randomness.
-          # One thing to keep in mind is the memory usage based on the size of
-          # each sample.
-          x = x.shuffle(1024)
-        x = x.repeat()
-        x = x.batch(batch_size, drop_remainder=drop_remainder)
-        y = None
-        sample_weight = None
-      else:
-        # This case is for the predict call where the dataset only contains
-        # inputs and no targets, i.e. it does not return a tuple
-        var_x = distributed_training_utils.get_var_for_numpy(
-            self._distribution_strategy, x)
-        x = dataset_ops.Dataset.from_tensor_slices(var_x)
-        x = x.batch(batch_size, drop_remainder=drop_remainder)
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
 
-    assert isinstance(x, dataset_ops.DatasetV2)
+  def test_on_batch(self, x, y=None, sample_weight=None, reset_metrics=True):
+    """Test the model on a single batch of samples.
 
-    with self._distribution_strategy.scope():
-      iterator = self._distribution_strategy.make_dataset_iterator(x)
-      init_op = iterator.initialize()
-      if not context.executing_eagerly():
-        K.get_session().run(init_op)
+    Arguments:
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset or a dataset iterator.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset or a
+          dataset iterator, `y` should not be specified
+          (since targets will be obtained from the iterator).
+        sample_weight: Optional array of the same length as x, containing
+            weights to apply to the model's loss for each sample.
+            In the case of temporal data, you can pass a 2D array
+            with shape (samples, sequence_length),
+            to apply a different weight to every timestep of every sample.
+            In this case you should make sure to specify
+            sample_weight_mode="temporal" in compile(). This argument is not
+            supported when `x` is a dataset or a dataset iterator.
+        reset_metrics: If `True`, the metrics returned will be only for this
+          batch. If `False`, the metrics will be statefully accumulated across
+          batches.
 
-    training_utils.validate_iterator_input(x, y, sample_weight,
-                                           validation_split)
-    return iterator
+    Returns:
+        Scalar test loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
 
-  def _standardize_user_data(self,
-                             x,
-                             y=None,
-                             sample_weight=None,
-                             class_weight=None,
-                             batch_size=None,
-                             check_steps=False,
-                             steps_name='steps',
-                             steps=None,
-                             validation_split=0,
-                             shuffle=False):
-    """Runs validation checks on input and target data passed by the user.
+    Raises:
+        ValueError: In case of invalid user-provided arguments.
+    """
+    if self._distribution_strategy:
+      raise NotImplementedError('`test_on_batch` is not supported for models '
+                                'compiled with DistributionStrategy.')
+    # Validate and standardize user data.
+    x, y, sample_weights = self._standardize_user_data(
+        x, y, sample_weight=sample_weight)
 
-    Also standardizes the data to lists of arrays, in order.
+    if self.run_eagerly:
+      outputs = training_eager.test_on_batch(
+          self, x, y, sample_weights=sample_weights)
+    else:
+      inputs = x + y + sample_weights
+      if reset_metrics:
+        self._make_test_function()
+        outputs = self.test_function(inputs)  # pylint: disable=not-callable
+      else:
+        self._make_eval_function()
+        outputs = self._eval_function(inputs)  # pylint: disable=not-callable
 
-    Also builds and compiles the model on the fly if it is a subclassed model
-    that has never been called before (and thus has no inputs/outputs).
+    if reset_metrics:
+      self.reset_metrics()
 
-    This is a purely internal method, subject to refactoring at any time.
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
 
-    Args:
-      x: Input data. It could be:
-        - A Numpy array (or array-like), or a list of arrays
-          (in case the model has multiple inputs).
-        - A TensorFlow tensor, or a list of tensors
-          (in case the model has multiple inputs).
-        - A dict mapping input names to the corresponding array/tensors,
-          if the model has named inputs.
-        - A `tf.data` dataset or a dataset iterator.
-      y: Target data. Like the input data `x`,
-        it could be either Numpy array(s) or TensorFlow tensor(s).
-        It should be consistent with `x` (you cannot have Numpy inputs and
-        tensor targets, or inversely). If `x` is a dataset or a
-        dataset iterator, `y` should not be specified
-        (since targets will be obtained from the iterator).
-      sample_weight: An optional sample-weight array passed by the user to
-        weight the importance of each sample in `x`.
-      class_weight: An optional class-weight array by the user to
-        weight the importance of samples in `x` based on the class they belong
-        to, as conveyed by `y`.
-      batch_size: Integer batch size. If provided, it is used to run additional
-        validation checks on stateful models.
-      check_steps: boolean, True if we want to check for validity of `steps` and
-        False, otherwise. For example, when we are standardizing one batch of
-        data for train_on_batch/predict_on_batch/test_on_batch APIs, `steps`
-        value is not required and we should not check for its validity in these
-        cases.
-      steps_name: The public API's parameter name for `steps`.
-      steps: Integer or `None`. Total number of steps (batches of samples) to
-        execute.
-      validation_split: Float between 0 and 1.
-        Fraction of the training data to be used as validation data.
-      shuffle: Boolean whether to shuffle the training data before each epoch.
+  def predict_on_batch(self, x):
+    """Returns predictions for a single batch of samples.
+
+    Arguments:
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A `tf.data` dataset or a dataset iterator.
 
     Returns:
-      A tuple of 3: inputs (arrays or dicts, depending on whether `x` was a dict
-      or not), target arrays, sample-weight arrays.
-      If the model's input and targets are symbolic, these lists are empty
-      (since the model takes no user-provided data, instead the data comes
-      from the symbolic inputs/targets).
+        Numpy array(s) of predictions.
 
     Raises:
-      ValueError: In case of invalid user-provided data.
-      RuntimeError: If the model was never compiled.
+        ValueError: In case of mismatch between given number of inputs and
+          expectations of the model.
     """
     if self._distribution_strategy:
-      iterator = self._distribution_standardize_user_data(
-          x,
-          y,
-          sample_weight=sample_weight,
-          class_weight=class_weight,
-          batch_size=batch_size,
-          check_steps=check_steps,
-          steps_name=steps_name,
-          steps=steps,
-          validation_split=validation_split,
-          shuffle=shuffle)
-      return iterator, None, None
-
-    if isinstance(x, dataset_ops.DatasetV2):
-      if context.executing_eagerly():
-        x = x.make_one_shot_iterator()
-      else:
-        if x in self._dataset_iterator_cache:
-          x = self._dataset_iterator_cache[x]
-        else:
-          iterator = x.make_initializable_iterator()
-          self._dataset_iterator_cache[x] = iterator
-          x = iterator
-        K.get_session().run(x.initializer)
+      raise NotImplementedError('`predict_on_batch` is not supported for '
+                                'models compiled with DistributionStrategy.')
+    # Validate and standardize user data.
+    inputs, _, _ = self._standardize_user_data(x)
+    if self.run_eagerly:
+      if (isinstance(inputs, iterator_ops.EagerIterator) or
+          (isinstance(inputs, dataset_ops.DatasetV2))):
+        inputs = training_utils.cast_if_floating_dtype(inputs)
+      elif isinstance(inputs, collections.Sequence):
+        inputs = [
+            ops.convert_to_tensor(val, dtype=K.floatx()) for val in inputs]
 
-    # Validates `steps` argument based on x's type.
-    if check_steps:
-      training_utils.check_steps_argument(x, steps, steps_name)
+        # Unwrap lists with only one input, as we do when training on batch
+        if len(inputs) == 1:
+          inputs = inputs[0]
 
-    is_x_eager_iterator = isinstance(x, iterator_ops.EagerIterator)
-    is_x_iterator = isinstance(x, iterator_ops.Iterator)
+      return self(inputs)  # pylint: disable=not-callable
 
-    # Validate user inputs when data is given as a dataset or dataset iterator.
-    if is_x_iterator or is_x_eager_iterator:
-      training_utils.validate_iterator_input(x, y, sample_weight,
-                                             validation_split)
+    self._make_predict_function()
+    outputs = self.predict_function(inputs)
 
-    # For eager iterators, when we have to process multiple batches of samples,
-    # we will standardize the data when we actually loop over iterator and get
-    # the batches. For now, we just return the iterator as is.
-    if is_x_eager_iterator:
-      return x, y, sample_weight
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
 
-    # If input data is a dataset iterator in graph mode or if it is an eager
-    # iterator and only one batch of samples is required, we fetch the data
-    # tensors from the iterator and then standardize them.
-    if is_x_iterator or is_x_eager_iterator:
-      try:
-        if is_x_iterator:
-          next_element = self._get_iterator_get_next_tensors(x)
-        else:
-          next_element = x.get_next()
-      except errors.OutOfRangeError:
-        raise RuntimeError('Your dataset iterator ran out of data; '
-                           'Make sure that your dataset can generate '
-                           'required number of samples.')
+  def fit_generator(self,
+                    generator,
+                    steps_per_epoch=None,
+                    epochs=1,
+                    verbose=1,
+                    callbacks=None,
+                    validation_data=None,
+                    validation_steps=None,
+                    class_weight=None,
+                    max_queue_size=10,
+                    workers=1,
+                    use_multiprocessing=False,
+                    shuffle=True,
+                    initial_epoch=0):
+    """Fits the model on data yielded batch-by-batch by a Python generator.
 
-      if isinstance(next_element, (list, tuple)):
-        if len(next_element) not in [2, 3]:
-          raise ValueError(
-              'Please provide model inputs as a list or tuple of 2  or 3'
-              'elements: (input, target) or (input, target, sample_weights)'
-              'Received %s' % next_element)
-        if len(next_element) == 2:
-          x, y = next_element
-        else:
-          x, y, sample_weight = next_element
-      else:
-        x = next_element
-    x, y, sample_weights = self._standardize_weights(x, y, sample_weight,
-                                                     class_weight, batch_size)
-    return x, y, sample_weights
+    The generator is run in parallel to the model, for efficiency.
+    For instance, this allows you to do real-time data augmentation
+    on images on CPU in parallel to training your model on GPU.
 
-  def _standardize_weights(self, x, y, sample_weight=None, class_weight=None,
-                           batch_size=None,):
-    # TODO(sourabhbajaj): Split input validation from weight standardization.
-    if sample_weight is not None and class_weight is not None:
-      logging.warning(
-          'Received both a `sample_weight` and `class_weight` argument. '
-          'The `class_weight` argument will be ignored.')
-    # First, we build/compile the model on the fly if necessary.
-    all_inputs = []
-    is_build_called = False
-    is_compile_called = False
-    # Whether this is a subclassed model that expects dictionary inputs
-    # rather than list inputs (e.g. FeatureColumn-based models).
-    dict_inputs = False
-    if not self.inputs:
-      # We need to use `x` to set the model inputs.
-      # We type-check that `x` and `y` are either single arrays
-      # or lists of arrays.
-      if isinstance(x, (list, tuple)):
-        if not all(isinstance(v, np.ndarray) or
-                   tensor_util.is_tensor(v) for v in x):
-          raise ValueError('Please provide as model inputs either a single '
-                           'array or a list of arrays. You passed: x=' + str(x))
-        all_inputs += list(x)
-      elif isinstance(x, dict):
-        dict_inputs = True
-        keys = sorted(x.keys())
-        all_inputs = [x[k] for k in keys]
-      else:
-        if not isinstance(x, np.ndarray) and not tensor_util.is_tensor(x):
-          raise ValueError('Please provide as model inputs either a single '
-                           'array or a list of arrays. You passed: x=' + str(x))
-        all_inputs.append(x)
+    The use of `keras.utils.Sequence` guarantees the ordering
+    and guarantees the single use of every input per epoch when
+    using `use_multiprocessing=True`.
 
-      # Build the model using the retrieved inputs (value or symbolic).
-      # If values, then in symbolic-mode placeholders will be created
-      # to match the value shapes.
-      if not self.inputs:
-        is_build_called = True
-        cast_inputs = x
-        if training_utils.has_tensors(x):
-          cast_inputs = training_utils.cast_if_floating_dtype(x)
-        self._set_inputs(cast_inputs)
-    else:
-      dict_inputs = isinstance(self.inputs, dict)
-    if dict_inputs and context.executing_eagerly():
-      # No support for graph functions when the model expects dictionary inputs
-      # (i.e. FeatureColumn-based models).
-      self.run_eagerly = True
+    Arguments:
+        generator: A generator or an instance of `Sequence`
+          (`keras.utils.Sequence`)
+            object in order to avoid duplicate data
+            when using multiprocessing.
+            The output of the generator must be either
+            - a tuple `(inputs, targets)`
+            - a tuple `(inputs, targets, sample_weights)`.
+            This tuple (a single output of the generator) makes a single batch.
+            Therefore, all arrays in this tuple must have the same length (equal
+            to the size of this batch). Different batches may have different
+              sizes.
+            For example, the last batch of the epoch is commonly smaller than
+              the
+            others, if the size of the dataset is not divisible by the batch
+              size.
+            The generator is expected to loop over its data
+            indefinitely. An epoch finishes when `steps_per_epoch`
+            batches have been seen by the model.
+        steps_per_epoch: Total number of steps (batches of samples)
+            to yield from `generator` before declaring one epoch
+            finished and starting the next epoch. It should typically
+            be equal to the number of samples of your dataset
+            divided by the batch size.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
+        epochs: Integer, total number of iterations on the data.
+        verbose: Verbosity mode, 0, 1, or 2.
+        callbacks: List of callbacks to be called during training.
+        validation_data: This can be either
+            - a generator for the validation data
+            - a tuple (inputs, targets)
+            - a tuple (inputs, targets, sample_weights).
+        validation_steps: Only relevant if `validation_data`
+            is a generator. Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(validation_data)` as a number of steps.
+        class_weight: Dictionary mapping class indices to a weight
+            for the class.
+        max_queue_size: Integer. Maximum size for the generator queue.
+            If unspecified, `max_queue_size` will default to 10.
+        workers: Integer. Maximum number of processes to spin up
+            when using process-based threading.
+            If unspecified, `workers` will default to 1. If 0, will
+            execute the generator on the main thread.
+        use_multiprocessing: Boolean.
+            If `True`, use process-based threading.
+            If unspecified, `use_multiprocessing` will default to `False`.
+            Note that because this implementation relies on multiprocessing,
+            you should not pass non-picklable arguments to the generator
+            as they can't be passed easily to children processes.
+        shuffle: Boolean. Whether to shuffle the order of the batches at
+            the beginning of each epoch. Only used with instances
+            of `Sequence` (`keras.utils.Sequence`).
+            Has no effect when `steps_per_epoch` is not `None`.
+        initial_epoch: Epoch at which to start training
+            (useful for resuming a previous training run)
 
-    if y is not None:
-      if not self.optimizer:
-        raise RuntimeError('You must compile a model before '
-                           'training/testing. '
-                           'Use `model.compile(optimizer, loss)`.')
-      if not self._is_compiled:
-        # On-the-fly compilation of the model.
-        # We need to use `y` to set the model targets.
-        if training_utils.has_tensors(y):
-          y = training_utils.cast_if_floating_dtype(y)
-        if isinstance(y, (list, tuple)):
-          if not all(isinstance(v, np.ndarray) or
-                     tensor_util.is_tensor(v) for v in y):
-            raise ValueError('Please provide as model targets either a single '
-                             'array or a list of arrays. '
-                             'You passed: y=' + str(y))
-          all_inputs += list(y)
-        elif isinstance(y, dict):
-          raise ValueError('Please do not pass a dictionary as model targets.')
-        else:
-          if not isinstance(y, np.ndarray) and not tensor_util.is_tensor(y):
-            raise ValueError('Please provide as model targets either a single '
-                             'array or a list of arrays. '
-                             'You passed: y=' + str(y))
-          all_inputs.append(y)
+    Returns:
+        A `History` object.
 
-        # Typecheck that all inputs are *either* value *or* symbolic.
-        # TODO(fchollet): this check could be removed in Eager mode?
-        if any(tensor_util.is_tensor(v) for v in all_inputs):
-          if not all(tensor_util.is_tensor(v) for v in all_inputs):
-            raise ValueError('Do not pass inputs that mix Numpy arrays and '
-                             'TensorFlow tensors. '
-                             'You passed: x=' + str(x) + '; y=' + str(y))
+    Example:
 
-        if self.run_eagerly:
-          target_tensors = None
-        else:
-          # Handle target tensors if any passed.
-          if not isinstance(y, (list, tuple)):
-            y = [y]
-          target_tensors = [v for v in y if _is_symbolic_tensor(v)]
-        is_compile_called = True
-        self.compile(
-            optimizer=self.optimizer,
-            loss=self.loss,
-            metrics=self._compile_metrics,
-            weighted_metrics=self._compile_weighted_metrics,
-            loss_weights=self.loss_weights,
-            target_tensors=target_tensors,
-            run_eagerly=self.run_eagerly)
+    ```python
+        def generate_arrays_from_file(path):
+            while 1:
+                f = open(path)
+                for line in f:
+                    # create numpy arrays of input data
+                    # and labels, from each line in the file
+                    x1, x2, y = process_line(line)
+                    yield ({'input_1': x1, 'input_2': x2}, {'output': y})
+                f.close()
 
-    # In graph mode, if we had just set inputs and targets as symbolic tensors
-    # by invoking build and compile on the model respectively, we do not have to
-    # feed anything to the model. Model already has input and target data as
-    # part of the graph.
-    # Note: in this case, `any` and `all` are equivalent since we disallow
-    # mixed symbolic/value inputs.
-    if (not self.run_eagerly and is_build_called and
-        is_compile_called and
-        any(_is_symbolic_tensor(v) for v in all_inputs)):
-      return [], [], []
+        model.fit_generator(generate_arrays_from_file('/my_file.txt'),
+                            steps_per_epoch=10000, epochs=10)
+    ```
+    Raises:
+        ValueError: In case the generator yields data in an invalid format.
+    """
+    if self._distribution_strategy:
+      raise NotImplementedError('`fit_generator` is not supported for '
+                                'models compiled with DistributionStrategy.')
+    return training_generator.fit_generator(
+        self,
+        generator,
+        steps_per_epoch=steps_per_epoch,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        validation_data=validation_data,
+        validation_steps=validation_steps,
+        class_weight=class_weight,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        shuffle=shuffle,
+        initial_epoch=initial_epoch)
 
-    # What follows is input validation and standardization to list format,
-    # in the case where all inputs are value arrays.
+  def evaluate_generator(self,
+                         generator,
+                         steps=None,
+                         callbacks=None,
+                         max_queue_size=10,
+                         workers=1,
+                         use_multiprocessing=False,
+                         verbose=0):
+    """Evaluates the model on a data generator.
 
-    if self.run_eagerly:
-      # In eager mode, do not do shape validation
-      # since the network has no input nodes (placeholders) to be fed.
-      feed_input_names = self.input_names
-      feed_input_shapes = None
-    elif not self._is_graph_network:
-      # Case: symbolic-mode subclassed network. Do not do shape validation.
-      feed_input_names = self._feed_input_names
-      feed_input_shapes = None
-    else:
-      # Case: symbolic-mode graph network.
-      # In this case, we run extensive shape validation checks.
-      feed_input_names = self._feed_input_names
-      feed_input_shapes = self._feed_input_shapes
+    The generator should return the same kind of data
+    as accepted by `test_on_batch`.
 
-    # Standardize the inputs.
-    x = training_utils.standardize_input_data(
-        x,
-        feed_input_names,
-        feed_input_shapes,
-        check_batch_axis=False,  # Don't enforce the batch size.
-        exception_prefix='input')
+    Arguments:
+        generator: Generator yielding tuples (inputs, targets)
+            or (inputs, targets, sample_weights)
+            or an instance of `keras.utils.Sequence`
+            object in order to avoid duplicate data
+            when using multiprocessing.
+        steps: Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
+        callbacks: List of `keras.callbacks.Callback` instances.
+            List of callbacks to apply during evaluation.
+            See [callbacks](/api_docs/python/tf/keras/callbacks).
+        max_queue_size: maximum size for the generator queue
+        workers: Integer. Maximum number of processes to spin up
+            when using process-based threading.
+            If unspecified, `workers` will default to 1. If 0, will
+            execute the generator on the main thread.
+        use_multiprocessing: Boolean.
+            If `True`, use process-based threading.
+            If unspecified, `use_multiprocessing` will default to `False`.
+            Note that because this implementation relies on multiprocessing,
+            you should not pass non-picklable arguments to the generator
+            as they can't be passed easily to children processes.
+        verbose: Verbosity mode, 0 or 1.
 
-    if y is not None:
-      if not self._is_graph_network:
-        feed_output_names = self._feed_output_names
-        feed_output_shapes = None
-        # Sample weighting not supported in this case.
-        # TODO(fchollet): consider supporting it.
-        feed_sample_weight_modes = [None for _ in self.outputs]
-      else:
-        feed_output_names = self._feed_output_names
-        feed_sample_weight_modes = self._feed_sample_weight_modes
-        feed_output_shapes = []
-        for output_shape, loss_fn in zip(self._feed_output_shapes,
-                                         self._feed_loss_fns):
-          if loss_fn is losses.sparse_categorical_crossentropy:
-            if K.image_data_format() == 'channels_first':
-              feed_output_shapes.append(
-                  (output_shape[0], 1) + output_shape[2:])
-            else:
-              feed_output_shapes.append(output_shape[:-1] + (1,))
-          elif (not hasattr(loss_fn, '__name__') or
-                getattr(losses, loss_fn.__name__, None) is None):
-            # If `loss_fn` is not a function (e.g. callable class)
-            # or if it not in the `losses` module, then
-            # it is a user-defined loss and we make no assumptions
-            # about it.
-            feed_output_shapes.append(None)
-          else:
-            feed_output_shapes.append(output_shape)
+    Returns:
+        Scalar test loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
 
-      # Standardize the outputs.
-      y = training_utils.standardize_input_data(
-          y,
-          feed_output_names,
-          # Don't enforce target shapes to match output shapes.
-          # Precise checks will be run in `check_loss_and_target_compatibility`.
-          shapes=None,
-          check_batch_axis=False,  # Don't enforce the batch size.
-          exception_prefix='target')
+    Raises:
+        ValueError: in case of invalid arguments.
 
-      # Generate sample-wise weight values given the `sample_weight` and
-      # `class_weight` arguments.
-      sample_weights = training_utils.standardize_sample_weights(
-          sample_weight, feed_output_names)
-      class_weights = training_utils.standardize_class_weights(
-          class_weight, feed_output_names)
-      sample_weights = [
-          training_utils.standardize_weights(ref, sw, cw, mode)
-          for (ref, sw, cw, mode) in zip(y, sample_weights, class_weights,
-                                         feed_sample_weight_modes)
-      ]
-      # Check that all arrays have the same length.
-      if not self._distribution_strategy:
-        training_utils.check_array_lengths(x, y, sample_weights)
-        if self._is_graph_network and not self.run_eagerly:
-          # Additional checks to avoid users mistakenly using improper loss fns.
-          training_utils.check_loss_and_target_compatibility(
-              y, self._feed_loss_fns, feed_output_shapes)
-    else:
-      y = []
-      sample_weights = []
+    Raises:
+        ValueError: In case the generator yields data in an invalid format.
+    """
+    if self._distribution_strategy:
+      raise NotImplementedError('`evaluate_generator` is not supported for '
+                                'models compiled with DistributionStrategy.')
+    return training_generator.evaluate_generator(
+        self,
+        generator,
+        steps=steps,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        verbose=verbose,
+        callbacks=callbacks)
 
-    if self.stateful and batch_size:
-      # Check that for stateful networks, number of samples is a multiple
-      # of the static batch size.
-      if x[0].shape[0] % batch_size != 0:
-        raise ValueError('In a stateful network, '
-                         'you should only pass inputs with '
-                         'a number of samples that can be '
-                         'divided by the batch size. Found: ' +
-                         str(x[0].shape[0]) + ' samples')
+  def predict_generator(self,
+                        generator,
+                        steps=None,
+                        callbacks=None,
+                        max_queue_size=10,
+                        workers=1,
+                        use_multiprocessing=False,
+                        verbose=0):
+    """Generates predictions for the input samples from a data generator.
 
-    # If dictionary inputs were provided, we return a dictionary as well.
-    if dict_inputs:
-      x = dict(zip(feed_input_names, x))
-    return x, y, sample_weights
+    The generator should return the same kind of data as accepted by
+    `predict_on_batch`.
 
-  @checkpointable.no_automatic_dependency_tracking
-  def _set_inputs(self, inputs, outputs=None, training=None):
-    """Set model's input and output specs based on the input data received.
+    Arguments:
+        generator: Generator yielding batches of input samples
+            or an instance of `keras.utils.Sequence` object in order to
+            avoid duplicate data when using multiprocessing.
+        steps: Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
+        callbacks: List of `keras.callbacks.Callback` instances.
+            List of callbacks to apply during prediction.
+            See [callbacks](/api_docs/python/tf/keras/callbacks).
+        max_queue_size: Maximum size for the generator queue.
+        workers: Integer. Maximum number of processes to spin up
+            when using process-based threading.
+            If unspecified, `workers` will default to 1. If 0, will
+            execute the generator on the main thread.
+        use_multiprocessing: Boolean.
+            If `True`, use process-based threading.
+            If unspecified, `use_multiprocessing` will default to `False`.
+            Note that because this implementation relies on multiprocessing,
+            you should not pass non-picklable arguments to the generator
+            as they can't be passed easily to children processes.
+        verbose: verbosity mode, 0 or 1.
 
-    This is to be used for Model subclasses, which do not know at instantiation
-    time what their inputs look like.
+    Returns:
+        Numpy array(s) of predictions.
 
-    Args:
-      inputs: Single array, or list of arrays. The arrays could be placeholders,
-        Numpy arrays, or data tensors.
-        - if placeholders: the model is built on top of these placeholders,
-          and we expect Numpy data to be fed for them when calling `fit`/etc.
-        - if Numpy data: we create placeholders matching the shape of the Numpy
-          arrays. We expect Numpy data to be fed for these placeholders
-          when calling `fit`/etc.
-        - if data tensors: the model is built on top of these tensors.
-          We do not expect any Numpy data to be provided when calling `fit`/etc.
-      outputs: None, a data tensor, or a list of tensors. If None, the
-        outputs will be determined by invoking `self.call()`, otherwise the
-        provided value will be used.
-      training: Boolean or None. Only relevant in symbolic mode. Specifies
-        whether to build the model's graph in inference mode (False), training
-        mode (True), or using the Keras learning phase (None).
     Raises:
-      ValueError: If dict inputs are passed to a Sequential Model where the
-        first layer isn't FeatureLayer.
+        ValueError: In case the generator yields data in an invalid format.
     """
-    if self.inputs:
-      raise ValueError('Model inputs are already set.')
+    if self._distribution_strategy:
+      raise NotImplementedError('`predict_generator` is not supported for '
+                                'models compiled with DistributionStrategy.')
+    return training_generator.predict_generator(
+        self,
+        generator,
+        steps=steps,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        verbose=verbose,
+        callbacks=callbacks)
 
-    if self.__class__.__name__ == 'Sequential' and not self.built:
-      if tensor_util.is_tensor(inputs):
-        input_shape = (None,) + tuple(inputs.shape.as_list()[1:])
-      elif isinstance(inputs, dict):
-        # We assert that the first layer is a FeatureLayer.
-        if not training_utils.is_feature_layer(self.layers[0]):
-          raise ValueError('Passing a dictionary input to a Sequential Model '
-                           'which doesn\'t have FeatureLayer as the first layer'
-                           ' is an error.')
-        input_shape = (None,)
-      else:
-        input_shape = (None,) + tuple(inputs.shape[1:])
-      self._build_input_shape = input_shape
+  def _get_callback_model(self):
+    """Returns the Callback Model for this Model."""
 
-    # On-the-fly setting of symbolic model inputs (either by using the tensor
-    # provided, or by creating a placeholder if Numpy data was provided).
-    model_inputs = training_utils.ModelInputs(inputs)
-    inputs = model_inputs.get_symbolic_inputs()
-    self.inputs = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-    self.input_names = model_inputs.get_input_names()
+    if hasattr(self, '_replicated_model') and self._replicated_model:
+      # When using training_distributed, we set the callback model
+      # to an instance of the `DistributedModel` that we create in
+      # the `compile` call. The `DistributedModel` is initialized
+      # with the first replicated model. We need to set the callback
+      # model to a DistributedModel to allow us to override saving
+      # and loading weights when we checkpoint the model during training.
+      return self._replicated_model
+    if hasattr(self, 'callback_model') and self.callback_model:
+      return self.callback_model
+    return self
 
-    self._feed_inputs = []
-    self._feed_input_names = []
-    self._feed_input_shapes = []
+  def _make_callback_model(self, grouped_model):
+    first_replicated_model = self._distribution_strategy.unwrap(
+        grouped_model)[0]
+    # We initialize the callback model with the first replicated model.
+    self._replicated_model = DistributedCallbackModel(first_replicated_model)
+    self._replicated_model.set_original_model(self)
 
-    for k, v in model_inputs.as_dict():
-      if K.is_placeholder(v):
-        self._feed_inputs.append(v)
-        self._feed_input_names.append(k)
-        self._feed_input_shapes.append(K.int_shape(v))
+  def _validate_or_infer_batch_size(self, batch_size, steps, x):
+    """Validates that the `batch_size` provided is consistent with InputLayer.
 
-    # TODO(fchollet): consider calling `_maybe_build` before calling the model.
+    It's possible that the user specified a static batch size in their
+    InputLayer. If so, this method checks the provided `batch_size` and `x`
+    arguments are consistent with this static batch size. Also, if
+    `batch_size` is `None`, this method will attempt to infer the batch size
+    from the static batch size of the InputLayer.
 
-    if outputs is None:
-      # Obtain symbolic outputs by calling the model.
-      with K.get_graph().as_default():
-        if self._expects_training_arg:
-          outputs = self.call(inputs, training=training)
-        else:
-          outputs = self.call(inputs)
+    Arguments:
+      batch_size: The batch_size provided as an argument to
+        fit/evaluate/predict.
+      steps: The steps provided as an argument to fit/evaluate/predict.
+      x: The data passed as `x` to fit/evaluate/predict.
 
-    outputs = nest.flatten(outputs)
-    self.outputs = outputs
-    self.output_names = [
-        'output_%d' % (i + 1) for i in range(len(self.outputs))]
-    self.built = True
+    Returns:
+      The validated batch_size, auto-inferred from the first layer if not
+      provided.
+    """
+    layers = super(Model, self).layers  # Avoids the override in Sequential.
+    if layers:
+      first_layer = layers[0]
+      static_batch_size = training_utils.get_static_batch_size(first_layer)
+      if static_batch_size is not None:
+
+        # Check `batch_size` argument is consistent with InputLayer.
+        if batch_size is not None and batch_size != static_batch_size:
+          raise ValueError('The `batch_size` argument value {} is incompatible '
+                           'with the specified batch size of your Input Layer: '
+                           '{}'.format(batch_size, static_batch_size))
+
+        # Check Dataset/Iterator batch size is consistent with InputLayer.
+        if isinstance(x, (dataset_ops.DatasetV2, iterator_ops.Iterator,
+                          iterator_ops.EagerIterator)):
+          ds_batch_size = tensor_shape.as_dimension(
+              nest.flatten(x.output_shapes)[0][0]).value
+          if ds_batch_size is not None and ds_batch_size != static_batch_size:
+            raise ValueError('The batch output shape of your `Dataset` is {}, '
+                             'which is incompatible with the specified batch '
+                             'size of your Input Layer: {}'.format(
+                                 ds_batch_size, static_batch_size))
+
+        # Set inferred batch size from the InputLayer.
+        if steps is None:
+          batch_size = static_batch_size
 
-  def fit(self,
-          x=None,
-          y=None,
-          batch_size=None,
-          epochs=1,
-          verbose=1,
-          callbacks=None,
-          validation_split=0.,
-          validation_data=None,
-          shuffle=True,
-          class_weight=None,
-          sample_weight=None,
-          initial_epoch=0,
-          steps_per_epoch=None,
-          validation_steps=None,
-          max_queue_size=10,
-          workers=1,
-          use_multiprocessing=False,
-          **kwargs):
-    """Trains the model for a fixed number of epochs (iterations on a dataset).
+    if batch_size is None and steps is None:
+      # Backwards compatibility
+      batch_size = 32
+    return batch_size
 
-    Arguments:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
-          - A `tf.data` dataset or a dataset iterator. Should return a tuple
-            of either `(inputs, targets)` or
-            `(inputs, targets, sample_weights)`.
-          - A generator or `keras.utils.Sequence` returning `(inputs, targets)`
-            or `(inputs, targets, sample weights)`.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset, dataset
-          iterator, generator, or `keras.utils.Sequence` instance, `y` should
-          not be specified (since targets will be obtained from `x`).
-        batch_size: Integer or `None`.
-            Number of samples per gradient update.
-            If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` if your data is in the
-            form of symbolic tensors, dataset, dataset iterators,
-            generators, or `keras.utils.Sequence` instances (since they generate
-            batches).
-        epochs: Integer. Number of epochs to train the model.
-            An epoch is an iteration over the entire `x` and `y`
-            data provided.
-            Note that in conjunction with `initial_epoch`,
-            `epochs` is to be understood as "final epoch".
-            The model is not trained for a number of iterations
-            given by `epochs`, but merely until the epoch
-            of index `epochs` is reached.
-        verbose: Integer. 0, 1, or 2. Verbosity mode.
-            0 = silent, 1 = progress bar, 2 = one line per epoch.
-        callbacks: List of `keras.callbacks.Callback` instances.
-            List of callbacks to apply during training.
-            See [callbacks](/api_docs/python/tf/keras/callbacks).
-        validation_split: Float between 0 and 1.
-            Fraction of the training data to be used as validation data.
-            The model will set apart this fraction of the training data,
-            will not train on it, and will evaluate
-            the loss and any model metrics
-            on this data at the end of each epoch.
-            The validation data is selected from the last samples
-            in the `x` and `y` data provided, before shuffling. This argument is
-            not supported when `x` is a dataset, dataset iterator, generator or
-           `keras.utils.Sequence` instance.
-        validation_data: Data on which to evaluate
-            the loss and any model metrics at the end of each epoch.
-            The model will not be trained on this data.
-            `validation_data` will override `validation_split`.
-            `validation_data` could be:
-              - tuple `(x_val, y_val)` of Numpy arrays or tensors
-              - tuple `(x_val, y_val, val_sample_weights)` of Numpy arrays
-              - dataset or a dataset iterator
-            For the first two cases, `batch_size` must be provided.
-            For the last case, `validation_steps` must be provided.
-        shuffle: Boolean (whether to shuffle the training data
-            before each epoch) or str (for 'batch').
-            'batch' is a special option for dealing with the
-            limitations of HDF5 data; it shuffles in batch-sized chunks.
-            Has no effect when `steps_per_epoch` is not `None`.
-        class_weight: Optional dictionary mapping class indices (integers)
-            to a weight (float) value, used for weighting the loss function
-            (during training only).
-            This can be useful to tell the model to
-            "pay more attention" to samples from
-            an under-represented class.
-        sample_weight: Optional Numpy array of weights for
-            the training samples, used for weighting the loss function
-            (during training only). You can either pass a flat (1D)
-            Numpy array with the same length as the input samples
-            (1:1 mapping between weights and samples),
-            or in the case of temporal data,
-            you can pass a 2D array with shape
-            `(samples, sequence_length)`,
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            `sample_weight_mode="temporal"` in `compile()`. This argument is not
-            supported when `x` is a dataset, dataset iterator, generator, or
-           `keras.utils.Sequence` instance, instead provide the sample_weights
-            as the third element of `x`.
-        initial_epoch: Integer.
-            Epoch at which to start training
-            (useful for resuming a previous training run).
-        steps_per_epoch: Integer or `None`.
-            Total number of steps (batches of samples)
-            before declaring one epoch finished and starting the
-            next epoch. When training with input tensors such as
-            TensorFlow data tensors, the default `None` is equal to
-            the number of samples in your dataset divided by
-            the batch size, or 1 if that cannot be determined.
-        validation_steps: Only relevant if `validation_data` is provided and
-            is a dataset or dataset iterator. Total number of steps (batches of
-            samples) to draw before stopping when performing validation
-            at the end of every epoch.
-        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
-            input only. Maximum size for the generator queue.
-            If unspecified, `max_queue_size` will default to 10.
-        workers: Integer. Used for generator or `keras.utils.Sequence` input
-            only. Maximum number of processes to spin up
-            when using process-based threading. If unspecified, `workers`
-            will default to 1. If 0, will execute the generator on the main
-            thread.
-        use_multiprocessing: Boolean. Used for generator or
-            `keras.utils.Sequence` input only. If `True`, use process-based
-            threading. If unspecified, `use_multiprocessing` will default to
-            `False`. Note that because this implementation relies on
-            multiprocessing, you should not pass non-picklable arguments to
-            the generator as they can't be passed easily to children processes.
-        **kwargs: Used for backwards compatibility.
+  @property
+  def _default_save_signature(self):
+    return training_utils.trace_model_call(self)
+
+  def _set_sample_weight_attributes(self, sample_weight_mode,
+                                    skip_target_weighing_indices):
+    """Sets sample weight related attributes on the model."""
+    sample_weights, sample_weight_modes = training_utils.prepare_sample_weights(
+        self.output_names, sample_weight_mode, skip_target_weighing_indices)
+    self.sample_weights = sample_weights
+    self.sample_weight_modes = sample_weight_modes
+    self._feed_sample_weight_modes = [
+        sample_weight_modes[i]
+        for i in range(len(self.outputs))
+        if i not in skip_target_weighing_indices
+    ]
+    self._feed_sample_weights = [
+        sample_weights[i]
+        for i in range(len(sample_weights))
+        if i not in skip_target_weighing_indices
+    ]
 
-    Returns:
-        A `History` object. Its `History.history` attribute is
-        a record of training loss values and metrics values
-        at successive epochs, as well as validation loss values
-        and validation metrics values (if applicable).
+  def _cache_output_metric_attributes(self, metrics, weighted_metrics):
+    """Caches metric name and function attributes for every model output."""
+    output_shapes = []
+    for output in self.outputs:
+      if output is None or output.shape.rank is None:
+        output_shapes.append(None)
+      else:
+        output_shapes.append(output.shape.as_list())
+    self._per_output_metrics = training_utils.collect_per_output_metric_info(
+        metrics, self.output_names, output_shapes, self.loss_functions)
+    self._per_output_weighted_metrics = \
+        training_utils.collect_per_output_metric_info(
+            weighted_metrics, self.output_names, output_shapes,
+            self.loss_functions, self.sample_weights)
 
-    Raises:
-        RuntimeError: If the model was never compiled.
-        ValueError: In case of mismatch between the provided input data
-            and what the model expects.
+  def _add_unique_metric_name(self, metric_name, output_index):
+    """Makes the metric name unique and adds it to the model's metric name list.
+
+      If there are multiple outputs for which the metrics are calculated, the
+      metric names have to be made unique by appending an integer.
+
+    Arguments:
+      metric_name: Metric name that corresponds to the metric specified by the
+          user. For example: 'acc'.
+      output_index: The index of the model output for which the metric name is
+        being added.
+
+    Returns:
+      string, name of the model's unique metric name
     """
-    # TODO(fchollet): this method may be creating reference cycles, which would
-    # lead to accumulating garbage in memory when called in a loop. Investigate.
-    if data_utils.is_generator_or_sequence(x):
-      training_utils.check_generator_arguments(y, sample_weight)
-      return self.fit_generator(
-          x,
-          steps_per_epoch=steps_per_epoch,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          validation_data=validation_data,
-          validation_steps=validation_steps,
-          class_weight=class_weight,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing,
-          shuffle=shuffle,
-          initial_epoch=initial_epoch)
+    if len(self.output_names) > 1:
+      metric_name = '%s_%s' % (self.output_names[output_index], metric_name)
+    j = 1
+    base_metric_name = metric_name
+    while metric_name in self._compile_metrics_names:
+      metric_name = '%s_%d' % (base_metric_name, j)
+      j += 1
 
-    # Legacy support
-    if 'nb_epoch' in kwargs:
-      logging.warning(
-          'The `nb_epoch` argument in `fit` '
-          'has been renamed `epochs`.')
-      epochs = kwargs.pop('nb_epoch')
-    if kwargs:
-      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
+    return metric_name
 
-    # Validate and standardize user data.
-    if self._distribution_strategy:
-      distributed_training_utils.validate_callbacks(callbacks, self.optimizer,
-                                                    self._distribution_strategy)
+  @property
+  def _all_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    metrics_tensors = {}
+    if self._is_compiled:
+      metrics_tensors.update(self._compile_metrics_tensors)
+    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
+    return metrics_tensors
 
-      distributed_training_utils.validate_inputs(
-          x, y, self._distribution_strategy)
+  @property
+  def _all_stateful_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    metrics_tensors = {}
+    if self._is_compiled:
+      metrics_tensors.update(self._compile_stateful_metrics_tensors)
+    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
+    return metrics_tensors
 
-      first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray):
-        steps_per_epoch, batch_size = (
-            distributed_training_utils.get_input_params(
-                self._distribution_strategy, first_x_value, steps_per_epoch,
-                batch_size, is_training=True))
+  def _init_metric_attributes(self):
+    """Initialized model metric attributes."""
+    # List of all metric names in the model.
+    self._compile_metrics_names = ['loss']
+    # List of stateful metric functions. Used for resetting metric state during
+    # training/eval.
+    # This includes loss functions when there are multiple outputs.
+    self._compile_stateful_metric_functions = []
+    # Dict of all aggregated metric result tensors. This includes aggregated
+    # loss result tensors when there are multiple outputs.
+    self._compile_stateful_metrics_tensors = {}
+    # Dict of all metric result tensors (aggregated or not - based on the
+    # values given in compile.). This includes aggregated loss result tensors
+    # when there are multiple outputs.
+    self._compile_metrics_tensors = {}
 
-    # Backwards compatibility
-    if batch_size is None and steps_per_epoch is None:
-      batch_size = 32
+  def _set_per_output_metric_attributes(self, metrics_dict, output_index):
+    """Sets the metric attributes on the model for the given output.
 
-    x, y, sample_weights = self._standardize_user_data(
-        x,
-        y,
-        sample_weight=sample_weight,
-        class_weight=class_weight,
-        batch_size=batch_size,
-        check_steps=True,
-        steps_name='steps_per_epoch',
-        steps=steps_per_epoch,
-        validation_split=validation_split,
-        shuffle=shuffle)
+    Arguments:
+      metrics_dict: A dict with metric names as keys and metric fns as values.
+      output_index: The index of the model output for which the metric
+        attributes are added.
 
-    # Prepare validation data.
-    if validation_data:
-      if (isinstance(validation_data, iterator_ops.Iterator) or
-          isinstance(validation_data, iterator_ops.EagerIterator) or
-          isinstance(validation_data, dataset_ops.DatasetV2)):
-        val_x = validation_data
-        val_y = None
-        val_sample_weight = None
-      elif len(validation_data) == 2:
-        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
-        val_sample_weight = None
-      elif len(validation_data) == 3:
-        val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
-      else:
-        raise ValueError(
-            'When passing a `validation_data` argument, '
-            'it must contain either 2 items (x_val, y_val), '
-            'or 3 items (x_val, y_val, val_sample_weights), '
-            'or alternatively it could be a dataset or a '
-            'dataset or a dataset iterator. '
-            'However we received `validation_data=%s`' % validation_data)
+    Returns:
+      Metrics dict updated with unique metric names as keys.
+    """
+    updated_metrics_dict = collections.OrderedDict()
+    for metric_name, (metric_fn, stateful_metric_fn) in metrics_dict.items():
+      metric_name = self._add_unique_metric_name(metric_name, output_index)
+      updated_metrics_dict[metric_name] = (metric_fn, stateful_metric_fn)
+      # Keep track of metric name, function and stateful function.
+      self._compile_metrics_names.append(metric_name)
+      self._compile_stateful_metric_functions.append(stateful_metric_fn)
+    return updated_metrics_dict
 
-      # Validate and standardize validation data.
-      if self._distribution_strategy:
-        distributed_training_utils.validate_inputs(
-            val_x, val_y, self._distribution_strategy)
-        first_valx_value = nest.flatten(val_x)[0]
-        if isinstance(first_valx_value, np.ndarray):
-          validation_steps, _ = distributed_training_utils.get_input_params(
-              self._distribution_strategy, first_valx_value, validation_steps,
-              batch_size)
+  def _set_metric_attributes(self, outputs, skip_target_indices=None):
+    """Sets the metric attributes on the model for all the model outputs."""
+    skip_target_indices = skip_target_indices or []
+    updated_per_output_metrics = []
+    updated_per_output_weighted_metrics = []
+    for i in range(len(outputs)):
+      if i in skip_target_indices:
+        updated_per_output_metrics.append(self._per_output_metrics[i])
+        updated_per_output_weighted_metrics.append(
+            self._per_output_weighted_metrics[i])
+        continue
+      updated_per_output_metrics.append(
+          self._set_per_output_metric_attributes(self._per_output_metrics[i],
+                                                 i))
+      updated_per_output_weighted_metrics.append(
+          self._set_per_output_metric_attributes(
+              self._per_output_weighted_metrics[i], i))
 
-      val_x, val_y, val_sample_weights = self._standardize_user_data(
-          val_x,
-          val_y,
-          sample_weight=val_sample_weight,
-          batch_size=batch_size,
-          steps=validation_steps)
+    self._per_output_metrics = updated_per_output_metrics
+    self._per_output_weighted_metrics = updated_per_output_weighted_metrics
 
-    elif validation_split and 0. < validation_split < 1.:
-      if training_utils.has_symbolic_tensors(x):
-        raise ValueError('If your data is in the form of symbolic tensors, '
-                         'you cannot use `validation_split`.')
-      if hasattr(x[0], 'shape'):
-        split_at = int(x[0].shape[0] * (1. - validation_split))
-      else:
-        split_at = int(len(x[0]) * (1. - validation_split))
-      x, val_x = (slice_arrays(x, 0, split_at), slice_arrays(x, split_at))
-      y, val_y = (slice_arrays(y, 0, split_at), slice_arrays(y, split_at))
-      sample_weights, val_sample_weights = (slice_arrays(
-          sample_weights, 0, split_at), slice_arrays(sample_weights, split_at))
-    elif validation_steps:
-      val_x = []
-      val_y = []
-      val_sample_weights = []
-    else:
-      val_x = None
-      val_y = None
-      val_sample_weights = None
+  def _handle_per_output_metrics(self,
+                                 metrics_dict,
+                                 y_true,
+                                 y_pred,
+                                 mask,
+                                 weights=None,
+                                 return_stateful_result=True):
+    """Calls metric functions for a single output.
+
+    Arguments:
+      metrics_dict: A dict with metric names as keys and metric fns as values.
+      y_true: Target output.
+      y_pred: Predicted output.
+      mask: Computed mask value for the current output.
+      weights: Weights to be applied on the current output.
+      return_stateful_result: Boolean, indicates whether the stateful
+        (aggregated)/stateless metric result should be returned.
+
+    Returns:
+      A list of metric result tensors.
+    """
+    metric_results = []
+    for metric_name, (metric_fn, stateful_fn) in metrics_dict.items():
+      with K.name_scope(metric_name):
+
+        def _call_stateful_fn(fn):
+          return training_utils.call_metric_function(
+              fn, y_true, y_pred, weights=weights, mask=mask)
+
+        def _call_stateless_fn(fn):
+          weighted_metric_fn = training_utils.weighted_masked_objective(fn)
+          return weighted_metric_fn(y_true, y_pred, weights=weights, mask=mask)
 
-    if self.run_eagerly:
-      return training_eager.fit_loop(
-          self,
-          inputs=x,
-          targets=y,
-          sample_weights=sample_weights,
-          class_weight=class_weight,
-          batch_size=batch_size,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          val_inputs=val_x,
-          val_targets=val_y,
-          val_sample_weights=val_sample_weights,
-          shuffle=shuffle,
-          initial_epoch=initial_epoch,
-          steps_per_epoch=steps_per_epoch,
-          validation_steps=validation_steps)
-    elif distributed_training_utils.is_tpu_strategy(
-        self._distribution_strategy):
-      return training_distributed.experimental_fit_loop(
-          self,
-          x,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          val_iterator=val_x,
-          initial_epoch=initial_epoch,
-          steps_per_epoch=steps_per_epoch,
-          validation_steps=validation_steps)
-    elif (isinstance(x, iterator_ops.EagerIterator) and
-          not self._distribution_strategy):
-      return training_generator.fit_generator(
-          self,
-          x,
-          steps_per_epoch=steps_per_epoch,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          validation_data=validation_data,
-          validation_steps=validation_steps,
-          workers=0,
-          initial_epoch=initial_epoch)
-    else:
-      return training_arrays.fit_loop(
-          self,
-          x,
-          y,
-          sample_weights=sample_weights,
-          batch_size=batch_size,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          val_inputs=val_x,
-          val_targets=val_y,
-          val_sample_weights=val_sample_weights,
-          shuffle=shuffle,
-          initial_epoch=initial_epoch,
-          steps_per_epoch=steps_per_epoch,
-          validation_steps=validation_steps)
+        def _track_metric_tensors(name, stateless_result, stateful_result):
+          self._compile_metrics_tensors[name] = stateless_result
+          self._compile_stateful_metrics_tensors[name] = stateful_result
 
-  def evaluate(self,
-               x=None,
-               y=None,
-               batch_size=None,
-               verbose=1,
-               sample_weight=None,
-               steps=None,
-               max_queue_size=10,
-               workers=1,
-               use_multiprocessing=False):
-    """Returns the loss value & metrics values for the model in test mode.
+        if isinstance(metric_fn, metrics_module.Metric):
+          # If the given metric fn is stateful, call the fn and return result.
+          metric_result = _call_stateful_fn(metric_fn)
+          metric_results.append(metric_result)
+          if not self.run_eagerly:
+            _track_metric_tensors(metric_name, metric_result, metric_result)
+        elif self.run_eagerly:
+          # In eager mode, if the given metric fn is not stateful, we invoke the
+          # given fn or its stateful version based on the given flag.
+          if return_stateful_result:
+            metric_result = _call_stateful_fn(stateful_fn)
+          else:
+            metric_result = _call_stateless_fn(metric_fn)
+          metric_results.append(metric_result)
+        else:
+          # In graph mode, we build the sub-graph for both the stateful and the
+          # stateless fns.
+          stateful_metric_result = _call_stateful_fn(stateful_fn)
+          metric_result = _call_stateless_fn(metric_fn)
+          _track_metric_tensors(metric_name, metric_result,
+                                stateful_metric_result)
 
-    Computation is done in batches.
+    return metric_results
+
+  def _handle_metrics(self,
+                      outputs,
+                      skip_target_indices=None,
+                      targets=None,
+                      sample_weights=None,
+                      masks=None,
+                      return_stateful_result=True):
+    """Handles calling metric functions.
 
     Arguments:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
-          - A `tf.data` dataset or a dataset iterator.
-          - A generator or `keras.utils.Sequence` instance.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely).
-          If `x` is a dataset, dataset iterator, generator or
-          `keras.utils.Sequence` instance, `y` should not be specified (since
-          targets will be obtained from the iterator/dataset).
-        batch_size: Integer or `None`.
-            Number of samples per gradient update.
-            If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` is your data is in the
-            form of symbolic tensors, dataset, dataset iterators,
-            generators, or `keras.utils.Sequence` instances (since they generate
-            batches).
-        verbose: 0 or 1. Verbosity mode.
-            0 = silent, 1 = progress bar.
-        sample_weight: Optional Numpy array of weights for
-            the test samples, used for weighting the loss function.
-            You can either pass a flat (1D)
-            Numpy array with the same length as the input samples
-            (1:1 mapping between weights and samples),
-            or in the case of temporal data,
-            you can pass a 2D array with shape
-            `(samples, sequence_length)`,
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            `sample_weight_mode="temporal"` in `compile()`. This argument is not
-            supported when `x` is a dataset or a dataset iterator, instead pass
-            sample weights as the third element of `x`.
-        steps: Integer or `None`.
-            Total number of steps (batches of samples)
-            before declaring the evaluation round finished.
-            Ignored with the default value of `None`.
-        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
-            input only. Maximum size for the generator queue.
-            If unspecified, `max_queue_size` will default to 10.
-        workers: Integer. Used for generator or `keras.utils.Sequence` input
-            only. Maximum number of processes to spin up when using
-            process-based threading. If unspecified, `workers` will default
-            to 1. If 0, will execute the generator on the main thread.
-        use_multiprocessing: Boolean. Used for generator or
-            `keras.utils.Sequence` input only. If `True`, use process-based
-            threading. If unspecified, `use_multiprocessing` will default to
-            `False`. Note that because this implementation relies on
-            multiprocessing, you should not pass non-picklable arguments to
-            the generator as they can't be passed easily to children processes.
+      outputs: List of outputs (predictions).
+      skip_target_indices: Optional. List of target ids to skip.
+      targets: List of targets.
+      sample_weights: Optional list of sample weight arrays.
+      masks: List of computed output mask values.
+      return_stateful_result: Boolean, indicates whether the stateful
+        (aggregated)/stateless metric result should be returned.
 
     Returns:
-        Scalar test loss (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
+      A list of metric result tensors.
+    """
+    skip_target_indices = skip_target_indices or []
+    metric_results = []
+    with K.name_scope('metrics'):
+      # Invoke all metrics added using `compile`.
+      for i in range(len(outputs)):
+        if i in skip_target_indices:
+          continue
+        output = outputs[i] if outputs else None
+        target = targets[i] if targets else None
+        output_mask = masks[i] if masks else None
+        metric_results.extend(
+            self._handle_per_output_metrics(
+                self._per_output_metrics[i],
+                target,
+                output,
+                output_mask,
+                return_stateful_result=return_stateful_result))
+        metric_results.extend(
+            self._handle_per_output_metrics(
+                self._per_output_weighted_metrics[i],
+                target,
+                output,
+                output_mask,
+                weights=sample_weights[i],
+                return_stateful_result=return_stateful_result))
 
-    Raises:
-        ValueError: in case of invalid arguments.
+    # Add metric results from the `add_metric` metrics in eager mode.
+    if context.executing_eagerly():
+      for m in self.metrics:
+        if m not in self._compile_stateful_metric_functions:
+          metric_results.append(m.result())
+    return metric_results
+
+  def _check_trainable_weights_consistency(self):
+    """Check trainable weights count consistency.
+
+    This will raise a warning if `trainable_weights` and
+    `_collected_trainable_weights` are inconsistent (i.e. have different
+    number of parameters).
+    Inconsistency will typically arise when one modifies `model.trainable`
+    without calling `model.compile` again.
     """
-    if data_utils.is_generator_or_sequence(x):
-      training_utils.check_generator_arguments(y, sample_weight)
-      return self.evaluate_generator(
-          x,
-          steps=steps,
-          verbose=verbose,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing)
-    # Validate and standardize user data.
-    if self._distribution_strategy:
-      distributed_training_utils.validate_inputs(
-          x, y, self._distribution_strategy)
-      first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray):
-        steps, batch_size = distributed_training_utils.get_input_params(
-            self._distribution_strategy, first_x_value, steps, batch_size)
+    if not hasattr(self, '_collected_trainable_weights'):
+      return
 
-    # Backwards compatibility.
-    if batch_size is None and steps is None:
-      batch_size = 32
+    if len(self.trainable_weights) != len(self._collected_trainable_weights):
+      logging.log_first_n(
+          logging.WARN, 'Discrepancy between trainable weights and collected'
+          ' trainable weights, did you set `model.trainable`'
+          ' without calling `model.compile` after ?', 1)
+
+  def _make_train_function_helper(self, fn_name, outputs, metric_updates=None):
+    if not hasattr(self, fn_name):
+      raise RuntimeError('You must compile your model before using it.')
+    self._check_trainable_weights_consistency()
+    if getattr(self, fn_name) is None:
+      inputs = (self._feed_inputs +
+                self._feed_targets +
+                self._feed_sample_weights)
+      if not isinstance(K.symbolic_learning_phase(), int):
+        inputs += [K.symbolic_learning_phase()]
+
+      with K.get_graph().as_default():
+        with K.name_scope('training'):
+          with K.name_scope(self.optimizer.__class__.__name__):
+            # Training updates
+            updates = self.optimizer.get_updates(
+                params=self._collected_trainable_weights, loss=self.total_loss)
+      # Unconditional updates
+      updates += self.get_updates_for(None)
+      # Conditional updates relevant to this model
+      updates += self.get_updates_for(self.inputs)
+      # Add stateful metrics updates.
+      if metric_updates is not None:
+        updates += metric_updates
 
-    x, y, sample_weights = self._standardize_user_data(
-        x,
-        y,
-        sample_weight=sample_weight,
-        batch_size=batch_size,
-        check_steps=True,
-        steps_name='steps',
-        steps=steps)
+      with K.name_scope('training'):
+        # Gets loss and metrics. Updates weights at each call.
+        fn = K.function(
+            inputs,
+            outputs,
+            updates=updates,
+            name='train_function',
+            **self._function_kwargs)
+        setattr(self, fn_name, fn)
 
-    if self.run_eagerly:
-      return training_eager.test_loop(
-          self,
-          inputs=x,
-          targets=y,
-          sample_weights=sample_weights,
-          batch_size=batch_size,
-          verbose=verbose,
-          steps=steps)
-    elif distributed_training_utils.is_tpu_strategy(
-        self._distribution_strategy):
-      return training_distributed.experimental_test_loop(
-          self, iterator=x, verbose=verbose, steps=steps)
-    elif isinstance(x, iterator_ops.EagerIterator):
-      return training_generator.evaluate_generator(
-          self,
-          x,
-          steps=steps,
-          verbose=verbose,
-          workers=0)
-    else:
-      return training_arrays.test_loop(
-          self,
-          inputs=x,
-          targets=y,
-          sample_weights=sample_weights,
-          batch_size=batch_size,
-          verbose=verbose,
-          steps=steps)
+  def _make_train_function(self):
+    metrics_tensors = [
+        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_train_function_helper('train_function',
+                                     [self.total_loss] + metrics_tensors)
 
-  def predict(self,
-              x,
-              batch_size=None,
-              verbose=0,
-              steps=None,
-              max_queue_size=10,
-              workers=1,
-              use_multiprocessing=False):
-    """Generates output predictions for the input samples.
+  def _make_fit_function(self):
+    metrics_tensors = [
+        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_train_function_helper(
+        '_fit_function', [self.total_loss] + metrics_tensors)
 
-    Computation is done in batches.
+  def _make_test_function_helper(self, fn_name, outputs, metric_updates=None):
+    if not hasattr(self, fn_name):
+      raise RuntimeError('You must compile your model before using it.')
+    if getattr(self, fn_name) is None:
+      inputs = (self._feed_inputs +
+                self._feed_targets +
+                self._feed_sample_weights)
 
-    Arguments:
-         x: Input samples. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A `tf.data` dataset or a dataset iterator.
-          - A generator or `keras.utils.Sequence` instance.
-        batch_size: Integer or `None`.
-            Number of samples per gradient update.
-            If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` is your data is in the
-            form of symbolic tensors, dataset, dataset iterators,
-            generators, or `keras.utils.Sequence` instances (since they generate
-            batches).
-        verbose: Verbosity mode, 0 or 1.
-        steps: Total number of steps (batches of samples)
-            before declaring the prediction round finished.
-            Ignored with the default value of `None`.
-        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
-            input only. Maximum size for the generator queue.
-            If unspecified, `max_queue_size` will default to 10.
-        workers: Integer. Used for generator or `keras.utils.Sequence` input
-            only. Maximum number of processes to spin up when using
-            process-based threading. If unspecified, `workers` will default
-            to 1. If 0, will execute the generator on the main thread.
-        use_multiprocessing: Boolean. Used for generator or
-            `keras.utils.Sequence` input only. If `True`, use process-based
-            threading. If unspecified, `use_multiprocessing` will default to
-            `False`. Note that because this implementation relies on
-            multiprocessing, you should not pass non-picklable arguments to
-            the generator as they can't be passed easily to children processes.
+      with K.name_scope('evaluation'):
+        updates = self.state_updates
+        # Add stateful metrics updates.
+        if metric_updates is not None:
+          updates += metric_updates
+        # Return loss and metrics, no gradient updates.
+        # Does update the network states.
+        fn = K.function(
+            inputs,
+            outputs,
+            updates=updates,
+            name='test_function',
+            **self._function_kwargs)
+        setattr(self, fn_name, fn)
 
+  def _make_test_function(self):
+    metrics_tensors = [
+        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_test_function_helper('test_function',
+                                    [self.total_loss] + metrics_tensors)
 
-    Returns:
-        Numpy array(s) of predictions.
+  def _make_eval_function(self):
+    metrics_tensors = [
+        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_test_function_helper(
+        '_eval_function', [self.total_loss] + metrics_tensors)
 
-    Raises:
-        ValueError: In case of mismatch between the provided
-            input data and the model's expectations,
-            or in case a stateful model receives a number of samples
-            that is not a multiple of the batch size.
-    """
-    if data_utils.is_generator_or_sequence(x):
-      return self.predict_generator(
-          x,
-          steps=steps,
-          verbose=verbose,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing)
-    if self._distribution_strategy:
-      distributed_training_utils.validate_inputs(
-          x, None, self._distribution_strategy)
-      first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray):
-        steps, batch_size = distributed_training_utils.get_input_params(
-            self._distribution_strategy, first_x_value, steps, batch_size)
+  def _make_predict_function(self):
+    if not hasattr(self, 'predict_function'):
+      self.predict_function = None
+    if self.predict_function is None:
+      inputs = self._feed_inputs
+      # Gets network outputs. Does not update weights.
+      # Does update the network states.
+      kwargs = getattr(self, '_function_kwargs', {})
+      with K.name_scope('predict'):
+        self.predict_function = K.function(
+            inputs,
+            self.outputs,
+            updates=self.state_updates,
+            name='predict_function',
+            **kwargs)
 
-    # Backwards compatibility.
-    if batch_size is None and steps is None:
-      batch_size = 32
+  def _make_execution_function(self, mode):
+    if mode == 'train':
+      self._make_fit_function()
+      return self._fit_function
+    if mode == 'test':
+      self._make_eval_function()
+      return self._eval_function
+    if mode == 'predict':
+      self._make_predict_function()
+      return self.predict_function
 
-    # Validate and standardize user data.
-    if self._distribution_strategy:
-      x, _, _ = self._standardize_user_data(
-          x, check_steps=True, steps_name='steps', steps=steps,
-          batch_size=batch_size)
-    else:
-      # TODO(anjalisridhar): We don't pass batch_size here for some reason. This
-      # means we need to special case distribution strategy which needs the
-      # batch size.
-      x, _, _ = self._standardize_user_data(
-          x, check_steps=True, steps_name='steps', steps=steps)
+  def _get_iterator_get_next_tensors(self, iterator):
+    get_next_op = self._iterator_get_next.get(iterator, None)
+    if get_next_op is None:
+      get_next_op = iterator.get_next()
+      self._iterator_get_next[iterator] = get_next_op
+    return get_next_op
 
-    if self.run_eagerly:
-      return training_eager.predict_loop(
-          self, x, batch_size=batch_size, verbose=verbose, steps=steps)
-    elif distributed_training_utils.is_tpu_strategy(
-        self._distribution_strategy):
-      return training_distributed.experimental_predict_loop(
-          self, x, verbose=verbose, steps=steps)
-    elif isinstance(x, iterator_ops.EagerIterator):
-      return training_generator.predict_generator(
-          self,
-          x,
-          steps=steps,
-          verbose=verbose,
-          workers=0)
-    else:
-      return training_arrays.predict_loop(
-          self, x, batch_size=batch_size, verbose=verbose, steps=steps)
+  def _distribution_standardize_user_data(self,
+                                          x,
+                                          y=None,
+                                          sample_weight=None,
+                                          class_weight=None,
+                                          batch_size=None,
+                                          check_steps=False,
+                                          steps_name='steps',
+                                          steps=None,
+                                          validation_split=0,
+                                          shuffle=False):
+    """Runs validation checks on input and target data passed by the user.
 
-  def train_on_batch(self, x, y=None, sample_weight=None, class_weight=None):
-    """Runs a single gradient update on a single batch of data.
+    This is called when using DistributionStrategy to train, evaluate or serve
+    the model.
 
-    Arguments:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-              (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-              (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-              if the model has named inputs.
-          - A `tf.data` dataset or a dataset iterator.
-        y: Target data. Like the input data `x`, it could be either Numpy
-          array(s) or TensorFlow tensor(s). It should be consistent with `x`
-          (you cannot have Numpy inputs and tensor targets, or inversely). If
-          `x` is a dataset or a dataset iterator, `y` should not be specified
-          (since targets will be obtained from the iterator).
-        sample_weight: Optional array of the same length as x, containing
-          weights to apply to the model's loss for each sample. In the case of
-          temporal data, you can pass a 2D array with shape (samples,
-          sequence_length), to apply a different weight to every timestep of
-          every sample. In this case you should make sure to specify
-          sample_weight_mode="temporal" in compile(). This argument is not
-          supported when `x` is a dataset or a dataset iterator.
-        class_weight: Optional dictionary mapping class indices (integers) to a
-          weight (float) to apply to the model's loss for the samples from this
-          class during training. This can be useful to tell the model to "pay
-          more attention" to samples from an under-represented class.
+    Args:
+      x: Input data. A numpy array or `tf.data` dataset.
+      y: Target data. A numpy array or None if x is a `tf.data` dataset.
+      sample_weight: An optional sample-weight array passed by the user to
+        weight the importance of each sample in `x`.
+      class_weight: An optional class-weight array by the user to
+        weight the importance of samples in `x` based on the class they belong
+        to, as conveyed by `y`.
+      batch_size: Integer batch size. If provided, it is used to run additional
+        validation checks on stateful models.
+      check_steps: boolean, True if we want to check for validity of `steps` and
+        False, otherwise.
+      steps_name: The public API's parameter name for `steps`.
+      steps: Integer or `None`. Total number of steps (batches of samples) to
+        execute.
+      validation_split: Float between 0 and 1.
+        Fraction of the training data to be used as validation data.
+      shuffle: Boolean whether to shuffle the training data before each epoch.
 
     Returns:
-        Scalar training loss
-        (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
+      Iterator for reading the dataset `x`.
 
     Raises:
-      ValueError: In case of invalid user-provided arguments.
+      ValueError: In case of invalid user-provided data.
+      RuntimeError: If the model was never compiled.
     """
-    if self._distribution_strategy:
-      raise NotImplementedError('`train_on_batch` is not supported for models '
-                                'compiled with DistributionStrategy.')
-    # Validate and standardize user data.
-    x, y, sample_weights = self._standardize_user_data(
-        x, y, sample_weight=sample_weight, class_weight=class_weight)
+    if class_weight:
+      raise NotImplementedError('`class_weight` is currently not supported '
+                                'when using DistributionStrategy.')
 
-    if self.run_eagerly:
-      outputs = training_eager.train_on_batch(
-          self, x, y, sample_weights=sample_weights)
-    else:
-      if not isinstance(K.symbolic_learning_phase(), int):
-        ins = x + y + sample_weights + [True]
-      else:
-        ins = x + y + sample_weights
+    if (sample_weight is not None and sample_weight.all() and
+        distributed_training_utils.is_tpu_strategy(
+            self._distribution_strategy)):
+      raise NotImplementedError('`sample_weight` is currently not supported '
+                                'when using TPUStrategy.')
 
-      self._make_train_function()
-      outputs = self.train_function(ins)  # pylint: disable=not-callable
+    # Validates `steps` argument right at the beginning since we use it to
+    # construct the dataset object.
+    # TODO(anjalisridhar): Remove this check once we refactor the
+    # _standardize_user_data code path. This check is already present elsewhere
+    # in the codebase.
+    if check_steps and isinstance(x, dataset_ops.DatasetV2) and steps is None:
+      raise ValueError('When using Datasets as input, '
+                       'you should specify the `{steps_name}` argument.'
+                       .format(steps_name=steps_name))
 
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+    first_x_value = nest.flatten(x)[0]
+    if isinstance(first_x_value, np.ndarray):
+      # We need to use the drop_remainder argument to allow for a static
+      # input shape which is required for TPUs.
+      drop_remainder = self._distribution_strategy.require_static_shapes
+      if y is not None:
+        var_x = distributed_training_utils.get_var_for_numpy(
+            self._distribution_strategy, x)
+        var_y = distributed_training_utils.get_var_for_numpy(
+            self._distribution_strategy, y)
+        if sample_weight is not None:
+          var_sample_weights = distributed_training_utils.get_var_for_numpy(
+              self._distribution_strategy, sample_weight)
 
-  def test_on_batch(self, x, y=None, sample_weight=None):
-    """Test the model on a single batch of samples.
+          x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y,
+                                                      var_sample_weights))
+        else:
+          x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y))
 
-    Arguments:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
-          - A `tf.data` dataset or a dataset iterator.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset or a
-          dataset iterator, `y` should not be specified
-          (since targets will be obtained from the iterator).
-        sample_weight: Optional array of the same length as x, containing
-            weights to apply to the model's loss for each sample.
-            In the case of temporal data, you can pass a 2D array
-            with shape (samples, sequence_length),
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            sample_weight_mode="temporal" in compile(). This argument is not
-            supported when `x` is a dataset or a dataset iterator.
+        x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y))
+        if shuffle:
+          # 1024 is a good buffer size since it is much larger than the average
+          # batch size provided by the user and provides sufficient randomness.
+          # One thing to keep in mind is the memory usage based on the size of
+          # each sample.
+          x = x.shuffle(1024)
+        x = x.repeat()
+        x = x.batch(batch_size, drop_remainder=drop_remainder)
+        y = None
+        sample_weight = None
+      else:
+        # This case is for the predict call where the dataset only contains
+        # inputs and no targets, i.e. it does not return a tuple
+        var_x = distributed_training_utils.get_var_for_numpy(
+            self._distribution_strategy, x)
+        x = dataset_ops.Dataset.from_tensor_slices(var_x)
+        x = x.batch(batch_size, drop_remainder=drop_remainder)
 
-    Returns:
-        Scalar test loss (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
+    assert isinstance(x, dataset_ops.DatasetV2)
 
-    Raises:
-        ValueError: In case of invalid user-provided arguments.
-    """
-    if self._distribution_strategy:
-      raise NotImplementedError('`test_on_batch` is not supported for models '
-                                'compiled with DistributionStrategy.')
-    # Validate and standardize user data.
-    x, y, sample_weights = self._standardize_user_data(
-        x, y, sample_weight=sample_weight)
+    with self._distribution_strategy.scope():
+      iterator = self._distribution_strategy.make_dataset_iterator(x)
+      init_op = iterator.initialize()
+      if not context.executing_eagerly():
+        K.get_session().run(init_op)
 
-    if self.run_eagerly:
-      outputs = training_eager.test_on_batch(
-          self, x, y, sample_weights=sample_weights)
-    else:
-      inputs = x + y + sample_weights
-      self._make_test_function()
-      outputs = self.test_function(inputs)  # pylint: disable=not-callable
+    training_utils.validate_iterator_input(x, y, sample_weight,
+                                           validation_split)
+    return iterator
 
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+  def _standardize_user_data(self,
+                             x,
+                             y=None,
+                             sample_weight=None,
+                             class_weight=None,
+                             batch_size=None,
+                             check_steps=False,
+                             steps_name='steps',
+                             steps=None,
+                             validation_split=0,
+                             shuffle=False):
+    """Runs validation checks on input and target data passed by the user.
 
-  def predict_on_batch(self, x):
-    """Returns predictions for a single batch of samples.
+    Also standardizes the data to lists of arrays, in order.
 
-    Arguments:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A `tf.data` dataset or a dataset iterator.
+    Also builds and compiles the model on the fly if it is a subclassed model
+    that has never been called before (and thus has no inputs/outputs).
+
+    This is a purely internal method, subject to refactoring at any time.
+
+    Args:
+      x: Input data. It could be:
+        - A Numpy array (or array-like), or a list of arrays
+          (in case the model has multiple inputs).
+        - A TensorFlow tensor, or a list of tensors
+          (in case the model has multiple inputs).
+        - A dict mapping input names to the corresponding array/tensors,
+          if the model has named inputs.
+        - A `tf.data` dataset or a dataset iterator.
+      y: Target data. Like the input data `x`,
+        it could be either Numpy array(s) or TensorFlow tensor(s).
+        It should be consistent with `x` (you cannot have Numpy inputs and
+        tensor targets, or inversely). If `x` is a dataset or a
+        dataset iterator, `y` should not be specified
+        (since targets will be obtained from the iterator).
+      sample_weight: An optional sample-weight array passed by the user to
+        weight the importance of each sample in `x`.
+      class_weight: An optional class-weight array by the user to
+        weight the importance of samples in `x` based on the class they belong
+        to, as conveyed by `y`.
+      batch_size: Integer batch size. If provided, it is used to run additional
+        validation checks on stateful models.
+      check_steps: boolean, True if we want to check for validity of `steps` and
+        False, otherwise. For example, when we are standardizing one batch of
+        data for train_on_batch/predict_on_batch/test_on_batch APIs, `steps`
+        value is not required and we should not check for its validity in these
+        cases.
+      steps_name: The public API's parameter name for `steps`.
+      steps: Integer or `None`. Total number of steps (batches of samples) to
+        execute.
+      validation_split: Float between 0 and 1.
+        Fraction of the training data to be used as validation data.
+      shuffle: Boolean whether to shuffle the training data before each epoch.
 
     Returns:
-        Numpy array(s) of predictions.
+      A tuple of 3: inputs (arrays or dicts, depending on whether `x` was a dict
+      or not), target arrays, sample-weight arrays.
+      If the model's input and targets are symbolic, these lists are empty
+      (since the model takes no user-provided data, instead the data comes
+      from the symbolic inputs/targets).
 
     Raises:
-        ValueError: In case of mismatch between given number of inputs and
-          expectations of the model.
+      ValueError: In case of invalid user-provided data.
+      RuntimeError: If the model was never compiled.
     """
     if self._distribution_strategy:
-      raise NotImplementedError('`predict_on_batch` is not supported for '
-                                'models compiled with DistributionStrategy.')
-    # Validate and standardize user data.
-    inputs, _, _ = self._standardize_user_data(x)
-    if self.run_eagerly:
-      if (isinstance(inputs, iterator_ops.EagerIterator) or
-          (isinstance(inputs, dataset_ops.DatasetV2))):
-        inputs = training_utils.cast_if_floating_dtype(inputs)
-      elif isinstance(inputs, collections.Sequence):
-        inputs = [
-            ops.convert_to_tensor(val, dtype=K.floatx()) for val in inputs]
-      return self(inputs)  # pylint: disable=not-callable
+      iterator = self._distribution_standardize_user_data(
+          x,
+          y,
+          sample_weight=sample_weight,
+          class_weight=class_weight,
+          batch_size=batch_size,
+          check_steps=check_steps,
+          steps_name=steps_name,
+          steps=steps,
+          validation_split=validation_split,
+          shuffle=shuffle)
+      return iterator, None, None
+
+    if isinstance(x, dataset_ops.DatasetV2):
+      if context.executing_eagerly():
+        x = iter(x)
+      else:
+        if x in self._dataset_iterator_cache:
+          x = self._dataset_iterator_cache[x]
+        else:
+          iterator = dataset_ops.make_initializable_iterator(x)
+          self._dataset_iterator_cache[x] = iterator
+          x = iterator
+        K.get_session().run(x.initializer)
+
+    # Validates `steps` argument based on x's type.
+    if check_steps:
+      training_utils.check_steps_argument(x, steps, steps_name)
 
-    self._make_predict_function()
-    outputs = self.predict_function(inputs)
+    is_x_eager_iterator = isinstance(x, iterator_ops.EagerIterator)
+    is_x_iterator = isinstance(x, iterator_ops.Iterator)
 
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+    # Validate user inputs when data is given as a dataset or dataset iterator.
+    if is_x_iterator or is_x_eager_iterator:
+      training_utils.validate_iterator_input(x, y, sample_weight,
+                                             validation_split)
 
-  def fit_generator(self,
-                    generator,
-                    steps_per_epoch=None,
-                    epochs=1,
-                    verbose=1,
-                    callbacks=None,
-                    validation_data=None,
-                    validation_steps=None,
-                    class_weight=None,
-                    max_queue_size=10,
-                    workers=1,
-                    use_multiprocessing=False,
-                    shuffle=True,
-                    initial_epoch=0):
-    """Fits the model on data yielded batch-by-batch by a Python generator.
+    # For eager iterators, when we have to process multiple batches of samples,
+    # we will standardize the data when we actually loop over iterator and get
+    # the batches. For now, we just return the iterator as is.
+    if is_x_eager_iterator:
+      return x, y, sample_weight
 
-    The generator is run in parallel to the model, for efficiency.
-    For instance, this allows you to do real-time data augmentation
-    on images on CPU in parallel to training your model on GPU.
+    # If input data is a dataset iterator in graph mode or if it is an eager
+    # iterator and only one batch of samples is required, we fetch the data
+    # tensors from the iterator and then standardize them.
+    if is_x_iterator:
+      try:
+        next_element = self._get_iterator_get_next_tensors(x)
+      except errors.OutOfRangeError:
+        raise RuntimeError('Your dataset iterator ran out of data; '
+                           'Make sure that your dataset can generate '
+                           'required number of samples.')
 
-    The use of `keras.utils.Sequence` guarantees the ordering
-    and guarantees the single use of every input per epoch when
-    using `use_multiprocessing=True`.
+      if isinstance(next_element, (list, tuple)):
+        if len(next_element) not in [2, 3]:
+          raise ValueError(
+              'Please provide model inputs as a list or tuple of 2  or 3'
+              'elements: (input, target) or (input, target, sample_weights)'
+              'Received %s' % next_element)
+        if len(next_element) == 2:
+          x, y = next_element
+        else:
+          x, y, sample_weight = next_element
+      else:
+        x = next_element
 
-    Arguments:
-        generator: A generator or an instance of `Sequence`
-          (`keras.utils.Sequence`)
-            object in order to avoid duplicate data
-            when using multiprocessing.
-            The output of the generator must be either
-            - a tuple `(inputs, targets)`
-            - a tuple `(inputs, targets, sample_weights)`.
-            This tuple (a single output of the generator) makes a single batch.
-            Therefore, all arrays in this tuple must have the same length (equal
-            to the size of this batch). Different batches may have different
-              sizes.
-            For example, the last batch of the epoch is commonly smaller than
-              the
-            others, if the size of the dataset is not divisible by the batch
-              size.
-            The generator is expected to loop over its data
-            indefinitely. An epoch finishes when `steps_per_epoch`
-            batches have been seen by the model.
-        steps_per_epoch: Total number of steps (batches of samples)
-            to yield from `generator` before declaring one epoch
-            finished and starting the next epoch. It should typically
-            be equal to the number of samples of your dataset
-            divided by the batch size.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(generator)` as a number of steps.
-        epochs: Integer, total number of iterations on the data.
-        verbose: Verbosity mode, 0, 1, or 2.
-        callbacks: List of callbacks to be called during training.
-        validation_data: This can be either
-            - a generator for the validation data
-            - a tuple (inputs, targets)
-            - a tuple (inputs, targets, sample_weights).
-        validation_steps: Only relevant if `validation_data`
-            is a generator. Total number of steps (batches of samples)
-            to yield from `generator` before stopping.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(validation_data)` as a number of steps.
-        class_weight: Dictionary mapping class indices to a weight
-            for the class.
-        max_queue_size: Integer. Maximum size for the generator queue.
-            If unspecified, `max_queue_size` will default to 10.
-        workers: Integer. Maximum number of processes to spin up
-            when using process-based threading.
-            If unspecified, `workers` will default to 1. If 0, will
-            execute the generator on the main thread.
-        use_multiprocessing: Boolean.
-            If `True`, use process-based threading.
-            If unspecified, `use_multiprocessing` will default to `False`.
-            Note that because this implementation relies on multiprocessing,
-            you should not pass non-picklable arguments to the generator
-            as they can't be passed easily to children processes.
-        shuffle: Boolean. Whether to shuffle the order of the batches at
-            the beginning of each epoch. Only used with instances
-            of `Sequence` (`keras.utils.Sequence`).
-            Has no effect when `steps_per_epoch` is not `None`.
-        initial_epoch: Epoch at which to start training
-            (useful for resuming a previous training run)
+    if sample_weight is not None and class_weight is not None:
+      logging.warning(
+          'Received both a `sample_weight` and `class_weight` argument. '
+          'The `class_weight` argument will be ignored.')
+    # First, we build/compile the model on the fly if necessary.
+    all_inputs = []
+    is_build_called = False
+    is_compile_called = False
+    # Whether this is a subclassed model that expects dictionary inputs
+    # rather than list inputs (e.g. FeatureColumn-based models).
+    dict_inputs = False
+    if not self.inputs:
+      # We need to use `x` to set the model inputs.
+      # We type-check that `x` and `y` are either single arrays
+      # or lists of arrays.
+      if isinstance(x, (list, tuple)):
+        if not all(isinstance(v, np.ndarray) or
+                   tensor_util.is_tensor(v) for v in x):
+          raise ValueError('Please provide as model inputs either a single '
+                           'array or a list of arrays. You passed: x=' + str(x))
+        all_inputs += list(x)
+      elif isinstance(x, dict):
+        dict_inputs = True
+        keys = sorted(x.keys())
+        all_inputs = [x[k] for k in keys]
+      else:
+        if not isinstance(x, np.ndarray) and not tensor_util.is_tensor(x):
+          raise ValueError('Please provide as model inputs either a single '
+                           'array or a list of arrays. You passed: x=' + str(x))
+        all_inputs.append(x)
 
-    Returns:
-        A `History` object.
+      # Build the model using the retrieved inputs (value or symbolic).
+      # If values or generated from a dataset, then in symbolic-mode
+      # placeholders will be created to match the value shapes.
+      is_build_called = True
+      if is_x_iterator:
+        cast_inputs = nest.map_structure(lambda v: v.shape, x)
+      elif training_utils.has_tensors(x):
+        cast_inputs = training_utils.cast_if_floating_dtype(x)
+      else:
+        cast_inputs = x
+      self._set_inputs(cast_inputs)
+    else:
+      dict_inputs = isinstance(self.inputs, dict)
+    if dict_inputs and context.executing_eagerly():
+      # No support for graph functions when the model expects dictionary inputs
+      # (i.e. FeatureColumn-based models).
+      self.run_eagerly = True
 
-    Example:
+    if y is not None:
+      if not self.optimizer:
+        raise RuntimeError('You must compile a model before '
+                           'training/testing. '
+                           'Use `model.compile(optimizer, loss)`.')
+      if not self._is_compiled:
+        # On-the-fly compilation of the model.
+        # We need to use `y` to set the model targets.
+        if training_utils.has_tensors(y):
+          y = training_utils.cast_if_floating_dtype(y)
+        if isinstance(y, (list, tuple)):
+          if not all(isinstance(v, np.ndarray) or
+                     tensor_util.is_tensor(v) for v in y):
+            raise ValueError('Please provide as model targets either a single '
+                             'array or a list of arrays. '
+                             'You passed: y=' + str(y))
+          all_inputs += list(y)
+        elif isinstance(y, dict):
+          raise ValueError('Please do not pass a dictionary as model targets.')
+        else:
+          if not isinstance(y, np.ndarray) and not tensor_util.is_tensor(y):
+            raise ValueError('Please provide as model targets either a single '
+                             'array or a list of arrays. '
+                             'You passed: y=' + str(y))
+          all_inputs.append(y)
 
-    ```python
-        def generate_arrays_from_file(path):
-            while 1:
-                f = open(path)
-                for line in f:
-                    # create numpy arrays of input data
-                    # and labels, from each line in the file
-                    x1, x2, y = process_line(line)
-                    yield ({'input_1': x1, 'input_2': x2}, {'output': y})
-                f.close()
+        # Typecheck that all inputs are *either* value *or* symbolic.
+        # TODO(fchollet): this check could be removed in Eager mode?
+        if any(tensor_util.is_tensor(v) for v in all_inputs):
+          if not all(tensor_util.is_tensor(v) for v in all_inputs):
+            raise ValueError('Do not pass inputs that mix Numpy arrays and '
+                             'TensorFlow tensors. '
+                             'You passed: x=' + str(x) + '; y=' + str(y))
 
-        model.fit_generator(generate_arrays_from_file('/my_file.txt'),
-                            steps_per_epoch=10000, epochs=10)
-    ```
-    Raises:
-        ValueError: In case the generator yields data in an invalid format.
-    """
-    if self._distribution_strategy:
-      raise NotImplementedError('`fit_generator` is not supported for '
-                                'models compiled with DistributionStrategy.')
-    return training_generator.fit_generator(
-        self,
-        generator,
-        steps_per_epoch=steps_per_epoch,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        validation_data=validation_data,
-        validation_steps=validation_steps,
-        class_weight=class_weight,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        shuffle=shuffle,
-        initial_epoch=initial_epoch)
+        if self.run_eagerly or is_x_iterator:
+          target_tensors = None
+        else:
+          # Handle target tensors if any passed.
+          if not isinstance(y, (list, tuple)):
+            y = [y]
+          target_tensors = [v for v in y if _is_symbolic_tensor(v)]
+        is_compile_called = True
+        self.compile(
+            optimizer=self.optimizer,
+            loss=self.loss,
+            metrics=self._compile_metrics,
+            weighted_metrics=self._compile_weighted_metrics,
+            loss_weights=self.loss_weights,
+            target_tensors=target_tensors,
+            run_eagerly=self.run_eagerly)
 
-  def evaluate_generator(self,
-                         generator,
-                         steps=None,
-                         max_queue_size=10,
-                         workers=1,
-                         use_multiprocessing=False,
-                         verbose=0):
-    """Evaluates the model on a data generator.
+    # In graph mode, if we had just set inputs and targets as symbolic tensors
+    # by invoking build and compile on the model respectively, we do not have to
+    # feed anything to the model. Model already has input and target data as
+    # part of the graph.
+    # Note: in this case, `any` and `all` are equivalent since we disallow
+    # mixed symbolic/value inputs.
+    if (not self.run_eagerly and is_build_called and is_compile_called and
+        not is_x_iterator and any(_is_symbolic_tensor(v) for v in all_inputs)):
+      return [], [], []
+
+    # What follows is input validation and standardization to list format,
+    # in the case where all inputs are value arrays.
 
-    The generator should return the same kind of data
-    as accepted by `test_on_batch`.
+    if self.run_eagerly:
+      # In eager mode, do not do shape validation
+      # since the network has no input nodes (placeholders) to be fed.
+      feed_input_names = self.input_names
+      feed_input_shapes = None
+    elif not self._is_graph_network:
+      # Case: symbolic-mode subclassed network. Do not do shape validation.
+      feed_input_names = self._feed_input_names
+      feed_input_shapes = None
+    else:
+      # Case: symbolic-mode graph network.
+      # In this case, we run extensive shape validation checks.
+      feed_input_names = self._feed_input_names
+      feed_input_shapes = self._feed_input_shapes
 
-    Arguments:
-        generator: Generator yielding tuples (inputs, targets)
-            or (inputs, targets, sample_weights)
-            or an instance of `keras.utils.Sequence`
-            object in order to avoid duplicate data
-            when using multiprocessing.
-        steps: Total number of steps (batches of samples)
-            to yield from `generator` before stopping.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(generator)` as a number of steps.
-        max_queue_size: maximum size for the generator queue
-        workers: Integer. Maximum number of processes to spin up
-            when using process-based threading.
-            If unspecified, `workers` will default to 1. If 0, will
-            execute the generator on the main thread.
-        use_multiprocessing: Boolean.
-            If `True`, use process-based threading.
-            If unspecified, `use_multiprocessing` will default to `False`.
-            Note that because this implementation relies on multiprocessing,
-            you should not pass non-picklable arguments to the generator
-            as they can't be passed easily to children processes.
-        verbose: Verbosity mode, 0 or 1.
+    # Standardize the inputs.
+    x = training_utils.standardize_input_data(
+        x,
+        feed_input_names,
+        feed_input_shapes,
+        check_batch_axis=False,  # Don't enforce the batch size.
+        exception_prefix='input')
 
-    Returns:
-        Scalar test loss (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
+    if y is not None:
+      if not self._is_graph_network:
+        feed_output_names = self._feed_output_names
+        feed_output_shapes = None
+        # Sample weighting not supported in this case.
+        # TODO(fchollet): consider supporting it.
+        feed_sample_weight_modes = [None for _ in self.outputs]
+      else:
+        feed_output_names = self._feed_output_names
+        feed_sample_weight_modes = self._feed_sample_weight_modes
+        feed_output_shapes = []
+        for output_shape, loss_fn in zip(self._feed_output_shapes,
+                                         self._feed_loss_fns):
+          if loss_fn is losses.sparse_categorical_crossentropy:
+            if K.image_data_format() == 'channels_first':
+              feed_output_shapes.append(
+                  (output_shape[0], 1) + output_shape[2:])
+            else:
+              feed_output_shapes.append(output_shape[:-1] + (1,))
+          elif (not hasattr(loss_fn, '__name__') or
+                getattr(losses, loss_fn.__name__, None) is None):
+            # If `loss_fn` is not a function (e.g. callable class)
+            # or if it not in the `losses` module, then
+            # it is a user-defined loss and we make no assumptions
+            # about it.
+            feed_output_shapes.append(None)
+          else:
+            feed_output_shapes.append(output_shape)
 
-    Raises:
-        ValueError: in case of invalid arguments.
+      # Standardize the outputs.
+      y = training_utils.standardize_input_data(
+          y,
+          feed_output_names,
+          # Don't enforce target shapes to match output shapes.
+          # Precise checks will be run in `check_loss_and_target_compatibility`.
+          shapes=None,
+          check_batch_axis=False,  # Don't enforce the batch size.
+          exception_prefix='target')
 
-    Raises:
-        ValueError: In case the generator yields data in an invalid format.
-    """
-    if self._distribution_strategy:
-      raise NotImplementedError('`evaluate_generator` is not supported for '
-                                'models compiled with DistributionStrategy.')
-    return training_generator.evaluate_generator(
-        self,
-        generator,
-        steps=steps,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        verbose=verbose)
+      # Generate sample-wise weight values given the `sample_weight` and
+      # `class_weight` arguments.
+      sample_weights = training_utils.standardize_sample_weights(
+          sample_weight, feed_output_names)
+      class_weights = training_utils.standardize_class_weights(
+          class_weight, feed_output_names)
+      sample_weights = [
+          training_utils.standardize_weights(ref, sw, cw, mode)
+          for (ref, sw, cw, mode) in zip(y, sample_weights, class_weights,
+                                         feed_sample_weight_modes)
+      ]
+      # Check that all arrays have the same length.
+      if not self._distribution_strategy:
+        training_utils.check_array_lengths(x, y, sample_weights)
+        if self._is_graph_network and not self.run_eagerly:
+          # Additional checks to avoid users mistakenly using improper loss fns.
+          training_utils.check_loss_and_target_compatibility(
+              y, self._feed_loss_fns, feed_output_shapes)
+    else:
+      y = []
+      sample_weights = []
 
-  def predict_generator(self,
-                        generator,
-                        steps=None,
-                        max_queue_size=10,
-                        workers=1,
-                        use_multiprocessing=False,
-                        verbose=0):
-    """Generates predictions for the input samples from a data generator.
+    if self.stateful and batch_size:
+      # Check that for stateful networks, number of samples is a multiple
+      # of the static batch size.
+      if x[0].shape[0] % batch_size != 0:
+        raise ValueError('In a stateful network, '
+                         'you should only pass inputs with '
+                         'a number of samples that can be '
+                         'divided by the batch size. Found: ' +
+                         str(x[0].shape[0]) + ' samples')
 
-    The generator should return the same kind of data as accepted by
-    `predict_on_batch`.
+    # If dictionary inputs were provided, we return a dictionary as well.
+    if dict_inputs:
+      x = dict(zip(feed_input_names, x))
+    return x, y, sample_weights
 
-    Arguments:
-        generator: Generator yielding batches of input samples
-            or an instance of `keras.utils.Sequence` object in order to
-            avoid duplicate data when using multiprocessing.
-        steps: Total number of steps (batches of samples)
-            to yield from `generator` before stopping.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(generator)` as a number of steps.
-        max_queue_size: Maximum size for the generator queue.
-        workers: Integer. Maximum number of processes to spin up
-            when using process-based threading.
-            If unspecified, `workers` will default to 1. If 0, will
-            execute the generator on the main thread.
-        use_multiprocessing: Boolean.
-            If `True`, use process-based threading.
-            If unspecified, `use_multiprocessing` will default to `False`.
-            Note that because this implementation relies on multiprocessing,
-            you should not pass non-picklable arguments to the generator
-            as they can't be passed easily to children processes.
-        verbose: verbosity mode, 0 or 1.
+  @checkpointable.no_automatic_dependency_tracking
+  def _set_inputs(self, inputs, outputs=None, training=None):
+    """Set model's input and output specs based on the input data received.
 
-    Returns:
-        Numpy array(s) of predictions.
+    This is to be used for Model subclasses, which do not know at instantiation
+    time what their inputs look like.
 
+    Args:
+      inputs: Single array, or list of arrays. The arrays could be placeholders,
+        Numpy arrays, data tensors, or TensorShapes.
+        - if placeholders: the model is built on top of these placeholders,
+          and we expect Numpy data to be fed for them when calling `fit`/etc.
+        - if Numpy data or TensorShapes: we create placeholders matching the
+          TensorShapes or shapes of the Numpy arrays. We expect Numpy data to be
+          fed for these placeholders when calling `fit`/etc.
+        - if data tensors: the model is built on top of these tensors.
+          We do not expect any Numpy data to be provided when calling `fit`/etc.
+      outputs: None, a data tensor, or a list of tensors. If None, the
+        outputs will be determined by invoking `self.call()`, otherwise the
+        provided value will be used.
+      training: Boolean or None. Only relevant in symbolic mode. Specifies
+        whether to build the model's graph in inference mode (False), training
+        mode (True), or using the Keras learning phase (None).
     Raises:
-        ValueError: In case the generator yields data in an invalid format.
+      ValueError: If dict inputs are passed to a Sequential Model where the
+        first layer isn't FeatureLayer.
     """
-    if self._distribution_strategy:
-      raise NotImplementedError('`predict_generator` is not supported for '
-                                'models compiled with DistributionStrategy.')
-    return training_generator.predict_generator(
-        self,
-        generator,
-        steps=steps,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        verbose=verbose)
+    if self.inputs:
+      raise ValueError('Model inputs are already set.')
 
-  def _get_callback_model(self):
-    """Returns the Callback Model for this Model."""
+    if self.__class__.__name__ == 'Sequential' and not self.built:
+      if tensor_util.is_tensor(inputs):
+        input_shape = (None,) + tuple(inputs.shape.as_list()[1:])
+      elif isinstance(inputs, tensor_shape.TensorShape):
+        input_shape = (None,) + tuple(inputs.as_list()[1:])
+      elif isinstance(inputs, dict):
+        # We assert that the first layer is a FeatureLayer.
+        if not training_utils.is_feature_layer(self.layers[0]):
+          raise ValueError('Passing a dictionary input to a Sequential Model '
+                           'which doesn\'t have FeatureLayer as the first layer'
+                           ' is an error.')
+        input_shape = (None,)
+      else:
+        input_shape = (None,) + tuple(inputs.shape[1:])
+      self._build_input_shape = input_shape
 
-    if hasattr(self, '_replicated_model') and self._replicated_model:
-      # When using training_distributed, we set the callback model
-      # to an instance of the `DistributedModel` that we create in
-      # the `compile` call. The `DistributedModel` is initialized
-      # with the first replicated model. We need to set the callback
-      # model to a DistributedModel to allow us to override saving
-      # and loading weights when we checkpoint the model during training.
-      return self._replicated_model
-    if hasattr(self, 'callback_model') and self.callback_model:
-      return self.callback_model
-    return self
+    # On-the-fly setting of symbolic model inputs (either by using the tensor
+    # provided, or by creating a placeholder if Numpy data was provided).
+    model_inputs = training_utils.ModelInputs(inputs)
+    inputs = model_inputs.get_symbolic_inputs()
+    self.inputs = model_inputs.get_symbolic_inputs(return_single_as_list=True)
+    self.input_names = model_inputs.get_input_names()
 
-  def _make_callback_model(self, grouped_model):
-    first_replicated_model = self._distribution_strategy.unwrap(
-        grouped_model)[0]
-    # We initialize the callback model with the first replicated model.
-    self._replicated_model = DistributedCallbackModel(first_replicated_model)
-    self._replicated_model.set_original_model(self)
+    self._feed_inputs = []
+    self._feed_input_names = []
+    self._feed_input_shapes = []
+
+    for k, v in model_inputs.as_dict():
+      if K.is_placeholder(v):
+        self._feed_input_names.append(k)
+        self._feed_inputs.append(v)
+        self._feed_input_shapes.append(K.int_shape(v))
+
+    # TODO(fchollet): consider calling `_maybe_build` before calling the model.
+    if outputs is None:
+      if not self._dynamic:
+        # The network may include dynamic layers but its `call`
+        # itself isn't dynamic.
+        # Obtain symbolic outputs by calling the model.
+        with K.get_graph().as_default():
+          if self._expects_training_arg:
+            outputs = self.call(inputs, training=training)
+          else:
+            outputs = self.call(inputs)
+      else:
+        # Case: network's `call` is dynamic.
+        try:
+          outputs = self._symbolic_call(inputs)
+        except NotImplementedError:
+          # Static shape inference was not implemented for this dynamic net.
+          # Do not specify symbolic outputs.
+          outputs = None
+
+    outputs = nest.flatten(outputs)
+    self.outputs = outputs
+    self.output_names = training_utils.generic_output_names(outputs)
+    self.built = True
 
 
 class DistributedCallbackModel(Model):
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index 390357303e2d519e20fe492313806944b643624a..03033c33348e7336883ba8ff16db0ee229512ac0 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -23,6 +23,7 @@ import functools
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
@@ -38,91 +39,6 @@ except ImportError:
   issparse = None
 
 
-class Aggregator(object):
-  """Abstract base class used to aggregate batch-level outputs of a loop.
-
-  Arguments:
-    use_steps: Whether the loop is using `step` or `batch_size`.
-    num_samples_or_steps: Either `batch_size*num_batches` or `steps`.
-  """
-
-  def __init__(self, use_steps, num_samples_or_steps):
-    self.use_steps = use_steps
-    self.num_samples_or_steps = num_samples_or_steps
-    self.results = []
-
-  def create(self, batch_outs):
-    """Create the initial results from the first batch outputs.
-
-    Arguments:
-      batch_outs: A list of batch-level outputs.
-    """
-    raise NotImplementedError
-
-  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
-    """Aggregate batch-level results into total results.
-
-    Arguments:
-      batch_outs: A list of batch-level outputs.
-      batch_start: The start index of this batch. Always `None` if `use_steps`
-        is `True`.
-      batch_end: The end index of this batch. Always `None` if `use_steps` is
-        `True`.
-    """
-    raise NotImplementedError
-
-  def finalize(self):
-    """Prepare the total results to be returned."""
-    raise NotImplementedError
-
-
-class MetricsAggregator(Aggregator):
-  """Aggregator that calculates loss and metrics info."""
-
-  def create(self, batch_outs):
-    self.results = [0.] * len(batch_outs)
-
-  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
-    # Loss.
-    if self.use_steps:
-      self.results[0] += batch_outs[0]
-    else:
-      self.results[0] += batch_outs[0] * (batch_end - batch_start)
-    # Metrics (always stateful, just grab current values.)
-    self.results[1:] = batch_outs[1:]
-
-  def finalize(self):
-    self.results[0] /= self.num_samples_or_steps
-
-
-class OutputsAggregator(Aggregator):
-  """Aggregator that concatenates outputs."""
-
-  def create(self, batch_outs):
-    if self.use_steps:
-      # Cannot pre-allocate the returned NumPy arrays bc
-      # batch sizes are unknown. Concatenate batches at the end.
-      for _ in batch_outs:
-        self.results.append([])
-    else:
-      # Pre-allocate NumPy arrays.
-      for batch_out in batch_outs:
-        shape = (self.num_samples_or_steps,) + batch_out.shape[1:]
-        self.results.append(np.zeros(shape, dtype=batch_out.dtype))
-
-  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
-    if self.use_steps:
-      for i, batch_out in enumerate(batch_outs):
-        self.results[i].append(batch_out)
-    else:
-      for i, batch_out in enumerate(batch_outs):
-        self.results[i][batch_start:batch_end] = batch_out
-
-  def finalize(self):
-    if self.use_steps:
-      self.results = [np.concatenate(result, axis=0) for result in self.results]
-
-
 def _get_model_feed(model, mode):
   if mode == 'predict':
     feed = model._feed_inputs
@@ -152,13 +68,6 @@ def _print_train_info(inputs, val_inputs, steps_per_epoch, verbose):
           (inputs[0].shape[0], val_inputs[0].shape[0]))
 
 
-def _get_progbar(model, count_mode):
-  stateful_metric_names = None
-  if hasattr(model, 'metrics_names'):
-    stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
-  return cbks.ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names)
-
-
 def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
   """Returns total number of samples (when training in batch mode) or steps."""
   if steps_per_epoch:
@@ -167,18 +76,6 @@ def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
                                           'steps_per_epoch')
 
 
-def _make_logs(model, outputs, mode, prefix=''):
-  """Used to make logs to send to `on_batch_end` methods."""
-  logs = {}
-  # TODO(omalleyt): handle outputs in prediction when Callback
-  # hooks are ready.
-  if mode in ['train', 'test']:
-    if hasattr(model, 'metrics_names'):
-      for label, output in zip(model.metrics_names, outputs):
-        logs[prefix + label] = output
-  return logs
-
-
 def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
   """Prepare feed values to the model execution function.
 
@@ -193,8 +90,22 @@ def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
     Feed values for the model in the given mode.
   """
   if model._distribution_strategy:
-    return training_distributed._prepare_feed_values(model, inputs, targets,
-                                                     sample_weights, mode)
+    def get_distributed_inputs():
+      return training_distributed._prepare_feed_values(
+          model, inputs, targets, sample_weights, mode)
+
+    # In the eager case, we want to call the input method per step, so return
+    # a lambda from here that can be called. Note that this is applicable only
+    # in Distribution Strategy case as it follows the same code path for both
+    # eager and graph modes.
+    # TODO(priyag,omalleyt): Either we should move the training DS with
+    # EagerIterator to use training_generator code path, or figure out how to
+    # set a symbolic Iterator out of a Dataset when in eager mode.
+    if context.executing_eagerly():
+      return get_distributed_inputs
+    else:
+      return get_distributed_inputs()
+
   inputs = training_utils.ModelInputs(inputs).as_list()
   targets = targets or []
   sample_weights = sample_weights or []
@@ -204,11 +115,11 @@ def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
   return ins
 
 
-def _get_execution_function(model, mode):
-  """Get function to run one step of model execution."""
+def _make_execution_function(model, mode):
+  """Makes function to run one step of model execution."""
   if model._distribution_strategy:
-    return training_distributed._get_execution_function(model, mode)
-  return model._get_execution_function(mode)
+    return training_distributed._make_execution_function(model, mode)
+  return model._make_execution_function(mode)
 
 
 def model_iteration(model,
@@ -227,6 +138,7 @@ def model_iteration(model,
                     steps_per_epoch=None,
                     validation_steps=None,
                     mode='train',
+                    validation_in_fit=False,
                     **kwargs):
   """Loop function for arrays of data with modes 'train'/'test'/'predict'.
 
@@ -253,6 +165,9 @@ def model_iteration(model,
       validation_steps: Number of steps to run validation for (only if doing
         validation from data tensors). Ignored with the default value of `None`.
       mode: One of 'train'/'test'/'predict'.
+      validation_in_fit: if true, then this method is invoked from within
+        training iteration (for validation). In this case, do not copy weights
+        when using a tf.distribute.Strategy.
       **kwargs: Additional arguments for backwards compatibility.
 
   Returns:
@@ -277,7 +192,7 @@ def model_iteration(model,
     scope.__enter__()
 
   # Get step function and loop type.
-  f = _get_execution_function(model, mode)
+  f = _make_execution_function(model, mode)
   use_steps = steps_per_epoch is not None
   do_validation = val_inputs is not None
 
@@ -292,24 +207,19 @@ def model_iteration(model,
       callbacks,
       model,
       do_validation=do_validation,
-      val_inputs=val_inputs,
-      val_targets=val_targets,
-      val_sample_weights=val_sample_weights,
       batch_size=batch_size,
       epochs=epochs,
       steps_per_epoch=steps_per_epoch,
       samples=num_samples_or_steps,
-      validation_steps=validation_steps,
       verbose=0,  # Handle ProgBarLogger separately in this loop.
-      count_mode=count_mode,
       mode=mode)
   # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
-  progbar = _get_progbar(model, count_mode)
+  progbar = training_utils.get_progbar(model, count_mode)
   progbar.params = callbacks.params
   progbar.params['verbose'] = verbose
 
   # Find beforehand arrays that need sparse-to-dense conversion.
-  if issparse is not None:
+  if issparse is not None and not use_steps:
     indices_for_conversion_to_dense = []
     feed = _get_model_feed(model, mode)
     for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)):
@@ -318,26 +228,27 @@ def model_iteration(model,
 
   # Select aggregation method.
   if mode == 'predict':
-    aggregator = OutputsAggregator(use_steps, num_samples_or_steps)
+    aggregator = training_utils.OutputsAggregator(use_steps,
+                                                  num_samples_or_steps)
   else:
-    aggregator = MetricsAggregator(use_steps, num_samples_or_steps)
+    aggregator = training_utils.MetricsAggregator(use_steps,
+                                                  num_samples_or_steps)
 
-  if model._distribution_strategy:
-    training_distributed._copy_weights_to_distributed_model(model)
+  if model._distribution_strategy and not validation_in_fit:
+    training_distributed._copy_weights_to_distributed_model(
+        model, model._grouped_model)
 
   callbacks.model.stop_training = False
   callbacks._call_begin_hook(mode)
   progbar.on_train_begin()
+
   for epoch in range(initial_epoch, epochs):
     if callbacks.model.stop_training:
       break
 
     # Setup work for each epoch
-    results = []
     epoch_logs = {}
-    if hasattr(model, 'metrics'):
-      for m in model.metrics:
-        m.reset_states()
+    model.reset_metrics()
     callbacks.on_epoch_begin(epoch, epoch_logs, mode=mode)
     progbar.on_epoch_begin(epoch, epoch_logs)
 
@@ -350,7 +261,9 @@ def model_iteration(model,
 
         # Get outputs.
         try:
-          batch_outs = f(ins)
+          # `ins` can be callable in DistributionStrategy + eager case.
+          actual_inputs = ins() if callable(ins) else ins
+          batch_outs = f(actual_inputs)
         except errors.OutOfRangeError:
           logging.warning('Your dataset iterator ran out of data; '
                           'interrupting training. Make sure that your dataset '
@@ -372,7 +285,7 @@ def model_iteration(model,
         aggregator.aggregate(batch_outs)
 
         # Callbacks batch end.
-        batch_logs.update(_make_logs(model, batch_outs, mode))
+        batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
         callbacks._call_batch_hook(mode, 'end', step, batch_logs)
         progbar.on_batch_end(step, batch_logs)
 
@@ -423,7 +336,7 @@ def model_iteration(model,
         aggregator.aggregate(batch_outs, batch_start, batch_end)
 
         # Callbacks batch end.
-        batch_logs.update(_make_logs(model, batch_outs, mode))
+        batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
         callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs)
         progbar.on_batch_end(batch_index, batch_logs)
 
@@ -432,7 +345,7 @@ def model_iteration(model,
 
     aggregator.finalize()
     results = aggregator.results
-    epoch_logs.update(_make_logs(model, results, mode))
+    epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
     if len(results) == 1:
       results = results[0]
 
@@ -447,17 +360,26 @@ def model_iteration(model,
           steps_per_epoch=validation_steps,
           callbacks=callbacks,
           verbose=0,
-          mode='test')
+          mode='test',
+          validation_in_fit=True)
       if not isinstance(val_results, list):
         val_results = [val_results]
-      epoch_logs.update(_make_logs(model, val_results, mode, prefix='val_'))
+      epoch_logs = cbks.make_logs(
+          model, epoch_logs, val_results, mode, prefix='val_')
+
+    if mode == 'train':
+      # Epochs only apply to `fit`.
+      callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
+      progbar.on_epoch_end(epoch, epoch_logs)
 
-    callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
-    progbar.on_epoch_end(epoch, epoch_logs)
   callbacks._call_end_hook(mode)
 
   if model._distribution_strategy:
-    training_distributed._copy_weights_to_original_model(model, mode)
+    # TODO(priyag, psv): Copy back metrics to the original model as well?
+    if not validation_in_fit:
+      training_distributed._copy_weights_to_original_model(
+          model, model._grouped_model, mode)
+
     scope.__exit__(None, None, None)
 
   if mode == 'train':
diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py
index 8020326377eea537a8bd173356fefadc66892190..d6cc93d1ef77b14142851e6267158d61edcbc13b 100644
--- a/tensorflow/python/keras/engine/training_dataset_test.py
+++ b/tensorflow/python/keras/engine/training_dataset_test.py
@@ -20,14 +20,14 @@ from __future__ import print_function
 
 import logging
 
-from absl.testing import parameterized
-
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops.losses import losses_impl
@@ -36,29 +36,24 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
-class TestTrainingWithDatasetIterators(test.TestCase, parameterized.TestCase):
+class TestTrainingWithDatasetIterators(keras_parameterized.TestCase):
 
-  @parameterized.parameters(
-      {'model': 'functional'},
-      {'model': 'subclass'},
-  )
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_training_and_eval_methods_on_iterators_single_io(self, model):
-    if model == 'functional':
-      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    elif model == 'subclass':
-      model = testing_utils.get_small_sequential_mlp(1, 4)
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_training_and_eval_methods_on_iterators_single_io(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
 
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
     model.evaluate(iterator, steps=2, verbose=1)
@@ -104,20 +99,22 @@ class TestTrainingWithDatasetIterators(test.TestCase, parameterized.TestCase):
                                  'you should specify the `steps` argument'):
       model.predict(iterator, verbose=0)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_get_next_op_created_once(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics)
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
 
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
     # Finalize graph to make sure we are not appending another iterator
@@ -125,20 +122,22 @@ class TestTrainingWithDatasetIterators(test.TestCase, parameterized.TestCase):
     ops.get_default_graph().finalize()
     model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_iterators_running_out_of_data(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics)
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
 
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
     dataset = dataset.repeat(2)
     dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     with test.mock.patch.object(logging, 'warning') as mock_log:
       model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0)
@@ -147,18 +146,25 @@ class TestTrainingWithDatasetIterators(test.TestCase, parameterized.TestCase):
           'dataset iterator ran out of data')
 
 
-class TestTrainingWithDataset(test.TestCase, parameterized.TestCase):
+class TestTrainingWithDataset(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_calling_model_on_same_dataset(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    if ((not testing_utils.should_run_eagerly())
+        and testing_utils.get_model_type() == 'subclass'
+        and context.executing_eagerly()):
+      self.skipTest('b/120673224')
+
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics)
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
 
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
@@ -172,16 +178,18 @@ class TestTrainingWithDataset(test.TestCase, parameterized.TestCase):
     model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
               validation_data=dataset, validation_steps=2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_training_and_eval_methods_on_dataset(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
 
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
@@ -230,13 +238,15 @@ class TestTrainingWithDataset(test.TestCase, parameterized.TestCase):
                                  'you should specify the `steps` argument'):
       model.predict(dataset, verbose=0)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_dataset_with_sample_weights(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     inputs = np.zeros((10, 3), np.float32)
     targets = np.zeros((10, 4), np.float32)
@@ -250,21 +260,15 @@ class TestTrainingWithDataset(test.TestCase, parameterized.TestCase):
     model.evaluate(dataset, steps=2, verbose=1)
     model.predict(dataset, steps=2)
 
-  @parameterized.parameters(
-      {'model': 'functional'},
-      {'model': 'subclass'},
-  )
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_dataset_with_sparse_labels(self, model):
-    if model == 'functional':
-      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    elif model == 'subclass':
-      model = testing_utils.get_small_sequential_mlp(1, 4)
-
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_dataset_with_sparse_labels(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
     for loss in ['sparse_categorical_crossentropy',
                  losses_impl.sparse_softmax_cross_entropy]:
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      model.compile(optimizer, loss)
+      model.compile(optimizer, loss,
+                    run_eagerly=testing_utils.should_run_eagerly())
 
       inputs = np.zeros((10, 3), dtype=np.float32)
       targets = np.random.randint(0, 4, size=10, dtype=np.int32)
@@ -304,28 +308,31 @@ class TestTrainingWithDataset(test.TestCase, parameterized.TestCase):
         model.train_on_batch(dataset)
 
 
-class TestMetricsWithDatasetIterators(test.TestCase):
+class TestMetricsWithDatasetIterators(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_metrics_correctness_with_iterator(self):
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Dense(
-            8, activation='relu', input_dim=4, kernel_initializer='ones'))
-    model.add(
-        keras.layers.Dense(
-            1, activation='sigmoid', kernel_initializer='ones'))
+    layers = [
+        keras.layers.Dense(8, activation='relu', input_dim=4,
+                           kernel_initializer='ones'),
+        keras.layers.Dense(1, activation='sigmoid', kernel_initializer='ones')
+    ]
+
+    model = testing_utils.get_model_from_layers(layers, (4,))
+
     model.compile(
         loss='binary_crossentropy',
         metrics=['accuracy', metrics_module.BinaryAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
 
     np.random.seed(123)
     x = np.random.randint(10, size=(100, 4)).astype(np.float32)
     y = np.random.randint(2, size=(100, 1)).astype(np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
     dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     outs = model.evaluate(iterator, steps=10)
     self.assertEqual(np.around(outs[1], decimals=1), 0.5)
     self.assertEqual(np.around(outs[2], decimals=1), 0.5)
@@ -334,7 +341,7 @@ class TestMetricsWithDatasetIterators(test.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     outs = model.evaluate(iterator, steps=10)
     self.assertEqual(outs[1], 0.)
     self.assertEqual(outs[2], 0.)
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index 7cf961b9ec3bda4c1cfae2cefac0b520790370c4..ffb0266911e4d2d4ae5939d2744d99fabaab5267 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -163,17 +163,13 @@ def experimental_fit_loop(model,
   do_validation = bool(validation_steps)
 
   # Copy the weights from the original model to each of the replicated models.
-  orig_model_weights = model.get_weights()
   with current_strategy.scope():
-    distributed_model = current_strategy.unwrap(model._grouped_model_train)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
+    _copy_weights_to_distributed_model(model, model._grouped_model_train)
+
   callbacks = cbks.configure_callbacks(
       callbacks,
       model,
       do_validation=do_validation,
-      val_inputs=None,
-      val_targets=None,
       epochs=epochs,
       steps_per_epoch=steps_per_epoch,
       verbose=verbose)
@@ -187,6 +183,8 @@ def experimental_fit_loop(model,
 
   callbacks.on_train_begin()
   for epoch in range(initial_epoch, epochs):
+    with current_strategy.scope():
+      _reset_metrics(model, model._grouped_model_train)
     callbacks.on_epoch_begin(epoch)
     epoch_logs = {}
     step_index = 0
@@ -219,9 +217,8 @@ def experimental_fit_loop(model,
       # Since we create a new clone from the original model we need to copy
       # the weights back to the original model before we can run validation.
       with current_strategy.scope():
-        updated_weights = current_strategy.unwrap(
-            model._grouped_model_train)[0].get_weights()
-        model.set_weights(updated_weights)
+        _copy_weights_to_original_model(model, model._grouped_model_train,
+                                        'train')
 
       val_outs = experimental_test_loop(  # pylint: disable=undefined-variable
           model,
@@ -242,9 +239,7 @@ def experimental_fit_loop(model,
 
   # Copy the weights back from the replicated model to the original model.
   with current_strategy.scope():
-    updated_weights = current_strategy.unwrap(
-        model._grouped_model_train)[0].get_weights()
-    model.set_weights(updated_weights)
+    _copy_weights_to_original_model(model, model._grouped_model_train, 'train')
 
   K.get_session().run(current_strategy.finalize())
   return model.history
@@ -347,22 +342,26 @@ def experimental_test_loop(model,
     progbar = Progbar(target=steps)
 
   # Copy the weights from the original model to each of the replicated models.
-  orig_model_weights = model.get_weights()
   with current_strategy.scope():
-    distributed_model = current_strategy.unwrap(model._grouped_model_test)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
+    _copy_weights_to_distributed_model(model, model._grouped_model_test)
+    _reset_metrics(model, model._grouped_model_test)
   assert steps is not None
   outs = [0.] * len(model.metrics_names)
   for step in range(steps):
     _, batch_outs = K.get_session().run([test_op, output_tensors])
     for i, label in enumerate(model.metrics_names):
-      outs[i] += batch_outs[label]
+      if i == 0:
+        # Loss is stateless metrics.
+        outs[i] += batch_outs[label]
+      else:
+        # For all stateful metrics, the aggregation is handled by mirrored vars.
+        outs[i] = batch_outs[label]
+
     if verbose >= 1:
       progbar.update(step + 1)
-  for i in range(len(outs)):
-    outs[i] /= (steps)
+
+  if len(outs) >= 0:
+    outs[0] /= (steps)
 
   if initialize_finalize_strategy:
     K.get_session().run(current_strategy.finalize())
@@ -457,12 +456,9 @@ def experimental_predict_loop(model, iterator, verbose=0, steps=None):
     progbar = Progbar(target=steps)
 
   # Copy the weights from the original model to each of the replicated models.
-  orig_model_weights = model.get_weights()
   with current_strategy.scope():
-    distributed_model = current_strategy.unwrap(model._grouped_model_predict)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
+    _copy_weights_to_distributed_model(model, model._grouped_model_predict)
+    _reset_metrics(model, model._grouped_model_predict)
   assert steps is not None
   # Since we do not know how many samples we will see, we cannot pre-allocate
   # the returned Numpy arrays. Instead, we store one array per batch seen
@@ -574,18 +570,15 @@ def _get_input_from_iterator(iterator, model):
   # Validate that all the elements in x and y are of the same type and shape.
   # We can then pass the first element of x and y to `_standardize_weights`
   # below and be confident of the output.
-  x_values, y_values, sample_weights_values = distributed_training_utils.\
-    validate_distributed_dataset_inputs(model._distribution_strategy, x, y,
-                                        sample_weights)
-  model._standardize_weights(x_values, y_values,
-                             sample_weight=sample_weights_values)
+  distributed_training_utils.validate_distributed_dataset_inputs(
+      model._distribution_strategy, x, y, sample_weights)
   return x, y, sample_weights
 
 
-def _get_execution_function(model, mode):
-  """Get function to run one step of distributed model execution."""
+def _make_execution_function(model, mode):
+  """Makes function to run one step of distributed model execution."""
   if context.executing_eagerly():
-    return _get_eager_execution_function(model, mode)
+    return _make_eager_execution_function(model, mode)
 
   strategy = model._distribution_strategy
   if not model._grouped_model:
@@ -593,7 +586,7 @@ def _get_execution_function(model, mode):
         model, strategy, make_callback_model=(mode == 'train'))
 
   def _per_device_function(model):
-    f = model._get_execution_function(mode)
+    f = model._make_execution_function(mode)
     return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
 
   with strategy.scope():
@@ -631,15 +624,15 @@ def _get_execution_function(model, mode):
         **all_session_args)
 
 
-def _get_eager_execution_function(model, mode):
-  """Get function to run one step of distributed model eager execution."""
+def _make_eager_execution_function(model, mode):
+  """Makes function to run one step of distributed model eager execution."""
   strategy = model._distribution_strategy
   if not model._grouped_model:
     clone_model_on_replicas(
         model, strategy, make_callback_model=(mode == 'train'))
 
   def _per_device_function(model):
-    f = model._get_execution_function(mode)
+    f = model._make_execution_function(mode)
     return (f.inputs, f.outputs)
 
   # NOTE(priyag): Try creating a new FuncGraph within DS scope instead of using
@@ -657,7 +650,8 @@ def _get_eager_execution_function(model, mode):
     (all_inputs, all_outputs, _, _) = distributed_training_utils.unwrap_values(
         strategy,
         grouped_inputs,
-        grouped_outputs)
+        grouped_outputs,
+        with_loss_tensor=(mode != 'predict'))
 
     return K.function(
         all_inputs,
@@ -696,22 +690,23 @@ def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
   return ins
 
 
-def _copy_weights_to_distributed_model(model):
+def _copy_weights_to_distributed_model(original_model, grouped_model):
   """Copies weights from original model to distributed models."""
-  if model._distribution_strategy:
-    # Copy the weights from the original model to each of the replicated models.
-    orig_model_weights = model.get_weights()
-    distributed_model = model._distribution_strategy.unwrap(
-        model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        model._distribution_strategy, distributed_model, orig_model_weights)
+  strategy = original_model._distribution_strategy
+  if strategy:
+    # Copy the weights from the original model to each of the replicated
+    # models.
+    orig_model_weights = original_model.get_weights()
+    distributed_model = strategy.unwrap(grouped_model)[0]
+    distributed_training_utils.set_weights(strategy, distributed_model,
+                                           orig_model_weights)
 
 
-def _copy_weights_to_original_model(model, mode):
+def _copy_weights_to_original_model(model, grouped_model, mode):
   """Copies weights from first distributed model back to original model."""
   if model._distribution_strategy and mode == 'train':
     updated_weights = model._distribution_strategy.unwrap(
-        model._grouped_model)[0].get_weights()
+        grouped_model)[0].get_weights()
     model.set_weights(updated_weights)
 
 
@@ -725,3 +720,11 @@ def _per_device_aggregate_batch(batch_outs, model, mode):
       total_batch_outs.append(np.concatenate(nest.flatten(nested_outs)))
     return total_batch_outs
   return batch_outs
+
+
+def _reset_metrics(model, distributed_model=None):
+  if model._distribution_strategy:
+    distributed_model = (
+        distributed_model or
+        model._distribution_strategy.unwrap(model._grouped_model)[0])
+    distributed_model.reset_metrics()
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index cd85c365db4a0f694ba859e04ccbe6a4f3c84ce8..895db5bc633669641b0493b8bfb918094f312513 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -20,19 +20,12 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import copy
 
-import numpy as np
-
-from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager.backprop import GradientTape
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
-from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras import losses as losses_module
-from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
@@ -181,361 +174,6 @@ def _model_loss(model,
   return outs, total_loss, loss_metrics, aggregated_loss_metrics, masks
 
 
-def iterator_fit_loop(model,
-                      inputs,
-                      class_weight,
-                      steps_per_epoch,
-                      epoch_logs,
-                      val_inputs=None,
-                      val_targets=None,
-                      val_sample_weights=None,
-                      epochs=1,
-                      verbose=1,
-                      callbacks=None,
-                      validation_steps=None,
-                      do_validation=False,
-                      batch_size=None,
-                      output_loss_metrics=None):
-  """Fit function for eager execution when input is given as dataset iterator.
-
-  Updates the given epoch logs.
-
-  Arguments:
-      model: Instance of the `Model`.
-      inputs: Input dataset iterator.
-      class_weight: Optional class-weight array to weight the importance of
-          samples in `inputs` based on the class they belong to, as conveyed by
-          the targets from the `inputs` iterator.
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch.
-      epoch_logs: Dictionary of logs from every epoch.
-      val_inputs: Input data for validation.
-      val_targets: Target data for validation.
-      val_sample_weights: Sample weight data for validation.
-      epochs: Number of times to iterate over the data
-      verbose: Verbosity mode, 0, 1 or 2
-      callbacks: CallbackList instance. Controls callbacks during training.
-      validation_steps: Number of steps to run validation for (only if doing
-        validation from data tensors). Ignored with default value of `None`.
-      do_validation: Boolean value indicating whether we should do validation.
-      batch_size: int, val_inputs and val_targets will be evaled batch by
-        batch with size batch_size if they are array.
-      output_loss_metrics: List of metrics that are used to aggregated output
-        loss values.
-
-  Raises:
-      ValueError: In case of mismatch between given number of inputs and
-        expectations of the model.
-  """
-  assert isinstance(inputs, iterator_ops.EagerIterator)
-
-  # make sure either x,y or x,y,sample_weights is provided
-  if (not isinstance(inputs.output_shapes, collections.Sequence) or
-      len(inputs.output_shapes) not in (2, 3)):
-    raise ValueError('Please provide either inputs and targets '
-                     'or inputs, targets, and sample_weights')
-
-  for step_index in range(steps_per_epoch):
-    batch_logs = {'batch': step_index, 'size': 1}
-    callbacks.on_batch_begin(step_index, batch_logs)
-
-    # Get data from the iterator.
-    try:
-      next_element = inputs.get_next()
-    except errors.OutOfRangeError:
-      logging.warning(
-          'Your dataset iterator ran out of data; interrupting training. Make '
-          'sure that your dataset can generate at least '
-          '`steps_per_epoch * epochs` batches (in this case, %d batches). You '
-          'may need to use the repeat() function when building your '
-          'dataset.' % steps_per_epoch * epochs)
-      break
-
-    if len(inputs.output_shapes) == 2:
-      x, y = next_element
-      sample_weights = None
-    else:
-      x, y, sample_weights = next_element
-
-    # Validate and standardize data.
-    x, y, sample_weights = model._standardize_user_data(
-        x, y, sample_weight=sample_weights, class_weight=class_weight)
-    x = training_utils.cast_if_floating_dtype(x)
-    y = training_utils.cast_if_floating_dtype(y)
-    if sample_weights:
-      sample_weights = [
-          training_utils.cast_if_floating_dtype(
-              ops.convert_to_tensor(val, dtype=backend.floatx()))
-          if val is not None else None for val in sample_weights
-      ]
-
-    # Train model.
-    outs, loss, _, aggregated_loss_metrics, masks = _process_single_batch(
-        model,
-        x,
-        y,
-        output_loss_metrics=output_loss_metrics,
-        sample_weights=sample_weights,
-        training=True)
-    outs = generic_utils.to_list(outs)
-
-    if step_index == 0:
-      # Set stateful_metrics in callbacks. We do not do this before the
-      # `steps_per_epoch` loop because model will be compiled only in the first
-      # iteration of this loop in the deferred build scenario.
-      for cbk in callbacks:
-        if (isinstance(cbk, cbks.BaseLogger) or
-            isinstance(cbk, cbks.ProgbarLogger)):
-          cbk.stateful_metrics = model.metrics_names[1:]  # Exclude `loss`
-
-      callback_metrics = copy.copy(model.metrics_names)
-      if do_validation:
-        callback_metrics += ['val_' + n for n in model.metrics_names]
-      callbacks.set_params({
-          'batch_size': batch_size,
-          'epochs': epochs,
-          'steps': steps_per_epoch,
-          'verbose': verbose,
-          'do_validation': do_validation,
-          'metrics': callback_metrics or [],
-          'validation_steps': validation_steps
-      })
-
-    # Calculate metrics.
-    for l, o in zip(model.metrics_names, outs):
-      batch_logs[l] = o
-    metrics_results = _eager_metrics_fn(
-        model, outs, y, sample_weights=sample_weights, masks=masks)
-    batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
-
-    for k, v in zip(
-        model.metrics_names,
-        [backend.mean(loss)] + aggregated_loss_metrics + metrics_results):
-      batch_logs[k] = tensor_util.constant_value(v)
-    callbacks.on_batch_end(step_index, batch_logs)
-    if callbacks.model.stop_training:
-      break
-
-    if step_index == steps_per_epoch - 1:
-      if do_validation:
-        val_outs = test_loop(
-            model,
-            val_inputs,
-            val_targets,
-            sample_weights=val_sample_weights,
-            steps=validation_steps,
-            verbose=0,
-            batch_size=batch_size)
-        if not isinstance(val_outs, list):
-          val_outs = [val_outs]
-        # Same labels assumed.
-        for l, o in zip(model.metrics_names, val_outs):
-          epoch_logs['val_' + l] = o
-
-
-def iterator_test_loop(model, inputs, steps, verbose=0):
-  """Test function for eager execution when input is given as dataset iterator.
-
-  Arguments:
-      model: Model instance that is being evaluated in Eager mode.
-      inputs: Input dataset iterator.
-      steps: Total number of steps (batches of samples) before declaring
-      predictions finished.
-      verbose: Verbosity mode.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-
-  Raises:
-      ValueError: In case of mismatch between given number of inputs and
-        expectations of the model.
-  """
-  assert isinstance(inputs, iterator_ops.EagerIterator)
-  # make sure either x,y or x,y,sample_weights is provided
-  if (not isinstance(inputs.output_shapes, collections.Sequence) or
-      len(inputs.output_shapes) < 2 or len(inputs.output_shapes) > 3):
-    raise ValueError('Please provide either inputs and targets'
-                     'or inputs, targets, and sample_weights')
-  outs = []
-
-  # Create metric wrapper for the losses.
-  output_loss_metrics = []
-  for i in range(len(model.outputs)):
-    loss_fn = model.loss_functions[i]
-    loss_name = loss_fn.name if isinstance(
-        loss_fn, losses_module.Loss) else loss_fn.__name__
-    mean_wrapped_loss = metrics_module.MeanMetricWrapper(
-        loss_fn, name=loss_name)
-    output_loss_metrics.append(mean_wrapped_loss)
-
-  num_samples = 0
-  if verbose == 1:
-    progbar = generic_utils.Progbar(target=steps)
-  for step_index in range(steps):
-    # Get data from the iterator.
-    try:
-      next_element = inputs.get_next()
-    except errors.OutOfRangeError:
-      logging.warning(
-          'Your dataset iterator ran out of data interrupting testing. '
-          'Make sure that your dataset can generate at least `steps` batches '
-          '(in this case, %d batches). You may need to use the repeat() '
-          'function when building your dataset.', steps)
-      break
-
-    if len(inputs.output_shapes) == 2:
-      x, y = next_element
-      sample_weights = None
-    else:
-      x, y, sample_weights = next_element
-
-    # Validate and standardize data.
-    x, y, sample_weights = model._standardize_user_data(
-        x, y, sample_weight=sample_weights)
-    x = training_utils.cast_if_floating_dtype(x)
-    y = training_utils.cast_if_floating_dtype(y)
-    if sample_weights:
-      sample_weights = [
-          training_utils.cast_if_floating_dtype(
-              ops.convert_to_tensor(val, dtype=backend.floatx()))
-          if val is not None else None for val in sample_weights
-      ]
-
-    if step_index == 0:
-      # Get stateful metrics indices. We do not do this before the `steps` loop
-      # because model will be compiled only in the first iteration of this loop
-      # in the deferred build scenario.
-      if hasattr(model, '_compile_metrics'):
-        for m in model.metrics:
-          m.reset_states()
-      for m in output_loss_metrics:
-        m.reset_states()
-
-    # Calculate model output, loss values.
-    loss_outs, loss, _, aggregated_loss_metrics, masks = _model_loss(
-        model,
-        x,
-        y,
-        output_loss_metrics=output_loss_metrics,
-        sample_weights=sample_weights,
-        training=False)
-    metrics_results = _eager_metrics_fn(
-        model, loss_outs, y, sample_weights=sample_weights, masks=masks)
-    batch_outs = []
-    for _, v in zip(
-        model.metrics_names,
-        [backend.mean(loss)] + aggregated_loss_metrics + metrics_results):
-      batch_outs.append(tensor_util.constant_value(v))
-
-    # Get current step size.
-    if isinstance(x, list):
-      step_size = x[0].get_shape().as_list()[0]
-    elif isinstance(x, dict):
-      step_size = list(x.values())[0].get_shape().as_list()[0]
-    else:
-      step_size = x.get_shape().as_list()[0]
-
-    # Accumulate results in output array.
-    if not isinstance(batch_outs, list):
-      batch_outs = [batch_outs]
-    if step_index == 0:
-      for _ in enumerate(batch_outs):
-        outs.append(0.)
-    outs[0] += batch_outs[0] * step_size  # index 0 = 'loss'
-    outs[1:] = batch_outs[1:]
-
-    # Calculate sample size.
-    num_samples += step_size
-    if verbose == 1:
-      progbar.update(step_index + 1)
-
-  outs[0] /= num_samples  # index 0 = 'loss'
-  if len(outs) == 1:
-    return outs[0]
-  return outs
-
-
-def iterator_predict_loop(model, inputs, steps, verbose=0):
-  """Predict function for eager execution when input is dataset iterator.
-
-  Arguments:
-      model: Instance of `Model`.
-      inputs: Input dataset iterator.
-      steps: Total number of steps (batches of samples) before declaring
-          `_predict_loop` finished.
-      verbose: Verbosity mode.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions (if the model has multiple outputs).
-
-  Raises:
-      ValueError: In case of mismatch between given number of inputs and
-        expectations of the model.
-  """
-  assert isinstance(inputs, iterator_ops.EagerIterator)
-  if not isinstance(inputs.output_shapes,
-                    collections.Sequence) or len(inputs.output_shapes) > 3:
-    raise ValueError(
-        'Please provide data as a list or tuple of 1, 2, or 3 elements '
-        ' - `(input)`, or `(input, target)`, or `(input, target,'
-        'sample_weights)`. Received %s. We do not use the `target` or'
-        '`sample_weights` value here.' % inputs.output_shapes)
-  outs = []
-  if verbose == 1:
-    progbar = generic_utils.Progbar(target=steps)
-
-  for step_index in range(steps):
-    # Get data from the iterator.
-    try:
-      next_element = inputs.get_next()
-    except errors.OutOfRangeError:
-      logging.warning(
-          'Your dataset iterator ran out of data; interrupting prediction. '
-          'Make sure that your dataset can generate at least `steps` batches '
-          '(in this case, %d batches). You may need to use the repeat() '
-          'function when building your dataset.', steps)
-      break
-
-    # expects a tuple, where first element of tuple represents inputs
-    x = next_element[0]
-
-    # Validate and standardize data.
-    x, _, _ = model._standardize_user_data(x)
-    x = training_utils.cast_if_floating_dtype(x)
-
-    if isinstance(x, list) and len(x) == 1:
-      x = x[0]
-
-    if model._expects_training_arg:
-      batch_outs = model.call(x, training=False)
-    else:
-      batch_outs = model.call(x)
-    if not isinstance(batch_outs, list):
-      batch_outs = [batch_outs]
-
-    # We collect the results from every step and then concatenate them once
-    # in the end. This is an expensive process. We are doing this because we
-    # do not know the number of samples beforehand.
-    if step_index == 0:
-      for _ in batch_outs:
-        outs.append([])
-    for i, batch_out in enumerate(batch_outs):
-      outs[i].append(backend.get_value(batch_out))
-
-    if verbose == 1:
-      progbar.update(step_index + 1)
-  for i, out in enumerate(outs):
-    outs[i] = np.concatenate(tuple(out), axis=0)
-  if len(outs) == 1:
-    return outs[0]
-  return outs
-
-
 def _process_single_batch(model,
                           inputs,
                           targets,
@@ -606,15 +244,15 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
       inputs = training_utils.cast_if_floating_dtype(inputs)
       targets = training_utils.cast_if_floating_dtype(targets)
     else:
-      inputs = [
-          ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
-      ]
-      targets = [
-          ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
-      ]
+      inputs = training_utils.cast_if_floating_dtype([
+          ops.convert_to_tensor(val) for val in inputs
+      ])
+      targets = training_utils.cast_if_floating_dtype([
+          ops.convert_to_tensor(val) for val in targets
+      ])
   if sample_weights:
     sample_weights = [
-        ops.convert_to_tensor(val, dtype=backend.floatx())
+        training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val))
         if val is not None else None for val in sample_weights
     ]
 
@@ -628,7 +266,7 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
       targets,
       sample_weights=sample_weights,
       masks=masks,
-      return_stateful_result=False)
+      return_stateful_result=True)
   loss = generic_utils.to_list(loss)
 
   return [
@@ -654,15 +292,15 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       inputs = training_utils.cast_if_floating_dtype(inputs)
       targets = training_utils.cast_if_floating_dtype(targets)
     else:
-      inputs = [
-          ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
-      ]
-      targets = [
-          ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
-      ]
+      inputs = training_utils.cast_if_floating_dtype([
+          ops.convert_to_tensor(val) for val in inputs
+      ])
+      targets = training_utils.cast_if_floating_dtype([
+          ops.convert_to_tensor(val) for val in targets
+      ])
   if sample_weights:
     sample_weights = [
-        ops.convert_to_tensor(val, dtype=backend.floatx())
+        training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val))
         if val is not None else None for val in sample_weights
     ]
   outs, loss, loss_metrics, _, masks = _model_loss(
@@ -675,186 +313,10 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       targets,
       sample_weights=sample_weights,
       masks=masks,
-      return_stateful_result=False)
+      return_stateful_result=True)
   loss = generic_utils.to_list(loss)
 
   return [
       tensor_util.constant_value(v)
       for v in loss + loss_metrics + metrics_results
   ]
-
-
-def fit_loop(model,
-             inputs,
-             targets,
-             sample_weights=None,
-             class_weight=None,
-             val_inputs=None,
-             val_targets=None,
-             val_sample_weights=None,
-             batch_size=None,
-             epochs=1,
-             verbose=1,
-             callbacks=None,
-             shuffle=True,
-             initial_epoch=0,
-             steps_per_epoch=None,
-             validation_steps=None):
-  """Fit function for eager execution.
-
-  Arguments:
-      model: Instance of the model that is being executed in Eager mode.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      class_weight: Optional class-weight array to weight the importance of
-          samples in `inputs` based on the class they belong to, as conveyed by
-          `targets`.
-      val_inputs: Input data for validation.
-      val_targets: Target data for validation.
-      val_sample_weights: Sample weight data for validation.
-      batch_size: Integer batch size or None if unknown.
-      epochs: Number of times to iterate over the data
-      verbose: Verbosity mode, 0, 1 or 2
-      callbacks: List of callbacks to be called during training
-      shuffle: Whether to shuffle the data at the beginning of each epoch
-      initial_epoch: Epoch at which to start training
-          (useful for resuming a previous training run)
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch. Ignored with the default value of `None`.
-      validation_steps: Number of steps to run validation for (only if doing
-        validation from data tensors). Ignored with default value of `None`.
-
-  Returns:
-      `History` object.
-
-  Raises:
-    ValueError: In case of invalid argument values.
-  """
-  # Convert training inputs to an EagerIterator
-  inputs, steps_per_epoch = training_utils.convert_to_iterator(
-      x=inputs,
-      y=targets,
-      sample_weights=sample_weights,
-      batch_size=batch_size,
-      steps_per_epoch=steps_per_epoch,
-      epochs=epochs,
-      shuffle=shuffle)
-  # Required for eager execution
-  with backend.learning_phase_scope(1):
-    do_validation = val_inputs is not None
-    callbacks = cbks.configure_callbacks(
-        callbacks,
-        model,
-        do_validation=do_validation,
-        batch_size=batch_size,
-        epochs=epochs,
-        steps_per_epoch=steps_per_epoch,
-        val_inputs=val_inputs,
-        val_targets=val_targets,
-        val_sample_weights=val_sample_weights,
-        validation_steps=validation_steps,
-        verbose=verbose)
-
-    # Create metric wrapper for the losses.
-    output_loss_metrics = []
-    for i in range(len(model.outputs)):
-      loss_fn = model.loss_functions[i]
-      loss_name = loss_fn.name if isinstance(
-          loss_fn, losses_module.Loss) else loss_fn.__name__
-      mean_wrapped_loss = metrics_module.MeanMetricWrapper(
-          loss_fn, name=loss_name)
-      output_loss_metrics.append(mean_wrapped_loss)
-
-    callbacks.on_train_begin()
-    for epoch in range(initial_epoch, epochs):
-      if model._is_compiled:  # Model may not be compiled the first time.
-        # Reset stateful metrics
-        for m in model.metrics:
-          m.reset_states()
-
-      for m in output_loss_metrics:
-        m.reset_states()
-
-      callbacks.on_epoch_begin(epoch)
-      epoch_logs = {}
-      iterator_fit_loop(
-          model,
-          inputs,
-          class_weight,
-          steps_per_epoch=steps_per_epoch,
-          epoch_logs=epoch_logs,
-          val_inputs=val_inputs,
-          val_targets=val_targets,
-          val_sample_weights=val_sample_weights,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          validation_steps=validation_steps,
-          do_validation=do_validation,
-          batch_size=batch_size,
-          output_loss_metrics=output_loss_metrics)
-      callbacks.on_epoch_end(epoch, epoch_logs)
-      if callbacks.model.stop_training:
-        break
-  callbacks.on_train_end()
-  return model.history
-
-
-def test_loop(model, inputs, targets,
-              sample_weights=None,
-              batch_size=None,
-              verbose=0,
-              steps=None):
-  """Test function for eager execution.
-
-  Arguments:
-      model: Model instance that is being evaluated in Eager mode.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      batch_size: integer batch size or `None`.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring predictions finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-  """
-  inputs, steps = training_utils.convert_to_iterator(
-      x=inputs,
-      y=targets,
-      sample_weights=sample_weights,
-      batch_size=batch_size,
-      steps_per_epoch=steps,
-      is_validation=True)
-  with backend.learning_phase_scope(0):
-    return iterator_test_loop(model, inputs, steps, verbose=verbose)
-
-
-def predict_loop(model, inputs, batch_size=32, verbose=0, steps=None):
-  """Predict function for eager execution.
-
-  Arguments:
-      model: Instance of `Model`.
-      inputs: List of input arrays.
-      batch_size: integer batch size.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions
-      (if the model has multiple outputs).
-  """
-  with backend.learning_phase_scope(0):
-    inputs, steps = training_utils.convert_to_iterator(
-        x=inputs, batch_size=batch_size, steps_per_epoch=steps)
-    return iterator_predict_loop(model, inputs, steps, verbose=verbose)
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index d769143106a56c6079ab70dd4b1bfcbdf6d75483..27eaea23ba09d1405ca16f3beaa2f4c4f4a18661 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -24,25 +24,27 @@ from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.platform import test
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
-class TrainingTest(test.TestCase):
+class TrainingTest(keras_parameterized.TestCase):
 
+  @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
   def test_model_methods_with_eager_tensors_multi_io(self):
-    a = keras.layers.Input(shape=(3,), name='input_a')
-    b = keras.layers.Input(shape=(3,), name='input_b')
+    input_a = keras.layers.Input(shape=(3,), name='input_a')
+    input_b = keras.layers.Input(shape=(3,), name='input_b')
 
     dense = keras.layers.Dense(4, name='dense')
-    c = dense(a)
-    d = dense(b)
-    e = keras.layers.Dropout(0.5, name='dropout')(c)
+    dropout = keras.layers.Dropout(0.5, name='dropout')
 
-    model = keras.models.Model([a, b], [d, e])
+    model = testing_utils.get_multi_io_model(
+        [input_a, dense], [input_b, dense, dropout])
 
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    optimizer = rmsprop.RMSprop(learning_rate=0.001)
     loss = 'mse'
     loss_weights = [1., 0.5]
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
@@ -56,60 +58,59 @@ class TrainingTest(test.TestCase):
 
     input_a = keras.backend.zeros(shape=(10, 3))
     input_b = keras.backend.zeros(shape=(10, 3))
-    target_d = keras.backend.zeros(shape=(10, 4))
-    target_e = keras.backend.zeros(shape=(10, 4))
+    target_a = keras.backend.zeros(shape=(10, 4))
+    target_b = keras.backend.zeros(shape=(10, 4))
 
     model.fit(
-        [input_a, input_b], [target_d, target_e],
+        [input_a, input_b], [target_a, target_b],
         epochs=1,
         batch_size=5,
         verbose=0)
     # Test: no shuffle.
     model.fit(
-        [input_a, input_b], [target_d, target_e],
+        [input_a, input_b], [target_a, target_b],
         epochs=1,
         batch_size=5,
         verbose=0,
         shuffle=False)
     # Test: validation data.
-    model.fit([input_a, input_b], [target_d, target_e],
+    model.fit([input_a, input_b], [target_a, target_b],
               epochs=1, batch_size=2, verbose=0,
-              validation_data=([input_a, input_b], [target_d, target_e]))
-    model.train_on_batch([input_a, input_b], [target_d, target_e])
+              validation_data=([input_a, input_b], [target_a, target_b]))
+    model.train_on_batch([input_a, input_b], [target_a, target_b])
     model.predict([input_a, input_b], batch_size=5)
-    model.evaluate([input_a, input_b], [target_d, target_e],
+    model.evaluate([input_a, input_b], [target_a, target_b],
                    batch_size=2, verbose=0)
-    model.test_on_batch([input_a, input_b], [target_d, target_e])
+    model.test_on_batch([input_a, input_b], [target_a, target_b])
 
     # Test: mix np and tensors.
     input_b = np.zeros(shape=(10, 3)).astype('float32')
-    target_e = np.zeros(shape=(10, 4)).astype('float32')
+    target_b = np.zeros(shape=(10, 4)).astype('float32')
     model.fit(
-        [input_a, input_b], [target_d, target_e],
+        [input_a, input_b], [target_a, target_b],
         epochs=1,
         batch_size=5,
         verbose=0)
-    model.fit([input_a, input_b], [target_d, target_e],
+    model.fit([input_a, input_b], [target_a, target_b],
               epochs=1, batch_size=2, verbose=0,
-              validation_data=([input_a, input_b], [target_d, target_e]))
+              validation_data=([input_a, input_b], [target_a, target_b]))
     model.fit(
-        [input_a, input_b], [target_d, target_e],
+        [input_a, input_b], [target_a, target_b],
         epochs=1,
         batch_size=5,
         verbose=0,
         shuffle=False)
-    model.train_on_batch([input_a, input_b], [target_d, target_e])
+    model.train_on_batch([input_a, input_b], [target_a, target_b])
     model.predict([input_a, input_b], batch_size=5)
-    model.evaluate([input_a, input_b], [target_d, target_e],
+    model.evaluate([input_a, input_b], [target_a, target_b],
                    batch_size=2, verbose=0)
-    model.test_on_batch([input_a, input_b], [target_d, target_e])
+    model.test_on_batch([input_a, input_b], [target_a, target_b])
 
+  @keras_parameterized.run_with_all_model_types
   def test_model_methods_with_eager_tensors_single_io(self):
-    x = keras.layers.Input(shape=(3,), name='input')
-    y = keras.layers.Dense(4, name='dense')(x)
-    model = keras.Model(x, y)
+    model = testing_utils.get_small_mlp(10, 4, 3)
 
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    optimizer = rmsprop.RMSprop(learning_rate=0.001)
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
     model.compile(optimizer, loss, metrics=metrics, run_eagerly=True)
@@ -126,21 +127,20 @@ class TrainingTest(test.TestCase):
     model.train_on_batch(inputs, targets)
     model.test_on_batch(inputs, targets)
 
+  @keras_parameterized.run_with_all_model_types
   def test_model_fit_and_validation_with_missing_arg_errors(self):
-    x = keras.layers.Input(shape=(3,), name='input')
-    y = keras.layers.Dense(4, name='dense')(x)
-    model = keras.Model(x, y)
-    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001),
+    model = testing_utils.get_small_mlp(10, 4, 3)
+    model.compile(optimizer=rmsprop.RMSprop(learning_rate=0.001),
                   loss='mse',
                   run_eagerly=True)
 
     x = keras.backend.zeros(shape=(10, 3))
     y = keras.backend.zeros(shape=(10, 4))
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat(10).batch(5)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     validation_dataset = dataset_ops.Dataset.from_tensor_slices(
         (x, y)).repeat(10).batch(5)
-    validation_iterator = validation_dataset.make_one_shot_iterator()
+    validation_iterator = dataset_ops.make_one_shot_iterator(validation_dataset)
 
     with self.assertRaisesRegexp(
         ValueError, r'specify .* `steps_per_epoch`'):
@@ -152,19 +152,21 @@ class TrainingTest(test.TestCase):
           ValueError, r'provide either `batch_size` or `validation_steps`'):
         model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
                   validation_data=(x, y))
-    with self.assertRaisesRegexp(
-        ValueError, r'specify the number of steps'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'specify the `validation_steps` argument.'):
       model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
                 validation_data=validation_dataset)
-    with self.assertRaisesRegexp(
-        ValueError, r'specify the number of steps'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'specify the `validation_steps` argument.'):
       model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
                 validation_data=validation_iterator)
 
+  # TODO(b/120931266): Enable test on subclassed models after bug causing an
+  # extra dimension to be added to predict outputs is fixed.
+  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
   def test_generator_methods(self):
-    model = keras.Sequential()
-    model.add(keras.layers.Dense(4, input_shape=(3,)))
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    model = testing_utils.get_small_mlp(10, 4, 3)
+    optimizer = rmsprop.RMSprop(learning_rate=0.001)
     model.compile(
         optimizer,
         loss='mse',
@@ -189,50 +191,50 @@ class TrainingTest(test.TestCase):
     self.assertEqual(out.shape, (30, 4))
 
 
-class CorrectnessTest(test.TestCase):
+class CorrectnessTest(keras_parameterized.TestCase):
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_loss_correctness(self):
     # Test that training loss is the same in eager and graph
     # (by comparing it to a reference value in a deterministic case)
-    model = keras.Sequential()
-    model.add(keras.layers.Dense(3,
-                                 activation='relu',
-                                 input_dim=4,
-                                 kernel_initializer='ones'))
-    model.add(keras.layers.Dense(2,
-                                 activation='softmax',
-                                 kernel_initializer='ones'))
+    layers = [
+        keras.layers.Dense(3, activation='relu',
+                           kernel_initializer='ones'),
+        keras.layers.Dense(2, activation='softmax', kernel_initializer='ones')]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(4,))
     model.compile(loss='sparse_categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  run_eagerly=False)
+                  optimizer=rmsprop.RMSprop(learning_rate=0.001),
+                  run_eagerly=testing_utils.should_run_eagerly())
     x = np.ones((100, 4))
     np.random.seed(123)
     y = np.random.randint(0, 1, size=(100, 1))
     history = model.fit(x, y, epochs=1, batch_size=10)
-    self.assertAlmostEqual(history.history['loss'][-1], 0.6173, 4)
+    self.assertAlmostEqual(history.history['loss'][-1], 0.5836, 4)
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_loss_correctness_with_iterator(self):
     # Test that training loss is the same in eager and graph
     # (by comparing it to a reference value in a deterministic case)
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Dense(
-            3, activation='relu', input_dim=4, kernel_initializer='ones'))
-    model.add(
-        keras.layers.Dense(2, activation='softmax', kernel_initializer='ones'))
+    layers = [
+        keras.layers.Dense(3, activation='relu',
+                           kernel_initializer='ones'),
+        keras.layers.Dense(2, activation='softmax', kernel_initializer='ones')]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(4,))
     model.compile(
         loss='sparse_categorical_crossentropy',
-        optimizer=RMSPropOptimizer(learning_rate=0.001),
-        run_eagerly=True)
+        optimizer=rmsprop.RMSprop(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
     x = np.ones((100, 4), dtype=np.float32)
     np.random.seed(123)
     y = np.random.randint(0, 1, size=(100, 1))
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     history = model.fit(iterator, epochs=1, steps_per_epoch=10)
-    self.assertAlmostEqual(history.history['loss'][-1], 0.6173, 4)
+    self.assertAlmostEqual(history.history['loss'][-1], 0.5836, 4)
 
   def test_loss_in_call(self):
 
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index e7310a7bb97d417808563cb318d45c37e6b30814..bc6a3e8dd0be81ff2af8150c4d62e9416ced4f4f 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -19,421 +19,440 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import math
+
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks as cbks
+from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import data_utils
-from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+
+
+def model_iteration(model,
+                    data,
+                    steps_per_epoch=None,
+                    epochs=1,
+                    verbose=1,
+                    callbacks=None,
+                    validation_data=None,
+                    validation_steps=None,
+                    class_weight=None,
+                    max_queue_size=10,
+                    workers=1,
+                    use_multiprocessing=False,
+                    shuffle=False,
+                    initial_epoch=0,
+                    mode='train',
+                    batch_size=None,
+                    **kwargs):
+  """Loop function for arrays of data with modes 'train'/'test'/'predict'.
+
+  Arguments:
+      model: Keras Model instance.
+      data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x, y)` or
+        `(x, y, sample_weights)`) or a generator or
+        `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
+      steps_per_epoch: Total number of steps (batches of samples) before
+        declaring one epoch finished and starting the next epoch. Ignored with
+        the default value of `None`.
+      epochs: Number of times to iterate over the data.
+      verbose: Verbosity mode, 0, 1 or 2.
+      callbacks: List of callbacks to be called during training.
+      validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or
+        `(x, y)` or `(x, y, sample_weights)`) or a generator or
+        `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
+      validation_steps: Total number of steps (batches of samples) before
+        declaring validation finished.
+      class_weight: Dictionary mapping class indices to a weight for the class.
+      max_queue_size: Integer. Maximum size for the generator queue. If
+        unspecified, `max_queue_size` will default to 10.
+      workers: Integer. Maximum number of processes to spin up when using
+        process-based threading. If unspecified, `workers` will default to 1. If
+        0, will execute the generator on the main thread.
+      use_multiprocessing: Boolean. If `True`, use process-based threading. If
+        unspecified, `use_multiprocessing` will default to `False`. Note that
+        because this implementation relies on multiprocessing, you should not
+        pass non-picklable arguments to the generator as they can't be passed
+        easily to children processes.
+      shuffle: Boolean. Whether to shuffle the order of the batches at the
+        beginning of each epoch. Only used with instances of `Sequence`
+        (`keras.utils.Sequence`). Has no effect when `steps_per_epoch` is not
+        `None`.
+      initial_epoch: Epoch at which to start training (useful for resuming a
+        previous training run).
+      mode: One of 'train'/'test'/'predict'.
+      batch_size: Integer batch size or None if unknown. Will only be used if
+        `data` is in NumPy/Tensor format.
+      **kwargs: Additional arguments for backwards compatibility. `steps` is
+        accepted as an alias for `steps_per_epoch`.
+
+  Returns:
+      - In 'train' mode: `History` object.
+      - In 'test' mode: Evaluation metrics.
+      - In 'predict' mode: Outputs of the Model called on inputs.
+
+  Raises:
+      ValueError: in case of invalid arguments.
+  """
+  if 'steps' in kwargs:
+    steps_per_epoch = kwargs['steps']
+
+  # Convert to a format that supports `next(generator)`.
+  generator, steps_per_epoch = convert_to_generator_like(
+      data,
+      steps_per_epoch=steps_per_epoch,
+      batch_size=batch_size,
+      epochs=epochs - initial_epoch,
+      shuffle=shuffle)
+
+  do_validation = validation_data is not None
+  should_set_learning_phase = context.executing_eagerly() and model.run_eagerly
+  is_sequence = isinstance(generator, data_utils.Sequence)
+  _validate_arguments(is_sequence, use_multiprocessing, workers,
+                      steps_per_epoch, validation_data, validation_steps, mode,
+                      kwargs)
+
+  batch_function = _make_execution_function(
+      model, mode, class_weight=class_weight)
+
+  # Create the queue for the generator.
+  output_generator, enqueuer = _make_enqueued_generator(
+      generator,
+      workers=workers,
+      use_multiprocessing=use_multiprocessing,
+      max_queue_size=max_queue_size,
+      shuffle=shuffle)
+
+  num_samples_or_steps, use_steps = _get_num_samples_or_steps(
+      data, steps_per_epoch)
+
+  count_mode = 'steps' if use_steps else 'samples'
+  callbacks = cbks.configure_callbacks(
+      callbacks,
+      model,
+      do_validation=do_validation,
+      epochs=epochs,
+      steps_per_epoch=steps_per_epoch,
+      batch_size=batch_size,
+      samples=num_samples_or_steps,
+      verbose=0,  # Handle ProgBar as part of Callbacks once hooks are ready.
+      mode=mode)
+  # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
+  progbar = training_utils.get_progbar(model, count_mode)
+  progbar.params = callbacks.params
+  progbar.params['verbose'] = verbose
+
+  if mode == 'predict':
+    aggregator = training_utils.OutputsAggregator(True, steps_per_epoch)
+  else:
+    aggregator = training_utils.MetricsAggregator(True, steps_per_epoch)
 
+  if should_set_learning_phase:
+    old_learning_phase = backend.learning_phase()
+    backend.set_learning_phase(1 if mode == 'train' else 0)
 
-def fit_generator(model,
-                  generator,
-                  steps_per_epoch=None,
-                  epochs=1,
-                  verbose=1,
-                  callbacks=None,
-                  validation_data=None,
-                  validation_steps=None,
-                  class_weight=None,
-                  max_queue_size=10,
-                  workers=1,
-                  use_multiprocessing=False,
-                  shuffle=True,
-                  initial_epoch=0):
-  """See docstring for `Model.fit_generator`."""
-  epoch = initial_epoch
-
-  do_validation = bool(validation_data)
-  if not context.executing_eagerly():
-    model._make_train_function()
-    if do_validation:
-      model._make_test_function()
+  callbacks.model.stop_training = False
+  callbacks._call_begin_hook(mode)
+  progbar.on_train_begin()
+  for epoch in range(initial_epoch, epochs):
+    if callbacks.model.stop_training:
+      break
 
-  is_sequence = isinstance(generator, data_utils.Sequence)
-  if not is_sequence and use_multiprocessing and workers > 1:
-    logging.warning(
-        UserWarning('Using a generator with `use_multiprocessing=True`'
-                    ' and multiple workers may duplicate your data.'
-                    ' Please consider using the`keras.utils.Sequence'
-                    ' class.'))
-  if steps_per_epoch is None:
-    if is_sequence:
-      steps_per_epoch = len(generator)
-    else:
-      raise ValueError('Please specify the `steps_per_epoch` argument.')
+    # Setup work for each epoch.
+    model.reset_metrics()
+    epoch_logs = {}
+    callbacks.on_epoch_begin(epoch, epoch_logs, mode=mode)
+    progbar.on_epoch_begin(epoch, epoch_logs)
 
-  if (isinstance(validation_data, dataset_ops.DatasetV2) and
-      context.executing_eagerly()):
-    validation_data = validation_data.make_one_shot_iterator()
-  val_gen = (data_utils.is_generator_or_sequence(validation_data) or
-             isinstance(validation_data, iterator_ops.EagerIterator))
-  if (val_gen and not isinstance(validation_data, data_utils.Sequence) and
-      not validation_steps):
-    raise ValueError('Please specify the `validation_steps` argument.')
+    for step in range(steps_per_epoch):
+      batch_data = _get_next_batch(output_generator, mode)
+      if batch_data is None:
+        callbacks.model.stop_training = True
+        break
 
-  enqueuer = None
-  val_enqueuer = None
+      # `batch_size` used for validation data if validation
+      # data is NumPy/EagerTensors.
+      batch_size = int(nest.flatten(batch_data)[0].shape[0])
 
-  try:
-    val_x, val_y, val_sample_weights = validation_data, None, None
-    if do_validation and not val_gen:
-      # Prepare data for validation
-      if len(validation_data) == 2:
-        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
-        val_sample_weights = None
-      elif len(validation_data) == 3:
-        val_x, val_y, val_sample_weights = validation_data  # pylint: disable=unpacking-non-sequence
-      else:
-        raise ValueError(
-            '`validation_data` should be a tuple '
-            '`(val_x, val_y, val_sample_weight)` '
-            'or `(val_x, val_y)`. Found: ' + str(validation_data))
-      val_x, val_y, val_sample_weights = model._standardize_user_data(
-          val_x, val_y, val_sample_weights)
-
-    callbacks = cbks.configure_callbacks(
-        callbacks,
-        model,
-        do_validation=do_validation,
-        val_inputs=val_x,
-        val_targets=val_y,
-        val_sample_weights=val_sample_weights,
-        epochs=epochs,
-        validation_steps=validation_steps,
-        steps_per_epoch=steps_per_epoch,
-        verbose=verbose)
-
-    if workers > 0:
-      if is_sequence:
-        enqueuer = data_utils.OrderedEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing,
-            shuffle=shuffle)
-      else:
-        enqueuer = data_utils.GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
-    else:
-      if is_sequence:
-        output_generator = data_utils.iter_sequence_infinite(generator)
-      else:
-        output_generator = generator
+      # Callbacks batch begin.
+      batch_logs = {'batch': step, 'size': batch_size}
+      callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
+      progbar.on_batch_begin(step, batch_logs)
+
+      batch_outs = batch_function(*batch_data)
+      if not isinstance(batch_outs, list):
+        batch_outs = [batch_outs]
+
+      # Aggregate results.
+      if step == 0:
+        aggregator.create(batch_outs)
+      aggregator.aggregate(batch_outs)
+
+      # Callbacks batch end.
+      batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
+      callbacks._call_batch_hook(mode, 'end', step, batch_logs)
+      progbar.on_batch_end(step, batch_logs)
 
-    callbacks.on_train_begin()
-    # Construct epoch logs.
-    epoch_logs = {}
-    while epoch < epochs:
-      for m in model.metrics:
-        m.reset_states()
-      callbacks.on_epoch_begin(epoch)
-      steps_done = 0
-      batch_index = 0
-      while steps_done < steps_per_epoch:
-        generator_output = next(output_generator)
-        if not hasattr(generator_output, '__len__'):
-          raise ValueError('Output of generator should be '
-                           'a tuple `(x, y, sample_weight)` '
-                           'or `(x, y)`. Found: ' + str(generator_output))
-
-        if len(generator_output) == 2:
-          x, y = generator_output
-          sample_weight = None
-        elif len(generator_output) == 3:
-          x, y, sample_weight = generator_output
-        else:
-          raise ValueError('Output of generator should be '
-                           'a tuple `(x, y, sample_weight)` '
-                           'or `(x, y)`. Found: ' + str(generator_output))
-        # build batch logs
-        batch_logs = {}
-        if isinstance(x, list):
-          batch_size = x[0].shape[0]
-        elif isinstance(x, dict):
-          batch_size = list(x.values())[0].shape[0]
-        else:
-          batch_size = x.shape[0]
-        batch_logs['batch'] = int(batch_index)
-        batch_logs['size'] = int(batch_size)
-        callbacks.on_batch_begin(batch_index, batch_logs)
-
-        outs = model.train_on_batch(
-            x, y, sample_weight=sample_weight, class_weight=class_weight)
-
-        if not isinstance(outs, list):
-          outs = [outs]
-        for l, o in zip(model.metrics_names, outs):
-          batch_logs[l] = o
-
-        callbacks.on_batch_end(batch_index, batch_logs)
-
-        batch_index += 1
-        steps_done += 1
-
-        # Epoch finished.
-        if steps_done >= steps_per_epoch and do_validation:
-          if val_gen:
-            val_outs = evaluate_generator(
-                model,
-                validation_data,
-                validation_steps,
-                workers=workers,
-                use_multiprocessing=use_multiprocessing,
-                max_queue_size=max_queue_size)
-          else:
-            # No need for try/except because
-            # data has already been validated.
-            val_outs = model.evaluate(
-                val_x,
-                val_y,
-                batch_size=batch_size,
-                sample_weight=val_sample_weights,
-                verbose=0)
-          if not isinstance(val_outs, list):
-            val_outs = [val_outs]
-          # Same labels assumed.
-          for l, o in zip(model.metrics_names, val_outs):
-            epoch_logs['val_' + l] = o
-
-        if callbacks.model.stop_training:
-          break
-
-      callbacks.on_epoch_end(epoch, epoch_logs)
-      epoch += 1
       if callbacks.model.stop_training:
         break
 
+    aggregator.finalize()
+    results = aggregator.results
+    epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
+    if len(results) == 1:
+      results = results[0]
+
+    # Run the test loop every epoch during training.
+    if do_validation and not callbacks.model.stop_training:
+      val_results = model_iteration(
+          model,
+          validation_data,
+          steps_per_epoch=validation_steps,
+          batch_size=batch_size,
+          class_weight=class_weight,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing,
+          max_queue_size=max_queue_size,
+          callbacks=callbacks,
+          verbose=0,
+          mode='test')
+
+      if not isinstance(val_results, list):
+        val_results = [val_results]
+      epoch_logs = cbks.make_logs(
+          model, epoch_logs, val_results, mode, prefix='val_')
+
+    if mode == 'train':
+      # Epochs only apply to `fit`.
+      callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
+      progbar.on_epoch_end(epoch, epoch_logs)
+
+  callbacks._call_end_hook(mode)
+
+  if enqueuer is not None:
+    enqueuer.stop()
+
+  if should_set_learning_phase:
+    backend.set_learning_phase(old_learning_phase)
+
+  if mode == 'train':
+    return model.history
+  return results
+
+
+# Maintain compatibility with the existing names.
+fit_generator = functools.partial(model_iteration, mode='train')
+evaluate_generator = functools.partial(
+    model_iteration, mode='test', shuffle=False)
+predict_generator = functools.partial(
+    model_iteration, mode='predict', shuffle=False)
+
+
+def _get_next_batch(output_generator, mode):
+  """Retrieves the next batch of input data."""
+  try:
+    generator_output = next(output_generator)
   except (errors.OutOfRangeError, StopIteration):
-    logging.warning(
-        'Your dataset iterator ran out of data interrupting testing. '
-        'Make sure that your dataset can generate at least `steps_per_epoch` '
-        'batches (in this case, %d batches). You may need to use the '
-        'repeat() function when building your dataset.', steps_per_epoch)
-
-  finally:
-    try:
-      if enqueuer is not None:
-        enqueuer.stop()
-    finally:
-      if val_enqueuer is not None:
-        val_enqueuer.stop()
-
-  callbacks.on_train_end()
-  return model.history
-
-
-def evaluate_generator(model,
-                       generator,
-                       steps=None,
-                       max_queue_size=10,
-                       workers=1,
-                       use_multiprocessing=False,
-                       verbose=0):
-  """See docstring for `Model.evaluate_generator`."""
-  if not context.executing_eagerly():
-    model._make_test_function()
-
-  if hasattr(model, '_compile_metrics'):
-    for m in model.metrics:
-      m.reset_states()
-
-  steps_done = 0
-  all_outs = []
-  batch_sizes = []
-  is_sequence = isinstance(generator, data_utils.Sequence)
+    # Returning `None` will trigger looping to stop.
+    logging.warning('Your dataset iterator ran out of data.')
+    return None
+  if not isinstance(generator_output, tuple):
+    if mode == 'predict':
+      # Always wrap in a tuple.
+      return (generator_output,)
+    else:
+      raise ValueError('Output of generator should be '
+                       'a tuple `(x, y, sample_weight)` '
+                       'or `(x, y)`. Found: ' + str(generator_output))
+
+  if len(generator_output) < 1 or len(generator_output) > 3:
+    raise ValueError('Output of generator should be '
+                     'a tuple `(x, y, sample_weight)` '
+                     'or `(x, y)` or (x,). Found: ' + str(generator_output))
+  return generator_output
+
+
+def _validate_arguments(is_sequence, use_multiprocessing, workers,
+                        steps_per_epoch, validation_data, validation_steps,
+                        mode, kwargs):
+  """Raises errors if arguments are invalid.
+
+  Arguments:
+    is_sequence: Boolean, whether data is a `keras.utils.data_utils.Sequence`
+      instance.
+    use_multiprocessing: Boolean. If `True`, use process-based threading. If
+      unspecified, `use_multiprocessing` will default to `False`. Note that
+      because this implementation relies on multiprocessing, you should not pass
+      non-picklable arguments to the generator as they can't be passed easily to
+      children processes.
+    workers: Integer. Maximum number of processes to spin up when using
+      process-based threading. If unspecified, `workers` will default to 1. If
+      0, will execute the generator on the main thread.
+    steps_per_epoch: Total number of steps (batches of samples) before declaring
+      one epoch finished and starting the next epoch. Ignored with the default
+      value of `None`.
+    validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x,
+      y)` or `(x, y, sample_weights)`) or a generator or
+      `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
+    validation_steps: Total number of steps (batches of samples) before
+      declaring validation finished.
+    mode: One of 'train'/'test'/'predict'.
+    kwargs: Additional arguments for backwards compatibility.
+
+  Raises:
+    ValueError: If `steps_per_epoch` or `validation_steps` are not passed
+      for data types that require them, or if unrecognized keyword
+      arguments are passed.
+  """
   if not is_sequence and use_multiprocessing and workers > 1:
     logging.warning(
         UserWarning('Using a generator with `use_multiprocessing=True`'
                     ' and multiple workers may duplicate your data.'
-                    ' Please consider using the`keras.utils.Sequence'
+                    ' Please consider using the `keras.utils.Sequence`'
                     ' class.'))
-  if steps is None:
+
+  if steps_per_epoch is None:
+    arg_name = 'steps_per_epoch' if mode == 'train' else 'steps'
+    raise ValueError('Please specify the number of steps via the '
+                     '`{}` argument.'.format(arg_name))
+
+  val_gen = (
+      data_utils.is_generator_or_sequence(validation_data) or
+      isinstance(validation_data, iterator_ops.EagerIterator) or
+      isinstance(validation_data, dataset_ops.DatasetV2))
+  if (val_gen and not isinstance(validation_data, data_utils.Sequence) and
+      not validation_steps):
+    raise ValueError('Please specify the `validation_steps` argument.')
+
+  if any(k != 'steps' for k in kwargs):
+    raise ValueError('Invalid arguments passed: {}'.format(
+        [k for k in kwargs if k != 'steps']))
+
+
+def convert_to_generator_like(data,
+                              batch_size=None,
+                              steps_per_epoch=None,
+                              epochs=1,
+                              shuffle=False):
+  """Make a generator out of NumPy or EagerTensor inputs.
+
+  Arguments:
+    data: Either a generator or `keras.utils.data_utils.Sequence` object or
+      `Dataset` or `EagerIterator` or a {1,2,3}-tuple of NumPy arrays or
+      EagerTensors. If a tuple, the elements represent `(x, y, sample_weights)`
+      and may be `None` or `[None]`.
+    batch_size: Used when creating a generator out of tuples of NumPy arrays or
+      EagerTensors.
+    steps_per_epoch: Steps of the generator to run each epoch.
+    epochs: Total number of epochs to run.
+    shuffle: Whether the data should be shuffled.
+
+  Returns:
+    - Generator or `keras.utils.data_utils.Sequence` or EagerIterator.
+
+  Raises:
+    - ValueError: If `batch_size` is not provided for NumPy or EagerTensor
+      inputs.
+  """
+  if isinstance(data, tuple):
+    # Scrub `Nones` that might have been passed for `targets`, `sample_weights`.
+    data = tuple(
+        ele for ele in data if not all(e is None for e in nest.flatten(ele)))
+    if len(data) == 1:
+      data = data[0]
+
+  if data_utils.is_generator_or_sequence(data) or isinstance(
+      data, iterator_ops.EagerIterator):
+    if isinstance(data, data_utils.Sequence):
+      steps_per_epoch = len(data)
+    return data, steps_per_epoch
+  if isinstance(data, dataset_ops.DatasetV2):
+    return dataset_ops.make_one_shot_iterator(data), steps_per_epoch
+
+  # Create generator from NumPy or EagerTensor Input.
+  num_samples = int(nest.flatten(data)[0].shape[0])
+  if batch_size is None:
+    raise ValueError('You must specify `batch_size`')
+  steps_per_epoch = int(math.ceil(num_samples / batch_size))
+
+  def _gen(data):
+    """Makes a generator out of a structure of NumPy/EagerTensors."""
+    index_array = np.arange(num_samples)
+    for _ in range(epochs):
+      if shuffle:
+        np.random.shuffle(index_array)
+      batches = generic_utils.make_batches(num_samples, batch_size)
+      for (batch_start, batch_end) in batches:
+        batch_ids = index_array[batch_start:batch_end]
+        flat_batch_data = training_utils.slice_arrays(
+            nest.flatten(data), batch_ids, contiguous=(not shuffle))
+        yield nest.pack_sequence_as(data, flat_batch_data)
+
+  return _gen(data), steps_per_epoch
+
+
+def _make_enqueued_generator(generator,
+                             workers=1,
+                             use_multiprocessing=False,
+                             max_queue_size=10,
+                             shuffle=False):
+  """Create a buffered queue of next elements of the generator."""
+  is_sequence = isinstance(generator, data_utils.Sequence)
+  enqueuer = None
+  if workers > 0:
     if is_sequence:
-      steps = len(generator)
+      enqueuer = data_utils.OrderedEnqueuer(
+          generator, use_multiprocessing=use_multiprocessing, shuffle=shuffle)
     else:
-      raise ValueError('Please specify the `steps` argument.')
-  enqueuer = None
-
-  try:
-    if workers > 0:
-      if is_sequence:
-        enqueuer = data_utils.OrderedEnqueuer(
-            generator, use_multiprocessing=use_multiprocessing)
-      else:
-        enqueuer = data_utils.GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
+      enqueuer = data_utils.GeneratorEnqueuer(
+          generator, use_multiprocessing=use_multiprocessing)
+    enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+    output_generator = enqueuer.get()
+  else:
+    if is_sequence:
+      output_generator = data_utils.iter_sequence_infinite(generator)
     else:
-      if is_sequence:
-        output_generator = data_utils.iter_sequence_infinite(generator)
-      else:
-        output_generator = generator
-
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    while steps_done < steps:
-      generator_output = next(output_generator)
-      if not hasattr(generator_output, '__len__'):
-        raise ValueError('Output of generator should be a tuple '
-                         '(x, y, sample_weight) '
-                         'or (x, y). Found: ' + str(generator_output))
-      if len(generator_output) == 2:
-        x, y = generator_output
-        sample_weight = None
-      elif len(generator_output) == 3:
-        x, y, sample_weight = generator_output
-      else:
-        raise ValueError('Output of generator should be a tuple '
-                         '(x, y, sample_weight) '
-                         'or (x, y). Found: ' + str(generator_output))
-      outs = model.test_on_batch(x, y, sample_weight=sample_weight)
-
-      if isinstance(x, list):
-        batch_size = int(x[0].shape[0])
-      elif isinstance(x, dict):
-        batch_size = int(list(x.values())[0].shape[0])
-      else:
-        batch_size = int(x.shape[0])
-      if batch_size == 0:
-        raise ValueError('Received an empty batch. '
-                         'Batches should at least contain one item.')
-      all_outs.append(outs)
-
-      steps_done += 1
-      batch_sizes.append(batch_size)
-      if verbose == 1:
-        progbar.update(steps_done)
+      output_generator = generator
+  return output_generator, enqueuer
+
+
+def _make_execution_function(model, mode, class_weight=None):
+  """Makes function to run one step of model execution."""
+  if mode == 'train':
+    if not context.executing_eagerly():
+      model._make_fit_function()
+    f = functools.partial(model.train_on_batch, class_weight=class_weight)
+  elif mode == 'test':
+    if not context.executing_eagerly():
+      model._make_eval_function()
+    f = model.test_on_batch
+  else:
+    # Match signature of other modes to allow
+    # 1, 2, or 3-tuples from generator
+    def predict_on_batch(x, y=None, sample_weights=None):  # pylint: disable=unused-argument
+      return model.predict_on_batch(x)
 
-  except (errors.OutOfRangeError, StopIteration):
-    logging.warning(
-        'Your dataset iterator ran out of data interrupting testing. '
-        'Make sure that your dataset can generate at least `steps` '
-        'batches (in this case, %d batches). You may need to use the '
-        'repeat() function when building your dataset.', steps)
+    f = predict_on_batch
 
-  finally:
-    if enqueuer is not None:
-      enqueuer.stop()
+  # Maintain stateful metrics across batch-level calls.
+  if mode != 'predict':
+    f = functools.partial(f, reset_metrics=False)
 
-  if not isinstance(outs, list):
-    return np.average(np.asarray(all_outs), weights=batch_sizes)
-  else:
-    averages = [float(all_outs[-1][0])]  # index 0 = 'loss'
-    averages.extend([
-        np.average([out[i]
-                    for out in all_outs], weights=batch_sizes)
-        for i in range(1, len(outs))
-    ])
-    return averages
-
-
-def predict_generator(model,
-                      generator,
-                      steps=None,
-                      max_queue_size=10,
-                      workers=1,
-                      use_multiprocessing=False,
-                      verbose=0):
-  """See docstring for `Model.predict_generator`."""
-  if not context.executing_eagerly():
-    model._make_predict_function()
-
-  steps_done = 0
-  all_outs = []
-  is_sequence = isinstance(generator, data_utils.Sequence)
-  if not is_sequence and use_multiprocessing and workers > 1:
-    logging.warning(
-        UserWarning('Using a generator with `use_multiprocessing=True`'
-                    ' and multiple workers may duplicate your data.'
-                    ' Please consider using the`keras.utils.Sequence'
-                    ' class.'))
-  if steps is None:
-    if is_sequence:
-      steps = len(generator)
-    else:
-      raise ValueError('Please specify the `steps` argument.')
-  enqueuer = None
+  return f
 
-  try:
-    if workers > 0:
-      if is_sequence:
-        enqueuer = data_utils.OrderedEnqueuer(
-            generator, use_multiprocessing=use_multiprocessing)
-      else:
-        enqueuer = data_utils.GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
-    else:
-      if is_sequence:
-        output_generator = data_utils.iter_sequence_infinite(generator)
-      else:
-        output_generator = generator
-
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    while steps_done < steps:
-      generator_output = next(output_generator)
-      if isinstance(generator_output, tuple):
-        # Compatibility with the generators
-        # used for training.
-        if len(generator_output) == 1:
-          x = generator_output[0]
-        elif len(generator_output) == 2:
-          x, _ = generator_output
-        elif len(generator_output) == 3:
-          x, _, _ = generator_output
-        else:
-          raise ValueError('Output of generator should be '
-                           'a tuple `(x, y, sample_weight)` '
-                           'or `(x, y)`. Found: ' + str(generator_output))
-      else:
-        # Assumes a generator that only
-        # yields inputs (not targets and sample weights).
-        x = generator_output
-
-      outs = model.predict_on_batch(x)
-      if not isinstance(outs, list):
-        outs = [outs]
-
-      if not all_outs:
-        for out in outs:
-          all_outs.append([])
-
-      for i, out in enumerate(outs):
-        all_outs[i].append(out)
-      steps_done += 1
-      if verbose == 1:
-        progbar.update(steps_done)
 
-  except (errors.OutOfRangeError, StopIteration):
-    logging.warning(
-        'Your dataset iterator ran out of data interrupting testing. '
-        'Make sure that your dataset can generate at least `steps` '
-        'batches (in this case, %d batches). You may need to use the '
-        'repeat() function when building your dataset.', steps)
-
-  finally:
-    if enqueuer is not None:
-      enqueuer.stop()
-
-  if len(all_outs) == 1:
-    if steps_done == 1:
-      return all_outs[0][0]
-    else:
-      return np.concatenate(all_outs[0])
-  if steps_done == 1:
-    return [out[0] for out in all_outs]
-  else:
-    return [np.concatenate(out) for out in all_outs]
+def _get_num_samples_or_steps(data, steps_per_epoch):
+  """Returns number of samples or steps, and whether to use steps count mode."""
+  flat_inputs = nest.flatten(data)
+  if hasattr(flat_inputs[0], 'shape'):
+    return int(flat_inputs[0].shape[0]), False
+  return steps_per_epoch, True
diff --git a/tensorflow/python/keras/engine/training_generator_test.py b/tensorflow/python/keras/engine/training_generator_test.py
index 42cfa3bc70d4e6a172593f7951c96eb7edfd5610..90c45dfcb7fdae23ffba5c0a8e72404f3b9350dd 100644
--- a/tensorflow/python/keras/engine/training_generator_test.py
+++ b/tensorflow/python/keras/engine/training_generator_test.py
@@ -25,11 +25,17 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import training_generator
+from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.platform import test
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
+from tensorflow.python.util import nest
 
 
 def custom_generator(mode=2):
@@ -55,23 +61,20 @@ def custom_generator(mode=2):
       yield x, y, w
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
+class TestGeneratorMethods(keras_parameterized.TestCase):
 
   @unittest.skipIf(
       os.name == 'nt',
       'use_multiprocessing=True does not work on windows properly.')
-  @parameterized.parameters('sequential', 'functional')
-  def test_fit_generator_method(self, model_type):
-    if model_type == 'sequential':
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=3, num_classes=4, input_dim=2)
-    else:
-      model = testing_utils.get_small_functional_mlp(
-          num_hidden=3, num_classes=4, input_dim=2)
+  # TODO(b/120940700): Bug with subclassed model inputs.
+  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_all_keras_modes
+  def test_fit_generator_method(self):
+    model = testing_utils.get_small_mlp(
+        num_hidden=3, num_classes=4, input_dim=2)
     model.compile(
         loss='mse',
-        optimizer='sgd',
+        optimizer=rmsprop.RMSprop(1e-3),
         metrics=['mae', metrics_module.CategoricalAccuracy()])
 
     model.fit_generator(custom_generator(),
@@ -104,19 +107,17 @@ class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
   @unittest.skipIf(
       os.name == 'nt',
       'use_multiprocessing=True does not work on windows properly.')
-  @parameterized.parameters('sequential', 'functional')
-  def test_evaluate_generator_method(self, model_type):
-    if model_type == 'sequential':
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=3, num_classes=4, input_dim=2)
-    else:
-      model = testing_utils.get_small_functional_mlp(
-          num_hidden=3, num_classes=4, input_dim=2)
+  # TODO(b/120940700): Bug with subclassed model inputs.
+  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_all_keras_modes
+  def test_evaluate_generator_method(self):
+    model = testing_utils.get_small_mlp(
+        num_hidden=3, num_classes=4, input_dim=2)
     model.compile(
         loss='mse',
-        optimizer='sgd',
-        metrics=['mae', metrics_module.CategoricalAccuracy()])
-    model.summary()
+        optimizer=rmsprop.RMSprop(1e-3),
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     model.evaluate_generator(custom_generator(),
                              steps=5,
@@ -137,18 +138,12 @@ class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
   @unittest.skipIf(
       os.name == 'nt',
       'use_multiprocessing=True does not work on windows properly.')
-  @parameterized.parameters('sequential', 'functional')
-  def test_predict_generator_method(self, model_type):
-    if model_type == 'sequential':
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=3, num_classes=4, input_dim=2)
-    else:
-      model = testing_utils.get_small_functional_mlp(
-          num_hidden=3, num_classes=4, input_dim=2)
-    model.compile(
-        loss='mse',
-        optimizer='sgd',
-        metrics=['mae', metrics_module.CategoricalAccuracy()])
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_predict_generator_method(self):
+    model = testing_utils.get_small_mlp(
+        num_hidden=3, num_classes=4, input_dim=2)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
     model.predict_generator(custom_generator(),
                             steps=5,
@@ -178,13 +173,17 @@ class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
                             max_queue_size=10,
                             workers=0)
 
+  # TODO(b/120940700): Bug with subclassed model inputs.
+  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_all_keras_modes
   def test_generator_methods_with_sample_weights(self):
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(4, input_shape=(2,)))
+    model = testing_utils.get_small_mlp(
+        num_hidden=3, num_classes=4, input_dim=2)
     model.compile(
         loss='mse',
-        optimizer='sgd',
-        metrics=['mae', metrics_module.CategoricalAccuracy()])
+        optimizer=rmsprop.RMSprop(1e-3),
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     model.fit_generator(custom_generator(mode=3),
                         steps_per_epoch=5,
@@ -209,15 +208,19 @@ class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
                              max_queue_size=10,
                              use_multiprocessing=False)
 
+  # TODO(b/120940700): Bug with subclassed model inputs.
+  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_all_keras_modes
   def test_generator_methods_invalid_use_case(self):
 
     def invalid_generator():
       while 1:
         yield 0
 
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(4, input_shape=(2,)))
-    model.compile(loss='mse', optimizer='sgd')
+    model = testing_utils.get_small_mlp(
+        num_hidden=3, num_classes=4, input_dim=2)
+    model.compile(loss='mse', optimizer=rmsprop.RMSprop(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     with self.assertRaises(ValueError):
       model.fit_generator(invalid_generator(),
@@ -246,6 +249,9 @@ class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
                                max_queue_size=10,
                                use_multiprocessing=False)
 
+  # TODO(b/120940700): Bug with subclassed model inputs.
+  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_all_keras_modes
   def test_generator_input_to_fit_eval_predict(self):
     val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
 
@@ -253,12 +259,11 @@ class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
       while True:
         yield np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
 
-    inputs = keras.layers.Input(shape=(10,))
-    x = keras.layers.Dense(10, activation='relu')(inputs)
-    outputs = keras.layers.Dense(1, activation='sigmoid')(x)
-    model = keras.Model(inputs, outputs)
+    model = testing_utils.get_small_mlp(
+        num_hidden=10, num_classes=1, input_dim=10)
 
-    model.compile(RMSPropOptimizer(0.001), 'binary_crossentropy')
+    model.compile(rmsprop.RMSprop(0.001), 'binary_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.fit(
         ones_generator(),
         steps_per_epoch=2,
@@ -268,9 +273,11 @@ class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
     model.predict(ones_generator(), steps=2)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class TestGeneratorMethodsWithSequences(test.TestCase):
+class TestGeneratorMethodsWithSequences(keras_parameterized.TestCase):
 
+  # TODO(b/120940700): Bug with subclassed model inputs.
+  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_all_keras_modes
   def test_training_with_sequences(self):
 
     class DummySequence(keras.utils.Sequence):
@@ -281,9 +288,9 @@ class TestGeneratorMethodsWithSequences(test.TestCase):
       def __len__(self):
         return 10
 
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(4, input_shape=(2,)))
-    model.compile(loss='mse', optimizer='sgd')
+    model = testing_utils.get_small_mlp(
+        num_hidden=3, num_classes=4, input_dim=2)
+    model.compile(loss='mse', optimizer=rmsprop.RMSprop(1e-3))
 
     model.fit_generator(DummySequence(),
                         steps_per_epoch=10,
@@ -300,6 +307,9 @@ class TestGeneratorMethodsWithSequences(test.TestCase):
                         workers=0,
                         use_multiprocessing=False)
 
+  # TODO(b/120940700): Bug with subclassed model inputs.
+  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_all_keras_modes
   def test_sequence_input_to_fit_eval_predict(self):
     val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
 
@@ -311,12 +321,10 @@ class TestGeneratorMethodsWithSequences(test.TestCase):
       def __len__(self):
         return 2
 
-    inputs = keras.layers.Input(shape=(10,))
-    x = keras.layers.Dense(10, activation='relu')(inputs)
-    outputs = keras.layers.Dense(1, activation='sigmoid')(x)
-    model = keras.Model(inputs, outputs)
+    model = testing_utils.get_small_mlp(
+        num_hidden=10, num_classes=1, input_dim=10)
 
-    model.compile(RMSPropOptimizer(0.001), 'binary_crossentropy')
+    model.compile(rmsprop.RMSprop(0.001), 'binary_crossentropy')
     model.fit(CustomSequence(), validation_data=val_data, epochs=2)
     model.evaluate(CustomSequence())
     model.predict(CustomSequence())
@@ -329,5 +337,56 @@ class TestGeneratorMethodsWithSequences(test.TestCase):
       model.fit(CustomSequence(), sample_weight=np.ones([10, 1]))
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
+class TestConvertToGeneratorLike(test.TestCase, parameterized.TestCase):
+  simple_inputs = (np.ones((10, 10)), np.ones((10, 1)))
+  nested_inputs = ((np.ones((10, 10)), np.ones((10, 20))), (np.ones((10, 1)),
+                                                            np.ones((10, 3))))
+
+  def _make_dataset(self, inputs, batches):
+    return dataset_ops.DatasetV2.from_tensors(inputs).repeat(batches)
+
+  def _make_iterator(self, inputs, batches):
+    return dataset_ops.make_one_shot_iterator(
+        self._make_dataset(inputs, batches))
+
+  def _make_generator(self, inputs, batches):
+
+    def _gen():
+      for _ in range(batches):
+        yield inputs
+
+    return _gen()
+
+  def _make_numpy(self, inputs, _):
+    return inputs
+
+  @parameterized.named_parameters(
+      ('simple_dataset', _make_dataset, simple_inputs),
+      ('simple_iterator', _make_iterator, simple_inputs),
+      ('simple_generator', _make_generator, simple_inputs),
+      ('simple_numpy', _make_numpy, simple_inputs),
+      ('nested_dataset', _make_dataset, nested_inputs),
+      ('nested_iterator', _make_iterator, nested_inputs),
+      ('nested_generator', _make_generator, nested_inputs),
+      ('nested_numpy', _make_numpy, nested_inputs))
+  def test_convert_to_generator_like(self, input_fn, inputs):
+    expected_batches = 5
+    data = input_fn(self, inputs, expected_batches)
+
+    # Dataset and Iterator not supported in Legacy Graph mode.
+    if (not context.executing_eagerly() and
+        isinstance(data, (dataset_ops.DatasetV2, iterator_ops.Iterator))):
+      return
+
+    generator, steps = training_generator.convert_to_generator_like(
+        data, batch_size=2, steps_per_epoch=expected_batches)
+    self.assertEqual(steps, expected_batches)
+
+    for _ in range(expected_batches):
+      outputs = next(generator)
+    nest.assert_same_structure(outputs, inputs)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 4674bce2449ed3891f89fcd616a51f5c5c31b16f..887e3b84b53378420d0b568050ca56736fbb5d62 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.callbacks import Callback
@@ -50,19 +51,20 @@ except ImportError:
   scipy_sparse = None
 
 
-class TrainingTest(test.TestCase):
+class TrainingTest(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
+  @keras_parameterized.run_all_keras_modes
   def test_fit_on_arrays(self):
-    a = keras.layers.Input(shape=(3,), name='input_a')
-    b = keras.layers.Input(shape=(3,), name='input_b')
+    input_a = keras.layers.Input(shape=(3,), name='input_a')
+    input_b = keras.layers.Input(shape=(3,), name='input_b')
 
     dense = keras.layers.Dense(4, name='dense')
-    c = dense(a)
-    d = dense(b)
-    e = keras.layers.Dropout(0.5, name='dropout')(c)
+    dropout = keras.layers.Dropout(0.5, name='dropout')
+    branch_a = [input_a, dense]
+    branch_b = [input_b, dense, dropout]
 
-    model = keras.models.Model([a, b], [d, e])
+    model = testing_utils.get_multi_io_model(branch_a, branch_b)
 
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
@@ -71,7 +73,8 @@ class TrainingTest(test.TestCase):
         optimizer,
         loss,
         metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
-        loss_weights=loss_weights)
+        loss_weights=loss_weights,
+        run_eagerly=testing_utils.should_run_eagerly())
 
     input_a_np = np.random.random((10, 3))
     input_b_np = np.random.random((10, 3))
@@ -135,61 +138,63 @@ class TrainingTest(test.TestCase):
         verbose=0,
         validation_split=0.2)
 
-    # Test with dictionary inputs
-    model.fit(
-        {
-            'input_a': input_a_np,
-            'input_b': input_b_np
-        }, {
-            'dense': output_d_np,
-            'dropout': output_e_np
-        },
-        epochs=1,
-        batch_size=5,
-        verbose=0)
-    model.fit(
-        {
-            'input_a': input_a_np,
-            'input_b': input_b_np
-        }, {
-            'dense': output_d_np,
-            'dropout': output_e_np
-        },
-        epochs=1,
-        batch_size=5,
-        verbose=1)
-    model.fit(
-        {
-            'input_a': input_a_np,
-            'input_b': input_b_np
-        }, {
-            'dense': output_d_np,
-            'dropout': output_e_np
-        },
-        validation_data=({
-            'input_a': input_a_np,
-            'input_b': input_b_np
-        }, {
-            'dense': output_d_np,
-            'dropout': output_e_np
-        }),
-        epochs=1,
-        batch_size=5,
-        verbose=0)
-    model.train_on_batch({
-        'input_a': input_a_np,
-        'input_b': input_b_np
-    }, {
-        'dense': output_d_np,
-        'dropout': output_e_np
-    })
+    if testing_utils.get_model_type() == 'functional':
+      # Test with dictionary inputs
+      model.fit(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          },
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+      model.fit(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          },
+          epochs=1,
+          batch_size=5,
+          verbose=1)
+      model.fit(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          },
+          validation_data=({
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          }),
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+      model.train_on_batch({
+          'input_a': input_a_np,
+          'input_b': input_b_np
+      }, {
+          'dense': output_d_np,
+          'dropout': output_e_np
+      })
 
     # Test with lists for loss, metrics
     loss = ['mae', 'mse']
     model.compile(
         optimizer,
         loss,
-        metrics=[metrics_module.CategoricalAccuracy(), 'mae'])
+        metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
+        run_eagerly=testing_utils.should_run_eagerly())
     model.fit(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         epochs=1,
@@ -197,13 +202,15 @@ class TrainingTest(test.TestCase):
         verbose=0)
 
     # Test with dictionaries for loss, metrics, loss weights
-    loss = {'dense': 'mse', 'dropout': 'mae'}
-    loss_weights = {'dense': 1., 'dropout': 0.5}
-    metrics = {
-        'dense': 'mse',
-        'dropout': metrics_module.CategoricalAccuracy()
-    }
-    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+    if testing_utils.get_model_type() == 'functional':
+      loss = {'dense': 'mse', 'dropout': 'mae'}
+      loss_weights = {'dense': 1., 'dropout': 0.5}
+      metrics = {
+          'dense': 'mse',
+          'dropout': metrics_module.CategoricalAccuracy()
+      }
+      model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights,
+                    run_eagerly=testing_utils.should_run_eagerly())
     model.fit(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         epochs=1,
@@ -239,11 +246,14 @@ class TrainingTest(test.TestCase):
     x = keras.layers.Input(shape=(3,), name='input_a')
     y = keras.layers.Dense(4)(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer, loss='mse')
+    model.compile(optimizer, loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     # This will work
     model.fit([input_a_np], output_d_np, epochs=1)
-    with self.assertRaises(ValueError):
-      model.fit([input_a_np, input_a_np], output_d_np, epochs=1)
+    # TODO(gsundeep) Test only works in eager, file ticket
+    if testing_utils.should_run_eagerly() and context.executing_eagerly():
+      with self.assertRaises(ValueError):
+        model.fit([input_a_np, input_a_np], output_d_np, epochs=1)
 
     # Test model on a list of floats
     input_a_np = np.random.random((10, 3))
@@ -255,7 +265,7 @@ class TrainingTest(test.TestCase):
               batch_size=5,
               verbose=2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_evaluate_predict_on_arrays(self):
     a = keras.layers.Input(shape=(3,), name='input_a')
     b = keras.layers.Input(shape=(3,), name='input_b')
@@ -275,7 +285,8 @@ class TrainingTest(test.TestCase):
         loss,
         metrics=['mae', metrics_module.CategoricalAccuracy()],
         loss_weights=loss_weights,
-        sample_weight_mode=None)
+        sample_weight_mode=None,
+        run_eagerly=testing_utils.should_run_eagerly())
 
     input_a_np = np.random.random((10, 3))
     input_b_np = np.random.random((10, 3))
@@ -336,7 +347,7 @@ class TrainingTest(test.TestCase):
     })
     self.assertEqual(len(out), 2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_activity_regularizer_fit(self):
     loss = {}
     for reg in [None, 'l2']:
@@ -352,12 +363,13 @@ class TrainingTest(test.TestCase):
       y = np.ones((10, 1), 'float32')
 
       optimizer = RMSPropOptimizer(learning_rate=0.001)
-      model.compile(optimizer, 'binary_crossentropy')
+      model.compile(optimizer, 'binary_crossentropy',
+                    run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, batch_size=2, epochs=5)
       loss[reg] = model.evaluate(x, y)
     self.assertLess(loss[None], loss['l2'])
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_activity_regularizer_loss_value(self):
     inputs = keras.layers.Input(shape=(10,))
     outputs = keras.layers.Dense(
@@ -370,11 +382,12 @@ class TrainingTest(test.TestCase):
     x = np.ones((10, 10), 'float32')
     y = np.ones((10, 1), 'float32')
     optimizer = RMSPropOptimizer(learning_rate=0.001)
-    model.compile(optimizer, 'binary_crossentropy')
+    model.compile(optimizer, 'binary_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
     loss = model.test_on_batch(x, y)
     self.assertAlmostEqual(0.01, loss, places=4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_activity_regularizer_batch_independent(self):
     inputs = keras.layers.Input(shape=(10,))
     x = keras.layers.Dense(
@@ -384,7 +397,8 @@ class TrainingTest(test.TestCase):
     model = keras.Model(inputs, outputs)
 
     optimizer = RMSPropOptimizer(learning_rate=0.001)
-    model.compile(optimizer, 'binary_crossentropy')
+    model.compile(optimizer, 'binary_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((10, 10), 'float32')
     y = np.ones((10, 1), 'float32')
@@ -396,7 +410,7 @@ class TrainingTest(test.TestCase):
 
     self.assertAlmostEqual(loss_small_batch, loss_big_batch, places=4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_activity_regularizer_in_model_call(self):
 
     class MyModel(keras.Model):
@@ -410,46 +424,55 @@ class TrainingTest(test.TestCase):
     _ = model(x)
     self.assertEqual(1, len(model.losses))
 
+  @keras_parameterized.run_all_keras_modes
   def test_training_on_sparse_data_with_dense_placeholders(self):
+    # TODO(kaftan) Test seems to not work, file ticket
+    if testing_utils.should_run_eagerly() and context.executing_eagerly():
+      self.skipTest('Skipping running model eagerly.')
+
     if scipy_sparse is None:
       return
 
-    with self.cached_session():
-      test_inputs = [
-          scipy_sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)
-      ]
-      test_outputs = [
-          scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)
-      ]
-      in1 = keras.layers.Input(shape=(3,))
-      in2 = keras.layers.Input(shape=(3,))
-      out1 = keras.layers.Dropout(0.5, name='dropout')(in1)
-      out2 = keras.layers.Dense(4, name='dense_1')(in2)
-      model = keras.Model([in1, in2], [out1, out2])
-      model.predict(test_inputs, batch_size=2)
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      model.compile(
-          optimizer,
-          'mse',
-          metrics=['mae', metrics_module.CategoricalAccuracy()])
-      model.fit(test_inputs, test_outputs,
-                epochs=1, batch_size=2, validation_split=0.5)
-      model.evaluate(test_inputs, test_outputs, batch_size=2)
+    test_inputs = [
+        scipy_sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)
+    ]
+    test_outputs = [
+        scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)
+    ]
+    in1 = keras.layers.Input(shape=(3,))
+    in2 = keras.layers.Input(shape=(3,))
+    out1 = keras.layers.Dropout(0.5, name='dropout')(in1)
+    out2 = keras.layers.Dense(4, name='dense_1')(in2)
+    model = keras.Model([in1, in2], [out1, out2])
+    model.predict(test_inputs, batch_size=2)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    model.compile(
+        optimizer,
+        'mse',
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(test_inputs, test_outputs,
+              epochs=1, batch_size=2, validation_split=0.5)
+    model.evaluate(test_inputs, test_outputs, batch_size=2)
 
+  @keras_parameterized.run_all_keras_modes
   def test_compile_with_sparse_placeholders(self):
-    with self.cached_session():
-      input_layer = keras.layers.Input(shape=(10,), sparse=True)
-      weights = variables_lib.Variable(
-          np.ones((10, 1)).astype(np.float32), name='weights')
-      weights_mult = lambda x: sparse_ops.sparse_tensor_dense_matmul(x, weights)
-      output_layer = keras.layers.Lambda(weights_mult)(input_layer)
-      model = keras.Model([input_layer], output_layer)
-      model.compile(
-          loss='binary_crossentropy',
-          optimizer=keras.optimizers.Adam(lr=0.0001),
-          metrics=['accuracy'])
+    # TODO(kaftan) Test seems to not work, file ticket
+    if testing_utils.should_run_eagerly() and context.executing_eagerly():
+      self.skipTest('Skipping running model eagerly.')
+
+    input_layer = keras.layers.Input(shape=(10,), sparse=True)
+    weights = variables_lib.Variable(
+        np.ones((10, 1)).astype(np.float32), name='weights')
+    weights_mult = lambda x: sparse_ops.sparse_tensor_dense_matmul(x, weights)
+    output_layer = keras.layers.Lambda(weights_mult)(input_layer)
+    model = keras.Model([input_layer], output_layer)
+    model.compile(
+        loss='binary_crossentropy',
+        optimizer=keras.optimizers.Adam(lr=0.0001),
+        metrics=['accuracy'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
-  @tf_test_util.run_deprecated_v1
   def test_that_trainable_disables_updates(self):
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
@@ -546,14 +569,15 @@ class TrainingTest(test.TestCase):
               'val_loss', 'val_weighted_mean_absolute_error'
           ]))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_mismatched_output_shape_and_target_shape(self):
     model = keras.Sequential([
         keras.layers.Dense(2, input_shape=(3, 4)),
         keras.layers.Dense(5),
     ])
     model.compile(RMSPropOptimizer(learning_rate=0.001),
-                  loss='sparse_categorical_crossentropy')
+                  loss='sparse_categorical_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
     # Test with Numpy data
     x_train = np.random.random((10, 3, 4))
     y_train = np.random.randint(0, 5, size=(10, 3))
@@ -563,7 +587,7 @@ class TrainingTest(test.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
     dataset = dataset.repeat(10)
     dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     model.fit(iterator, epochs=1, steps_per_epoch=2)
 
     if context.executing_eagerly():
@@ -588,14 +612,15 @@ class TrainingTest(test.TestCase):
       self.assertAllEqual(
           self.evaluate(layer.losses), self.evaluate(get_losses()))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_logging(self):
     mock_stdout = io.BytesIO() if six.PY2 else io.StringIO()
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(10, activation='relu'))
     model.add(keras.layers.Dense(1, activation='sigmoid'))
     model.compile(
-        RMSPropOptimizer(learning_rate=0.001), loss='binary_crossentropy')
+        RMSPropOptimizer(learning_rate=0.001), loss='binary_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly())
     with test.mock.patch.object(sys, 'stdout', mock_stdout):
       model.fit(
           np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'), epochs=10)
@@ -629,10 +654,153 @@ class TrainingTest(test.TestCase):
               epochs=1,
               batch_size=5)
 
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_static_batch_in_input_layer(self):
+
+    class Counter(keras.callbacks.Callback):
+
+      def __init__(self):
+        self.batches = 0
 
-class TestExceptionsAndWarnings(test.TestCase):
+      def on_batch_end(self, batch, logs=None):
+        self.batches += 1
+
+    x, y = np.ones((64, 10), 'float32'), np.ones((64, 1), 'float32')
+
+    for batch_size, expected_batches in [(None, 2), (4, 16)]:
+      inputs = keras.Input(batch_size=batch_size, shape=(10,))
+      outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
+      model = keras.Model(inputs, outputs)
+
+      model.compile(keras.optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
+      counter = Counter()
+      model.fit(x, y, callbacks=[counter])
+      self.assertEqual(counter.batches, expected_batches)
+
+      model = keras.Sequential(
+          [keras.layers.Dense(1, batch_input_shape=(batch_size, 10))])
+      model.compile(keras.optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
+      counter = Counter()
+      model.fit(x, y, callbacks=[counter])
+      self.assertEqual(counter.batches, expected_batches)
 
   @tf_test_util.run_in_graph_and_eager_modes
+  def test_static_batch_in_input_layer_consistency_checks(self):
+    x, y = np.ones((64, 10), 'float32'), np.ones((64, 1), 'float32')
+
+    inputs = keras.Input(batch_size=2, shape=(10,))
+    outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile(keras.optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
+    with self.assertRaisesRegexp(ValueError,
+                                 'incompatible with the specified batch size'):
+      model.fit(x, y, batch_size=4)
+
+    data = dataset_ops.DatasetV2.from_tensor_slices((x, y))
+    data = data.batch(4, drop_remainder=True)
+    with self.assertRaisesRegexp(ValueError,
+                                 'incompatible with the specified batch size'):
+      model.fit(data, steps_per_epoch=16)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_compatible_batch_size_functional_model(self):
+
+    class MyLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        return array_ops.concat(inputs, axis=0)
+
+    input1 = keras.Input(batch_size=2, shape=(10,))
+    input2 = keras.Input(batch_size=3, shape=(10,))
+    outputs = MyLayer()([input1, input2])
+    with self.assertRaisesRegexp(ValueError,
+                                 'specified batch sizes of the Input Layers'):
+      keras.Model([input1, input2], outputs)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_calling_subclass_model_on_different_datasets(self):
+
+    class SubclassedModel(keras.models.Model):
+
+      def call(self, inputs):
+        return inputs * 2
+
+    model = SubclassedModel()
+    dataset_one = dataset_ops.Dataset.range(2).batch(2)
+    dataset_two = dataset_ops.Dataset.range(3, 10).batch(2)
+    self.assertAllEqual([[0], [2]], model.predict(dataset_one, steps=1))
+    self.assertAllEqual([[6], [8], [10], [12]],
+                        model.predict(dataset_two, steps=2))
+
+  def test_training_on_sparse_categorical_crossentropy_loss_with_softmax(self):
+    with context.eager_mode():
+      np.random.seed(1337)
+      train_x = np.ones((100, 4))
+      train_y = np.random.randint(0, 1, size=(100, 1))
+
+      reference_model = testing_utils.get_small_sequential_mlp(16, 2,
+                                                               input_dim=4)
+      reference_model.compile(loss='sparse_categorical_crossentropy',
+                              optimizer=RMSPropOptimizer(learning_rate=0.001),
+                              run_eagerly=True)
+      fixed_weights = reference_model.get_weights()
+      reference_model_loss = reference_model.train_on_batch(train_x, train_y)
+
+      test_model = testing_utils.get_small_sequential_mlp(16, 2, input_dim=4)
+      test_model.compile(loss='sparse_categorical_crossentropy',
+                         optimizer=RMSPropOptimizer(learning_rate=0.001),
+                         run_eagerly=False)
+      test_model.set_weights(fixed_weights)
+      test_model_loss = test_model.train_on_batch(train_x, train_y)
+      self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
+
+  def test_training_on_categorical_crossentropy_loss_with_softmax(self):
+    with context.eager_mode():
+      np.random.seed(1337)
+      train_x = np.ones((100, 4))
+      train_y = keras.utils.to_categorical(np.random.randint(0, 1,
+                                                             size=(100, 1)), 2)
+
+      reference_model = testing_utils.get_small_sequential_mlp(16, 2,
+                                                               input_dim=4)
+      reference_model.compile(loss='categorical_crossentropy',
+                              optimizer=RMSPropOptimizer(learning_rate=0.001),
+                              run_eagerly=True)
+      fixed_weights = reference_model.get_weights()
+      reference_model_loss = reference_model.train_on_batch(train_x, train_y)
+
+      test_model = testing_utils.get_small_sequential_mlp(16, 2, input_dim=4)
+      test_model.compile(loss='categorical_crossentropy',
+                         optimizer=RMSPropOptimizer(learning_rate=0.001),
+                         run_eagerly=False)
+      test_model.set_weights(fixed_weights)
+      test_model_loss = test_model.train_on_batch(train_x, train_y)
+      self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
+
+  def test_training_on_binary_crossentropy_loss(self):
+    with context.eager_mode():
+      train_x = np.ones((100, 4), dtype=np.float32)
+      train_y = np.ones((100, 1), dtype=np.float32)
+      reference_model = testing_utils.get_small_sequential_mlp(16, 1,
+                                                               input_dim=4)
+      reference_model.compile(loss='binary_crossentropy',
+                              optimizer=RMSPropOptimizer(learning_rate=0.001),
+                              run_eagerly=True)
+      fixed_weights = reference_model.get_weights()
+      reference_model_loss = reference_model.train_on_batch(train_x, train_y)
+
+      test_model = testing_utils.get_small_sequential_mlp(16, 1, input_dim=4)
+      test_model.compile(loss='binary_crossentropy',
+                         optimizer=RMSPropOptimizer(learning_rate=0.001),
+                         run_eagerly=False)
+      test_model.set_weights(fixed_weights)
+      test_model_loss = test_model.train_on_batch(train_x, train_y)
+      self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
+
+
+class TestExceptionsAndWarnings(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_all_keras_modes
   def test_invalid_loss(self):
     num_classes = 5
     train_samples = 1000
@@ -659,9 +827,10 @@ class TestExceptionsAndWarnings(test.TestCase):
         model.fit(x_train, y_train)
 
       with self.assertRaises(ValueError):
-        model.compile(optimizer, loss=None)
+        model.compile(optimizer, loss=None,
+                      run_eagerly=testing_utils.should_run_eagerly())
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_compile_warning_for_loss_missing_output(self):
     with self.cached_session():
       inp = keras.layers.Input(shape=(16,), name='input_a')
@@ -679,21 +848,23 @@ class TestExceptionsAndWarnings(test.TestCase):
             metrics={
                 'dense_2': 'categorical_accuracy',
                 'dense_1': metrics_module.CategoricalAccuracy(),
-            })
+            },
+            run_eagerly=testing_utils.should_run_eagerly())
         msg = ('Output "dense_1" missing from loss dictionary. We assume this '
                'was done on purpose. The fit and evaluate APIs will not be '
                'expecting any data to be passed to "dense_1".')
         self.assertRegexpMatches(str(mock_log.call_args), msg)
 
 
-class LossWeightingTest(test.TestCase):
+class LossWeightingTest(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_class_weights(self):
     num_classes = 5
     batch_size = 5
-    epochs = 5
+    epochs = 10
     weighted_class = 3
+    weight = 10.
     train_samples = 1000
     test_samples = 1000
     input_dim = 5
@@ -705,7 +876,8 @@ class LossWeightingTest(test.TestCase):
         loss='categorical_crossentropy',
         metrics=['acc', metrics_module.CategoricalAccuracy()],
         weighted_metrics=['mae', metrics_module.CategoricalAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=learning_rate))
+        optimizer=RMSPropOptimizer(learning_rate=learning_rate),
+        run_eagerly=testing_utils.should_run_eagerly())
 
     np.random.seed(1337)
     (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
@@ -721,7 +893,7 @@ class LossWeightingTest(test.TestCase):
     test_ids = np.where(int_y_test == np.array(weighted_class))[0]
 
     class_weight = dict([(i, 1.) for i in range(num_classes)])
-    class_weight[weighted_class] = 2.
+    class_weight[weighted_class] = weight
 
     sample_weight = np.ones((y_train.shape[0]))
     sample_weight[int_y_train == weighted_class] = 2.
@@ -757,12 +929,13 @@ class LossWeightingTest(test.TestCase):
         x_test[test_ids, :], y_test[test_ids, :], verbose=0)
     self.assertLess(score[0], ref_score[0])
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sample_weights(self):
     num_classes = 5
     batch_size = 5
-    epochs = 5
+    epochs = 10
     weighted_class = 3
+    weight = 10.
     train_samples = 1000
     test_samples = 1000
     input_dim = 5
@@ -774,7 +947,8 @@ class LossWeightingTest(test.TestCase):
         RMSPropOptimizer(learning_rate=learning_rate),
         metrics=['acc', metrics_module.CategoricalAccuracy()],
         weighted_metrics=['mae', metrics_module.CategoricalAccuracy()],
-        loss='categorical_crossentropy')
+        loss='categorical_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     np.random.seed(43)
     (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
@@ -790,7 +964,7 @@ class LossWeightingTest(test.TestCase):
     test_ids = np.where(int_y_test == np.array(weighted_class))[0]
 
     sample_weight = np.ones((y_train.shape[0]))
-    sample_weight[int_y_train == weighted_class] = 2.
+    sample_weight[int_y_train == weighted_class] = weight
 
     model.fit(
         x_train,
@@ -822,13 +996,15 @@ class LossWeightingTest(test.TestCase):
           x_test[test_ids, :], y_test[test_ids, :], verbose=0)
       self.assertLess(score[0], ref_score[0])
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_warning_for_concurrent_sample_and_class_weights(self):
+
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(10, input_shape=(3,)))
     model.compile(
         loss='mse',
-        optimizer=RMSPropOptimizer(learning_rate=0.01))
+        optimizer=RMSPropOptimizer(learning_rate=0.01),
+        run_eagerly=testing_utils.should_run_eagerly())
     x_train = np.random.random((10, 3))
     y_train = np.random.random((10, 10))
     sample_weight = np.ones((y_train.shape[0]))
@@ -842,15 +1018,22 @@ class LossWeightingTest(test.TestCase):
           verbose=0,
           sample_weight=sample_weight,
           class_weight=class_weight)
-      msg = ('The `class_weight` argument will be ignored.')
-      self.assertRegexpMatches(str(mock_log.call_args), msg)
+      msg = 'The `class_weight` argument will be ignored.'
 
-  @tf_test_util.run_in_graph_and_eager_modes
+      msg_found = False
+      for call_args in mock_log.call_args_list:
+        if msg in str(call_args):
+          msg_found = True
+
+      self.assertTrue(msg_found)
+
+  @keras_parameterized.run_all_keras_modes
   def test_temporal_sample_weights(self):
     num_classes = 5
     batch_size = 5
-    epochs = 5
+    epochs = 10
     weighted_class = 3
+    weight = 10.
     train_samples = 1000
     test_samples = 1000
     input_dim = 5
@@ -879,7 +1062,7 @@ class LossWeightingTest(test.TestCase):
       test_ids = np.where(int_y_test == np.array(weighted_class))[0]
 
       sample_weight = np.ones((y_train.shape[0]))
-      sample_weight[int_y_train == weighted_class] = 2.
+      sample_weight[int_y_train == weighted_class] = weight
 
       temporal_x_train = np.reshape(x_train, (len(x_train), 1,
                                               x_train.shape[1]))
@@ -900,10 +1083,11 @@ class LossWeightingTest(test.TestCase):
 
       model.compile(
           RMSPropOptimizer(learning_rate=learning_rate),
-          loss='binary_crossentropy',
+          loss='categorical_crossentropy',
           metrics=['acc', metrics_module.CategoricalAccuracy()],
           weighted_metrics=['mae', metrics_module.CategoricalAccuracy()],
-          sample_weight_mode='temporal')
+          sample_weight_mode='temporal',
+          run_eagerly=testing_utils.should_run_eagerly())
 
       model.fit(
           temporal_x_train,
@@ -935,7 +1119,7 @@ class LossWeightingTest(test.TestCase):
             temporal_x_test[test_ids], temporal_y_test[test_ids], verbose=0)
         self.assertLess(score[0], ref_score[0])
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_class_weight_invalid_use_case(self):
     num_classes = 5
     train_samples = 1000
@@ -952,7 +1136,8 @@ class LossWeightingTest(test.TestCase):
               input_shape=(timesteps, input_dim)))
       model.add(keras.layers.Activation('softmax'))
       optimizer = RMSPropOptimizer(learning_rate=learning_rate)
-      model.compile(optimizer, loss='binary_crossentropy')
+      model.compile(optimizer, loss='binary_crossentropy',
+                    run_eagerly=testing_utils.should_run_eagerly())
 
       (x_train, y_train), _ = testing_utils.get_test_data(
           train_samples=train_samples,
@@ -970,14 +1155,16 @@ class LossWeightingTest(test.TestCase):
 
       with self.assertRaises(ValueError):
         model.compile(
-            optimizer, loss='binary_crossentropy', sample_weight_mode=[])
+            optimizer, loss='binary_crossentropy', sample_weight_mode=[],
+            run_eagerly=testing_utils.should_run_eagerly())
 
       # Build multi-output model
       x = keras.Input((3,))
       y1 = keras.layers.Dense(4, name='1')(x)
       y2 = keras.layers.Dense(4, name='2')(x)
       model = keras.models.Model(x, [y1, y2])
-      model.compile(optimizer, loss='mse')
+      model.compile(optimizer, loss='mse',
+                    run_eagerly=testing_utils.should_run_eagerly())
       x_np = np.random.random((10, 3))
       y_np = np.random.random((10, 4))
       w_np = np.random.random((10,))
@@ -1004,7 +1191,7 @@ class LossWeightingTest(test.TestCase):
         model.fit(x_np, [y_np, y_np], epochs=1,
                   sample_weight={'1': bad_w_np})
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_default_sample_weight(self):
     """Verifies that fit works without having to set sample_weight."""
 
@@ -1025,38 +1212,46 @@ class LossWeightingTest(test.TestCase):
       optimizer = RMSPropOptimizer(learning_rate=learning_rate)
 
       # sample_weight_mode is a list and mode value is None
-      model.compile(optimizer, loss='mse', sample_weight_mode=[None])
+      model.compile(optimizer, loss='mse', sample_weight_mode=[None],
+                    run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a list and mode value is `temporal`
-      model.compile(optimizer, loss='mse', sample_weight_mode=['temporal'])
+      model.compile(optimizer, loss='mse', sample_weight_mode=['temporal'],
+                    run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a dict and mode value is None
       model.compile(
-          optimizer, loss='mse', sample_weight_mode={'time_distributed': None})
+          optimizer, loss='mse', sample_weight_mode={'time_distributed': None},
+          run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a dict and mode value is `temporal`
       model.compile(
           optimizer,
           loss='mse',
-          sample_weight_mode={'time_distributed': 'temporal'})
+          sample_weight_mode={'time_distributed': 'temporal'},
+          run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a not a list/dict and mode value is None
-      model.compile(optimizer, loss='mse', sample_weight_mode=None)
+      model.compile(optimizer, loss='mse', sample_weight_mode=None,
+                    run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a not a list/dict and mode value is `temporal`
-      model.compile(optimizer, loss='mse', sample_weight_mode='temporal')
+      model.compile(optimizer, loss='mse', sample_weight_mode='temporal',
+                    run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
 
-class LossMaskingTest(test.TestCase):
+class LossMaskingTest(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_masking_graph_sequential(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/120495761')
     with self.cached_session():
       x = np.array([[[1], [1]], [[0], [0]]])
       model = keras.models.Sequential()
@@ -1064,13 +1259,16 @@ class LossMaskingTest(test.TestCase):
       model.add(
           keras.layers.TimeDistributed(
               keras.layers.Dense(1, kernel_initializer='one')))
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    run_eagerly=testing_utils.should_run_eagerly())
       y = np.array([[[1], [1]], [[1], [1]]])
       loss = model.train_on_batch(x, y)
       self.assertEqual(float(loss), 0.)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_masking_deferred_sequential(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/120495761')
     with self.cached_session():
       x = np.array([[[1], [1]], [[0], [0]]])
       model = keras.models.Sequential()
@@ -1078,13 +1276,16 @@ class LossMaskingTest(test.TestCase):
       model.add(
           keras.layers.TimeDistributed(
               keras.layers.Dense(1, kernel_initializer='one')))
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    run_eagerly=testing_utils.should_run_eagerly())
       y = np.array([[[1], [1]], [[1], [1]]])
       loss = model.train_on_batch(x, y)
       self.assertEqual(float(loss), 0.)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_masking_functional(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/120495761')
     with self.cached_session():
       x = np.array([[[1], [1]], [[0], [0]]])
       inputs = keras.layers.Input((2, 1))
@@ -1092,12 +1293,13 @@ class LossMaskingTest(test.TestCase):
       outputs = keras.layers.TimeDistributed(
           keras.layers.Dense(1, kernel_initializer='one'))(outputs)
       model = keras.Model(inputs, outputs)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    run_eagerly=testing_utils.should_run_eagerly())
       y = np.array([[[1], [1]], [[1], [1]]])
       loss = model.train_on_batch(x, y)
       self.assertEqual(float(loss), 0.)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_mask_argument_in_layer(self):
     # Test that the mask argument gets correctly passed to a layer in the
     # functional API.
@@ -1122,7 +1324,8 @@ class LossMaskingTest(test.TestCase):
       outputs = CustomMaskedLayer()(masked)
 
       model = keras.Model(inputs, outputs)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    run_eagerly=testing_utils.should_run_eagerly())
       y = np.random.random((5, 3))
       model.train_on_batch(x, y)
 
@@ -1145,9 +1348,8 @@ class LossMaskingTest(test.TestCase):
               keras.backend.variable(weights), keras.backend.variable(mask)))
 
 
-class TestDynamicTrainability(test.TestCase):
+class TestDynamicTrainability(keras_parameterized.TestCase):
 
-  @tf_test_util.run_deprecated_v1
   def test_trainable_warning(self):
     with self.cached_session():
       x = np.random.random((5, 3))
@@ -1161,7 +1363,6 @@ class TestDynamicTrainability(test.TestCase):
       model.train_on_batch(x, y)
       self.assertRaises(Warning)
 
-  @tf_test_util.run_deprecated_v1
   def test_trainable_argument(self):
     with self.cached_session():
       x = np.random.random((5, 3))
@@ -1290,144 +1491,152 @@ class TestDynamicTrainability(test.TestCase):
       self.assertListEqual(outer_model.trainable_weights, [])
 
 
-class TestTrainingWithDataTensors(test.TestCase):
+class TestTrainingWithDataTensors(keras_parameterized.TestCase):
 
-  @tf_test_util.run_deprecated_v1
+  @keras_parameterized.run_all_keras_modes
   def test_training_and_eval_methods_on_symbolic_tensors_single_io(self):
-    with self.cached_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
+    # TODO(kaftan) Test seems to not work, file ticket
+    if  context.executing_eagerly():
+      self.skipTest('Skipping eager execution.')
 
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      model.compile(
-          optimizer,
-          loss,
-          metrics=['mae', metrics_module.CategoricalAccuracy()])
-
-      inputs = keras.backend.zeros(shape=(10, 3))
-      targets = keras.backend.zeros(shape=(10, 4))
-
-      model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
-      model.evaluate(inputs, targets, steps=2, verbose=0)
-      model.predict(inputs, steps=2)
-      model.train_on_batch(inputs, targets)
-      model.test_on_batch(inputs, targets)
-      model.fit(inputs, targets,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_data=(inputs, targets), validation_steps=2)
-
-      # Test with dynamic shape
-      inputs = array_ops.placeholder_with_default(
-          np.zeros((2, 3)), shape=tensor_shape.TensorShape([None, 3]))
-      targets = array_ops.placeholder_with_default(
-          np.zeros((2, 4)), shape=tensor_shape.TensorShape([None, 4]))
-      self.assertEqual(inputs.shape.dims[0].value, None)
-      model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
-      model.evaluate(inputs, targets, steps=2, verbose=0)
-      model.predict(inputs, steps=2)
-      model.train_on_batch(inputs, targets)
-      model.test_on_batch(inputs, targets)
-      model.fit(inputs, targets,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_data=(inputs, targets), validation_steps=2)
+    x = keras.layers.Input(shape=(3,), name='input')
+    y = keras.layers.Dense(4, name='dense')(x)
+    model = keras.Model(x, y)
 
-  @tf_test_util.run_deprecated_v1
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    model.compile(
+        optimizer,
+        loss,
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = keras.backend.zeros(shape=(10, 3))
+    targets = keras.backend.zeros(shape=(10, 4))
+
+    model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
+    model.evaluate(inputs, targets, steps=2, verbose=0)
+    model.predict(inputs, steps=2)
+    model.train_on_batch(inputs, targets)
+    model.test_on_batch(inputs, targets)
+    model.fit(inputs, targets,
+              epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=(inputs, targets), validation_steps=2)
+
+    # Test with dynamic shape
+    inputs = array_ops.placeholder_with_default(
+        np.zeros((2, 3)), shape=tensor_shape.TensorShape([None, 3]))
+    targets = array_ops.placeholder_with_default(
+        np.zeros((2, 4)), shape=tensor_shape.TensorShape([None, 4]))
+    self.assertEqual(inputs.shape.dims[0].value, None)
+    model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
+    model.evaluate(inputs, targets, steps=2, verbose=0)
+    model.predict(inputs, steps=2)
+    model.train_on_batch(inputs, targets)
+    model.test_on_batch(inputs, targets)
+    model.fit(inputs, targets,
+              epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=(inputs, targets), validation_steps=2)
+
+  @keras_parameterized.run_all_keras_modes
   def test_training_and_eval_methods_on_symbolic_tensors_multi_io(self):
-    with self.cached_session():
-      a = keras.layers.Input(shape=(3,), name='input_a')
-      b = keras.layers.Input(shape=(3,), name='input_b')
+    # TODO(kaftan) Test seems to not work, file ticket
+    if context.executing_eagerly():
+      self.skipTest('Skipping eager execution.')
 
-      dense = keras.layers.Dense(4, name='dense')
-      c = dense(a)
-      d = dense(b)
-      e = keras.layers.Dropout(0.5, name='dropout')(c)
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
 
-      model = keras.models.Model([a, b], [d, e])
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
 
-      optimizer = 'rmsprop'
-      loss = 'mse'
-      loss_weights = [1., 0.5]
-      model.compile(
-          optimizer,
-          loss,
-          metrics=['mae', metrics_module.CategoricalAccuracy()],
-          loss_weights=loss_weights)
+    model = keras.models.Model([a, b], [d, e])
 
-      input_a_tf = keras.backend.zeros(shape=(10, 3))
-      input_b_tf = keras.backend.zeros(shape=(10, 3))
+    optimizer = 'rmsprop'
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    model.compile(
+        optimizer,
+        loss,
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        loss_weights=loss_weights,
+        run_eagerly=testing_utils.should_run_eagerly())
 
-      output_d_tf = keras.backend.zeros(shape=(10, 4))
-      output_e_tf = keras.backend.zeros(shape=(10, 4))
+    input_a_tf = keras.backend.zeros(shape=(10, 3))
+    input_b_tf = keras.backend.zeros(shape=(10, 3))
 
+    output_d_tf = keras.backend.zeros(shape=(10, 4))
+    output_e_tf = keras.backend.zeros(shape=(10, 4))
+
+    model.fit(
+        [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+        epochs=1,
+        steps_per_epoch=2,
+        verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'should specify the `steps_per_epoch`'):
       model.fit(
           [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
           epochs=1,
-          steps_per_epoch=2,
+          batch_size=5,
           verbose=0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'should specify the `steps_per_epoch`'):
-        model.fit(
-            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-            epochs=1,
-            batch_size=5,
-            verbose=0)
-      model.train_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+    model.train_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
 
-      # Test with dictionary inputs
-      model.fit(
-          {'input_a': input_a_tf,
-           'input_b': input_b_tf},
-          {'dense': output_d_tf,
-           'dropout': output_e_tf},
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0)
-      model.fit(
-          {'input_a': input_a_tf,
-           'input_b': input_b_tf},
-          {'dense': output_d_tf,
-           'dropout': output_e_tf},
-          validation_data=({'input_a': input_a_tf,
-                            'input_b': input_b_tf},
-                           {'dense': output_d_tf,
-                            'dropout': output_e_tf}),
-          epochs=1,
-          steps_per_epoch=2,
-          validation_steps=2,
-          verbose=0)
-      model.train_on_batch(
-          {'input_a': input_a_tf,
-           'input_b': input_b_tf},
-          {'dense': output_d_tf,
-           'dropout': output_e_tf})
+    # Test with dictionary inputs
+    model.fit(
+        {'input_a': input_a_tf,
+         'input_b': input_b_tf},
+        {'dense': output_d_tf,
+         'dropout': output_e_tf},
+        epochs=1,
+        steps_per_epoch=2,
+        verbose=0)
+    model.fit(
+        {'input_a': input_a_tf,
+         'input_b': input_b_tf},
+        {'dense': output_d_tf,
+         'dropout': output_e_tf},
+        validation_data=({'input_a': input_a_tf,
+                          'input_b': input_b_tf},
+                         {'dense': output_d_tf,
+                          'dropout': output_e_tf}),
+        epochs=1,
+        steps_per_epoch=2,
+        validation_steps=2,
+        verbose=0)
+    model.train_on_batch(
+        {'input_a': input_a_tf,
+         'input_b': input_b_tf},
+        {'dense': output_d_tf,
+         'dropout': output_e_tf})
 
-      # Test with validation data
+    # Test with validation data
+    model.fit(
+        [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+        validation_data=([input_a_tf, input_b_tf],
+                         [output_d_tf, output_e_tf]),
+        epochs=1,
+        steps_per_epoch=2,
+        validation_steps=2,
+        verbose=0)
+    # Test with validation split
+    with self.assertRaisesRegexp(ValueError,
+                                 'you cannot use `validation_split`'):
       model.fit(
           [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-          validation_data=([input_a_tf, input_b_tf],
-                           [output_d_tf, output_e_tf]),
-          epochs=1,
+          epochs=2,
           steps_per_epoch=2,
-          validation_steps=2,
-          verbose=0)
-      # Test with validation split
-      with self.assertRaisesRegexp(ValueError,
-                                   'you cannot use `validation_split`'):
-        model.fit(
-            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-            epochs=2,
-            steps_per_epoch=2,
-            verbose=0,
-            validation_split=0.2,
-            validation_steps=2)
-
-      # Test evaluation / prediction methods
-      model.evaluate([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-                     steps=2, verbose=0)
-      model.predict([input_a_tf, input_b_tf], steps=2)
-      model.test_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+          verbose=0,
+          validation_split=0.2,
+          validation_steps=2)
+
+    # Test evaluation / prediction methods
+    model.evaluate([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+                   steps=2, verbose=0)
+    model.predict([input_a_tf, input_b_tf], steps=2)
+    model.test_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
 
   @tf_test_util.run_deprecated_v1
   def test_model_with_input_feed_tensor(self):
@@ -1866,10 +2075,10 @@ class TestTrainingWithDataTensors(test.TestCase):
                            [output_a_np, output_b_np])
 
 
-class TestTrainingWithMetrics(test.TestCase):
+class TestTrainingWithMetrics(keras_parameterized.TestCase):
   """Training tests related to metrics."""
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_metrics_names(self):
     a = keras.layers.Input(shape=(3,), name='input_a')
     b = keras.layers.Input(shape=(3,), name='input_b')
@@ -1883,7 +2092,8 @@ class TestTrainingWithMetrics(test.TestCase):
 
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     metrics = ['mse', metrics_module.BinaryAccuracy()]
-    model.compile(optimizer, loss='mae', metrics=metrics)
+    model.compile(optimizer, loss='mae', metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
     reference_metric_names = [
         'loss', 'dense_loss', 'dropout_loss', 'dense_mean_squared_error',
         'dense_binary_accuracy', 'dropout_mean_squared_error',
@@ -1903,7 +2113,7 @@ class TestTrainingWithMetrics(test.TestCase):
               batch_size=5)
     self.assertEqual(reference_metric_names, model.metrics_names)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_metrics_correctness(self):
     model = keras.Sequential()
     model.add(
@@ -1915,7 +2125,8 @@ class TestTrainingWithMetrics(test.TestCase):
     model.compile(
         loss='mae',
         metrics=['accuracy', metrics_module.BinaryAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
 
     # verify correctness of stateful and stateless metrics.
     x = np.ones((100, 4))
@@ -1929,7 +2140,7 @@ class TestTrainingWithMetrics(test.TestCase):
     self.assertEqual(outs[1], 0.)
     self.assertEqual(outs[2], 0.)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_metrics_correctness_with_weighted_metrics(self):
     np.random.seed(1337)
     x = np.array([[[1.], [1.]], [[0.], [0.]]])
@@ -1942,7 +2153,8 @@ class TestTrainingWithMetrics(test.TestCase):
         RMSPropOptimizer(learning_rate=0.001),
         loss='mse',
         sample_weight_mode='temporal',
-        weighted_metrics=['accuracy', 'mse'])
+        weighted_metrics=['accuracy', 'mse'],
+        run_eagerly=testing_utils.should_run_eagerly())
     y = np.array([[[1.], [1.]], [[1.], [1.]]])
 
     outs = model.evaluate(x, y)
@@ -1964,7 +2176,7 @@ class TestTrainingWithMetrics(test.TestCase):
     mse2 = model.evaluate(x, y, sample_weight=w, batch_size=10)[2]
     self.assertNear(mse1, mse2, err=1e-7)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_metric_state_reset_between_fit_and_evaluate(self):
     model = keras.Sequential()
     model.add(keras.layers.Dense(3, activation='relu', input_dim=4))
@@ -1973,7 +2185,8 @@ class TestTrainingWithMetrics(test.TestCase):
     model.compile(
         loss='mae',
         metrics=[acc_obj],
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x_train = np.random.random((100, 4))
     y_train = np.random.random((100, 1))
@@ -1985,7 +2198,7 @@ class TestTrainingWithMetrics(test.TestCase):
     model.evaluate(x_test, y_test, batch_size=5)
     self.assertEqual(self.evaluate(acc_obj.count), 10)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_invalid_metrics(self):
     num_classes = 5
     input_dim = 5
@@ -1999,10 +2212,13 @@ class TestTrainingWithMetrics(test.TestCase):
       model.compile(
           RMSPropOptimizer(learning_rate=0.001),
           loss='categorical_crossentropy',
-          metrics=metrics_module.CategoricalAccuracy())
+          metrics=metrics_module.CategoricalAccuracy(),
+          run_eagerly=testing_utils.should_run_eagerly())
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_metrics_masking(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/120495761')
     with self.cached_session():
       np.random.seed(1337)
       model = keras.models.Sequential()
@@ -2013,7 +2229,8 @@ class TestTrainingWithMetrics(test.TestCase):
       model.compile(
           RMSPropOptimizer(learning_rate=0.001),
           loss='mse',
-          weighted_metrics=['accuracy'])
+          weighted_metrics=['accuracy'],
+          run_eagerly=testing_utils.should_run_eagerly())
 
       # verify that masking is applied.
       x = np.array([[[1], [1]], [[1], [1]], [[0], [0]]])
@@ -2061,7 +2278,7 @@ class TestTrainingWithMetrics(test.TestCase):
       model.train_on_batch(inputs, targets)
       model.test_on_batch(inputs, targets)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_add_metric_in_model_call(self):
 
     class TestModel(keras.Model):
@@ -2080,7 +2297,8 @@ class TestTrainingWithMetrics(test.TestCase):
         return self.dense1(x)
 
     model = TestModel()
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01))
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
@@ -2098,45 +2316,7 @@ class TestTrainingWithMetrics(test.TestCase):
     model.train_on_batch(x, y)
     model.test_on_batch(x, y)
 
-  def test_add_metric_in_model_call_run_eagerly(self):
-    with context.eager_mode():
-
-      class TestModel(keras.Model):
-
-        def __init__(self):
-          super(TestModel, self).__init__(name='test_model')
-          self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
-          self.mean = metrics_module.Mean(name='metric_1')
-
-        def call(self, x):
-          self.add_metric(
-              math_ops.reduce_sum(x), name='metric_2', aggregation='mean')
-          # Provide same name as in the instance created in __init__
-          # for eager mode
-          self.add_metric(self.mean(x), name='metric_1')
-          return self.dense1(x)
-
-      model = TestModel()
-      model.compile(
-          loss='mse', optimizer=RMSPropOptimizer(0.01), run_eagerly=True)
-
-      x = np.ones(shape=(10, 1))
-      y = np.ones(shape=(10, 2))
-      history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
-      self.assertAlmostEqual(history.history['metric_1'][-1], 1, 0)
-      self.assertAlmostEqual(history.history['val_metric_1'][-1], 1, 0)
-      self.assertAlmostEqual(history.history['metric_2'][-1], 5, 0)
-      self.assertAlmostEqual(history.history['val_metric_2'][-1], 5, 0)
-
-      eval_results = model.evaluate(x, y, batch_size=5)
-      self.assertAlmostEqual(eval_results[1], 1, 0)
-      self.assertAlmostEqual(eval_results[2], 5, 0)
-
-      model.predict(x, batch_size=5)
-      model.train_on_batch(x, y)
-      model.test_on_batch(x, y)
-
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_add_metric_in_layer_call(self):
 
     class TestLayer(keras.layers.Layer):
@@ -2154,7 +2334,8 @@ class TestTrainingWithMetrics(test.TestCase):
     model = keras.Sequential()
     model.add(TestLayer(input_shape=(1,)))
     model.add(keras.layers.Dense(2, kernel_initializer='ones'))
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01))
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
@@ -2162,33 +2343,6 @@ class TestTrainingWithMetrics(test.TestCase):
     self.assertEqual(history.history['metric_1'][-1], 5)
     self.assertAlmostEqual(history.history['val_metric_1'][-1], 5, 0)
 
-  def test_add_metric_in_layer_call_run_eagerly(self):
-    with context.eager_mode():
-
-      class TestLayer(keras.layers.Layer):
-
-        def build(self, input_shape):
-          self.a = self.add_variable(
-              'a', (1, 1), initializer='ones', trainable=False)
-          self.built = True
-
-        def call(self, inputs):
-          self.add_metric(
-              math_ops.reduce_sum(inputs), name='metric_1', aggregation='mean')
-          return inputs + 1
-
-      model = keras.Sequential()
-      model.add(TestLayer(input_shape=(1,)))
-      model.add(keras.layers.Dense(2, kernel_initializer='ones'))
-      model.compile(
-          loss='mse', optimizer=RMSPropOptimizer(0.01), run_eagerly=True)
-
-      x = np.ones(shape=(10, 1))
-      y = np.ones(shape=(10, 2))
-      history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
-      self.assertEqual(history.history['metric_1'][-1], 5)
-      self.assertAlmostEqual(history.history['val_metric_1'][-1], 5, 0)
-
   @tf_test_util.run_deprecated_v1
   def test_model_metrics_list(self):
     with self.cached_session():
@@ -2244,7 +2398,7 @@ class TestTrainingWithMetrics(test.TestCase):
           names.append(m.__name__)
       self.assertEqual(names, ['categorical_accuracy', 'metric_1'])
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_multiple_add_metric_calls(self):
 
     class TestModel(keras.Model):
@@ -2263,7 +2417,8 @@ class TestTrainingWithMetrics(test.TestCase):
         return self.dense1(x)
 
     model = TestModel()
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01))
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
@@ -2302,7 +2457,7 @@ class TestTrainingWithMetrics(test.TestCase):
           'eager execution.'):
         model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_duplicate_metric_name_in_add_metric(self):
 
     class TestModel(keras.Model):
@@ -2318,7 +2473,8 @@ class TestTrainingWithMetrics(test.TestCase):
         return self.dense1(x)
 
     model = TestModel()
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01))
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
@@ -2328,8 +2484,11 @@ class TestTrainingWithMetrics(test.TestCase):
         'We found 2 metrics with the name: "metric_1"'):
       model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_multiple_no_name_input_to_add_metric(self):
+    # TODO(kaftan) Test seems to not work, file ticket
+    if testing_utils.should_run_eagerly() and context.executing_eagerly():
+      self.skipTest('Skipping running model eagerly.')
 
     class TestModel(keras.Model):
 
@@ -2343,7 +2502,8 @@ class TestTrainingWithMetrics(test.TestCase):
         return self.dense1(x)
 
     model = TestModel()
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01))
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
     model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 347582aa95a4a65a5b64c45e5c3f0b533584e69c..64c6f727c91be77f41398f6f63147c481b4e8cc2 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -18,23 +18,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 from collections import OrderedDict
 import copy
-import math
 
 import numpy as np
 import six
 
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -42,143 +46,138 @@ from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.util import nest
 
 
-def _map_nested(data, func):
-  """Maps each nested element using func."""
-  if isinstance(data, list):
-    return [_map_nested(nested_data, func) for nested_data in data]
-  elif isinstance(data, tuple):
-    return tuple(_map_nested(nested_data, func) for nested_data in data)
-  elif isinstance(data, dict):
-    return {
-        k: _map_nested(nested_data, func) for k, nested_data in data.items()
-    }
-  else:
-    return func(data)
+@six.add_metaclass(abc.ABCMeta)
+class Aggregator(object):
+  """Abstract base class used to aggregate batch-level outputs of a loop.
 
+  Attributes:
+    use_steps: Whether the loop is using `step` or `batch_size`.
+    num_samples_or_steps: Either `batch_size*num_batches` or `steps`.
+    results: What to return at the end of the aggregation loop.
+  """
 
-def _nested_all(data, cond_func):
-  """Checks if all elements in a nested structure satisfy cond_func."""
-  if isinstance(data, (tuple, list)):
-    return all(_nested_all(nested_data, cond_func) for nested_data in data)
-  elif isinstance(data, dict):
-    return all(
-        _nested_all(nested_data, cond_func) for nested_data in data.values())
-  else:
-    return cond_func(data)
+  def __init__(self, use_steps, num_samples_or_steps):
+    self.use_steps = use_steps
+    self.num_samples_or_steps = num_samples_or_steps
+    self.results = []
 
+  @abc.abstractmethod
+  def create(self, batch_outs):
+    """Creates the initial results from the first batch outputs.
 
-def _nested_any(data, cond_func):
-  """Checks if any nested_elements in a nested structure satisfy cond_func."""
-  if isinstance(data, (tuple, list)):
-    return any(_nested_any(nested_data, cond_func) for nested_data in data)
-  elif isinstance(data, dict):
-    return any(
-        [_nested_any(nested_data, cond_func) for nested_data in data.values()])
-  else:
-    return cond_func(data)
-
-
-def _convert_lists_to_tuples(data):
-  """Converts all lists to tuples, since Datasets expect tuples."""
-  if isinstance(data, (tuple, list)):
-    return tuple(_convert_lists_to_tuples(nested_data) for nested_data in data)
-  elif isinstance(data, dict):
-    return {
-        k: _convert_lists_to_tuples(nested_data)
-        for k, nested_data in data.items()
-    }
-  else:
-    return data
+    Arguments:
+      batch_outs: A list of batch-level outputs.
+    """
+    NotImplementedError('Must be implemented in subclasses.')
 
+  @abc.abstractmethod
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    """Aggregates batch-level results into total results.
 
-def _get_batch_axis_size(data):
-  """Returns batch axis shape for nested data."""
-  if isinstance(data, (tuple, list)):
-    return _get_batch_axis_size(data[0])
-  elif isinstance(data, dict):
-    return _get_batch_axis_size(list(data.values()))
-  else:
-    return int(data.shape[0])
+    Arguments:
+      batch_outs: A list of batch-level outputs.
+      batch_start: The start index of this batch. Always `None` if `use_steps`
+        is `True`.
+      batch_end: The end index of this batch. Always `None` if `use_steps` is
+        `True`.
+    """
+    NotImplementedError('Must be implemented in subclasses.')
 
+  @abc.abstractmethod
+  def finalize(self):
+    """Prepares the total results to be returned."""
+    NotImplementedError('Must be implemented in subclasses.')
 
-def convert_to_iterator(x=None,
-                        y=None,
-                        sample_weights=None,
-                        batch_size=None,
-                        steps_per_epoch=None,
-                        epochs=1,
-                        shuffle=False,
-                        is_validation=False):
-  """Converts NumPy arrays or EagerTensors to an EagerIterator.
 
-  Combines all provided data into a single EagerIterator.
+class MetricsAggregator(Aggregator):
+  """Aggregator that calculates loss and metrics info."""
 
-  Arguments:
-      x: NumPy array or EagerTensor,  or list of Numpy arrays or EagerTensors
-        representing inputs to a model.
-      y: Optional. NumPy array or EagerTensor, or list of Numpy arrays or
-        EagerTensors representing targets of a model.
-      sample_weights: Optional NumPy array or EagerTensor representing sample
-        weights.
-      batch_size: Used to batch data and calculate how many steps EagerIterator
-        should take per epoch.
-      steps_per_epoch: If provided, how many steps EagerIterator should take per
-        epoch.
-      epochs: Epochs to repeat iterator for.
-      shuffle: Whether to shuffle data after each epoch.
-      is_validation: Whether this call is for validation during a training
-        (e.g., `fit()`) call. This info is used to construct error messages
-        (if any).
+  def create(self, batch_outs):
+    self.results = [0.] * len(batch_outs)
 
-  Raises:
-      ValueError: if steps_per_epoch cannot be calculated from the data
-      provided.
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    # Loss.
+    if self.use_steps:
+      self.results[0] += batch_outs[0]
+    else:
+      self.results[0] += batch_outs[0] * (batch_end - batch_start)
+    # Metrics (always stateful, just grab current values.)
+    self.results[1:] = batch_outs[1:]
 
-  Returns:
-      (Iterator, steps_per_epoch).
+  def finalize(self):
+    self.results[0] /= self.num_samples_or_steps
 
-  """
-  if isinstance(x, iterator_ops.EagerIterator):
-    if steps_per_epoch is None:
-      raise ValueError('You must specify the number of steps (number of batches'
-                       ' to draw from the iterator).')
-    return x, steps_per_epoch
-
-  if not _nested_any(sample_weights, lambda x: x is None):
-    data = (x, y, sample_weights)
-  elif not _nested_any(y, lambda x: x is None):
-    data = (x, y)
-  else:
-    # always wrap in a tuple, so we know y, sample_weights weren't set
-    # even when x has multiple elements
-    data = (x,)
-
-  data = _convert_lists_to_tuples(data)
-  if steps_per_epoch is None and batch_size is not None:
-    num_samples = _get_batch_axis_size(data)
-    steps_per_epoch = int(math.ceil(num_samples / int(batch_size)))
-
-  if steps_per_epoch is None:
-    alternative_arg_name = (
-        'validation_steps' if is_validation else 'steps_per_epoch')
-    raise ValueError(
-        'Could not determine how to convert EagerTensors into EagerIterator. '
-        'Please provide either `batch_size` or '
-        '`%s`.' % alternative_arg_name)
 
-  # TODO(omalleyt) for NumPy arrays in graph mode
-  # placeholder ops should be used
-  # this is only ideal for eager mode
-  dataset = dataset_ops.Dataset.from_tensor_slices(data)
+class OutputsAggregator(Aggregator):
+  """Aggregator that concatenates outputs."""
+
+  def create(self, batch_outs):
+    if self.use_steps:
+      # Cannot pre-allocate the returned NumPy arrays bc
+      # batch sizes are unknown. Concatenate batches at the end.
+      for _ in batch_outs:
+        self.results.append([])
+    else:
+      # Pre-allocate NumPy arrays.
+      for batch_out in batch_outs:
+        shape = (self.num_samples_or_steps,) + batch_out.shape[1:]
+        self.results.append(np.zeros(shape, dtype=batch_out.dtype))
+
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    if self.use_steps:
+      for i, batch_out in enumerate(batch_outs):
+        self.results[i].append(batch_out)
+    else:
+      for i, batch_out in enumerate(batch_outs):
+        self.results[i][batch_start:batch_end] = batch_out
+
+  def finalize(self):
+    if self.use_steps:
+      self.results = [np.concatenate(result, axis=0) for result in self.results]
+
+
+def get_progbar(model, count_mode):
+  """Get Progbar."""
+  stateful_metric_names = None
+  if hasattr(model, 'metrics_names'):
+    stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
+  return cbks.ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names)
+
 
-  if batch_size is not None:
-    dataset = dataset.batch(batch_size)
-  if shuffle:
-    dataset = dataset.shuffle(buffer_size=10000)
-  dataset = dataset.repeat(epochs)
-  iterator = dataset.make_one_shot_iterator()
+def slice_arrays(arrays, indices, contiguous=True):
+  """Slices batches out of provided arrays (workaround for eager tensors).
 
-  return iterator, steps_per_epoch
+  Unfortunately eager tensors don't have the same slicing behavior as
+  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
+  hence we cannot use `generic_utils.slice_arrays` directly
+  and we have to implement this workaround based on `concat`. This has a
+  performance cost.
+
+  Arguments:
+    arrays: Single array or list of arrays.
+    indices: List of indices in the array that should be included in the output
+      batch.
+    contiguous: Boolean flag indicating whether the indices are contiguous.
+
+  Returns:
+    Slice of data (either single array or list of arrays).
+  """
+  converted_to_list = False
+  if not isinstance(arrays, list):
+    converted_to_list = True
+    arrays = [arrays]
+  if any(tensor_util.is_tensor(x) for x in arrays):
+    if not contiguous:
+      entries = [[x[i:i + 1] for i in indices] for x in arrays]
+      slices = [array_ops.concat(x, axis=0) for x in entries]
+    else:
+      slices = [x[indices[0]:indices[-1] + 1] for x in arrays]
+  else:
+    slices = generic_utils.slice_arrays(arrays, indices)
+
+  if converted_to_list:
+    slices = slices[0]
+  return slices
 
 
 def check_num_samples(ins,
@@ -223,10 +222,14 @@ def check_num_samples(ins,
   return None  # Edge case where ins == [static_learning_phase]
 
 
-def standardize_single_array(x):
+def standardize_single_array(x, expected_shape=None):
+  """Expand data of shape (x,) to (x, 1), unless len(expected_shape)==1."""
   if x is None:
     return None
-  if x.shape is not None and len(x.shape) == 1:
+
+  if (x.shape is not None
+      and len(x.shape) == 1
+      and (expected_shape is None or len(expected_shape) != 1)):
     if tensor_util.is_tensor(x):
       x = array_ops.expand_dims(x, axis=1)
     else:
@@ -292,7 +295,11 @@ def standardize_input_data(data,
   else:
     data = data.values if data.__class__.__name__ == 'DataFrame' else data
     data = [data]
-  data = [standardize_single_array(x) for x in data]
+  if shapes is not None:
+    data = [standardize_single_array(x, shape)
+            for (x, shape) in zip(data, shapes)]
+  else:
+    data = [standardize_single_array(x) for x in data]
 
   if len(data) != len(names):
     if data and hasattr(data[0], 'shape'):
@@ -1090,6 +1097,9 @@ class ModelInputs(object):
 
   def get_symbolic_inputs(self, return_single_as_list=False):
     """Returns inputs to be set as self.inputs for a model."""
+    # TODO(karmel): There is a side-effect here where what you get
+    # with as_list and as_dict depends on whether you have called this
+    # method first, since it modifies in place.
     for i in range(len(self._flattened_inputs)):
       k = self._input_names[i]
       v = self._flattened_inputs[i]
@@ -1097,6 +1107,7 @@ class ModelInputs(object):
         v = np.asarray(v)
         if v.ndim == 1:
           v = np.expand_dims(v, 1)
+
       if isinstance(v, (np.ndarray, ops.EagerTensor)):
         # We fix the placeholder shape except the batch size.
         # This is suboptimal, but it is the best we can do with the info
@@ -1104,6 +1115,10 @@ class ModelInputs(object):
         # to specify custom placeholders if the need arises.
         shape = (None,) + tuple(v.shape[1:])
         v = K.placeholder(shape=shape, name=k)
+      elif isinstance(v, tensor_shape.TensorShape):
+        shape = (None,) + tuple(v.as_list()[1:])
+        v = K.placeholder(shape=shape, name=k)
+
       self._flattened_inputs[i] = v
 
     if self._is_dict:
@@ -1120,3 +1135,112 @@ class ModelInputs(object):
   def as_list(self):
     """Returning the inputs as a list."""
     return self._flattened_inputs
+
+
+# Allow use of methods not exposed to the user.
+# pylint: disable=protected-access
+def get_input_shape_and_dtype(layer):
+  """Retrieves input shape and input dtype of layer if applicable.
+
+  Args:
+    layer: Layer (or model) instance.
+
+  Returns:
+    Tuple (input_shape, input_dtype). Both could be None if the layer
+      does not have a defined input shape.
+
+  Raises:
+    ValueError: in case an empty Sequential or Graph Network is passed.
+  """
+
+  def _is_graph_model(layer):
+    return ((hasattr(layer, '_is_graph_network') and layer._is_graph_network) or
+            layer.__class__.__name__ == 'Sequential')
+
+  # In case of nested models: recover the first layer
+  # of the deepest model to infer input shape and dtype.
+  # Subclassed Models may not have been built so can't be checked.
+  while _is_graph_model(layer):
+    if not layer.layers:
+      raise ValueError('An empty Model cannot be used as a Layer.')
+    layer = layer.layers[0]
+
+  if hasattr(layer, '_batch_input_shape'):
+    return layer._batch_input_shape, layer.dtype
+  return None, None
+
+
+# pylint: enable=protected-access
+
+
+def get_static_batch_size(layer):
+  """Gets the static batch size of a Layer.
+
+  Arguments:
+    layer: a `Layer` instance.
+
+  Returns:
+    The static batch size of a Layer.
+  """
+  batch_input_shape, _ = get_input_shape_and_dtype(layer)
+  if batch_input_shape is not None:
+    return tensor_shape.as_dimension(batch_input_shape[0]).value
+  return None
+
+
+def generic_output_names(outputs_list):
+  return ['output_%d' % (i + 1) for i in range(len(outputs_list))]
+
+
+def trace_model_call(model, input_signature=None):
+  """Trace the model call to create a tf.function for exporting a Keras model.
+
+  Args:
+    model: A Keras model.
+    input_signature: optional, a list of tf.TensorSpec objects specifying the
+      inputs to the model.
+
+  Returns:
+    A tf.function wrapping the model's call function with input signatures set.
+
+  Raises:
+    ValueError: if input signature cannot be inferred from the model.
+  """
+  if input_signature is None:
+    if isinstance(model.call, def_function.PolymorphicFunction):
+      input_signature = model.call.input_signature
+
+  if input_signature is None:
+    try:
+      inputs = model.inputs
+      input_names = model.input_names
+    except AttributeError:
+      raise ValueError(
+          'Model {} cannot be saved because the input shapes have not been '
+          'set. Usually, input shapes are automatically determined from calling'
+          ' .fit() or .predict(). To manually set the shapes, call '
+          'model._set_inputs(inputs).'.format(model))
+    input_specs = []
+    for input_tensor, input_name in zip(inputs, input_names):
+      input_specs.append(tensor_spec.TensorSpec(
+          shape=input_tensor.shape, dtype=input_tensor.dtype,
+          name=input_name))
+    # The input signature of the call function is a list with one element, since
+    # all tensor inputs must be passed in as the first argument.
+    input_signature = [input_specs] if len(input_specs) > 1 else input_specs
+
+  @def_function.function(input_signature=input_signature)
+  def _wrapped_model(*args):
+    """A concrete tf.function that wraps the model's call function."""
+    # When given a single input, Keras models will call the model on the tensor
+    # rather than a list consisting of the single tensor.
+    inputs = args[0] if len(input_signature) == 1 else list(args)
+    outputs_list = nest.flatten(model(inputs=inputs))
+    try:
+      output_names = model.output_names
+    except AttributeError:
+      output_names = generic_output_names(outputs_list)
+    return {name: output for name, output in zip(output_names, outputs_list)}
+
+  return _wrapped_model
+
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py
index d6a92dec7c445c49e07426d2d022055272ebeeb3..d8acec32cb65ffb2bbf517007802504e7c184544 100644
--- a/tensorflow/python/keras/engine/training_utils_test.py
+++ b/tensorflow/python/keras/engine/training_utils_test.py
@@ -18,147 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import numpy as np
 
+
+from tensorflow.python.client import session as session_lib
+from tensorflow.python import keras
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
-
-
-class TrainingUtilTest(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_single_numpy(self):
-    batch_size = 2
-    a = np.ones([10, 10])
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_batch = a[:batch_size, :]
-    actual_batch, = iterator.get_next()
-    self.assertAllEqual(expected_batch, actual_batch)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_single_tensor(self):
-    batch_size = 2
-    a = ops.convert_to_tensor(np.ones([10, 10]))
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_batch = a[:batch_size, :]
-    actual_batch, = iterator.get_next()
-    self.assertAllEqual(expected_batch, actual_batch)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_y(self):
-    batch_size = 2
-    a = np.ones([10, 100])
-    b = np.ones([10, 10])
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, y=b, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_x = a[:batch_size, :]
-    expected_y = b[:batch_size, :]
-    actual_x, actual_y = iterator.get_next()
-    self.assertAllEqual(expected_x, actual_x)
-    self.assertAllEqual(expected_y, actual_y)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_sample_weights(self):
-    batch_size = 2
-    a = ops.convert_to_tensor(np.ones([10, 100]))
-    b = ops.convert_to_tensor(np.ones([10, 10]))
-    sw = ops.convert_to_tensor(np.ones([10]))
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, y=b, sample_weights=sw, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_x = a[:batch_size, :]
-    expected_y = b[:batch_size, :]
-    expected_sw = sw[:batch_size]
-    actual_x, actual_y, actual_sw = iterator.get_next()
-    self.assertAllEqual(expected_x, actual_x)
-    self.assertAllEqual(expected_y, actual_y)
-    self.assertAllEqual(expected_sw, actual_sw)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_nested(self):
-    batch_size = 2
-    x = {'1': np.ones([10, 100]), '2': [np.zeros([10, 10]), np.ones([10, 20])]}
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=x, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_x1 = x['1'][:batch_size, :]
-    expected_x2_0 = x['2'][0][:batch_size, :]
-    expected_x2_1 = x['2'][1][:batch_size, :]
-
-    actual_x, = iterator.get_next()
-    actual_x1 = actual_x['1'][:batch_size, :]
-    actual_x2_0 = actual_x['2'][0][:batch_size, :]
-    actual_x2_1 = actual_x['2'][1][:batch_size, :]
-
-    self.assertAllEqual(expected_x1, actual_x1)
-    self.assertAllEqual(expected_x2_0, actual_x2_0)
-    self.assertAllEqual(expected_x2_1, actual_x2_1)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_epochs(self):
-    batch_size = 2
-    a = np.ones([10, 10])
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, batch_size=batch_size, epochs=2)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_batch = a[:batch_size, :]
-    # loop through one whole epoch
-    for _ in range(6):
-      actual_batch, = iterator.get_next()
-    self.assertAllEqual(expected_batch, actual_batch)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_insufficient_info(self):
-    # with batch_size and steps_per_epoch not set
-    with self.assertRaises(ValueError):
-      a = np.ones([10, 10])
-      _ = training_utils.convert_to_iterator(x=a)
-
-  def test_nested_all(self):
-    nested_data = {'a': True, 'b': [True, True, (False, True)]}
-    all_true = training_utils._nested_all(nested_data, lambda x: x)
-    self.assertEquals(all_true, False)
-
-    nested_data = {'a': True, 'b': [True, True, (True, True)]}
-    all_true = training_utils._nested_all(nested_data, lambda x: x)
-    self.assertEquals(all_true, True)
-
-  def test_nested_any(self):
-    nested_data = [False, {'a': False, 'b': (False, True)}]
-    any_true = training_utils._nested_any(nested_data, lambda x: x)
-    self.assertEquals(any_true, True)
-
-    nested_data = [False, {'a': False, 'b': (False, False)}]
-    any_true = training_utils._nested_any(nested_data, lambda x: x)
-    self.assertEquals(any_true, False)
-
-  def test_check_array_lengths(self):
-    training_utils.check_array_lengths(None, None, None)
-    a_np = np.random.random((4, 3, 3))
-    training_utils.check_array_lengths(a_np, a_np, a_np)
-    training_utils.check_array_lengths(
-        [a_np, a_np], [a_np, a_np], [a_np, a_np])
-    training_utils.check_array_lengths([None], [None], [None])
-
-    b_np = np.random.random((3, 4))
-    with self.assertRaises(ValueError):
-      training_utils.check_array_lengths([a_np], [b_np], None)
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import save as save_lib
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
 
 
 class ModelInputsTest(test.TestCase):
@@ -166,28 +49,28 @@ class ModelInputsTest(test.TestCase):
   def test_single_thing(self):
     a = np.ones(10)
     model_inputs = training_utils.ModelInputs(a)
-    self.assertEquals(['input_1'], model_inputs.get_input_names())
+    self.assertEqual(['input_1'], model_inputs.get_input_names())
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals))
     vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-    self.assertEquals(1, len(vals))
+    self.assertEqual(1, len(vals))
     self.assertTrue(tensor_util.is_tensor(vals[0]))
 
   def test_single_thing_eager(self):
     with context.eager_mode():
       a = np.ones(10)
       model_inputs = training_utils.ModelInputs(a)
-      self.assertEquals(['input_1'], model_inputs.get_input_names())
+      self.assertEqual(['input_1'], model_inputs.get_input_names())
       val = model_inputs.get_symbolic_inputs()
       self.assertTrue(tf_utils.is_symbolic_tensor(val))
       vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-      self.assertEquals(1, len(vals))
+      self.assertEqual(1, len(vals))
       self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
 
   def test_list(self):
     a = [np.ones(10), np.ones(20)]
     model_inputs = training_utils.ModelInputs(a)
-    self.assertEquals(['input_1', 'input_2'], model_inputs.get_input_names())
+    self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals[0]))
     self.assertTrue(tensor_util.is_tensor(vals[1]))
@@ -196,7 +79,7 @@ class ModelInputsTest(test.TestCase):
     with context.eager_mode():
       a = [np.ones(10), np.ones(20)]
       model_inputs = training_utils.ModelInputs(a)
-      self.assertEquals(['input_1', 'input_2'], model_inputs.get_input_names())
+      self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
       vals = model_inputs.get_symbolic_inputs()
       self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
       self.assertTrue(tf_utils.is_symbolic_tensor(vals[1]))
@@ -204,7 +87,7 @@ class ModelInputsTest(test.TestCase):
   def test_dict(self):
     a = {'b': np.ones(10), 'a': np.ones(20)}
     model_inputs = training_utils.ModelInputs(a)
-    self.assertEquals(['a', 'b'], model_inputs.get_input_names())
+    self.assertEqual(['a', 'b'], model_inputs.get_input_names())
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals['a']))
     self.assertTrue(tensor_util.is_tensor(vals['b']))
@@ -213,11 +96,174 @@ class ModelInputsTest(test.TestCase):
     with context.eager_mode():
       a = {'b': np.ones(10), 'a': np.ones(20)}
       model_inputs = training_utils.ModelInputs(a)
-      self.assertEquals(['a', 'b'], model_inputs.get_input_names())
+      self.assertEqual(['a', 'b'], model_inputs.get_input_names())
       vals = model_inputs.get_symbolic_inputs()
       self.assertTrue(tf_utils.is_symbolic_tensor(vals['a']))
       self.assertTrue(tf_utils.is_symbolic_tensor(vals['b']))
 
 
+class TraceModelCallTest(keras_parameterized.TestCase):
+
+  def _assert_all_close(self, expected, actual):
+    if not context.executing_eagerly():
+      with self.cached_session() as sess:
+        K._initialize_variables(sess)
+        self.assertAllClose(expected, actual)
+    else:
+      self.assertAllClose(expected, actual)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_trace_model_outputs(self):
+    input_dim = 5 if testing_utils.get_model_type() == 'functional' else None
+    model = testing_utils.get_small_mlp(10, 3, input_dim)
+    inputs = array_ops.ones((8, 5))
+
+    if input_dim is None:
+      with self.assertRaisesRegexp(ValueError,
+                                   'input shapes have not been set'):
+        training_utils.trace_model_call(model)
+      model._set_inputs(inputs)
+
+    fn = training_utils.trace_model_call(model)
+    signature_outputs = fn(inputs)
+    expected_outputs = {model.output_names[0]: model(inputs)}
+
+    self._assert_all_close(expected_outputs, signature_outputs)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_trace_model_outputs_after_fitting(self):
+    input_dim = 5 if testing_utils.get_model_type() == 'functional' else None
+    model = testing_utils.get_small_mlp(10, 3, input_dim)
+    model.compile(optimizer='sgd', loss='mse')
+    model.fit(x=np.random.random((8, 5)),
+              y=np.random.random((8, 3)), epochs=2)
+
+    inputs = array_ops.ones((8, 5))
+
+    fn = training_utils.trace_model_call(model)
+    signature_outputs = fn(inputs)
+    expected_outputs = {model.output_names[0]: model(inputs)}
+
+    self._assert_all_close(expected_outputs, signature_outputs)
+
+  @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
+  @keras_parameterized.run_all_keras_modes
+  def test_trace_multi_io_model_outputs(self):
+    input_dim = 5
+    num_classes = 3
+    num_classes_b = 4
+    input_a = keras.layers.Input(shape=(input_dim,), name='input_a')
+    input_b = keras.layers.Input(shape=(input_dim,), name='input_b')
+
+    dense = keras.layers.Dense(num_classes, name='dense')
+    dense2 = keras.layers.Dense(num_classes_b, name='dense2')
+    dropout = keras.layers.Dropout(0.5, name='dropout')
+    branch_a = [input_a, dense]
+    branch_b = [input_b, dense, dense2, dropout]
+
+    model = testing_utils.get_multi_io_model(branch_a, branch_b)
+
+    input_a_np = np.random.random((10, input_dim)).astype(np.float32)
+    input_b_np = np.random.random((10, input_dim)).astype(np.float32)
+
+    if testing_utils.get_model_type() == 'subclass':
+      with self.assertRaisesRegexp(ValueError,
+                                   'input shapes have not been set'):
+        training_utils.trace_model_call(model)
+
+    model.compile(optimizer='sgd', loss='mse')
+    model.fit(x=[np.random.random((8, input_dim)).astype(np.float32),
+                 np.random.random((8, input_dim)).astype(np.float32)],
+              y=[np.random.random((8, num_classes)).astype(np.float32),
+                 np.random.random((8, num_classes_b)).astype(np.float32)],
+              epochs=2)
+
+    fn = training_utils.trace_model_call(model)
+    signature_outputs = fn([input_a_np, input_b_np])
+    outputs = model([input_a_np, input_b_np])
+    expected_outputs = {model.output_names[0]: outputs[0],
+                        model.output_names[1]: outputs[1]}
+
+    self._assert_all_close(expected_outputs, signature_outputs)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_specify_input_signature(self):
+    model = testing_utils.get_small_sequential_mlp(10, 3, None)
+    inputs = array_ops.ones((8, 5))
+
+    with self.assertRaisesRegexp(ValueError, 'input shapes have not been set'):
+      training_utils.trace_model_call(model)
+
+    fn = training_utils.trace_model_call(
+        model, [tensor_spec.TensorSpec(shape=[None, 5], dtype=dtypes.float32)])
+    signature_outputs = fn(inputs)
+    expected_outputs = {model.output_names[0]: model(inputs)}
+    self._assert_all_close(expected_outputs, signature_outputs)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_subclassed_model_with_input_signature(self):
+
+    class Model(keras.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.dense = keras.layers.Dense(3, name='dense')
+
+      @def_function.function(
+          input_signature=[[tensor_spec.TensorSpec([None, 5], dtypes.float32),
+                            tensor_spec.TensorSpec([None], dtypes.float32)]],)
+      def call(self, inputs, *args):
+        x, y = inputs
+        return self.dense(x) + y
+
+    model = Model()
+    fn = training_utils.trace_model_call(model)
+    x = array_ops.ones((8, 5), dtype=dtypes.float32)
+    y = array_ops.ones((3,), dtype=dtypes.float32)
+    expected_outputs = {'output_1': model([x, y])}
+    signature_outputs = fn([x, y])
+    self._assert_all_close(expected_outputs, signature_outputs)
+
+
+def _import_and_infer(save_dir, inputs):
+  """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
+  graph = ops.Graph()
+  with graph.as_default(), session_lib.Session() as session:
+    model = loader.load(session, [tag_constants.SERVING], save_dir)
+    signature = model.signature_def[
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+    assert set(inputs.keys()) == set(signature.inputs.keys())
+    feed_dict = {}
+    for arg_name in inputs.keys():
+      feed_dict[graph.get_tensor_by_name(signature.inputs[arg_name].name)] = (
+          inputs[arg_name])
+    output_dict = {}
+    for output_name, output_tensor_info in signature.outputs.items():
+      output_dict[output_name] = graph.get_tensor_by_name(
+          output_tensor_info.name)
+    return session.run(output_dict, feed_dict=feed_dict)
+
+
+class ModelSaveTest(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_model_save(self):
+    input_dim = 5
+    model = testing_utils.get_small_mlp(10, 3, input_dim)
+    inputs = array_ops.ones((8, 5))
+
+    if testing_utils.get_model_type() == 'subclass':
+      model._set_inputs(inputs)
+
+    save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
+    save_lib.save(model, save_dir)
+
+    self.assertAllClose(
+        {model.output_names[0]: model.predict_on_batch(inputs)},
+        _import_and_infer(save_dir, {model.input_names[0]: np.ones((8, 5))}))
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/estimator/__init__.py b/tensorflow/python/keras/estimator/__init__.py
index 3c1a63d6dfdb3b4324e7f29b77d1bceb8d2bf9d1..dcd0600897005f1905b5f6b65cdc0f225172fa1b 100644
--- a/tensorflow/python/keras/estimator/__init__.py
+++ b/tensorflow/python/keras/estimator/__init__.py
@@ -65,7 +65,7 @@ def model_to_estimator(
     raise NotImplementedError(
         'tf.keras.estimator.model_to_estimator function not available in your '
         'installation.')
-  keras_lib.model_to_estimator(
+  return keras_lib.model_to_estimator(
       keras_model=keras_model,
       keras_model_path=keras_model_path,
       custom_objects=custom_objects,
diff --git a/tensorflow/python/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
index f1a0932613bcf4f067e590817375994c26edeb2a..fbe3508f07d85d91c845a9defd2f3660d0b25754 100644
--- a/tensorflow/python/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -35,7 +35,6 @@ class KerasIntegrationTest(test.TestCase):
   def test_version(self):
     self.assertTrue(keras.__version__.endswith('-tf'))
 
-  @test_util.run_deprecated_v1
   def test_vector_classification_sequential(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -168,7 +167,6 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
-  @test_util.run_deprecated_v1
   def test_video_classification_functional(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -197,7 +195,6 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
-  @test_util.run_deprecated_v1
   def test_vector_classification_shared_sequential(self):
     # Test that Sequential models that feature internal updates
     # and internal losses can be shared.
@@ -232,7 +229,6 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
-  @test_util.run_deprecated_v1
   def test_vector_classification_shared_model(self):
     # Test that functional models that feature internal updates
     # and internal losses can be shared.
diff --git a/tensorflow/python/keras/keras_parameterized.py b/tensorflow/python/keras/keras_parameterized.py
new file mode 100644
index 0000000000000000000000000000000000000000..d76bbadeb3613a8e71b1a6fc313fb7e68630de93
--- /dev/null
+++ b/tensorflow/python/keras/keras_parameterized.py
@@ -0,0 +1,298 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for unit-testing Keras."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+import itertools
+import unittest
+
+from absl.testing import parameterized
+
+from tensorflow.python import keras
+from tensorflow.python import tf2
+from tensorflow.python.eager import context
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class TestCase(test.TestCase, parameterized.TestCase):
+
+  def tearDown(self):
+    keras.backend.clear_session()
+    super(TestCase, self).tearDown()
+
+
+# TODO(kaftan): Possibly enable 'subclass_custom_build' when tests begin to pass
+# it. Or perhaps make 'subclass' always use a custom build method.
+def run_with_all_model_types(
+    test_or_class=None,
+    exclude_models=None):
+  """Execute the decorated test with all Keras model types.
+
+  This decorator is intended to be applied either to individual test methods in
+  a `keras_parameterized.TestCase` class, or directly to a test class that
+  extends it. Doing so will cause the contents of the individual test
+  method (or all test methods in the class) to be executed multiple times - once
+  for each Keras model type.
+
+  The Keras model types are: ['functional', 'subclass', 'sequential']
+
+  Note: if stacking this decorator with absl.testing's parameterized decorators,
+  those should be at the bottom of the stack.
+
+  Various methods in `testing_utils` to get models will auto-generate a model
+  of the currently active Keras model type. This allows unittests to confirm
+  the equivalence between different Keras models.
+
+  For example, consider the following unittest:
+
+  ```python
+  class MyTests(testing_utils.KerasTestCase):
+
+    @testing_utils.run_with_all_model_types(
+      exclude_models = ['sequential'])
+    def test_foo(self):
+      model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+  if __name__ == "__main__":
+    tf.test.main()
+  ```
+
+  This test tries building a small mlp as both a functional model and as a
+  subclass model.
+
+  We can also annotate the whole class if we want this to apply to all tests in
+  the class:
+  ```python
+  @testing_utils.run_with_all_model_types(exclude_models = ['sequential'])
+  class MyTests(testing_utils.KerasTestCase):
+
+    def test_foo(self):
+      model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+  if __name__ == "__main__":
+    tf.test.main()
+  ```
+
+
+  Args:
+    test_or_class: test method or class to be annotated. If None,
+      this method returns a decorator that can be applied to a test method or
+      test class. If it is not None this returns the decorator applied to the
+      test or class.
+    exclude_models: A collection of Keras model types to not run.
+      (May also be a single model type not wrapped in a collection).
+      Defaults to None.
+
+  Returns:
+    Returns a decorator that will run the decorated test method multiple times:
+    once for each desired Keras model type.
+
+  Raises:
+    ImportError: If abseil parameterized is not installed or not included as
+      a target dependency.
+  """
+  model_types = ['functional', 'subclass', 'sequential']
+  params = [('_%s' % model, model) for model in model_types
+            if model not in nest.flatten(exclude_models)]
+
+  def single_method_decorator(f):
+    """Decorator that constructs the test cases."""
+    # Use named_parameters so it can be individually run from the command line
+    @parameterized.named_parameters(*params)
+    @functools.wraps(f)
+    def decorated(self, model_type, *args, **kwargs):
+      """A run of a single test case w/ the specified model type."""
+      with testing_utils.model_type_scope(model_type):
+        f(self, *args, **kwargs)
+
+    return decorated
+
+  return _test_or_class_decorator(test_or_class, single_method_decorator)
+
+
+def run_all_keras_modes(
+    test_or_class=None,
+    config=None,
+    always_skip_v1=False):
+  """Execute the decorated test with all keras execution modes.
+
+  This decorator is intended to be applied either to individual test methods in
+  a `keras_parameterized.TestCase` class, or directly to a test class that
+  extends it. Doing so will cause the contents of the individual test
+  method (or all test methods in the class) to be executed multiple times -
+  once executing in legacy graph mode, once running eagerly and with
+  `should_run_eagerly` returning True, and once running eagerly with
+  `should_run_eagerly` returning False.
+
+  If Tensorflow v2 behavior is enabled, legacy graph mode will be skipped, and
+  the test will only run twice.
+
+  Note: if stacking this decorator with absl.testing's parameterized decorators,
+  those should be at the bottom of the stack.
+
+  For example, consider the following unittest:
+
+  ```python
+  class MyTests(testing_utils.KerasTestCase):
+
+    @testing_utils.run_all_keras_modes
+    def test_foo(self):
+      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics,
+                    run_eagerly=testing_utils.should_run_eagerly())
+
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+  if __name__ == "__main__":
+    tf.test.main()
+  ```
+
+  This test will try compiling & fitting the small functional mlp using all
+  three Keras execution modes.
+
+  Args:
+    test_or_class: test method or class to be annotated. If None,
+      this method returns a decorator that can be applied to a test method or
+      test class. If it is not None this returns the decorator applied to the
+      test or class.
+    config: An optional config_pb2.ConfigProto to use to configure the
+      session when executing graphs.
+    always_skip_v1: If True, does not try running the legacy graph mode even
+      when Tensorflow v2 behavior is not enabled.
+
+  Returns:
+    Returns a decorator that will run the decorated test method multiple times.
+
+  Raises:
+    ImportError: If abseil parameterized is not installed or not included as
+      a target dependency.
+  """
+  params = [('_v2_eager', 'v2_eager'),
+            ('_v2_function', 'v2_function')]
+  if not (always_skip_v1 or tf2.enabled()):
+    params.append(('_v1_graph', 'v1_graph'))
+
+  def single_method_decorator(f):
+    """Decorator that constructs the test cases."""
+
+    # Use named_parameters so it can be individually run from the command line
+    @parameterized.named_parameters(*params)
+    @functools.wraps(f)
+    def decorated(self, run_mode, *args, **kwargs):
+      """A run of a single test case w/ specified run mode."""
+      if run_mode == 'v1_graph':
+        with context.graph_mode(), testing_utils.run_eagerly_scope(False):
+          with self.test_session(use_gpu=True, config=config):
+            f(self, *args, **kwargs)
+      elif run_mode == 'v2_function':
+        with context.eager_mode():
+          with testing_utils.run_eagerly_scope(False):
+            f(self, *args, **kwargs)
+      elif run_mode == 'v2_eager':
+        with context.eager_mode():
+          with testing_utils.run_eagerly_scope(True):
+            f(self, *args, **kwargs)
+      else:
+        return ValueError('Unknown run mode %s' % run_mode)
+
+    return decorated
+
+  return _test_or_class_decorator(test_or_class, single_method_decorator)
+
+
+def _test_or_class_decorator(test_or_class, single_method_decorator):
+  """Decorate a test or class with a decorator intended for one method.
+
+  If the test_or_class is a class:
+    This will apply the decorator to all test methods in the class.
+
+  If the test_or_class is an iterable of already-parameterized test cases:
+    This will apply the decorator to all the cases, and then flatten the
+    resulting cross-product of test cases. This allows stacking the Keras
+    parameterized decorators w/ each other, and to apply them to test methods
+    that have already been marked with an absl parameterized decorator.
+
+  Otherwise, treat the obj as a single method and apply the decorator directly.
+
+  Args:
+    test_or_class: A test method (that may have already been decorated with a
+      parameterized decorator, or a test class that extends
+      keras_parameterized.TestCase
+    single_method_decorator:
+      A parameterized decorator intended for a single test method.
+  Returns:
+    The decorated result.
+  """
+  def _decorate_test_or_class(obj):
+    if isinstance(obj, collections.Iterable):
+      return itertools.chain.from_iterable(
+          single_method_decorator(method) for method in obj)
+    if isinstance(obj, type):
+      cls = obj
+      for name, value in cls.__dict__.copy().items():
+        if callable(value) and name.startswith(
+            unittest.TestLoader.testMethodPrefix):
+          setattr(cls, name, single_method_decorator(value))
+
+      cls = type(cls).__new__(type(cls), cls.__name__, cls.__bases__,
+                              cls.__dict__.copy())
+      return cls
+
+    return single_method_decorator(obj)
+
+  if test_or_class is not None:
+    return _decorate_test_or_class(test_or_class)
+
+  return _decorate_test_or_class
diff --git a/tensorflow/python/keras/keras_parameterized_test.py b/tensorflow/python/keras/keras_parameterized_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0ff40cfc7a17114fad20a51f29a6aed89b56015
--- /dev/null
+++ b/tensorflow/python/keras/keras_parameterized_test.py
@@ -0,0 +1,552 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras testing_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+from absl.testing import parameterized
+
+from tensorflow.python import keras
+from tensorflow.python import tf2
+from tensorflow.python.eager import context
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import googletest
+
+
+class KerasParameterizedTest(keras_parameterized.TestCase):
+
+  def test_run_with_all_model_types(self):
+    model_types = []
+    models = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_with_all_model_types
+      def testBody(self):
+        model_types.append(testing_utils.get_model_type())
+        models.append(testing_utils.get_small_mlp(1, 4, input_dim=3))
+
+    e = ExampleTest()
+    e.testBody_functional()
+    e.testBody_subclass()
+    e.testBody_sequential()
+
+    self.assertLen(model_types, 3)
+    self.assertAllEqual(model_types, [
+        "functional",
+        "subclass",
+        "sequential"
+    ])
+
+    # Validate that the models are what they should be
+    self.assertTrue(models[0]._is_graph_network)
+    self.assertFalse(models[1]._is_graph_network)
+    self.assertNotIsInstance(models[0], keras.models.Sequential)
+    self.assertNotIsInstance(models[1], keras.models.Sequential)
+    self.assertIsInstance(models[2], keras.models.Sequential)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(model_types, 6)
+
+  def test_run_with_all_model_types_and_extra_params(self):
+    model_types = []
+    models = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_with_all_model_types
+      @parameterized.named_parameters(
+          [dict(testcase_name="_0", with_brackets=True),
+           dict(testcase_name="_1", with_brackets=False)])
+      def testBody(self, with_brackets):
+        with_brackets = "with_brackets" if with_brackets else "without_brackets"
+        model_types.append((with_brackets, testing_utils.get_model_type()))
+        models.append(testing_utils.get_small_mlp(1, 4, input_dim=3))
+
+    e = ExampleTest()
+    e.testBody_0_functional()
+    e.testBody_0_subclass()
+    e.testBody_0_sequential()
+    e.testBody_1_functional()
+    e.testBody_1_subclass()
+    e.testBody_1_sequential()
+
+    self.assertLen(model_types, 6)
+    self.assertAllEqual(model_types, [
+        ("with_brackets", "functional"),
+        ("with_brackets", "subclass"),
+        ("with_brackets", "sequential"),
+        ("without_brackets", "functional"),
+        ("without_brackets", "subclass"),
+        ("without_brackets", "sequential"),
+    ])
+
+    # Validate that the models are what they should be
+    self.assertTrue(models[0]._is_graph_network)
+    self.assertFalse(models[1]._is_graph_network)
+    self.assertNotIsInstance(models[0], keras.models.Sequential)
+    self.assertNotIsInstance(models[1], keras.models.Sequential)
+    self.assertIsInstance(models[2], keras.models.Sequential)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(model_types, 12)
+
+  def test_run_with_all_model_types_exclude_one(self):
+    model_types = []
+    models = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_with_all_model_types(exclude_models="sequential")
+      def testBody(self):
+        model_types.append(testing_utils.get_model_type())
+        models.append(testing_utils.get_small_mlp(1, 4, input_dim=3))
+
+    e = ExampleTest()
+    if hasattr(e, "testBody_functional"):
+      e.testBody_functional()
+    if hasattr(e, "testBody_subclass"):
+      e.testBody_subclass()
+    if hasattr(e, "testBody_sequential"):
+      e.testBody_sequential()
+
+    self.assertLen(model_types, 2)
+    self.assertAllEqual(model_types, [
+        "functional",
+        "subclass"
+    ])
+
+    # Validate that the models are what they should be
+    self.assertTrue(models[0]._is_graph_network)
+    self.assertFalse(models[1]._is_graph_network)
+    self.assertNotIsInstance(models[0], keras.models.Sequential)
+    self.assertNotIsInstance(models[1], keras.models.Sequential)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(model_types, 4)
+
+  def test_run_with_all_model_types_exclude_multiple(self):
+    model_types = []
+    models = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_with_all_model_types(
+          exclude_models=["sequential", "functional"])
+      def testBody(self):
+        model_types.append(testing_utils.get_model_type())
+        models.append(testing_utils.get_small_mlp(1, 4, input_dim=3))
+
+    e = ExampleTest()
+    if hasattr(e, "testBody_functional"):
+      e.testBody_functional()
+    if hasattr(e, "testBody_subclass"):
+      e.testBody_subclass()
+    if hasattr(e, "testBody_sequential"):
+      e.testBody_sequential()
+
+    self.assertLen(model_types, 1)
+    self.assertAllEqual(model_types, [
+        "subclass"
+    ])
+
+    # Validate that the models are what they should be
+    self.assertFalse(models[0]._is_graph_network)
+    self.assertNotIsInstance(models[0], keras.models.Sequential)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(model_types, 2)
+
+  def test_run_all_keras_modes(self):
+    l = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_all_keras_modes
+      def testBody(self):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly))
+
+    e = ExampleTest()
+    if not tf2.enabled():
+      e.testBody_v1_graph()
+    e.testBody_v2_eager()
+    e.testBody_v2_function()
+
+    if not tf2.enabled():
+      self.assertLen(l, 3)
+      self.assertAllEqual(l, [
+          ("graph", False),
+          ("eager", True),
+          ("eager", False),
+      ])
+
+      ts = unittest.makeSuite(ExampleTest)
+      res = unittest.TestResult()
+      ts.run(res)
+      self.assertLen(l, 6)
+    else:
+      self.assertLen(l, 2)
+      self.assertAllEqual(l, [
+          ("eager", True),
+          ("eager", False),
+      ])
+
+      ts = unittest.makeSuite(ExampleTest)
+      res = unittest.TestResult()
+      ts.run(res)
+      self.assertLen(l, 4)
+
+  def test_run_all_keras_modes_extra_params(self):
+    l = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_all_keras_modes
+      @parameterized.named_parameters(
+          [dict(testcase_name="_0", with_brackets=True),
+           dict(testcase_name="_1", with_brackets=False)])
+      def testBody(self, with_brackets):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        with_brackets = "with_brackets" if with_brackets else "without_brackets"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((with_brackets, mode, should_run_eagerly))
+
+    e = ExampleTest()
+    if not tf2.enabled():
+      e.testBody_0_v1_graph()
+      e.testBody_1_v1_graph()
+
+    e.testBody_0_v2_eager()
+    e.testBody_0_v2_function()
+    e.testBody_1_v2_eager()
+    e.testBody_1_v2_function()
+
+    expected_combinations = {
+        ("with_brackets", "eager", True),
+        ("with_brackets", "eager", False),
+        ("without_brackets", "eager", True),
+        ("without_brackets", "eager", False),
+    }
+
+    if not tf2.enabled():
+      expected_combinations = expected_combinations.union({
+          ("with_brackets", "graph", False),
+          ("without_brackets", "graph", False),
+      })
+
+    self.assertLen(l, len(expected_combinations))
+    self.assertEqual(set(l), expected_combinations)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(l, len(expected_combinations) * 2)
+
+  def test_run_all_keras_modes_always_skip_v1(self):
+    l = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+      def testBody(self):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly))
+
+    e = ExampleTest()
+    if hasattr(e, "testBody_v1_graph"):
+      e.testBody_v1_graph()
+    if hasattr(e, "testBody_v2_eager"):
+      e.testBody_v2_eager()
+    if hasattr(e, "testBody_v2_function"):
+      e.testBody_v2_function()
+
+    self.assertLen(l, 2)
+    self.assertEqual(set(l), {
+        ("eager", True),
+        ("eager", False),
+    })
+
+  def test_run_all_keras_modes_with_all_model_types(self):
+    l = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_with_all_model_types
+      @keras_parameterized.run_all_keras_modes
+      def testBody(self):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
+
+    e = ExampleTest()
+    e.testBody_v2_eager_functional()
+    e.testBody_v2_function_functional()
+    e.testBody_v2_eager_sequential()
+    e.testBody_v2_function_sequential()
+    e.testBody_v2_eager_subclass()
+    e.testBody_v2_function_subclass()
+
+    if not tf2.enabled():
+      e.testBody_v1_graph_functional()
+      e.testBody_v1_graph_sequential()
+      e.testBody_v1_graph_subclass()
+
+    expected_combinations = {
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
+    }
+
+    if not tf2.enabled():
+      expected_combinations = expected_combinations.union({
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
+      })
+
+    self.assertLen(l, len(expected_combinations))
+    self.assertEqual(set(l), expected_combinations)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(l, len(expected_combinations) * 2)
+
+  def test_run_all_model_types_with_all_keras_modes(self):
+    l = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_all_keras_modes
+      @keras_parameterized.run_with_all_model_types
+      def testBody(self):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
+
+    e = ExampleTest()
+    e.testBody_functional_v2_eager()
+    e.testBody_functional_v2_function()
+    e.testBody_sequential_v2_eager()
+    e.testBody_sequential_v2_function()
+    e.testBody_subclass_v2_eager()
+    e.testBody_subclass_v2_function()
+
+    if not tf2.enabled():
+      e.testBody_functional_v1_graph()
+      e.testBody_sequential_v1_graph()
+      e.testBody_subclass_v1_graph()
+
+    expected_combinations = {
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
+    }
+
+    if not tf2.enabled():
+      expected_combinations = expected_combinations.union({
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
+      })
+
+    self.assertLen(l, len(expected_combinations))
+    self.assertEqual(set(l), expected_combinations)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(l, len(expected_combinations) * 2)
+
+  def test_run_all_keras_modes_with_all_model_types_annotate_class(self):
+    l = []
+
+    @keras_parameterized.run_with_all_model_types
+    @keras_parameterized.run_all_keras_modes
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @parameterized.named_parameters(dict(testcase_name="_arg",
+                                           arg=True))
+      def testBody(self, arg):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
+
+    e = ExampleTest()
+    e.testBody_arg_v2_eager_functional()
+    e.testBody_arg_v2_function_functional()
+    e.testBody_arg_v2_eager_sequential()
+    e.testBody_arg_v2_function_sequential()
+    e.testBody_arg_v2_eager_subclass()
+    e.testBody_arg_v2_function_subclass()
+
+    if not tf2.enabled():
+      e.testBody_arg_v1_graph_functional()
+      e.testBody_arg_v1_graph_sequential()
+      e.testBody_arg_v1_graph_subclass()
+
+    expected_combinations = {
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
+    }
+
+    if not tf2.enabled():
+      expected_combinations = expected_combinations.union({
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
+      })
+
+    self.assertLen(l, len(expected_combinations))
+    self.assertEqual(set(l), expected_combinations)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(l, len(expected_combinations) * 2)
+
+  def test_run_all_keras_modes_with_all_model_types_annotate_class_2(self):
+    l = []
+
+    @keras_parameterized.run_with_all_model_types
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_all_keras_modes
+      @parameterized.named_parameters(dict(testcase_name="_arg",
+                                           arg=True))
+      def testBody(self, arg):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
+
+    e = ExampleTest()
+    e.testBody_arg_v2_eager_functional()
+    e.testBody_arg_v2_function_functional()
+    e.testBody_arg_v2_eager_sequential()
+    e.testBody_arg_v2_function_sequential()
+    e.testBody_arg_v2_eager_subclass()
+    e.testBody_arg_v2_function_subclass()
+
+    if not tf2.enabled():
+      e.testBody_arg_v1_graph_functional()
+      e.testBody_arg_v1_graph_sequential()
+      e.testBody_arg_v1_graph_subclass()
+
+    expected_combinations = {
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
+    }
+
+    if not tf2.enabled():
+      expected_combinations = expected_combinations.union({
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
+      })
+
+    self.assertLen(l, len(expected_combinations))
+    self.assertEqual(set(l), expected_combinations)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(l, len(expected_combinations) * 2)
+
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters(dict(testcase_name="argument",
+                                       arg=True))
+  def test_run_all_keras_modes_extra_params_2(self, arg):
+    self.assertEqual(arg, True)
+
+  @keras_parameterized.run_with_all_model_types
+  @parameterized.named_parameters(dict(testcase_name="argument",
+                                       arg=True))
+  def test_run_with_all_model_types_extra_params_2(self, arg):
+    self.assertEqual(arg, True)
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 49990b6bf4f617dff1f6dc827ba03aa66f41f568..285388f340fc9aa6890a7d141127d1192d565528 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -149,6 +149,8 @@ from tensorflow.python.keras.layers.recurrent import PeepholeLSTMCell
 from tensorflow.python.keras.layers.recurrent import SimpleRNN
 from tensorflow.python.keras.layers.recurrent import GRU
 from tensorflow.python.keras.layers.recurrent import LSTM
+from tensorflow.python.keras.layers.recurrent import UnifiedGRU
+from tensorflow.python.keras.layers.recurrent import UnifiedLSTM
 
 # Convolutional-recurrent layers.
 from tensorflow.python.keras.layers.convolutional_recurrent import ConvLSTM2D
diff --git a/tensorflow/python/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/layers/advanced_activations_test.py
index 4aadf535e0cd2161b37cb26eb4cdd9a1da457a68..f32bb457c825d9769c6dccf625d9318c07843237 100644
--- a/tensorflow/python/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/layers/advanced_activations_test.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class AdvancedActivationsTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class AdvancedActivationsTest(keras_parameterized.TestCase):
 
   def test_leaky_relu(self):
     for alpha in [0., .5, -1.]:
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index d3339a8413095cae2b74e19d768fcda0e1b4e4fb..81af06b4eca3a962d95b59e73dc3148d0312c733 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -24,13 +24,13 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class Convolution1DTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class Convolution1DTest(keras_parameterized.TestCase):
 
   def _run_test(self, kwargs, arg, values):
     num_samples = 2
@@ -100,8 +100,8 @@ class Convolution1DTest(test.TestCase):
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class Conv2DTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class Conv2DTest(keras_parameterized.TestCase):
 
   def _run_test(self, kwargs, arg, values):
     num_samples = 2
@@ -175,8 +175,8 @@ class Conv2DTest(test.TestCase):
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class Conv2DTransposeTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class Conv2DTransposeTest(keras_parameterized.TestCase):
 
   def _run_test(self, kwargs, arg, values):
     num_samples = 2
@@ -267,8 +267,8 @@ class Conv2DTransposeTest(test.TestCase):
                              expected_output=expected_output)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class Conv3DTransposeTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class Conv3DTransposeTest(keras_parameterized.TestCase):
 
   def _run_test(self, kwargs, arg, values):
     num_samples = 2
@@ -336,8 +336,8 @@ class Conv3DTransposeTest(test.TestCase):
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class SeparableConv1DTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class SeparableConv1DTest(keras_parameterized.TestCase):
 
   def _run_test(self, kwargs, arg, values):
     num_samples = 2
@@ -411,8 +411,8 @@ class SeparableConv1DTest(test.TestCase):
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class SeparableConv2DTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class SeparableConv2DTest(keras_parameterized.TestCase):
 
   def _run_test(self, kwargs, arg, values):
     num_samples = 2
@@ -489,8 +489,8 @@ class SeparableConv2DTest(test.TestCase):
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class Conv3DTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class Conv3DTest(keras_parameterized.TestCase):
 
   def _run_test(self, kwargs, arg, values):
     num_samples = 2
@@ -557,8 +557,8 @@ class Conv3DTest(test.TestCase):
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class ZeroPaddingTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class ZeroPaddingTest(keras_parameterized.TestCase):
 
   def test_zero_padding_1d(self):
     num_samples = 2
@@ -726,8 +726,8 @@ class ZeroPaddingTest(test.TestCase):
       keras.layers.ZeroPadding3D(padding=None)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class UpSamplingTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class UpSamplingTest(keras_parameterized.TestCase):
 
   def test_upsampling_1d(self):
     with self.session(use_gpu=True):
@@ -875,8 +875,8 @@ class UpSamplingTest(test.TestCase):
               np.testing.assert_allclose(np_output, expected_out)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class CroppingTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class CroppingTest(keras_parameterized.TestCase):
 
   def test_cropping_1d(self):
     num_samples = 2
@@ -1017,8 +1017,8 @@ class CroppingTest(test.TestCase):
       keras.layers.Cropping3D(cropping=None)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class DepthwiseConv2DTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class DepthwiseConv2DTest(keras_parameterized.TestCase):
 
   def _run_test(self, kwargs, arg, values):
     num_samples = 2
@@ -1044,17 +1044,18 @@ class DepthwiseConv2DTest(test.TestCase):
       self._run_test(kwargs, 'data_format', ['channels_first'])
     self._run_test(kwargs, 'depth_multiplier', [1, 2])
 
-    kwargs = {'kernel_size': 3,
-              'padding': 'valid',
-              'data_format': 'channels_first',
-              'activation': None,
-              'depthwise_regularizer': 'l2',
-              'bias_regularizer': 'l2',
-              'activity_regularizer': 'l2',
-              'depthwise_constraint': 'unit_norm',
-              'use_bias': True,
-              'strides': (2, 2),
-             }
+    kwargs = {
+        'kernel_size': 3,
+        'padding': 'valid',
+        'data_format': 'channels_last',
+        'activation': None,
+        'depthwise_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'depthwise_constraint': 'unit_norm',
+        'use_bias': True,
+        'strides': (2, 2),
+    }
     self._run_test(kwargs, 'depth_multiplier', [1])
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 56dd70558cc6c1bf41211924ad5f8f9750ce8993..39bcb82c720f2530cfed3d16e61013ce3ffdb852 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -506,6 +506,9 @@ class Permute(Layer):
 class Flatten(Layer):
   """Flattens the input. Does not affect the batch size.
 
+  If inputs are shaped `(batch,)` without a channel dimension, then flattening
+  adds an extra channel dimension and output shapes are `(batch, 1)`.
+
   Arguments:
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
@@ -534,23 +537,28 @@ class Flatten(Layer):
   def __init__(self, data_format=None, **kwargs):
     super(Flatten, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
-    self.input_spec = InputSpec(min_ndim=2)
+    self.input_spec = InputSpec(min_ndim=1)
 
   def call(self, inputs):
-    if self.data_format == 'channels_first':
+    if (self.data_format == 'channels_first'
+        and K.ndim(inputs) is not None and K.ndim(inputs) > 1):
       permutation = [0]
       permutation.extend([i for i in
                           range(2, K.ndim(inputs))])
       permutation.append(1)
       inputs = array_ops.transpose(inputs, perm=permutation)
 
-    outputs = array_ops.reshape(inputs, (array_ops.shape(inputs)[0], -1))
+    outputs = array_ops.reshape(
+        inputs, (tensor_shape.dimension_value(inputs.shape[0]) or
+                 array_ops.shape(inputs)[0], -1))
     if not context.executing_eagerly():
       outputs.set_shape(self.compute_output_shape(inputs.get_shape()))
     return outputs
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if not input_shape:
+      output_shape = tensor_shape.TensorShape([1])
     output_shape = [input_shape[0]]
     if all(input_shape[1:]):
       output_shape += [np.prod(input_shape[1:])]
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index aad6ab8171ee6e7ff2d0d24b6dc37f556ddc6476..9df40f806fa2cd78699218298b6d31199ed126d6 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -22,43 +22,36 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class CoreLayersTest(test.TestCase):
-
-  def test_masking(self):
-    with self.cached_session():
-      testing_utils.layer_test(
-          keras.layers.Masking, kwargs={}, input_shape=(3, 2, 3))
+@keras_parameterized.run_all_keras_modes
+class DropoutLayersTest(keras_parameterized.TestCase):
 
   def test_dropout(self):
-    with self.cached_session():
-      testing_utils.layer_test(
-          keras.layers.Dropout, kwargs={'rate': 0.5}, input_shape=(3, 2))
+    testing_utils.layer_test(
+        keras.layers.Dropout, kwargs={'rate': 0.5}, input_shape=(3, 2))
 
-    with self.cached_session():
-      testing_utils.layer_test(
-          keras.layers.Dropout,
-          kwargs={'rate': 0.5,
-                  'noise_shape': [3, 1]},
-          input_shape=(3, 2))
-
-    # https://github.com/tensorflow/tensorflow/issues/14819
-    with self.cached_session():
-      dropout = keras.layers.Dropout(0.5)
-      self.assertEqual(True, dropout.supports_masking)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_spatial_dropout(self):
+    testing_utils.layer_test(
+        keras.layers.Dropout,
+        kwargs={'rate': 0.5,
+                'noise_shape': [3, 1]},
+        input_shape=(3, 2))
+
+  def test_dropout_supports_masking(self):
+    dropout = keras.layers.Dropout(0.5)
+    self.assertEqual(True, dropout.supports_masking)
+
+  def test_spatial_dropout_1d(self):
     testing_utils.layer_test(
         keras.layers.SpatialDropout1D,
         kwargs={'rate': 0.5},
         input_shape=(2, 3, 4))
 
+  def test_spatial_dropout_2d(self):
     testing_utils.layer_test(
         keras.layers.SpatialDropout2D,
         kwargs={'rate': 0.5},
@@ -69,6 +62,7 @@ class CoreLayersTest(test.TestCase):
         kwargs={'rate': 0.5, 'data_format': 'channels_first'},
         input_shape=(2, 3, 4, 5))
 
+  def test_spatial_dropout_3d(self):
     testing_utils.layer_test(
         keras.layers.SpatialDropout3D,
         kwargs={'rate': 0.5},
@@ -79,80 +73,9 @@ class CoreLayersTest(test.TestCase):
         kwargs={'rate': 0.5, 'data_format': 'channels_first'},
         input_shape=(2, 3, 4, 4, 5))
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_activation(self):
-    # with string argument
-    testing_utils.layer_test(
-        keras.layers.Activation,
-        kwargs={'activation': 'relu'},
-        input_shape=(3, 2))
-
-    # with function argument
-    testing_utils.layer_test(
-        keras.layers.Activation,
-        kwargs={'activation': keras.backend.relu},
-        input_shape=(3, 2))
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_reshape(self):
-    testing_utils.layer_test(
-        keras.layers.Reshape,
-        kwargs={'target_shape': (8, 1)},
-        input_shape=(3, 2, 4))
-
-    testing_utils.layer_test(
-        keras.layers.Reshape,
-        kwargs={'target_shape': (-1, 1)},
-        input_shape=(3, 2, 4))
-
-    testing_utils.layer_test(
-        keras.layers.Reshape,
-        kwargs={'target_shape': (1, -1)},
-        input_shape=(3, 2, 4))
-
-    testing_utils.layer_test(
-        keras.layers.Reshape,
-        kwargs={'target_shape': (-1, 1)},
-        input_shape=(None, None, 2))
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_permute(self):
-    testing_utils.layer_test(
-        keras.layers.Permute, kwargs={'dims': (2, 1)}, input_shape=(3, 2, 4))
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_permute_errors_on_invalid_starting_dims_index(self):
-    with self.assertRaisesRegexp(ValueError, r'Invalid permutation .*dims.*'):
-      testing_utils.layer_test(
-          keras.layers.Permute,
-          kwargs={'dims': (0, 1, 2)}, input_shape=(3, 2, 4))
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_permute_errors_on_invalid_set_of_dims_indices(self):
-    with self.assertRaisesRegexp(ValueError, r'Invalid permutation .*dims.*'):
-      testing_utils.layer_test(
-          keras.layers.Permute,
-          kwargs={'dims': (1, 4, 2)}, input_shape=(3, 2, 4))
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_flatten(self):
-    testing_utils.layer_test(
-        keras.layers.Flatten, kwargs={}, input_shape=(3, 2, 4))
 
-    # Test channels_first
-    inputs = np.random.random((10, 3, 5, 5)).astype('float32')
-    outputs = testing_utils.layer_test(
-        keras.layers.Flatten,
-        kwargs={'data_format': 'channels_first'},
-        input_data=inputs)
-    target_outputs = np.reshape(
-        np.transpose(inputs, (0, 2, 3, 1)), (-1, 5 * 5 * 3))
-    self.assertAllClose(outputs, target_outputs)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_repeat_vector(self):
-    testing_utils.layer_test(
-        keras.layers.RepeatVector, kwargs={'n': 3}, input_shape=(3, 2))
+@keras_parameterized.run_all_keras_modes
+class LambdaLayerTest(keras_parameterized.TestCase):
 
   def test_lambda(self):
     testing_utils.layer_test(
@@ -188,7 +111,6 @@ class CoreLayersTest(test.TestCase):
     config = ld.get_config()
     ld = keras.layers.Lambda.from_config(config)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_lambda_multiple_inputs(self):
     ld = keras.layers.Lambda(lambda x: x[0], output_shape=lambda x: x[0])
     x1 = np.ones([3, 2], np.float32)
@@ -196,64 +118,19 @@ class CoreLayersTest(test.TestCase):
     out = ld([x1, x2])
     self.assertAllEqual(out.shape, [3, 2])
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_dense(self):
-    testing_utils.layer_test(
-        keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 2))
-
-    testing_utils.layer_test(
-        keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 2))
-
-    testing_utils.layer_test(
-        keras.layers.Dense, kwargs={'units': 3}, input_shape=(None, None, 2))
-
-    testing_utils.layer_test(
-        keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 5, 2))
-
-  def test_dense_regularization(self):
-    with self.cached_session():
-      layer = keras.layers.Dense(
-          3,
-          kernel_regularizer=keras.regularizers.l1(0.01),
-          bias_regularizer='l1',
-          activity_regularizer='l2',
-          name='dense_reg')
-      layer(keras.backend.variable(np.ones((2, 4))))
-      self.assertEqual(3, len(layer.losses))
-
-  def test_dense_constraints(self):
-    with self.cached_session():
-      k_constraint = keras.constraints.max_norm(0.01)
-      b_constraint = keras.constraints.max_norm(0.01)
-      layer = keras.layers.Dense(
-          3, kernel_constraint=k_constraint, bias_constraint=b_constraint)
-      layer(keras.backend.variable(np.ones((2, 4))))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
-
-  def test_activity_regularization(self):
-    with self.cached_session():
-      layer = keras.layers.ActivityRegularization(l1=0.1)
-      layer(keras.backend.variable(np.ones((2, 4))))
-      self.assertEqual(1, len(layer.losses))
-      _ = layer.get_config()
-
   def test_lambda_output_shape(self):
-    with self.cached_session():
-      l = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
-      l(keras.backend.variable(np.ones((1, 1))))
-      self.assertEqual((1, 1), l.get_config()['output_shape'])
+    l = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
+    l(keras.backend.variable(np.ones((1, 1))))
+    self.assertEqual((1, 1), l.get_config()['output_shape'])
 
   def test_lambda_output_shape_function(self):
     def get_output_shape(input_shape):
       return 1 * input_shape
 
-    with self.cached_session():
-      l = keras.layers.Lambda(lambda x: x + 1, output_shape=get_output_shape)
-      l(keras.backend.variable(np.ones((1, 1))))
-      self.assertEqual('lambda', l.get_config()['output_shape_type'])
+    l = keras.layers.Lambda(lambda x: x + 1, output_shape=get_output_shape)
+    l(keras.backend.variable(np.ones((1, 1))))
+    self.assertEqual('lambda', l.get_config()['output_shape_type'])
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_lambda_output_shape_autocalculate_multiple_inputs(self):
 
     def lambda_fn(x):
@@ -263,7 +140,6 @@ class CoreLayersTest(test.TestCase):
     output_shape = l.compute_output_shape([(10, 10), (10, 20)])
     self.assertAllEqual((10, 20), output_shape)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_lambda_output_shape_list_multiple_outputs(self):
 
     def lambda_fn(x):
@@ -273,7 +149,6 @@ class CoreLayersTest(test.TestCase):
     output_shape = l.compute_output_shape([(10, 10), (10, 20)])
     self.assertAllEqual([(10, 10), (10, 20)], output_shape)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_lambda_output_shape_tuple_with_none(self):
 
     def lambda_fn(x):
@@ -283,7 +158,6 @@ class CoreLayersTest(test.TestCase):
     output_shape = l.compute_output_shape((5, 10, 20))
     self.assertAllEqual([5, None, 10], output_shape.as_list())
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_lambda_output_shape_function_multiple_outputs(self):
 
     def lambda_fn(x):
@@ -297,19 +171,144 @@ class CoreLayersTest(test.TestCase):
     self.assertAllEqual([(10, 10), (10, 20)], output_shape)
 
   def test_lambda_config_serialization(self):
-    with self.cached_session():
-      # test serialization with output_shape and output_shape_type
-      layer = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
-      layer(keras.backend.variable(np.ones((1, 1))))
-      config = layer.get_config()
-      layer = keras.layers.deserialize({
-          'class_name': 'Lambda',
-          'config': config
-      })
-
-      layer = keras.layers.Lambda.from_config(config)
-
-  @tf_test_util.run_in_graph_and_eager_modes
+    # Test serialization with output_shape and output_shape_type
+    layer = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
+    layer(keras.backend.variable(np.ones((1, 1))))
+    config = layer.get_config()
+    layer = keras.layers.deserialize({
+        'class_name': 'Lambda',
+        'config': config
+    })
+    layer = keras.layers.Lambda.from_config(config)
+
+
+@keras_parameterized.run_all_keras_modes
+class CoreLayersTest(keras_parameterized.TestCase):
+
+  def test_masking(self):
+    testing_utils.layer_test(
+        keras.layers.Masking, kwargs={}, input_shape=(3, 2, 3))
+
+  def test_activation(self):
+    # with string argument
+    testing_utils.layer_test(
+        keras.layers.Activation,
+        kwargs={'activation': 'relu'},
+        input_shape=(3, 2))
+
+    # with function argument
+    testing_utils.layer_test(
+        keras.layers.Activation,
+        kwargs={'activation': keras.backend.relu},
+        input_shape=(3, 2))
+
+  def test_reshape(self):
+    testing_utils.layer_test(
+        keras.layers.Reshape,
+        kwargs={'target_shape': (8, 1)},
+        input_shape=(3, 2, 4))
+
+    testing_utils.layer_test(
+        keras.layers.Reshape,
+        kwargs={'target_shape': (-1, 1)},
+        input_shape=(3, 2, 4))
+
+    testing_utils.layer_test(
+        keras.layers.Reshape,
+        kwargs={'target_shape': (1, -1)},
+        input_shape=(3, 2, 4))
+
+    testing_utils.layer_test(
+        keras.layers.Reshape,
+        kwargs={'target_shape': (-1, 1)},
+        input_shape=(None, None, 2))
+
+  def test_permute(self):
+    testing_utils.layer_test(
+        keras.layers.Permute, kwargs={'dims': (2, 1)}, input_shape=(3, 2, 4))
+
+  def test_permute_errors_on_invalid_starting_dims_index(self):
+    with self.assertRaisesRegexp(ValueError, r'Invalid permutation .*dims.*'):
+      testing_utils.layer_test(
+          keras.layers.Permute,
+          kwargs={'dims': (0, 1, 2)}, input_shape=(3, 2, 4))
+
+  def test_permute_errors_on_invalid_set_of_dims_indices(self):
+    with self.assertRaisesRegexp(ValueError, r'Invalid permutation .*dims.*'):
+      testing_utils.layer_test(
+          keras.layers.Permute,
+          kwargs={'dims': (1, 4, 2)}, input_shape=(3, 2, 4))
+
+  def test_flatten(self):
+    testing_utils.layer_test(
+        keras.layers.Flatten, kwargs={}, input_shape=(3, 2, 4))
+
+    # Test channels_first
+    inputs = np.random.random((10, 3, 5, 5)).astype('float32')
+    outputs = testing_utils.layer_test(
+        keras.layers.Flatten,
+        kwargs={'data_format': 'channels_first'},
+        input_data=inputs)
+    target_outputs = np.reshape(
+        np.transpose(inputs, (0, 2, 3, 1)), (-1, 5 * 5 * 3))
+    self.assertAllClose(outputs, target_outputs)
+
+  def test_flatten_scalar_channels(self):
+    testing_utils.layer_test(
+        keras.layers.Flatten, kwargs={}, input_shape=(3,))
+
+    # Test channels_first
+    inputs = np.random.random((10,)).astype('float32')
+    outputs = testing_utils.layer_test(
+        keras.layers.Flatten,
+        kwargs={'data_format': 'channels_first'},
+        input_data=inputs)
+    target_outputs = np.expand_dims(inputs, -1)
+    self.assertAllClose(outputs, target_outputs)
+
+  def test_repeat_vector(self):
+    testing_utils.layer_test(
+        keras.layers.RepeatVector, kwargs={'n': 3}, input_shape=(3, 2))
+
+  def test_dense(self):
+    testing_utils.layer_test(
+        keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 2))
+
+    testing_utils.layer_test(
+        keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 2))
+
+    testing_utils.layer_test(
+        keras.layers.Dense, kwargs={'units': 3}, input_shape=(None, None, 2))
+
+    testing_utils.layer_test(
+        keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 5, 2))
+
+  def test_dense_regularization(self):
+    layer = keras.layers.Dense(
+        3,
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l1',
+        activity_regularizer='l2',
+        name='dense_reg')
+    layer(keras.backend.variable(np.ones((2, 4))))
+    self.assertEqual(3, len(layer.losses))
+
+  def test_dense_constraints(self):
+    k_constraint = keras.constraints.max_norm(0.01)
+    b_constraint = keras.constraints.max_norm(0.01)
+    layer = keras.layers.Dense(
+        3, kernel_constraint=k_constraint, bias_constraint=b_constraint)
+    layer(keras.backend.variable(np.ones((2, 4))))
+    self.assertEqual(layer.kernel.constraint, k_constraint)
+    self.assertEqual(layer.bias.constraint, b_constraint)
+
+  def test_activity_regularization(self):
+    layer = keras.layers.ActivityRegularization(l1=0.1)
+    layer(keras.backend.variable(np.ones((2, 4))))
+    self.assertEqual(1, len(layer.losses))
+    config = layer.get_config()
+    self.assertEqual(config.pop('l1'), 0.1)
+
   def test_numpy_inputs(self):
     if context.executing_eagerly():
       layer = keras.layers.RepeatVector(2)
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
index 81f292817fd989ee0aa256ada64e09b32a79ac2b..e9925eeba655b9ce067b114cfd9db5cef1a366ef 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -26,6 +26,7 @@ from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras.engine.input_spec import InputSpec
+from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.keras.layers.recurrent import RNN
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
@@ -80,11 +81,6 @@ class _CuDNNRNN(RNN):
     self._num_inputs = None
     self._vector_shape = constant_op.constant([-1])
 
-  def _canonical_to_params(self, weights, biases):
-    weights = [array_ops.reshape(x, self._vector_shape) for x in weights]
-    biases = [array_ops.reshape(x, self._vector_shape) for x in biases]
-    return array_ops.concat(weights + biases, axis=0)
-
   def call(self, inputs, mask=None, training=None, initial_state=None):
     if isinstance(mask, list):
       mask = mask[0]
@@ -162,7 +158,7 @@ class _CuDNNRNN(RNN):
         RNN, self).get_losses_for(inputs=inputs)
 
 
-@tf_export('keras.layers.CuDNNGRU')
+@tf_export(v1=['keras.layers.CuDNNGRU'])
 class CuDNNGRU(_CuDNNRNN):
   """Fast GRU implementation backed by cuDNN.
 
@@ -279,7 +275,7 @@ class CuDNNGRU(_CuDNNRNN):
     input_h = initial_state[0]
     input_h = array_ops.expand_dims(input_h, axis=0)
 
-    params = self._canonical_to_params(
+    params = recurrent._canonical_to_params(    # pylint: disable=protected-access
         weights=[
             self.kernel[:, self.units:self.units * 2],
             self.kernel[:, :self.units],
@@ -296,7 +292,7 @@ class CuDNNGRU(_CuDNNRNN):
             self.bias[self.units * 3:self.units * 4],
             self.bias[self.units * 5:],
         ],
-    )
+        shape=self._vector_shape)
 
     outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(
         inputs,
@@ -339,7 +335,7 @@ class CuDNNGRU(_CuDNNRNN):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.CuDNNLSTM')
+@tf_export(v1=['keras.layers.CuDNNLSTM'])
 class CuDNNLSTM(_CuDNNRNN):
   """Fast LSTM implementation backed by cuDNN.
 
@@ -474,7 +470,7 @@ class CuDNNLSTM(_CuDNNRNN):
     input_h = array_ops.expand_dims(input_h, axis=0)
     input_c = array_ops.expand_dims(input_c, axis=0)
 
-    params = self._canonical_to_params(
+    params = recurrent._canonical_to_params(    # pylint: disable=protected-access
         weights=[
             self.kernel[:, :self.units],
             self.kernel[:, self.units:self.units * 2],
@@ -495,7 +491,7 @@ class CuDNNLSTM(_CuDNNRNN):
             self.bias[self.units * 6:self.units * 7],
             self.bias[self.units * 7:],
         ],
-    )
+        shape=self._vector_shape)
 
     outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
         inputs,
diff --git a/tensorflow/python/keras/layers/embeddings_test.py b/tensorflow/python/keras/layers/embeddings_test.py
index aaa17b7e96078dea9b84e0f0e62a4bdcbe071fa0..ac3acad7accb2a9d9d8858af973b61023dcfbc22 100644
--- a/tensorflow/python/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/layers/embeddings_test.py
@@ -23,15 +23,19 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import backprop
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training import adagrad
 
 
-class EmbeddingTest(test.TestCase):
+class EmbeddingTest(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=False)
+  @keras_parameterized.run_all_keras_modes
   def test_embedding(self):
+    if tf_test_util.is_gpu_available():
+      self.skipTest('Only test embedding on CPU.')
+
     testing_utils.layer_test(
         keras.layers.Embedding,
         kwargs={'output_dim': 4,
@@ -69,18 +73,17 @@ class EmbeddingTest(test.TestCase):
         input_dtype='int32',
         expected_output_dtype='float32')
 
-  @tf_test_util.run_in_graph_and_eager_modes()
+  @keras_parameterized.run_all_keras_modes
   def test_embedding_correctness(self):
     layer = keras.layers.Embedding(output_dim=2, input_dim=2)
-    layer.build((None, 2))
-    matrix = np.array([[1, 1], [2, 2]])
-    layer.set_weights([matrix])
+    model = keras.models.Sequential([layer])
 
-    inputs = keras.backend.constant([[0, 1, 0]], dtype='int32')
-    outputs = keras.backend.eval(layer(inputs))
+    layer.set_weights([np.array([[1, 1], [2, 2]])])
+    model.run_eagerly = testing_utils.should_run_eagerly()
+    outputs = model.predict(np.array([[0, 1, 0]], dtype='int32'))
     self.assertAllClose(outputs, [[[1, 1], [2, 2], [1, 1]]])
 
-  @tf_test_util.run_in_graph_and_eager_modes()
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_eager_gpu_cpu(self):
     l = keras.layers.Embedding(output_dim=2, input_dim=2)
     l.build((None, 2))
diff --git a/tensorflow/python/keras/layers/gru_test.py b/tensorflow/python/keras/layers/gru_test.py
index 9988c9fae5808a5cad47464addbb3f5e33953e66..d05e7eeb633e4e9b4c255e13ef7b21ad71ab4348 100644
--- a/tensorflow/python/keras/layers/gru_test.py
+++ b/tensorflow/python/keras/layers/gru_test.py
@@ -22,14 +22,15 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
-class GRULayerTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class GRULayerTest(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_return_sequences_GRU(self):
     num_samples = 2
     timesteps = 3
@@ -41,7 +42,6 @@ class GRULayerTest(test.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_dynamic_behavior_GRU(self):
     num_samples = 2
     timesteps = 3
@@ -50,12 +50,12 @@ class GRULayerTest(test.TestCase):
     layer = keras.layers.GRU(units, input_shape=(None, embedding_dim))
     model = keras.models.Sequential()
     model.add(layer)
-    model.compile(RMSPropOptimizer(0.01), 'mse')
+    model.compile(RMSPropOptimizer(0.01), 'mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     x = np.random.random((num_samples, timesteps, embedding_dim))
     y = np.random.random((num_samples, units))
     model.train_on_batch(x, y)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_dropout_GRU(self):
     num_samples = 2
     timesteps = 3
@@ -68,7 +68,6 @@ class GRULayerTest(test.TestCase):
                 'recurrent_dropout': 0.1},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_implementation_mode_GRU(self):
     num_samples = 2
     timesteps = 3
@@ -81,12 +80,83 @@ class GRULayerTest(test.TestCase):
                   'implementation': mode},
           input_shape=(num_samples, timesteps, embedding_dim))
 
+  def test_reset_after_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=num_samples,
+        test_samples=0,
+        input_shape=(timesteps, embedding_dim),
+        num_classes=units)
+    y_train = keras.utils.to_categorical(y_train, units)
+
+    inputs = keras.layers.Input(shape=[timesteps, embedding_dim])
+    gru_layer = keras.layers.GRU(units,
+                                 reset_after=True)
+    output = gru_layer(inputs)
+    gru_model = keras.models.Model(inputs, output)
+    gru_model.compile(RMSPropOptimizer(0.01), 'mse',
+                      run_eagerly=testing_utils.should_run_eagerly())
+    gru_model.fit(x_train, y_train)
+    gru_model.predict(x_train)
+
+  def test_with_masking_layer_GRU(self):
+    layer_class = keras.layers.GRU
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(layer_class(units=5, return_sequences=True, unroll=False))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+
+@tf_test_util.run_all_in_graph_and_eager_modes
+class GRULayerGenericTest(test.TestCase):
+
+  def test_constraints_GRU(self):
+    embedding_dim = 4
+    layer_class = keras.layers.GRU
+    k_constraint = keras.constraints.max_norm(0.01)
+    r_constraint = keras.constraints.max_norm(0.01)
+    b_constraint = keras.constraints.max_norm(0.01)
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_constraint=k_constraint,
+        recurrent_constraint=r_constraint,
+        bias_constraint=b_constraint)
+    layer.build((None, None, embedding_dim))
+    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+    self.assertEqual(layer.cell.bias.constraint, b_constraint)
+
+  def test_from_config_GRU(self):
+    layer_class = keras.layers.GRU
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
+
+
+class GRULayerGraphOnlyTest(test.TestCase):
+
+  @tf_test_util.run_v1_only('b/120545219')
   def test_statefulness_GRU(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
     layer_class = keras.layers.GRU
+
     with self.cached_session():
       model = keras.models.Sequential()
       model.add(
@@ -143,67 +213,26 @@ class GRULayerTest(test.TestCase):
 
       np.testing.assert_allclose(out7, out6, atol=1e-5)
 
+  # b/120919032
+  @tf_test_util.run_deprecated_v1
   def test_regularizers_GRU(self):
     embedding_dim = 4
     layer_class = keras.layers.GRU
-    with self.cached_session():
-      layer = layer_class(
-          5,
-          return_sequences=False,
-          weights=None,
-          input_shape=(None, embedding_dim),
-          kernel_regularizer=keras.regularizers.l1(0.01),
-          recurrent_regularizer=keras.regularizers.l1(0.01),
-          bias_regularizer='l2',
-          activity_regularizer='l1')
-      layer.build((None, None, 2))
-      self.assertEqual(len(layer.losses), 3)
-
-      x = keras.backend.variable(np.ones((2, 3, 2)))
-      layer(x)
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
-
-  def test_constraints_GRU(self):
-    embedding_dim = 4
-    layer_class = keras.layers.GRU
-    with self.cached_session():
-      k_constraint = keras.constraints.max_norm(0.01)
-      r_constraint = keras.constraints.max_norm(0.01)
-      b_constraint = keras.constraints.max_norm(0.01)
-      layer = layer_class(
-          5,
-          return_sequences=False,
-          weights=None,
-          input_shape=(None, embedding_dim),
-          kernel_constraint=k_constraint,
-          recurrent_constraint=r_constraint,
-          bias_constraint=b_constraint)
-      layer.build((None, None, embedding_dim))
-      self.assertEqual(layer.cell.kernel.constraint, k_constraint)
-      self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
-      self.assertEqual(layer.cell.bias.constraint, b_constraint)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_with_masking_layer_GRU(self):
-    layer_class = keras.layers.GRU
-    with self.cached_session():
-      inputs = np.random.random((2, 3, 4))
-      targets = np.abs(np.random.random((2, 3, 5)))
-      targets /= targets.sum(axis=-1, keepdims=True)
-      model = keras.models.Sequential()
-      model.add(keras.layers.Masking(input_shape=(3, 4)))
-      model.add(layer_class(units=5, return_sequences=True, unroll=False))
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=RMSPropOptimizer(0.01))
-      model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
-
-  def test_from_config_GRU(self):
-    layer_class = keras.layers.GRU
-    for stateful in (False, True):
-      l1 = layer_class(units=1, stateful=stateful)
-      l2 = layer_class.from_config(l1.get_config())
-      assert l1.get_config() == l2.get_config()
-
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        recurrent_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l2',
+        activity_regularizer='l1')
+    layer.build((None, None, 2))
+    self.assertEqual(len(layer.losses), 3)
+
+    x = keras.backend.variable(np.ones((2, 3, 2)))
+    layer(x)
+    self.assertEqual(len(layer.get_losses_for(x)), 1)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
index a23c5c38fe9fc5599b9f1bb1bd8d83b529e42af2..e4f4d0a639a6bac4605b3f03e23c6f14a2fdaa88 100644
--- a/tensorflow/python/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -294,8 +294,8 @@ class LocallyConnectedImplementationModeTest(test.TestCase):
                         # Compare outputs after a few training steps.
                         out_1 = model_1.call(inputs)
                         out_2 = model_2.call(inputs)
-                        self.assertAllCloseAccordingToType(out_1, out_2,
-                                                           atol=1e-4)
+                        self.assertAllCloseAccordingToType(
+                            out_1, out_2, atol=2e-4)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_make_2d(self):
diff --git a/tensorflow/python/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
index 732bbcfa18ecc668f53d8f768b686913167ac790..b132d2ee8ea4c89e043ece1f029f7d65c0f79c23 100644
--- a/tensorflow/python/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -18,10 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
@@ -29,8 +31,8 @@ from tensorflow.python.training import gradient_descent
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class LSTMLayerTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class LSTMLayerTest(keras_parameterized.TestCase):
 
   def test_return_sequences_LSTM(self):
     num_samples = 2
@@ -56,7 +58,7 @@ class LSTMLayerTest(test.TestCase):
     layer = keras.layers.LSTM(units, return_sequences=True)
     model.add(layer)
     outputs = model.layers[-1].output
-    self.assertEquals(outputs.get_shape().as_list(), [None, timesteps, units])
+    self.assertEqual(outputs.get_shape().as_list(), [None, timesteps, units])
 
   def test_dynamic_behavior_LSTM(self):
     num_samples = 2
@@ -66,7 +68,9 @@ class LSTMLayerTest(test.TestCase):
     layer = keras.layers.LSTM(units, input_shape=(None, embedding_dim))
     model = keras.models.Sequential()
     model.add(layer)
-    model.compile(RMSPropOptimizer(0.001), 'mse')
+    model.compile(RMSPropOptimizer(0.001), 'mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+
     x = np.random.random((num_samples, timesteps, embedding_dim))
     y = np.random.random((num_samples, units))
     model.train_on_batch(x, y)
@@ -83,17 +87,17 @@ class LSTMLayerTest(test.TestCase):
                 'recurrent_dropout': 0.1},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  def test_implementation_mode_LSTM(self):
+  @parameterized.parameters([0, 1, 2])
+  def test_implementation_mode_LSTM(self, implementation_mode):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    for mode in [0, 1, 2]:
-      testing_utils.layer_test(
-          keras.layers.LSTM,
-          kwargs={'units': units,
-                  'implementation': mode},
-          input_shape=(num_samples, timesteps, embedding_dim))
+    testing_utils.layer_test(
+        keras.layers.LSTM,
+        kwargs={'units': units,
+                'implementation': implementation_mode},
+        input_shape=(num_samples, timesteps, embedding_dim))
 
   def test_constraints_LSTM(self):
     embedding_dim = 4
@@ -114,7 +118,6 @@ class LSTMLayerTest(test.TestCase):
     self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
     self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
-  @tf_test_util.run_deprecated_v1
   def test_with_masking_layer_LSTM(self):
     layer_class = keras.layers.LSTM
     inputs = np.random.random((2, 3, 4))
@@ -124,10 +127,10 @@ class LSTMLayerTest(test.TestCase):
     model.add(keras.layers.Masking(input_shape=(3, 4)))
     model.add(layer_class(units=5, return_sequences=True, unroll=False))
     model.compile(loss='categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(0.01))
+                  optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
-  @tf_test_util.run_deprecated_v1
   def test_masking_with_stacking_LSTM(self):
     inputs = np.random.random((2, 3, 4))
     targets = np.abs(np.random.random((2, 3, 5)))
@@ -137,7 +140,8 @@ class LSTMLayerTest(test.TestCase):
     lstm_cells = [keras.layers.LSTMCell(10), keras.layers.LSTMCell(5)]
     model.add(keras.layers.RNN(lstm_cells, return_sequences=True, unroll=False))
     model.compile(loss='categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(0.01))
+                  optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
   def test_from_config_LSTM(self):
@@ -166,7 +170,8 @@ class LSTMLayerTest(test.TestCase):
 
     model = keras.models.Model([inputs] + initial_state, output)
     model.compile(loss='categorical_crossentropy',
-                  optimizer=adam.AdamOptimizer())
+                  optimizer=adam.AdamOptimizer(),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     inputs = np.random.random((num_samples, timesteps, embedding_dim))
     initial_state = [np.random.random((num_samples, units))
@@ -191,7 +196,8 @@ class LSTMLayerTest(test.TestCase):
 
     model = keras.models.Model(inputs, output)
     model.compile(loss='categorical_crossentropy',
-                  optimizer=adam.AdamOptimizer())
+                  optimizer=adam.AdamOptimizer(),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     inputs = np.random.random((num_samples, timesteps, embedding_dim))
     targets = np.random.random((num_samples, units))
@@ -241,7 +247,8 @@ class LSTMLayerTest(test.TestCase):
 
     model = keras.models.Model([inputs] + initial_state, output)
     model.compile(loss='categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(0.01))
+                  optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     inputs = np.random.random((num_samples, timesteps, embedding_dim))
     initial_state = [np.random.random((num_samples, units))
@@ -302,7 +309,8 @@ class LSTMLayerTest(test.TestCase):
 
     model = keras.models.Model(inputs, output)
     model.compile(loss='categorical_crossentropy',
-                  optimizer=adam.AdamOptimizer())
+                  optimizer=adam.AdamOptimizer(),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
     initial_state = [np.random.random((num_samples, units))
@@ -313,7 +321,6 @@ class LSTMLayerTest(test.TestCase):
 
 class LSTMLayerGraphOnlyTest(test.TestCase):
 
-  @tf_test_util.run_deprecated_v1
   def test_statefulness_LSTM(self):
     num_samples = 2
     timesteps = 3
@@ -377,25 +384,25 @@ class LSTMLayerGraphOnlyTest(test.TestCase):
 
       self.assertAllClose(out7, out6, atol=1e-5)
 
+  # b/120919032
   @tf_test_util.run_deprecated_v1
   def test_regularizers_LSTM(self):
     embedding_dim = 4
     layer_class = keras.layers.LSTM
-    with self.cached_session():
-      layer = layer_class(
-          5,
-          return_sequences=False,
-          weights=None,
-          input_shape=(None, embedding_dim),
-          kernel_regularizer=keras.regularizers.l1(0.01),
-          recurrent_regularizer=keras.regularizers.l1(0.01),
-          bias_regularizer='l2',
-          activity_regularizer='l1')
-      layer.build((None, None, 2))
-      self.assertEqual(len(layer.losses), 3)
-      x = keras.backend.variable(np.ones((2, 3, 2)))
-      layer(x)
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        recurrent_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l2',
+        activity_regularizer='l1')
+    layer.build((None, None, 2))
+    self.assertEqual(len(layer.losses), 3)
+    x = keras.backend.variable(np.ones((2, 3, 2)))
+    layer(x)
+    self.assertEqual(len(layer.get_losses_for(x)), 1)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/merge.py b/tensorflow/python/keras/layers/merge.py
index 45e705c69606c4dd839429597aa9903a9442234a..c73b21d96552c0ce58915273e6291b3fe2848105 100644
--- a/tensorflow/python/keras/layers/merge.py
+++ b/tensorflow/python/keras/layers/merge.py
@@ -87,7 +87,7 @@ class _Merge(Layer):
   def build(self, input_shape):
     # Used purely for shape validation.
     if not isinstance(input_shape, list):
-      raise ValueError('A merge layer should be called ' 'on a list of inputs.')
+      raise ValueError('A merge layer should be called on a list of inputs.')
     if len(input_shape) < 2:
       raise ValueError('A merge layer should be called '
                        'on a list of at least 2 inputs. '
@@ -118,7 +118,7 @@ class _Merge(Layer):
 
   def call(self, inputs):
     if not isinstance(inputs, list):
-      raise ValueError('A merge layer should be called ' 'on a list of inputs.')
+      raise ValueError('A merge layer should be called on a list of inputs.')
     if self._reshape_required:
       reshaped_inputs = []
       input_ndims = list(map(K.ndim, inputs))
@@ -504,7 +504,7 @@ class Dot(_Merge):
 
   def _merge_function(self, inputs):
     if len(inputs) != 2:
-      raise ValueError('A `Dot` layer should be called ' 'on exactly 2 inputs')
+      raise ValueError('A `Dot` layer should be called on exactly 2 inputs')
     x1 = inputs[0]
     x2 = inputs[1]
     if isinstance(self.axes, int):
diff --git a/tensorflow/python/keras/layers/merge_test.py b/tensorflow/python/keras/layers/merge_test.py
index fcb161ae20a4caeaa9514477529c2885d6e5bd41..f962a75b32421860296476607a5dacdaaf5468cd 100644
--- a/tensorflow/python/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/layers/merge_test.py
@@ -22,12 +22,13 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.ops import array_ops
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class MergeLayersTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class MergeLayersTest(keras_parameterized.TestCase):
 
   def test_merge_add(self):
     i1 = keras.layers.Input(shape=(4, 5))
@@ -35,8 +36,9 @@ class MergeLayersTest(test.TestCase):
     i3 = keras.layers.Input(shape=(4, 5))
 
     o = keras.layers.add([i1, i2, i3])
-    self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2, i3], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -45,25 +47,14 @@ class MergeLayersTest(test.TestCase):
     self.assertEqual(out.shape, (2, 4, 5))
     self.assertAllClose(out, x1 + x2 + x3, atol=1e-4)
 
-  def test_merge_elementwise_errors(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(4, 6))
-    with self.assertRaises(ValueError):
-      keras.layers.add([i1, i2])
-    with self.assertRaises(ValueError):
-      keras.layers.add([i1])
-    with self.assertRaises(ValueError):
-      keras.layers.add(i1)
-    with self.assertRaises(ValueError):
-      keras.layers.add([i1])
-
   def test_merge_multiply(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
     i3 = keras.layers.Input(shape=(4, 5))
     o = keras.layers.multiply([i1, i2, i3])
-    self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2, i3], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -76,8 +67,9 @@ class MergeLayersTest(test.TestCase):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
     o = keras.layers.average([i1, i2])
-    self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -89,8 +81,9 @@ class MergeLayersTest(test.TestCase):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
     o = keras.layers.maximum([i1, i2])
-    self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -102,8 +95,9 @@ class MergeLayersTest(test.TestCase):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
     o = keras.layers.minimum([i1, i2])
-    self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -115,8 +109,9 @@ class MergeLayersTest(test.TestCase):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
     o = keras.layers.concatenate([i1, i2], axis=1)
-    self.assertListEqual(o.get_shape().as_list(), [None, 8, 5])
+    self.assertListEqual(o.shape.as_list(), [None, 8, 5])
     model = keras.models.Model([i1, i2], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -124,22 +119,13 @@ class MergeLayersTest(test.TestCase):
     self.assertEqual(out.shape, (2, 8, 5))
     self.assertAllClose(out, np.concatenate([x1, x2], axis=1), atol=1e-4)
 
-  def test_concatenate_errors(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(3, 5))
-    with self.assertRaisesRegexp(ValueError, 'inputs with matching shapes'):
-      keras.layers.concatenate([i1, i2], axis=-1)
-    with self.assertRaisesRegexp(ValueError, 'called on a list'):
-      keras.layers.concatenate(i1, axis=-1)
-    with self.assertRaisesRegexp(ValueError, 'called on a list'):
-      keras.layers.concatenate([i1], axis=-1)
-
   def test_merge_dot(self):
     i1 = keras.layers.Input(shape=(4,))
     i2 = keras.layers.Input(shape=(4,))
     o = keras.layers.dot([i1, i2], axes=1)
-    self.assertListEqual(o.get_shape().as_list(), [None, 1])
+    self.assertListEqual(o.shape.as_list(), [None, 1])
     model = keras.models.Model([i1, i2], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
     _ = keras.layers.Dot(axes=1).get_config()
 
     x1 = np.random.random((2, 4))
@@ -153,8 +139,9 @@ class MergeLayersTest(test.TestCase):
 
     # Test with negative tuple of axes.
     o = keras.layers.dot([i1, i2], axes=(-1, -1))
-    self.assertListEqual(o.get_shape().as_list(), [None, 1])
+    self.assertListEqual(o.shape.as_list(), [None, 1])
     model = keras.models.Model([i1, i2], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
     out = model.predict([x1, x2])
     self.assertEqual(out.shape, (2, 1))
     self.assertAllClose(out, expected, atol=1e-4)
@@ -163,6 +150,32 @@ class MergeLayersTest(test.TestCase):
     layer = keras.layers.Dot(axes=-1)
     self.assertEqual(layer.compute_output_shape([(4, 5), (4, 5)]), (4, 1))
 
+
+@tf_test_util.run_all_in_graph_and_eager_modes
+class MergeLayersTestNoExecution(test.TestCase):
+
+  def test_merge_elementwise_errors(self):
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 6))
+    with self.assertRaises(ValueError):
+      keras.layers.add([i1, i2])
+    with self.assertRaises(ValueError):
+      keras.layers.add([i1])
+    with self.assertRaises(ValueError):
+      keras.layers.add(i1)
+    with self.assertRaises(ValueError):
+      keras.layers.add([i1])
+
+  def test_concatenate_errors(self):
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(3, 5))
+    with self.assertRaisesRegexp(ValueError, 'inputs with matching shapes'):
+      keras.layers.concatenate([i1, i2], axis=-1)
+    with self.assertRaisesRegexp(ValueError, 'called on a list'):
+      keras.layers.concatenate(i1, axis=-1)
+    with self.assertRaisesRegexp(ValueError, 'called on a list'):
+      keras.layers.concatenate([i1], axis=-1)
+
   def test_dot_errors(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 6))
@@ -183,7 +196,7 @@ class MergeLayersTest(test.TestCase):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
     y = keras.layers.subtract([i1, i2])
-    self.assertEqual(y.get_shape().as_list(), [None, 4, 5])
+    self.assertEqual(y.shape.as_list(), [None, 4, 5])
 
     # Test invalid use cases
     i1 = keras.layers.Input(shape=(4, 5))
@@ -193,39 +206,32 @@ class MergeLayersTest(test.TestCase):
     with self.assertRaises(ValueError):
       keras.layers.subtract([i1, i1, i1])
 
-
-class MergeLayersGraphOnlyTest(test.TestCase):
-
   def test_merge_add_masking(self):
-    with self.cached_session():
-      i1 = keras.layers.Input(shape=(4, 5))
-      i2 = keras.layers.Input(shape=(4, 5))
-      m1 = keras.layers.Masking()(i1)
-      layer = keras.layers.Add()
-      o = layer([m1, i2])
-      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
-      mask = layer.output_mask
-      self.assertListEqual(mask.get_shape().as_list(), [None, 4])
-
-  @tf_test_util.run_deprecated_v1
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 5))
+    m1 = keras.layers.Masking()(i1)
+    layer = keras.layers.Add()
+    o = layer([m1, i2])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
+    mask = layer.output_mask
+    self.assertListEqual(mask.shape.as_list(), [None, 4])
+
   def test_merge_add_dynamic_shape(self):
-    with self.cached_session():
-      i1 = array_ops.placeholder(shape=(4, None), dtype='float32')
-      i2 = array_ops.placeholder(shape=(4, 5), dtype='float32')
-      layer = keras.layers.Add()
-      o = layer([i1, i2])
-      self.assertListEqual(o.get_shape().as_list(), [4, 5])
+    i1 = keras.Input(batch_shape=(4, None), dtype='float32')
+    i2 = keras.Input(batch_shape=(4, 5), dtype='float32')
+    layer = keras.layers.Add()
+    o = layer([i1, i2])
+    self.assertListEqual(o.shape.as_list(), [4, 5])
 
   def test_merge_concatenate_masking(self):
-    with self.cached_session():
-      i1 = keras.layers.Input(shape=(4, 5))
-      i2 = keras.layers.Input(shape=(4, 5))
-      m1 = keras.layers.Masking()(i1)
-      layer = keras.layers.Concatenate()
-      o = layer([m1, i2])
-      self.assertListEqual(o.get_shape().as_list(), [None, 4, 10])
-      mask = layer.output_mask
-      self.assertListEqual(mask.get_shape().as_list(), [None, 4])
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 5))
+    m1 = keras.layers.Masking()(i1)
+    layer = keras.layers.Concatenate()
+    o = layer([m1, i2])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 10])
+    mask = layer.output_mask
+    self.assertListEqual(mask.shape.as_list(), [None, 4])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/noise_test.py b/tensorflow/python/keras/layers/noise_test.py
index 325dd933b21bd4182fcd8c20493acba70834383f..f1537a6919f6a13c4e1c5bd793f01f63fb7dc834 100644
--- a/tensorflow/python/keras/layers/noise_test.py
+++ b/tensorflow/python/keras/layers/noise_test.py
@@ -19,13 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import keras
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class NoiseLayersTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class NoiseLayersTest(keras_parameterized.TestCase):
 
   def test_GaussianNoise(self):
     testing_utils.layer_test(
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index d9584976555478be8efd6476ea4f6248369e0956..ee37e8a2422e5c1a942a773aaf40f404a25c641f 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
+
 from tensorflow.python import tf2
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -37,7 +40,6 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -412,11 +414,19 @@ class BatchNormalizationV2(Layer):
   def _assign_moving_average(self, variable, value, momentum):
     with ops.name_scope(None, 'AssignMovingAvg',
                         [variable, value, momentum]) as scope:
-      with ops.colocate_with(variable):
+      # TODO(apassos,srbs,skyewm): the colocation constraints here are disabled
+      # because of a bug which leads cond_v2 to skip rewriting them creating
+      # conflicts.
+      if tf2.enabled():
+        cm = contextlib.contextmanager(lambda: (yield))()
+      else:
+        cm = ops.colocate_with(variable)
+      with cm:
         decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
         if decay.dtype != variable.dtype.base_dtype:
           decay = math_ops.cast(decay, variable.dtype.base_dtype)
-        update_delta = (variable - math_ops.cast(value, variable.dtype)) * decay
+        update_delta = (
+            variable - math_ops.cast(value, variable.dtype)) * decay
         return state_ops.assign_sub(variable, update_delta, name=scope)
 
   def _fused_batch_norm(self, inputs, training):
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index 9138c0a08a32f21f4352598e570a383e06d7c9a2..f81ddcecb42662c8cfa481808919c4382771467b 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -22,15 +22,16 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import normalization
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class NormalizationLayersTest(test.TestCase):
+class BatchNormalizationTest(keras_parameterized.TestCase):
 
+  @keras_parameterized.run_all_keras_modes
   def test_basic_batchnorm(self):
     testing_utils.layer_test(
         keras.layers.BatchNormalization,
@@ -55,15 +56,8 @@ class NormalizationLayersTest(test.TestCase):
         kwargs={'scale': False,
                 'center': False},
         input_shape=(3, 3))
-    testing_utils.layer_test(
-        normalization.BatchNormalizationV2,
-        kwargs={'fused': True},
-        input_shape=(3, 3, 3, 3))
-    testing_utils.layer_test(
-        normalization.BatchNormalizationV2,
-        kwargs={'fused': None},
-        input_shape=(3, 3, 3))
 
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_batchnorm_weights(self):
     layer = keras.layers.BatchNormalization(scale=False, center=False)
     layer.build((None, 3, 4))
@@ -75,6 +69,7 @@ class NormalizationLayersTest(test.TestCase):
     self.assertEqual(len(layer.trainable_weights), 2)
     self.assertEqual(len(layer.weights), 4)
 
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_batchnorm_regularization(self):
     layer = keras.layers.BatchNormalization(
         gamma_regularizer='l1', beta_regularizer='l1')
@@ -87,36 +82,7 @@ class NormalizationLayersTest(test.TestCase):
     self.assertEqual(layer.gamma.constraint, max_norm)
     self.assertEqual(layer.beta.constraint, max_norm)
 
-  def _test_batchnorm_correctness(self, dtype, use_v2=True, fused=False):
-    model = keras.models.Sequential()
-    layer_ctor = (normalization.BatchNormalizationV2 if use_v2
-                  else normalization.BatchNormalizationV1)
-    norm = layer_ctor(input_shape=(2, 2, 2), momentum=0.8, fused=fused)
-    model.add(norm)
-    model.compile(loss='mse',
-                  optimizer=gradient_descent.GradientDescentOptimizer(0.01))
-
-    # centered on 5.0, variance 10.0
-    x = (np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2))
-         .astype(dtype))
-    model.fit(x, x, epochs=4, verbose=0)
-    out = model.predict(x)
-    out -= keras.backend.eval(norm.beta)
-    out /= keras.backend.eval(norm.gamma)
-
-    np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
-    np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
-
-  def test_batchnorm_correctness(self):
-    self._test_batchnorm_correctness(np.float32)
-    self._test_batchnorm_correctness(np.float32, fused=True)
-    self._test_batchnorm_correctness(np.float32, use_v2=False)
-
-  def test_batchnorm_mixed_precision(self):
-    self._test_batchnorm_correctness(np.float16)
-    self._test_batchnorm_correctness(np.float16, fused=True)
-    self._test_batchnorm_correctness(np.float16, use_v2=False)
-
+  @keras_parameterized.run_all_keras_modes
   def test_batchnorm_convnet(self):
     if test.is_gpu_available(cuda_only=True):
       with self.session(use_gpu=True):
@@ -125,7 +91,8 @@ class NormalizationLayersTest(test.TestCase):
             axis=1, input_shape=(3, 4, 4), momentum=0.8)
         model.add(norm)
         model.compile(loss='mse',
-                      optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+                      optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                      run_eagerly=testing_utils.should_run_eagerly())
 
         # centered on 5.0, variance 10.0
         x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
@@ -137,13 +104,15 @@ class NormalizationLayersTest(test.TestCase):
         np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
         np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
 
+  @keras_parameterized.run_all_keras_modes
   def test_batchnorm_convnet_channel_last(self):
     model = keras.models.Sequential()
     norm = keras.layers.BatchNormalization(
         axis=-1, input_shape=(4, 4, 3), momentum=0.8)
     model.add(norm)
     model.compile(loss='mse',
-                  optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+                  optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     # centered on 5.0, variance 10.0
     x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
@@ -155,6 +124,28 @@ class NormalizationLayersTest(test.TestCase):
     np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
     np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_batchnorm_correctness(self):
+    _run_batchnorm_correctness_test(
+        normalization.BatchNormalization, dtype='float32')
+    _run_batchnorm_correctness_test(
+        normalization.BatchNormalization, dtype='float32', fused=True)
+    _run_batchnorm_correctness_test(
+        normalization.BatchNormalization, dtype='float32', fused=False)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_batchnorm_mixed_precision(self):
+    _run_batchnorm_correctness_test(
+        normalization.BatchNormalization, dtype='float16')
+    _run_batchnorm_correctness_test(
+        normalization.BatchNormalization, dtype='float16', fused=True)
+    _run_batchnorm_correctness_test(
+        normalization.BatchNormalization, dtype='float16', fused=False)
+
+
+class BatchNormalizationV1Test(test.TestCase):
+
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_v1_fused_attribute(self):
     norm = normalization.BatchNormalizationV1()
     inp = keras.layers.Input((4, 4, 4))
@@ -173,6 +164,21 @@ class NormalizationLayersTest(test.TestCase):
     norm(inp)
     self.assertEqual(norm.fused, False)
 
+
+class BatchNormalizationV2Test(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_all_keras_modes
+  def test_basic_batchnorm_v2(self):
+    testing_utils.layer_test(
+        normalization.BatchNormalizationV2,
+        kwargs={'fused': True},
+        input_shape=(3, 3, 3, 3))
+    testing_utils.layer_test(
+        normalization.BatchNormalizationV2,
+        kwargs={'fused': None},
+        input_shape=(3, 3, 3))
+
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_v2_fused_attribute(self):
     norm = normalization.BatchNormalizationV2()
     self.assertEqual(norm.fused, None)
@@ -227,6 +233,26 @@ class NormalizationLayersTest(test.TestCase):
       norm(inp)
 
 
+def _run_batchnorm_correctness_test(layer, dtype='float32', fused=False):
+  model = keras.models.Sequential()
+  norm = layer(input_shape=(2, 2, 2), momentum=0.8, fused=fused)
+  model.add(norm)
+  model.compile(loss='mse',
+                optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                run_eagerly=testing_utils.should_run_eagerly())
+
+  # centered on 5.0, variance 10.0
+  x = (np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2))
+       .astype(dtype))
+  model.fit(x, x, epochs=4, verbose=0)
+  out = model.predict(x)
+  out -= keras.backend.eval(norm.beta)
+  out /= keras.backend.eval(norm.gamma)
+
+  np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+  np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+
+
 class NormalizationLayersGraphModeOnlyTest(test.TestCase):
 
   def test_shared_batchnorm(self):
@@ -308,6 +334,8 @@ class NormalizationLayersGraphModeOnlyTest(test.TestCase):
     Computes mean and std for current inputs then
     applies batch normalization using them.
     """
+    # TODO(fchollet): enable in all execution modes when issue with
+    # learning phase setting is resolved.
     with self.cached_session():
       bn_mean = 0.5
       bn_std = 10.
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 5d0efc2f16c3367bd08c76e7c9ea88f7bcb729d0..3051416c6e0304b0f1d0bc5b56f53affdf8e1c24 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -19,9 +19,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import uuid
+
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import backend as K
@@ -33,6 +39,7 @@ from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -40,6 +47,14 @@ from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
+# The following string constants are used by Defun approach for unified backend
+# of LSTM and GRU.
+_DEFUN_API_NAME_ATTRIBUTE = 'experimental_api_implements'
+_DEFUN_DEVICE_ATTRIBUTE = 'experimental_api_preferred_device'
+_CPU_DEVICE_NAME = 'CPU'
+_GPU_DEVICE_NAME = 'GPU'
+
+
 @tf_export('keras.layers.StackedRNNCells')
 class StackedRNNCells(Layer):
   """Wrapper allowing a stack of RNN cells to behave as a single cell.
@@ -63,6 +78,7 @@ class StackedRNNCells(Layer):
   ```
   """
 
+  @checkpointable.no_automatic_dependency_tracking
   def __init__(self, cells, **kwargs):
     for cell in cells:
       if not hasattr(cell, 'call'):
@@ -427,6 +443,7 @@ class RNN(Layer):
   ```
   """
 
+  @checkpointable.no_automatic_dependency_tracking
   def __init__(self,
                cell,
                return_sequences=False,
@@ -746,35 +763,12 @@ class RNN(Layer):
            training=None,
            initial_state=None,
            constants=None):
-    # input shape: `(samples, time (padded with zeros), input_dim)`
-    # note that the .build() method of subclasses MUST define
-    # self.input_spec and self.state_spec with complete input shapes.
-    if isinstance(inputs, list):
-      # get initial_state from full input spec
-      # as they could be copied to multiple GPU.
-      if self._num_constants is None:
-        initial_state = inputs[1:]
-      else:
-        initial_state = inputs[1:-self._num_constants]
-        constants = inputs[-self._num_constants:]
-      if len(initial_state) == 0:
-        initial_state = None
-      inputs = inputs[0]
-    if initial_state is not None:
-      pass
-    elif self.stateful:
-      initial_state = self.states
-    else:
-      initial_state = self.get_initial_state(inputs)
+    inputs, initial_state, constants = self._process_inputs(
+        inputs, initial_state, constants)
 
     if isinstance(mask, list):
       mask = mask[0]
 
-    if len(initial_state) != len(self.states):
-      raise ValueError(
-          'Layer has ' + str(len(self.states)) + ' states but was passed ' +
-          str(len(initial_state)) + ' initial states.')
-
     if nest.is_sequence(inputs):
       # In the case of nested input, use the first element for shape check.
       input_shape = K.int_shape(nest.flatten(inputs)[0])
@@ -854,6 +848,34 @@ class RNN(Layer):
     else:
       return output
 
+  def _process_inputs(self, inputs, initial_state, constants):
+    # input shape: `(samples, time (padded with zeros), input_dim)`
+    # note that the .build() method of subclasses MUST define
+    # self.input_spec and self.state_spec with complete input shapes.
+    if isinstance(inputs, list):
+      # get initial_state from full input spec
+      # as they could be copied to multiple GPU.
+      if self._num_constants is None:
+        initial_state = inputs[1:]
+      else:
+        initial_state = inputs[1:-self._num_constants]
+        constants = inputs[-self._num_constants:]
+      if len(initial_state) == 0:
+        initial_state = None
+      inputs = inputs[0]
+    if initial_state is not None:
+      pass
+    elif self.stateful:
+      initial_state = self.states
+    else:
+      initial_state = self.get_initial_state(inputs)
+
+    if len(initial_state) != len(self.states):
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' + str(len(initial_state)) +
+                       ' initial states.')
+    return inputs, initial_state, constants
+
   def reset_states(self, states=None):
     if not self.stateful:
       raise AttributeError('Layer must be stateful.')
@@ -1485,12 +1507,6 @@ class GRUCell(Layer):
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
-      if not self.reset_after:
-        self.input_bias, self.recurrent_bias = self.bias, None
-      else:
-        self.input_bias = K.flatten(self.bias[0])
-        self.recurrent_bias = K.flatten(self.bias[1])
-
     else:
       self.bias = None
     self.built = True
@@ -1517,6 +1533,12 @@ class GRUCell(Layer):
     # dropout matrices for recurrent units
     rec_dp_mask = self._recurrent_dropout_mask
 
+    if self.use_bias:
+      if not self.reset_after:
+        input_bias, recurrent_bias = self.bias, None
+      else:
+        input_bias, recurrent_bias = array_ops.unstack(self.bias)
+
     if self.implementation == 1:
       if 0. < self.dropout < 1.:
         inputs_z = inputs * dp_mask[0]
@@ -1532,9 +1554,9 @@ class GRUCell(Layer):
       x_h = K.dot(inputs_h, self.kernel[:, self.units * 2:])
 
       if self.use_bias:
-        x_z = K.bias_add(x_z, self.input_bias[:self.units])
-        x_r = K.bias_add(x_r, self.input_bias[self.units: self.units * 2])
-        x_h = K.bias_add(x_h, self.input_bias[self.units * 2:])
+        x_z = K.bias_add(x_z, input_bias[:self.units])
+        x_r = K.bias_add(x_r, input_bias[self.units: self.units * 2])
+        x_h = K.bias_add(x_h, input_bias[self.units * 2:])
 
       if 0. < self.recurrent_dropout < 1.:
         h_tm1_z = h_tm1 * rec_dp_mask[0]
@@ -1549,10 +1571,9 @@ class GRUCell(Layer):
       recurrent_r = K.dot(h_tm1_r,
                           self.recurrent_kernel[:, self.units:self.units * 2])
       if self.reset_after and self.use_bias:
-        recurrent_z = K.bias_add(recurrent_z, self.recurrent_bias[:self.units])
+        recurrent_z = K.bias_add(recurrent_z, recurrent_bias[:self.units])
         recurrent_r = K.bias_add(recurrent_r,
-                                 self.recurrent_bias[self.units:
-                                                     self.units * 2])
+                                 recurrent_bias[self.units:self.units * 2])
 
       z = self.recurrent_activation(x_z + recurrent_z)
       r = self.recurrent_activation(x_r + recurrent_r)
@@ -1561,8 +1582,7 @@ class GRUCell(Layer):
       if self.reset_after:
         recurrent_h = K.dot(h_tm1_h, self.recurrent_kernel[:, self.units * 2:])
         if self.use_bias:
-          recurrent_h = K.bias_add(recurrent_h,
-                                   self.recurrent_bias[self.units * 2:])
+          recurrent_h = K.bias_add(recurrent_h, recurrent_bias[self.units * 2:])
         recurrent_h = r * recurrent_h
       else:
         recurrent_h = K.dot(r * h_tm1_h,
@@ -1577,7 +1597,7 @@ class GRUCell(Layer):
       matrix_x = K.dot(inputs, self.kernel)
       if self.use_bias:
         # biases: bias_z_i, bias_r_i, bias_h_i
-        matrix_x = K.bias_add(matrix_x, self.input_bias)
+        matrix_x = K.bias_add(matrix_x, input_bias)
 
       x_z = matrix_x[:, :self.units]
       x_r = matrix_x[:, self.units: 2 * self.units]
@@ -1590,7 +1610,7 @@ class GRUCell(Layer):
         # hidden state projected by all gate matrices at once
         matrix_inner = K.dot(h_tm1, self.recurrent_kernel)
         if self.use_bias:
-          matrix_inner = K.bias_add(matrix_inner, self.recurrent_bias)
+          matrix_inner = K.bias_add(matrix_inner, recurrent_bias)
       else:
         # hidden state projected separately for update/reset and new
         matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units])
@@ -1643,7 +1663,7 @@ class GRUCell(Layer):
     return _generate_zero_filled_state_for_cell(self, inputs, batch_size, dtype)
 
 
-@tf_export('keras.layers.GRU')
+@tf_export(v1=['keras.layers.GRU'])
 class GRU(RNN):
   """Gated Recurrent Unit - Cho et al. 2014.
 
@@ -1902,6 +1922,391 @@ class GRU(RNN):
     return cls(**config)
 
 
+@tf_export('keras.layers.GRU', v1=[])
+class UnifiedGRU(GRU):
+  """Gated Recurrent Unit - Cho et al. 2014.
+
+  `UnifiedGRU` unifies the implementations between standard `GRU` layer and
+  `CuDNNGRU` layer. Based on available runtime hardware and constraints,
+  `UnifiedGRU` will choose different implementations to maximize the
+  performance. For instance, if GPU is available and all the parameters meet the
+  requirement of CuDNN kernel, `UnifiedGRU` will use CuDNN kernel for the
+  calculation. The requirements to use CuDNN kernel are:
+
+    1. `activation` == 'tanh'
+    2. `recurrent_activation` == 'sigmoid'
+    3. `recurrent_dropout` == 0
+    4. `unroll` is False
+    5. `use_bias` is True
+    6. `reset_after` is True
+    7. Use masking in previous layers.
+
+  There are two variants. The default one is based on
+  [v3](https://arxiv.org/abs/1406.1078v3) and has reset gate applied to hidden
+  state before matrix multiplication. The other one is based on
+  [original](https://arxiv.org/abs/1406.1078v1) and has the order reversed.
+
+  The second variant is compatible with CuDNNGRU (GPU-only) and allows
+  inference on CPU. Thus it has separate biases for `kernel` and
+  `recurrent_kernel`. Use `'reset_after'=True` and
+  `recurrent_activation='sigmoid'`.
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+          Default: hyperbolic tangent (`tanh`).
+          If you pass `None`, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+          for the recurrent step.
+          Default: sigmoid (`sigmoid`).
+          If you pass `None`, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+          used for the linear transformation of the inputs.
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+          weights matrix,
+          used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to
+          the `kernel` weights matrix.
+      recurrent_constraint: Constraint function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the recurrent state.
+      implementation: Implementation mode, either 1 or 2.
+          Mode 1 will structure its operations as a larger number of
+          smaller dot products and additions, whereas mode 2 will
+          batch them into fewer, larger operations. These modes will
+          have different performance profiles on different hardware and
+          for different applications.
+      return_sequences: Boolean. Whether to return the last output
+          in the output sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state
+          in addition to the output.
+      go_backwards: Boolean (default False).
+          If True, process the input sequence backwards and return the
+          reversed sequence.
+      stateful: Boolean (default False). If True, the last state
+          for each sample at index i in a batch will be used as initial
+          state for the sample of index i in the following batch.
+      unroll: Boolean (default False).
+          If True, the network will be unrolled,
+          else a symbolic loop will be used.
+          Unrolling can speed-up a RNN,
+          although it tends to be more memory-intensive.
+          Unrolling is only suitable for short sequences.
+      reset_after: GRU convention (whether to apply reset gate after or
+          before matrix multiplication). False = "before",
+          True = "after" (default and CuDNN compatible).
+  """
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               implementation=1,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               unroll=False,
+               time_major=False,
+               reset_after=True,
+               **kwargs):
+    # return_runtime is a flag for testing, which shows the real backend
+    # implementation chosen by grappler in graph mode.
+    self._return_runtime = kwargs.pop('return_runtime', False)
+
+    super(UnifiedGRU, self).__init__(
+        units,
+        activation=activation,
+        recurrent_activation=recurrent_activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        recurrent_initializer=recurrent_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        recurrent_regularizer=recurrent_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        recurrent_constraint=recurrent_constraint,
+        bias_constraint=bias_constraint,
+        dropout=dropout,
+        recurrent_dropout=recurrent_dropout,
+        implementation=implementation,
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        unroll=unroll,
+        time_major=time_major,
+        reset_after=reset_after,
+        **kwargs)
+    self._dropout_mask = None
+    # CuDNN uses following setting by default and not configurable.
+    self.could_use_cudnn = (
+        activation == 'tanh' and recurrent_activation == 'sigmoid' and
+        recurrent_dropout == 0 and not unroll and use_bias and
+        reset_after is True)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    # GRU does not support constants. Ignore it during process.
+    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    input_shape = K.int_shape(inputs)
+    timesteps = input_shape[0] if self.time_major else input_shape[1]
+
+    if mask is not None or not self.could_use_cudnn:
+      # CuDNN does not support masking, fall back to use the normal GRU.
+      kwargs = {'training': training}
+      self.cell._dropout_mask = None
+      self.cell._recurrent_dropout_mask = None
+
+      def step(cell_inputs, cell_states):
+        return self.cell.call(cell_inputs, cell_states, **kwargs)
+
+      last_output, outputs, states = K.rnn(
+          step,
+          inputs,
+          initial_state,
+          constants=None,
+          go_backwards=self.go_backwards,
+          mask=mask,
+          unroll=self.unroll,
+          input_length=timesteps,
+          time_major=self.time_major,
+          zero_output_for_mask=self.zero_output_for_mask)
+      # This is a dummy tensor for testing purpose.
+      runtime = constant_op.constant(
+          'unknown', dtype=dtypes.string, name='runtime')
+    else:
+      last_output, outputs, runtime, states = self._defun_gru_call(
+          inputs, initial_state, training)
+
+    if self.stateful:
+      updates = [state_ops.assign(self.states[0], states[0])]
+      self.add_update(updates, inputs)
+
+    if self.return_sequences:
+      output = outputs
+    else:
+      output = last_output
+
+    if self.return_state:
+      return [output] + states
+    elif self._return_runtime:
+      return output, runtime
+    else:
+      return output
+
+  def _defun_gru_call(self, inputs, initial_state, training):
+    # Use the new defun approach for backend implementation swap.
+    # Note that different implementations need to have same function
+    # signature, eg, the tensor parameters need to have same shape and dtypes.
+    if self.go_backwards:
+      # Reverse time axis.
+      inputs = K.reverse(inputs, 0 if self.time_major else 1)
+    if 0 < self.dropout < 1:
+      if self._dropout_mask is None:
+        self._dropout_mask = _generate_dropout_mask(
+            array_ops.ones_like(inputs),
+            self.dropout,
+            training=training,
+            count=3)
+
+      inputs *= self._dropout_mask[0]
+    experimental_api_name = 'gru_' + str(uuid.uuid4())
+    defun_standard_gru = _generate_defun_backend(
+        experimental_api_name, _CPU_DEVICE_NAME, standard_gru)
+    defun_cudnn_gru = _generate_defun_backend(
+        experimental_api_name, _GPU_DEVICE_NAME, cudnn_gru)
+    if ops.executing_eagerly_outside_functions():
+      # Under eager context, the device placement is already known. Prefer the
+      # GPU implementation when GPU is available.
+      if context.num_gpus() > 0:
+        last_output, outputs, new_h, runtime = defun_cudnn_gru(
+            inputs=inputs,
+            init_h=initial_state[0],
+            kernel=self.cell.kernel,
+            recurrent_kernel=self.cell.recurrent_kernel,
+            bias=self.cell.bias,
+            time_major=self.time_major)
+      else:
+        last_output, outputs, new_h, runtime = defun_standard_gru(
+            inputs=inputs,
+            init_h=initial_state[0],
+            kernel=self.cell.kernel,
+            recurrent_kernel=self.cell.recurrent_kernel,
+            bias=self.cell.bias,
+            activation=self.activation,
+            recurrent_activation=self.recurrent_activation,
+            time_major=self.time_major)
+    else:
+      # Call the normal GRU impl and register the CuDNN impl function. The
+      # grappler will kick in during session execution to optimize the graph.
+      last_output, outputs, new_h, runtime = defun_standard_gru(
+          inputs=inputs,
+          init_h=initial_state[0],
+          kernel=self.cell.kernel,
+          recurrent_kernel=self.cell.recurrent_kernel,
+          bias=self.cell.bias,
+          activation=self.activation,
+          recurrent_activation=self.recurrent_activation,
+          time_major=self.time_major)
+
+      function.register(defun_cudnn_gru, inputs, initial_state[0],
+                        self.cell.kernel, self.cell.recurrent_kernel,
+                        self.cell.bias, self.time_major)
+    states = [new_h]
+    return last_output, outputs, runtime, states
+
+
+def standard_gru(inputs, init_h, kernel, recurrent_kernel, bias, activation,
+                 recurrent_activation, time_major):
+  """GRU with standard kernel implementation.
+
+  This implementation can be run on all types of hardware.
+
+  This implementation lifts out all the layer weights and make them function
+  parameters. It has same number of tensor input params as the CuDNN
+  counterpart. The RNN step logic has been simplified, eg dropout and mask is
+  removed since CuDNN implementation does not support that.
+
+  Args:
+    inputs: input tensor of GRU layer.
+    init_h: initial state tensor for the cell output.
+    kernel: weights for cell kernel.
+    recurrent_kernel: weights for cell recurrent kernel.
+    bias: weights for cell kernel bias and recurrent bias. The bias contains the
+      combined input_bias and recurrent_bias.
+    activation: Activation function to use for output.
+    recurrent_activation: Activation function to use for hidden recurrent state.
+    time_major: boolean, whether the inputs are in the format of
+      [time, batch, feature] or [batch, time, feature].
+
+  Returns:
+    last_output: output tensor for the last timestep, which has shape
+      [batch, units].
+    outputs: output tensor for all timesteps, which has shape
+      [batch, time, units].
+    state_0: the cell output, which has same shape as init_h.
+    runtime: constant string tensor which indicate real runtime hardware. This
+      value is for testing purpose and should be used by user.
+  """
+  input_shape = K.int_shape(inputs)
+  timesteps = input_shape[0] if time_major else input_shape[1]
+
+  input_bias, recurrent_bias = array_ops.unstack(bias)
+
+  def step(cell_inputs, cell_states):
+    """Step function that will be used by Keras RNN backend."""
+    h_tm1 = cell_states[0]
+
+    # inputs projected by all gate matrices at once
+    matrix_x = K.dot(cell_inputs, kernel)
+    matrix_x = K.bias_add(matrix_x, input_bias)
+
+    x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1)
+
+    # hidden state projected by all gate matrices at once
+    matrix_inner = K.dot(h_tm1, recurrent_kernel)
+    matrix_inner = K.bias_add(matrix_inner, recurrent_bias)
+
+    recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3,
+                                                            axis=1)
+    z = recurrent_activation(x_z + recurrent_z)
+    r = recurrent_activation(x_r + recurrent_r)
+    hh = activation(x_h + r * recurrent_h)
+
+    # previous and candidate state mixed by update gate
+    h = z * h_tm1 + (1 - z) * hh
+    return h, [h]
+
+  last_output, outputs, new_states = K.rnn(
+      step,
+      inputs, [init_h],
+      constants=None,
+      unroll=False,
+      time_major=time_major,
+      input_length=timesteps)
+  return last_output, outputs, new_states[0], constant_op.constant(
+      'cpu', dtype=dtypes.string, name='runtime')
+
+
+def cudnn_gru(inputs, init_h, kernel, recurrent_kernel, bias, time_major):
+  """GRU with CuDNN implementation which is only available for GPU."""
+  if not time_major:
+    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+  init_h = array_ops.expand_dims(init_h, axis=0)
+
+  weights = array_ops.split(kernel, 3, axis=1)
+  weights += array_ops.split(recurrent_kernel, 3, axis=1)
+  # Note that the bias was initialized as shape (2, 3 * units), flat it into
+  # (6 * units)
+  bias = array_ops.split(K.flatten(bias), 6)
+  # Note that the gate order for CuDNN is different from the canonical format.
+  # canonical format is [z, r, h], whereas CuDNN is [r, z, h]. The swap need to
+  # be done for kernel, recurrent_kernel, input_bias, recurrent_bias.
+  # z is update gate weights.
+  # r is reset gate weights.
+  # h is output gate weights.
+  weights[0], weights[1] = weights[1], weights[0]
+  weights[3], weights[4] = weights[4], weights[3]
+  bias[0], bias[1] = bias[1], bias[0]
+  bias[3], bias[4] = bias[4], bias[3]
+
+  params = _canonical_to_params(
+      weights=weights,
+      biases=bias,
+      shape=constant_op.constant([-1]),
+      transpose_weights=True)
+
+  outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+      inputs,
+      input_h=init_h,
+      input_c=0,
+      params=params,
+      is_training=True,
+      rnn_mode='gru')
+  last_output = outputs[-1]
+  if not time_major:
+    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
+  h = h[0]
+  return last_output, outputs, h, constant_op.constant(
+      'cudnn', dtype=dtypes.string, name='runtime')
+
+
 @tf_export('keras.layers.LSTMCell')
 class LSTMCell(Layer):
   """Cell class for the LSTM layer.
@@ -2262,7 +2667,7 @@ class PeepholeLSTMCell(LSTMCell):
     return c, o
 
 
-@tf_export('keras.layers.LSTM')
+@tf_export(v1=['keras.layers.LSTM'])
 class LSTM(RNN):
   """Long Short-Term Memory layer - Hochreiter 1997.
 
@@ -2521,6 +2926,376 @@ class LSTM(RNN):
     return cls(**config)
 
 
+@tf_export('keras.layers.LSTM', v1=[])
+class UnifiedLSTM(LSTM):
+  """Long Short-Term Memory layer - Hochreiter 1997.
+
+  `UnifiedLSTM` unifies the implementations between standard `LSTM` layer and
+  `CuDNNLSTM` layer. Based on available runtime hardware and constrains,
+  `UnifiedLSTM` will choose different implementations to maximize the
+  performance. For instance, if GPU is available and all the parameters meet the
+  requirement of CuDNN kernel, `UnifiedLSTM` will use CuDNN kernel for the
+  calculation.
+
+  Arguments:
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+      Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
+      is applied (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use for the recurrent step.
+      Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
+      applied (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix, used for
+      the linear transformation of the inputs..
+    recurrent_initializer: Initializer for the `recurrent_kernel` weights
+      matrix, used for the linear transformation of the recurrent state..
+    bias_initializer: Initializer for the bias vector.
+    unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate at
+      initialization. Setting it to true will also force
+      `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
+          al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+    kernel_regularizer: Regularizer function applied to the `kernel` weights
+      matrix.
+    recurrent_regularizer: Regularizer function applied to the
+      `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to the output of the
+      layer (its "activation")..
+    kernel_constraint: Constraint function applied to the `kernel` weights
+      matrix.
+    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
+      weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
+      transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+    implementation: Implementation mode, either 1 or 2. Mode 1 will structure
+      its operations as a larger number of smaller dot products and additions,
+      whereas mode 2 will batch them into fewer, larger operations. These modes
+      will have different performance profiles on different hardware and for
+      different applications.
+    return_sequences: Boolean. Whether to return the last output. in the output
+      sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state in addition to the
+      output.
+    go_backwards: Boolean (default False). If True, process the input sequence
+      backwards and return the reversed sequence.
+    stateful: Boolean (default False). If True, the last state for each sample
+      at index i in a batch will be used as initial state for the sample of
+      index i in the following batch.
+    unroll: Boolean (default False). If True, the network will be unrolled, else
+      a symbolic loop will be used. Unrolling can speed-up a RNN, although it
+      tends to be more memory-intensive. Unrolling is only suitable for short
+      sequences.
+  """
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               implementation=1,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               time_major=False,
+               unroll=False,
+               **kwargs):
+    # return_runtime is a flag for testing, which shows the real backend
+    # implementation chosen by grappler in graph mode.
+    self.return_runtime = kwargs.pop('return_runtime', False)
+
+    super(UnifiedLSTM, self).__init__(
+        units,
+        activation=activation,
+        recurrent_activation=recurrent_activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        recurrent_initializer=recurrent_initializer,
+        bias_initializer=bias_initializer,
+        unit_forget_bias=unit_forget_bias,
+        kernel_regularizer=kernel_regularizer,
+        recurrent_regularizer=recurrent_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        recurrent_constraint=recurrent_constraint,
+        bias_constraint=bias_constraint,
+        dropout=dropout,
+        recurrent_dropout=recurrent_dropout,
+        implementation=implementation,
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        time_major=time_major,
+        unroll=unroll,
+        **kwargs)
+
+    self.state_spec = [
+        InputSpec(shape=(None, dim)) for dim in (self.units, self.units)
+    ]
+    self._dropout_mask = None
+    self.could_use_cudnn = (
+        activation == 'tanh' and recurrent_activation == 'sigmoid' and
+        recurrent_dropout == 0 and not unroll and use_bias)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    # LSTM does not support constants. Ignore it during process.
+    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    input_shape = K.int_shape(inputs)
+    timesteps = input_shape[0] if self.time_major else input_shape[1]
+
+    if mask is not None or not self.could_use_cudnn:
+      # CuDNN does not support masking, fall back to use the normal LSTM.
+      kwargs = {'training': training}
+
+      def step(inputs, states):
+        return self.cell.call(inputs, states, **kwargs)
+
+      last_output, outputs, states = K.rnn(
+          step,
+          inputs,
+          initial_state,
+          constants=None,
+          go_backwards=self.go_backwards,
+          mask=mask,
+          unroll=self.unroll,
+          input_length=timesteps,
+          time_major=self.time_major,
+          zero_output_for_mask=self.zero_output_for_mask)
+      runtime = constant_op.constant(
+          'unknown', dtype=dtypes.string, name='runtime')
+    else:
+      # Use the new defun approach for backend implementation swap.
+      # Note that different implementations need to have same function
+      # signature, eg, the tensor parameters need to have same shape and dtypes.
+      # Since the CuDNN has an extra set of bias, those bias will be passed to
+      # both normal and CuDNN implementations.
+      if self.go_backwards:
+        # Reverse time axis.
+        inputs = K.reverse(inputs, 0 if self.time_major else 1)
+
+      if 0 < self.dropout < 1:
+        if self._dropout_mask is None:
+          self._dropout_mask = _generate_dropout_mask(
+              array_ops.ones_like(inputs),
+              self.dropout,
+              training=training,
+              count=4)
+
+        inputs *= self._dropout_mask[0]
+
+      # Each time a defun function is called, we will give a unique identifiable
+      # API name, so that the grappler won't get confused when it sees multiple
+      # LSTM layer added into same graph, and it will be able to pair up the
+      # different implementations across them.
+      experimental_api_name = 'lstm_' + str(uuid.uuid4())
+      defun_standard_lstm = _generate_defun_backend(
+          experimental_api_name, _CPU_DEVICE_NAME, standard_lstm)
+      defun_cudnn_lstm = _generate_defun_backend(
+          experimental_api_name, _GPU_DEVICE_NAME, cudnn_lstm)
+
+      if ops.executing_eagerly_outside_functions():
+        # Under eager context, the device placement is already known. Prefer the
+        # GPU implementation here.
+        if context.num_gpus() > 0:
+          last_output, outputs, new_h, new_c, runtime = defun_cudnn_lstm(
+              inputs, initial_state[0], initial_state[1], self.cell.kernel,
+              self.cell.recurrent_kernel, self.cell.bias, self.time_major)
+        else:
+          last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
+              inputs, initial_state[0], initial_state[1], self.cell.kernel,
+              self.cell.recurrent_kernel, self.cell.bias, self.activation,
+              self.recurrent_activation, self.time_major)
+      else:
+        # Call the normal LSTM impl and register the CuDNN impl function. The
+        # grappler will kick in during session execution to optimize the graph.
+        last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
+            inputs, initial_state[0], initial_state[1], self.cell.kernel,
+            self.cell.recurrent_kernel, self.cell.bias, self.activation,
+            self.recurrent_activation, self.time_major)
+
+        function.register(defun_cudnn_lstm, inputs, initial_state[0],
+                          initial_state[1], self.cell.kernel,
+                          self.cell.recurrent_kernel, self.cell.bias,
+                          self.time_major)
+      states = [new_h, new_c]
+
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append(state_ops.assign(self.states[i], states[i]))
+      self.add_update(updates, inputs)
+
+    if self.return_sequences:
+      output = outputs
+    else:
+      output = last_output
+
+    if self.return_state:
+      return [output] + states
+    elif self.return_runtime:
+      return output, runtime
+    else:
+      return output
+
+
+def _canonical_to_params(weights, biases, shape, transpose_weights=False):
+  """Utility function convert variable to CuDNN compatible parameter.
+
+  Note that Keras weights for kernels are different from the CuDNN format. Eg.:
+
+  ```
+    Keras                 CuDNN
+    [[0, 1, 2],  <--->  [[0, 2, 4],
+     [3, 4, 5]]          [1, 3, 5]]
+  ```
+
+  If the input weights need to be in a unified format, then set
+  `transpose_weights=True` to convert the weights.
+
+  Args:
+    weights: list of weights for the individual kernels and recurrent kernels.
+    biases: list of biases for individual gate.
+    shape: the shape for the converted variables that will be feed to CuDNN.
+    transpose_weights: boolean, whether to transpose the weights.
+
+  Returns:
+    The converted weights that can be feed to CuDNN ops as param.
+  """
+  def convert(w):
+    return array_ops.transpose(w) if transpose_weights else w
+
+  weights = [array_ops.reshape(convert(x), shape) for x in weights]
+  biases = [array_ops.reshape(x, shape) for x in biases]
+  return array_ops.concat(weights + biases, axis=0)
+
+
+def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias,
+                  activation, recurrent_activation, time_major):
+  """LSTM with standard kernel implementation.
+
+  This implementation can be run on all types for hardware.
+
+  This implementation lifts out all the layer weights and make them function
+  parameters. It has same number of tensor input params as the CuDNN
+  counterpart. The RNN step logic has been simplified, eg dropout and mask is
+  removed since CuDNN implementation does not support that.
+
+  Note that the first half of the bias tensor should be ignored by this impl.
+  The CuDNN impl need an extra set of input gate bias. In order to make the both
+  function take same shape of parameter, that extra set of bias is also feed
+  here.
+
+  Args:
+    inputs: input tensor of LSTM layer.
+    init_h: initial state tensor for the cell output.
+    init_c: initial state tensor for the cell hidden state.
+    kernel: weights for cell kernel.
+    recurrent_kernel: weights for cell recurrent kernel.
+    bias: weights for cell kernel bias and recurrent bias. Only recurrent bias
+      is used in this case.
+    activation: Activation function to use for output.
+    recurrent_activation: Activation function to use for hidden recurrent state.
+    time_major: boolean, whether the inputs are in the format of
+      [time, batch, feature] or [batch, time, feature].
+
+  Returns:
+    last_output: output tensor for the last timestep, which has shape
+      [batch, units].
+    outputs: output tensor for all timesteps, which has shape
+      [batch, time, units].
+    state_0: the cell output, which has same shape as init_h.
+    state_1: the cell hidden state, which has same shape as init_c.
+    runtime: constant string tensor which indicate real runtime hardware. This
+      value is for testing purpose and should be used by user.
+  """
+  input_shape = K.int_shape(inputs)
+  timesteps = input_shape[0] if time_major else input_shape[1]
+
+  def step(cell_inputs, cell_states):
+    """Step function that will be used by Keras RNN backend."""
+    h_tm1 = cell_states[0]  # previous memory state
+    c_tm1 = cell_states[1]  # previous carry state
+
+    z = K.dot(cell_inputs, kernel)
+    z += K.dot(h_tm1, recurrent_kernel)
+    z = K.bias_add(z, bias)
+
+    z0, z1, z2, z3 = array_ops.split(z, 4, axis=1)
+
+    i = recurrent_activation(z0)
+    f = recurrent_activation(z1)
+    c = f * c_tm1 + i * activation(z2)
+    o = recurrent_activation(z3)
+
+    h = o * activation(c)
+    return h, [h, c]
+
+  last_output, outputs, new_states = K.rnn(
+      step,
+      inputs, [init_h, init_c],
+      constants=None,
+      unroll=False,
+      time_major=time_major,
+      input_length=timesteps)
+  return last_output, outputs, new_states[0], new_states[
+      1], constant_op.constant('cpu', dtype=dtypes.string, name='runtime')
+
+
+def cudnn_lstm(inputs, input_h, input_c, kernel, recurrent_kernel, bias,
+               time_major):
+  """LSTM with CuDNN implementation which is only available for GPU."""
+  if not time_major:
+    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+  input_h = array_ops.expand_dims(input_h, axis=0)
+  input_c = array_ops.expand_dims(input_c, axis=0)
+
+  weights = array_ops.split(kernel, 4, axis=1)
+  weights += array_ops.split(recurrent_kernel, 4, axis=1)
+  # CuDNN has an extra set of bias for inputs, we disable them (setting to 0),
+  # so that mathematically it is same as the canonical LSTM implementation.
+  full_bias = array_ops.concat((array_ops.zeros_like(bias), bias), 0)
+
+  params = _canonical_to_params(
+      weights=weights,
+      biases=array_ops.split(full_bias, 8),
+      shape=constant_op.constant([-1]),
+      transpose_weights=True)
+
+  outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+      inputs, input_h=input_h, input_c=input_c, params=params, is_training=True)
+  last_output = outputs[-1]
+  if not time_major:
+    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
+  h = h[0]
+  c = c[0]
+
+  return last_output, outputs, h, c, constant_op.constant(
+      'cudnn', dtype=dtypes.string, name='runtime')
+
+
 def _generate_dropout_mask(ones, rate, training=None, count=1):
   def dropped_inputs():
     return K.dropout(ones, rate)
@@ -2627,3 +3402,12 @@ def _generate_zero_filled_state(batch_size_tensor, state_size, dtype):
     return nest.map_structure(create_zeros, state_size)
   else:
     return create_zeros(state_size)
+
+
+def _generate_defun_backend(unique_api_name, preferred_device, func):
+  function_attributes = {
+      _DEFUN_API_NAME_ATTRIBUTE: unique_api_name,
+      _DEFUN_DEVICE_ATTRIBUTE: preferred_device,
+  }
+  return function.defun_with_attributes(func=func,
+                                        attributes=function_attributes)
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index b1449069e3279e27b08ecc383e72aed63525e521..e9bf788740fcb355a3cb2143ca4f0db1dcc8b802 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -30,7 +30,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -48,8 +49,8 @@ NestedInput = collections.namedtuple('NestedInput', ['t1', 't2'])
 NestedState = collections.namedtuple('NestedState', ['s1', 's2'])
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class RNNTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class RNNTest(keras_parameterized.TestCase):
 
   def test_minimal_rnn_cell_non_layer(self):
 
@@ -73,7 +74,8 @@ class RNNTest(test.TestCase):
     y = layer(x)
     model = keras.models.Model(x, y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test stacking.
@@ -84,7 +86,8 @@ class RNNTest(test.TestCase):
     y = layer(x)
     model = keras.models.Model(x, y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
   def test_minimal_rnn_cell_non_layer_multiple_states(self):
@@ -112,7 +115,8 @@ class RNNTest(test.TestCase):
     y = layer(x)
     model = keras.models.Model(x, y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test stacking.
@@ -125,7 +129,8 @@ class RNNTest(test.TestCase):
     y = layer(x)
     model = keras.models.Model(x, y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
   def test_minimal_rnn_cell_layer(self):
@@ -165,7 +170,8 @@ class RNNTest(test.TestCase):
     y = layer(x)
     model = keras.models.Model(x, y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test basic case serialization.
@@ -189,7 +195,8 @@ class RNNTest(test.TestCase):
     y = layer(x)
     model = keras.models.Model(x, y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test stacked RNN serialization.
@@ -228,7 +235,8 @@ class RNNTest(test.TestCase):
 
     model = keras.models.Model(x, y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, units)))
@@ -246,7 +254,8 @@ class RNNTest(test.TestCase):
     y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(y)
     model = keras.models.Model(x, y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, cell_units[-1])))
@@ -261,7 +270,8 @@ class RNNTest(test.TestCase):
     y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(rnn)
     model = keras.models.Model(x, y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, units)))
@@ -273,7 +283,8 @@ class RNNTest(test.TestCase):
 
     model = keras.models.Model(x, y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, units)))
@@ -347,7 +358,8 @@ class RNNTest(test.TestCase):
 
     model = keras.models.Model([x, c], y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -385,7 +397,8 @@ class RNNTest(test.TestCase):
     y = layer(x, constants=c)
     model = keras.models.Model([x, c], y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -399,7 +412,8 @@ class RNNTest(test.TestCase):
     y = layer(x, constants=c)
     model = keras.models.Model([x, c], y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -471,7 +485,8 @@ class RNNTest(test.TestCase):
     y = layer(x, initial_state=s, constants=c)
     model = keras.models.Model([x, s, c], y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 32)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -601,7 +616,8 @@ class RNNTest(test.TestCase):
       y = layer(x)
       model = keras.models.Model(x, y)
       model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                    loss='mse')
+                    loss='mse',
+                    run_eagerly=testing_utils.should_run_eagerly())
 
       # Test basic case serialization.
       x_np = np.random.random((6, 5, 5))
@@ -623,7 +639,8 @@ class RNNTest(test.TestCase):
       y = layer(x)
       model = keras.models.Model(x, y)
       model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                    loss='mse')
+                    loss='mse',
+                    run_eagerly=testing_utils.should_run_eagerly())
 
       # Test stacked RNN serialization.
       x_np = np.random.random((6, 5, 5))
@@ -647,7 +664,7 @@ class RNNTest(test.TestCase):
     x = keras.Input((None, 5))
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile('sgd', 'mse')
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
     x_np = np.random.random((6, 5, 5))
     y_np = np.random.random((6, 3))
     model.train_on_batch(x_np, y_np)
@@ -690,7 +707,8 @@ class RNNTest(test.TestCase):
     model = keras.models.Sequential()
     model.add(rnn(2))
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.fit(x, y, epochs=1, batch_size=1)
 
     # check whether the model variables are present in the
@@ -723,7 +741,8 @@ class RNNTest(test.TestCase):
 
     model = keras.models.Model(x, y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, input_a, input_b)),
         np.zeros((batch, unit_a, unit_b)))
@@ -739,7 +758,8 @@ class RNNTest(test.TestCase):
     y = layer(x)
     model = keras.models.Model(x, y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, input_a, input_b)),
         np.zeros((batch, unit_a * 4, unit_b * 4)))
@@ -762,7 +782,8 @@ class RNNTest(test.TestCase):
 
     model = keras.models.Model([x, s], y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch([
         np.zeros((batch, time_step, input_a, input_b)),
         np.zeros((batch, unit_a, unit_b))
@@ -799,7 +820,8 @@ class RNNTest(test.TestCase):
 
     model = keras.models.Model(x, y)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, input_size)),
         np.zeros((batch, input_size)))
@@ -854,7 +876,8 @@ class RNNTest(test.TestCase):
 
     model = keras.models.Model((input_1, input_2), outputs)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)), np.zeros((batch, t, i2, i3))],
         [np.zeros((batch, o1)), np.zeros((batch, o2, o3))])
@@ -875,7 +898,8 @@ class RNNTest(test.TestCase):
 
     model = keras.models.Model([input_1, input_2], outputs)
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3))],
@@ -903,7 +927,8 @@ class RNNTest(test.TestCase):
 
     model = keras.models.Model([input_1, input_2], [output1, output2])
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3))],
@@ -927,7 +952,8 @@ class RNNTest(test.TestCase):
 
     model = keras.models.Model([input_1, input_2], [output1, output2])
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3))],
@@ -960,7 +986,8 @@ class RNNTest(test.TestCase):
     model = keras.models.Model([input_1, input_2, init_s1, init_s2],
                                [output1, output2])
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3)),
@@ -991,7 +1018,8 @@ class RNNTest(test.TestCase):
     model = keras.models.Model([input_1, input_2, init_s1, init_s2],
                                [output1, output2])
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3)),
@@ -1004,18 +1032,17 @@ class RNNTest(test.TestCase):
   def test_peephole_lstm_cell(self):
 
     def _run_cell(cell_fn, **kwargs):
-      with self.cached_session() as sess:
-        inputs = array_ops.one_hot([1, 2, 3, 4], 4)
-        cell = cell_fn(5, **kwargs)
-        cell.build(inputs.shape)
-        initial_state = cell.get_initial_state(
-            inputs=inputs, batch_size=4, dtype=dtypes.float32)
-        inputs, _ = cell(inputs, initial_state)
-        output = inputs
-        if not context.executing_eagerly():
-          self.evaluate(variables_lib.global_variables_initializer())
-          output = self.evaluate(output)
-        return output
+      inputs = array_ops.one_hot([1, 2, 3, 4], 4)
+      cell = cell_fn(5, **kwargs)
+      cell.build(inputs.shape)
+      initial_state = cell.get_initial_state(
+          inputs=inputs, batch_size=4, dtype=dtypes.float32)
+      inputs, _ = cell(inputs, initial_state)
+      output = inputs
+      if not context.executing_eagerly():
+        self.evaluate(variables_lib.global_variables_initializer())
+        output = self.evaluate(output)
+      return output
 
     random_seed.set_random_seed(12345)
     # `recurrent_activation` kwarg is set to sigmoid as that is hardcoded into
@@ -1067,7 +1094,8 @@ class RNNTest(test.TestCase):
         Cell(), return_state=True)(x_masked, initial_state=s_0)
     model = keras.models.Model([x, s_0], [y, s])
     model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     # last time step masked
     x_np = np.array([[[1.], [2.], [0.]]])
@@ -1091,7 +1119,8 @@ class RNNTest(test.TestCase):
       y = layer(masked_input)
       model = keras.models.Model(x, y)
       model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                    loss='mse')
+                    loss='mse',
+                    run_eagerly=testing_utils.should_run_eagerly())
 
       np_x = np.ones((6, 5, 5))
       result_1 = model.predict(np_x)
diff --git a/tensorflow/python/keras/layers/simplernn_test.py b/tensorflow/python/keras/layers/simplernn_test.py
index b49b159b7199cb29e2c719cfa2c7a415c445d475..b5063850f0cd56348ed477c598faef031c71ef8a 100644
--- a/tensorflow/python/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/layers/simplernn_test.py
@@ -22,14 +22,15 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class SimpleRNNLayerTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class SimpleRNNLayerTest(keras_parameterized.TestCase):
 
   def test_return_sequences_SimpleRNN(self):
     num_samples = 2
@@ -98,7 +99,6 @@ class SimpleRNNLayerTest(test.TestCase):
     self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
     self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
-  @tf_test_util.run_deprecated_v1
   def test_with_masking_layer_SimpleRNN(self):
     layer_class = keras.layers.SimpleRNN
     inputs = np.random.random((2, 3, 4))
@@ -118,93 +118,91 @@ class SimpleRNNLayerTest(test.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
-
-class SimpleRNNLayerGraphOnlyTest(test.TestCase):
-
-  @tf_test_util.run_deprecated_v1
   def test_statefulness_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
     layer_class = keras.layers.SimpleRNN
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Embedding(
-              4,
-              embedding_dim,
-              mask_zero=True,
-              input_length=timesteps,
-              batch_input_shape=(num_samples, timesteps)))
-      layer = layer_class(
-          units, return_sequences=False, stateful=True, weights=None)
-      model.add(layer)
-      model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                    loss='mse')
-      out1 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertEqual(out1.shape, (num_samples, units))
-
-      # train once so that the states change
-      model.train_on_batch(
-          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-      out2 = model.predict(np.ones((num_samples, timesteps)))
-
-      # if the state is not reset, output should be different
-      self.assertNotEqual(out1.max(), out2.max())
-
-      # check that output changes after states are reset
-      # (even though the model itself didn't change)
-      layer.reset_states()
-      out3 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out2.max(), out3.max())
-
-      # check that container-level reset_states() works
-      model.reset_states()
-      out4 = model.predict(np.ones((num_samples, timesteps)))
-      np.testing.assert_allclose(out3, out4, atol=1e-5)
-
-      # check that the call to `predict` updated the states
-      out5 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out4.max(), out5.max())
-
-      # Check masking
-      layer.reset_states()
-
-      left_padded_input = np.ones((num_samples, timesteps))
-      left_padded_input[0, :1] = 0
-      left_padded_input[1, :2] = 0
-      out6 = model.predict(left_padded_input)
-
-      layer.reset_states()
-
-      right_padded_input = np.ones((num_samples, timesteps))
-      right_padded_input[0, -1:] = 0
-      right_padded_input[1, -2:] = 0
-      out7 = model.predict(right_padded_input)
-
-      np.testing.assert_allclose(out7, out6, atol=1e-5)
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Embedding(
+            4,
+            embedding_dim,
+            mask_zero=True,
+            input_length=timesteps,
+            batch_input_shape=(num_samples, timesteps)))
+    layer = layer_class(
+        units, return_sequences=False, stateful=True, weights=None)
+    model.add(layer)
+    model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  loss='mse')
+    out1 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertEqual(out1.shape, (num_samples, units))
+
+    # train once so that the states change
+    model.train_on_batch(
+        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+    out2 = model.predict(np.ones((num_samples, timesteps)))
+
+    # if the state is not reset, output should be different
+    self.assertNotEqual(out1.max(), out2.max())
+
+    # check that output changes after states are reset
+    # (even though the model itself didn't change)
+    layer.reset_states()
+    out3 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out2.max(), out3.max())
+
+    # check that container-level reset_states() works
+    model.reset_states()
+    out4 = model.predict(np.ones((num_samples, timesteps)))
+    np.testing.assert_allclose(out3, out4, atol=1e-5)
+
+    # check that the call to `predict` updated the states
+    out5 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out4.max(), out5.max())
 
+    # Check masking
+    layer.reset_states()
+
+    left_padded_input = np.ones((num_samples, timesteps))
+    left_padded_input[0, :1] = 0
+    left_padded_input[1, :2] = 0
+    out6 = model.predict(left_padded_input)
+
+    layer.reset_states()
+
+    right_padded_input = np.ones((num_samples, timesteps))
+    right_padded_input[0, -1:] = 0
+    right_padded_input[1, -2:] = 0
+    out7 = model.predict(right_padded_input)
+
+    np.testing.assert_allclose(out7, out6, atol=1e-5)
+
+
+class SimpleRNNLayerGraphOnlyTest(test.TestCase):
+
+  # b/120919032
   @tf_test_util.run_deprecated_v1
   def test_regularizers_SimpleRNN(self):
     embedding_dim = 4
     layer_class = keras.layers.SimpleRNN
-    with self.cached_session():
-      layer = layer_class(
-          5,
-          return_sequences=False,
-          weights=None,
-          input_shape=(None, embedding_dim),
-          kernel_regularizer=keras.regularizers.l1(0.01),
-          recurrent_regularizer=keras.regularizers.l1(0.01),
-          bias_regularizer='l2',
-          activity_regularizer='l1')
-      layer.build((None, None, 2))
-      self.assertEqual(len(layer.losses), 3)
-
-      x = keras.backend.variable(np.ones((2, 3, 2)))
-      layer(x)
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        recurrent_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l2',
+        activity_regularizer='l1')
+    layer.build((None, None, 2))
+    self.assertEqual(len(layer.losses), 3)
+
+    x = keras.backend.variable(np.ones((2, 3, 2)))
+    layer(x)
+    self.assertEqual(len(layer.get_losses_for(x)), 1)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/unified_gru_test.py b/tensorflow/python/keras/layers/unified_gru_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d482b866d04674d4cb199f61e10802054226780
--- /dev/null
+++ b/tensorflow/python/keras/layers/unified_gru_test.py
@@ -0,0 +1,599 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for UnifiedGRU layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python import keras
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+
+
+# Global config for grappler setting that is used for graph mode test.
+_rewrites = rewriter_config_pb2.RewriterConfig()
+_rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF
+_customer_optimizer = _rewrites.custom_optimizers.add()
+_customer_optimizer.name = 'ExperimentalImplementationSelector'
+_rewrites.min_graph_nodes = -1
+_graph_options = config_pb2.GraphOptions(rewrite_options=_rewrites)
+_config = config_pb2.ConfigProto(graph_options=_graph_options)
+
+
+@keras_parameterized.run_all_keras_modes(config=_config)
+class UnifiedGRUTest(keras_parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('non_tan_activation', 'relu', 'sigmoid', 0, False, True, True),
+      ('non_sigmoid_recur_activation', 'tanh', 'relu', 0, False, True, True),
+      ('use_recurrent_dropout', 'tanh', 'sigmoid', 0.1, False, True, True),
+      ('unroll', 'tanh', 'sigmoid', 0, True, True, True),
+      ('not_use_bias', 'tanh', 'sigmoid', 0, False, False, True),
+      ('not_reset_after', 'tanh', 'sigmoid', 0, False, True, False)
+  )
+  def test_could_use_defun_backend(self, activation, recurrent_activation,
+                                   recurrent_dropout, unroll, use_bias,
+                                   reset_after):
+    layer = keras.layers.UnifiedGRU(1,
+                                    activation=activation,
+                                    recurrent_activation=recurrent_activation,
+                                    recurrent_dropout=recurrent_dropout,
+                                    unroll=unroll,
+                                    use_bias=use_bias,
+                                    reset_after=reset_after)
+    self.assertFalse(layer.could_use_cudnn)
+
+  def test_keras_model_with_gru(self):
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 10
+
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=batch,
+        test_samples=0,
+        input_shape=(timestep, input_shape),
+        num_classes=output_shape)
+    y_train = keras.utils.to_categorical(y_train, output_shape)
+
+    layer = keras.layers.UnifiedGRU(rnn_state_size)
+
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('rmsprop', loss='mse')
+    model.fit(x_train, y_train, epochs=epoch)
+    model.evaluate(x_train, y_train)
+    model.predict(x_train)
+
+  def test_dynamic_behavior_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer = keras.layers.UnifiedGRU(units, input_shape=(None, embedding_dim))
+    model = keras.models.Sequential()
+    model.add(layer)
+    model.compile(gradient_descent.GradientDescentOptimizer(0.001), 'mse')
+    x = np.random.random((num_samples, timesteps, embedding_dim))
+    y = np.random.random((num_samples, units))
+    model.train_on_batch(x, y)
+
+  def test_stacking_GRU(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.UnifiedGRU(10, return_sequences=True, unroll=False))
+    model.add(keras.layers.UnifiedGRU(5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  def test_from_config_GRU(self):
+    layer_class = keras.layers.UnifiedGRU
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
+
+
+# TODO(scottzhu): Re-enable those tests in v2 mode once bugs attached are fixed.
+@test_util.run_v1_only
+class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
+
+  # b/120911602
+  def test_unified_gru_feature_parity_with_canonical_gru(self):
+    with context.eager_mode():
+      # Run this test under eager only due to b/120160788 for model.set_weights.
+      input_shape = 10
+      rnn_state_size = 8
+      timestep = 4
+      batch = 20
+
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=rnn_state_size)
+      y_train = keras.utils.to_categorical(y_train, rnn_state_size)
+
+      inputs = keras.layers.Input(
+          shape=[timestep, input_shape], dtype=dtypes.float32)
+      gru_layer = keras.layers.GRU(rnn_state_size,
+                                   recurrent_activation='sigmoid',
+                                   reset_after=True)
+      output = gru_layer(inputs)
+      gru_model = keras.models.Model(inputs, output)
+      weights = gru_model.get_weights()
+      y_1 = gru_model.predict(x_train)
+      gru_model.compile('rmsprop', 'mse')
+      gru_model.fit(x_train, y_train)
+      y_2 = gru_model.predict(x_train)
+
+      with test_util.device(use_gpu=True):
+        cudnn_layer = keras.layers.UnifiedGRU(rnn_state_size,
+                                              recurrent_activation='sigmoid',
+                                              reset_after=True)
+        cudnn_model = keras.models.Model(inputs, cudnn_layer(inputs))
+      cudnn_model.set_weights(weights)
+      y_3 = cudnn_model.predict(x_train)
+      cudnn_model.compile('rmsprop', 'mse')
+      cudnn_model.fit(x_train, y_train)
+      y_4 = cudnn_model.predict(x_train)
+
+      self.assertAllClose(y_1, y_3)
+      self.assertAllClose(y_2, y_4)
+
+  # b/120911602
+  @parameterized.named_parameters(
+      # test_name, use_bias, bias_initializer, activation
+      ('normal', True, 'zeros'),
+      ('no_bias', False, 'zeros'),
+      ('random_bias', True, 'random_uniform'),
+  )
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_unified_gru_model_save_load(self, use_bias, bias_initializer):
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+    h5_path = os.path.join(temp_dir, 'test.h5')
+
+    batch = 10
+    timestep = 3
+    input_dim = 5
+    units = 2
+
+    x = np.random.random((batch, timestep, input_dim))
+
+    def build_model():
+      inputs = keras.layers.Input(
+          shape=[timestep, input_dim], dtype=dtypes.float32)
+      layer = keras.layers.UnifiedGRU(
+          units,
+          use_bias=use_bias,
+          bias_initializer=bias_initializer)
+      output = layer(inputs)
+      return keras.models.Model(inputs, output), layer
+
+    model, layer = build_model()
+    y_ref = model.predict(x)
+    model.save_weights(h5_path)
+
+    cloned_model, new_layer = build_model()
+    cloned_model.load_weights(h5_path)
+    y = cloned_model.predict(x)
+
+    self.assertAllClose(y, y_ref)
+    self.assertAllClose(layer.get_weights(), new_layer.get_weights())
+
+  # b/120911602
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_unified_gru_output_on_multiple_kernel(self):
+    input_shape = 10
+    rnn_state_size = 8
+    timestep = 4
+    batch = 100
+
+    x_train = np.random.random((batch, timestep, input_shape))
+
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+    with test_util.device(use_gpu=False):
+      layer = keras.layers.UnifiedGRU(rnn_state_size)
+      output = layer(inputs)
+      cpu_model = keras.models.Model(inputs, output)
+      weights = cpu_model.get_weights()
+      y_1 = cpu_model.predict(x_train)
+
+    with test_util.device(use_gpu=True):
+      layer = keras.layers.UnifiedGRU(rnn_state_size)
+      output = layer(inputs)
+      gpu_model = keras.models.Model(inputs, output)
+      gpu_model.set_weights(weights)
+      y_2 = gpu_model.predict(x_train)
+
+    # Note that CuDNN uses 'sigmoid' as activation, so the unified GRU uses
+    # 'sigmoid' as default. Construct the canonical GRU with sigmoid to achieve
+    # the same output.
+    with test_util.device(use_gpu=True):
+      layer = keras.layers.GRU(rnn_state_size,
+                               recurrent_activation='sigmoid',
+                               reset_after=True)
+      output = layer(inputs)
+      canonical_model = keras.models.Model(inputs, output)
+      canonical_model.set_weights(weights)
+      y_3 = canonical_model.predict(x_train)
+
+    self.assertAllClose(y_1, y_2)
+    self.assertAllClose(y_2, y_3)
+
+  # b/120911602
+  @parameterized.named_parameters(
+      # test_name, time_major, go_backwards
+      ('normal', False, False),
+      ('time_major', True, False),
+      ('go_backwards', False, True),
+      ('both', True, True),
+  )
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_time_major_and_go_backward(self, time_major, go_backwards):
+    input_shape = 10
+    rnn_state_size = 8
+    timestep = 4
+    batch = 100
+
+    x_train = np.random.random((batch, timestep, input_shape))
+
+    def build_model(layer_cls):
+      inputs = keras.layers.Input(
+          shape=[timestep, input_shape], dtype=dtypes.float32)
+      layer = layer_cls(rnn_state_size,
+                        recurrent_activation='sigmoid',
+                        time_major=time_major,
+                        return_sequences=True,
+                        go_backwards=go_backwards,
+                        reset_after=True)
+      if time_major:
+        converted_input = keras.layers.Lambda(
+            lambda t: array_ops.transpose(t, [1, 0, 2]))(inputs)
+        outputs = layer(converted_input)
+        outputs = keras.layers.Lambda(
+            lambda t: array_ops.transpose(t, [1, 0, 2]))(outputs)
+      else:
+        outputs = layer(inputs)
+      return keras.models.Model(inputs, outputs)
+
+    gru_model = build_model(keras.layers.GRU)
+    y_ref = gru_model.predict(x_train)
+    weights = gru_model.get_weights()
+
+    unified_gru_model = build_model(keras.layers.UnifiedGRU)
+    unified_gru_model.set_weights(weights)
+    y = unified_gru_model.predict(x_train)
+
+    self.assertAllClose(y, y_ref)
+
+  # b/120911602
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_with_masking_layer_GRU(self):
+    layer_class = keras.layers.UnifiedGRU
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(layer_class(units=5, return_sequences=True, unroll=False))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=gradient_descent.GradientDescentOptimizer(0.001))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  # b/120911602
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_masking_with_stacking_GRU(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(keras.layers.UnifiedGRU(10, return_sequences=True, unroll=False))
+    model.add(keras.layers.UnifiedGRU(5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  # b/120911602
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_return_sequences_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        keras.layers.UnifiedGRU,
+        kwargs={'units': units,
+                'return_sequences': True},
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  # b/120911602
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_dropout_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        keras.layers.UnifiedGRU,
+        kwargs={'units': units,
+                'dropout': 0.1,
+                'recurrent_dropout': 0.1},
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  # b/120911602
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_constraints_GRU(self):
+    embedding_dim = 4
+    layer_class = keras.layers.UnifiedGRU
+    k_constraint = keras.constraints.max_norm(0.01)
+    r_constraint = keras.constraints.max_norm(0.01)
+    b_constraint = keras.constraints.max_norm(0.01)
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_constraint=k_constraint,
+        recurrent_constraint=r_constraint,
+        bias_constraint=b_constraint)
+    layer.build((None, None, embedding_dim))
+    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+    self.assertEqual(layer.cell.bias.constraint, b_constraint)
+
+  # b/120911602
+  @parameterized.parameters([0, 1, 2])
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_implementation_mode_GRU(self, implementation_mode):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        keras.layers.UnifiedGRU,
+        kwargs={'units': units,
+                'implementation': implementation_mode},
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  # b/120911602
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_statefulness_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = keras.layers.UnifiedGRU
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Embedding(
+            4,
+            embedding_dim,
+            mask_zero=True,
+            input_length=timesteps,
+            batch_input_shape=(num_samples, timesteps)))
+    layer = layer_class(
+        units, return_sequences=False, stateful=True, weights=None)
+    model.add(layer)
+    model.compile(optimizer='sgd', loss='mse')
+    out1 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertEqual(out1.shape, (num_samples, units))
+
+    # train once so that the states change
+    model.train_on_batch(
+        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+    out2 = model.predict(np.ones((num_samples, timesteps)))
+
+    # if the state is not reset, output should be different
+    self.assertNotEqual(out1.max(), out2.max())
+
+    # check that output changes after states are reset
+    # (even though the model itself didn't change)
+    layer.reset_states()
+    out3 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out2.max(), out3.max())
+
+    # check that container-level reset_states() works
+    model.reset_states()
+    out4 = model.predict(np.ones((num_samples, timesteps)))
+    np.testing.assert_allclose(out3, out4, atol=1e-5)
+
+    # check that the call to `predict` updated the states
+    out5 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out4.max(), out5.max())
+
+    # Check masking
+    layer.reset_states()
+
+    left_padded_input = np.ones((num_samples, timesteps))
+    left_padded_input[0, :1] = 0
+    left_padded_input[1, :2] = 0
+    out6 = model.predict(left_padded_input)
+
+    layer.reset_states()
+
+    right_padded_input = np.ones((num_samples, timesteps))
+    right_padded_input[0, -1:] = 0
+    right_padded_input[1, -2:] = 0
+    out7 = model.predict(right_padded_input)
+
+    np.testing.assert_allclose(out7, out6, atol=1e-5)
+
+
+class GRULayerGraphOnlyTest(test.TestCase):
+
+  # Need session for test
+  @test_util.run_deprecated_v1
+  def test_unifiedGRU(self):
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = keras.layers.UnifiedGRU(rnn_state_size, return_runtime=True)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      outputs, runtime = layer(inputs)
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+  # Need session for test
+  @test_util.run_deprecated_v1
+  def test_UnifiedGRU_with_cond(self):
+    # This test is to demonstrate the graph rewrite of grappler plugin under
+    # the condition that the function returns different number of internal
+    # states.
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = keras.layers.UnifiedGRU(rnn_state_size, return_runtime=True)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      zeros = array_ops.zeros([batch, output_shape])
+      dummy_runtime = constant_op.constant(
+          'unknown', dtype=dtypes.string, name='runtime')
+      a = constant_op.constant(0)
+      b = constant_op.constant(1)
+      # Will always run the GRU layer.
+      outputs, runtime = control_flow_ops.cond(
+          gen_math_ops.less(a, b),
+          lambda: layer(inputs),
+          lambda: (zeros, dummy_runtime))
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+  # b/120919032
+  @test_util.run_deprecated_v1
+  def test_regularizers_GRU(self):
+    embedding_dim = 4
+    layer_class = keras.layers.UnifiedGRU
+    with self.cached_session(config=_config):
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_regularizer=keras.regularizers.l1(0.01),
+          recurrent_regularizer=keras.regularizers.l1(0.01),
+          bias_regularizer='l2',
+          activity_regularizer='l1')
+      layer.build((None, None, 2))
+      self.assertEqual(len(layer.losses), 3)
+
+      x = keras.backend.variable(np.ones((2, 3, 2)))
+      layer(x)
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/unified_lstm_test.py b/tensorflow/python/keras/layers/unified_lstm_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c51304666d3f6b830c5a815db385921838ca9694
--- /dev/null
+++ b/tensorflow/python/keras/layers/unified_lstm_test.py
@@ -0,0 +1,917 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for UnifiedLSTM layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import time
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python import keras
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import gradient_descent
+
+
+# Global config for grappler setting that is used for graph mode test.
+_rewrites = rewriter_config_pb2.RewriterConfig()
+_rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF
+_customer_optimizer = _rewrites.custom_optimizers.add()
+_customer_optimizer.name = 'ExperimentalImplementationSelector'
+_rewrites.min_graph_nodes = -1
+_graph_options = config_pb2.GraphOptions(rewrite_options=_rewrites)
+_config = config_pb2.ConfigProto(graph_options=_graph_options)
+
+
+@keras_parameterized.run_all_keras_modes(config=_config)
+class UnifiedLSTMTest(keras_parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('non_tan_activation', 'relu', 'sigmoid', 0, False, True),
+      ('non_sigmoid_recur_activation', 'tanh', 'relu', 0, False, True),
+      ('use_recurrent_dropout', 'tanh', 'sigmoid', 0.1, False, True),
+      ('unroll', 'tanh', 'sigmoid', 0, True, True),
+      ('not_use_bias', 'tanh', 'sigmoid', 0, False, False),
+  )
+  def test_could_use_defun_backend(self, activation, recurrent_activation,
+                                   recurrent_dropout, unroll, use_bias):
+    layer = keras.layers.UnifiedLSTM(
+        1,
+        activation=activation,
+        recurrent_activation=recurrent_activation,
+        recurrent_dropout=recurrent_dropout,
+        unroll=unroll,
+        use_bias=use_bias)
+    self.assertFalse(layer.could_use_cudnn)
+
+  def test_static_shape_inference_LSTM(self):
+    # Github issue: 15165
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+
+    model = keras.models.Sequential()
+    inputs = keras.layers.Dense(
+        embedding_dim, input_shape=(timesteps, embedding_dim))
+    model.add(inputs)
+    layer = keras.layers.UnifiedLSTM(units, return_sequences=True)
+    model.add(layer)
+    outputs = model.layers[-1].output
+    self.assertEqual(outputs.get_shape().as_list(), [None, timesteps, units])
+
+  def test_dynamic_behavior_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer = keras.layers.UnifiedLSTM(units, input_shape=(None, embedding_dim))
+    model = keras.models.Sequential()
+    model.add(layer)
+    model.compile(gradient_descent.GradientDescentOptimizer(0.001), 'mse')
+    x = np.random.random((num_samples, timesteps, embedding_dim))
+    y = np.random.random((num_samples, units))
+    model.train_on_batch(x, y)
+
+  def test_stacking_LSTM(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.UnifiedLSTM(10, return_sequences=True, unroll=False))
+    model.add(keras.layers.UnifiedLSTM(5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  def test_from_config_LSTM(self):
+    layer_class = keras.layers.UnifiedLSTM
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
+
+  def test_specify_initial_state_keras_tensor(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    # Test with Keras tensor
+    inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    layer = keras.layers.UnifiedLSTM(units)
+    if len(initial_state) == 1:
+      output = layer(inputs, initial_state=initial_state[0])
+    else:
+      output = layer(inputs, initial_state=initial_state)
+    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+
+    model = keras.models.Model([inputs] + initial_state, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([inputs] + initial_state, targets)
+
+  def DISABLED_test_specify_initial_state_non_keras_tensor(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    # Test with non-Keras tensor
+    inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [
+        keras.backend.random_normal_variable((num_samples, units), 0, 1)
+        for _ in range(num_states)
+    ]
+    layer = keras.layers.UnifiedLSTM(units)
+    output = layer(inputs, initial_state=initial_state)
+
+    model = keras.models.Model(inputs, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch(inputs, targets)
+
+  def test_reset_states_with_values(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    layer = keras.layers.UnifiedLSTM(units, stateful=True)
+    layer.build((num_samples, timesteps, embedding_dim))
+    layer.reset_states()
+    assert len(layer.states) == num_states
+    assert layer.states[0] is not None
+    self.assertAllClose(
+        keras.backend.eval(layer.states[0]),
+        np.zeros(keras.backend.int_shape(layer.states[0])),
+        atol=1e-4)
+    state_shapes = [keras.backend.int_shape(state) for state in layer.states]
+    values = [np.ones(shape) for shape in state_shapes]
+    if len(values) == 1:
+      values = values[0]
+    layer.reset_states(values)
+    self.assertAllClose(
+        keras.backend.eval(layer.states[0]),
+        np.ones(keras.backend.int_shape(layer.states[0])),
+        atol=1e-4)
+
+    # Test with invalid data
+    with self.assertRaises(ValueError):
+      layer.reset_states([1] * (len(layer.states) + 1))
+
+  def test_specify_state_with_masking(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    inputs = keras.Input((timesteps, embedding_dim))
+    _ = keras.layers.Masking()(inputs)
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    output = keras.layers.UnifiedLSTM(units)(
+        inputs, initial_state=initial_state)
+
+    model = keras.models.Model([inputs] + initial_state, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([inputs] + initial_state, targets)
+
+  def test_return_state(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    layer = keras.layers.UnifiedLSTM(units, return_state=True, stateful=True)
+    outputs = layer(inputs)
+    state = outputs[1:]
+    assert len(state) == num_states
+    model = keras.models.Model(inputs, state[0])
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    state = model.predict(inputs)
+    self.assertAllClose(keras.backend.eval(layer.states[0]), state, atol=1e-4)
+
+  def test_state_reuse(self):
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    layer = keras.layers.UnifiedLSTM(
+        units, return_state=True, return_sequences=True)
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.UnifiedLSTM(units)(output, initial_state=state)
+    model = keras.models.Model(inputs, output)
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    model.predict(inputs)
+
+  def test_initial_states_as_other_inputs(self):
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+    num_states = 2
+    layer_class = keras.layers.UnifiedLSTM
+
+    # Test with Keras tensor
+    main_inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    inputs = [main_inputs] + initial_state
+
+    layer = layer_class(units)
+    output = layer(inputs)
+    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+
+    model = keras.models.Model(inputs, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([main_inputs] + initial_state, targets)
+
+
+class LSTMLayerGraphOnlyTest(test.TestCase):
+
+  # Need session for test
+  @test_util.run_deprecated_v1
+  def test_unifiedLSTM(self):
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = keras.layers.UnifiedLSTM(rnn_state_size, return_runtime=True)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      outputs, runtime = layer(inputs)
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+  # Need session for test
+  @test_util.run_deprecated_v1
+  def test_unifiedLSTM_with_cond(self):
+    # This test is to demonstrate the graph rewrite of grappler plugin under
+    # the condition that the function returns different number of internal
+    # states.
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = keras.layers.UnifiedLSTM(rnn_state_size, return_runtime=True)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      zeros = array_ops.zeros([batch, output_shape])
+      dummy_runtime = constant_op.constant(
+          'unknown', dtype=dtypes.string, name='runtime')
+      a = constant_op.constant(0)
+      b = constant_op.constant(1)
+      # Will always run the lstm layer.
+      outputs, runtime = control_flow_ops.cond(
+          gen_math_ops.less(a, b),
+          lambda: layer(inputs),
+          lambda: (zeros, dummy_runtime))
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+  # b/120919032
+  @test_util.run_deprecated_v1
+  def test_regularizers_LSTM(self):
+    embedding_dim = 4
+    layer_class = keras.layers.UnifiedLSTM
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        recurrent_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l2',
+        activity_regularizer='l1')
+    layer.build((None, None, 2))
+    self.assertEqual(len(layer.losses), 3)
+    x = keras.backend.variable(np.ones((2, 3, 2)))
+    layer(x)
+    self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+
+# TODO(scottzhu): Re-enable those tests in v2 mode once bugs attached are fixed.
+@test_util.run_v1_only
+class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
+
+  # b/120911602
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_dropout_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        keras.layers.UnifiedLSTM,
+        kwargs={
+            'units': units,
+            'dropout': 0.1,
+            'recurrent_dropout': 0.1
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  # b/120911602
+  def test_unified_lstm_feature_parity_with_canonical_lstm(self):
+    with context.eager_mode():
+      # Run this test under eager only due to b/120160788 for model.set_weights.
+      input_shape = 10
+      rnn_state_size = 8
+      timestep = 4
+      batch = 20
+
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=rnn_state_size)
+      y_train = keras.utils.to_categorical(y_train, rnn_state_size)
+
+      inputs = keras.layers.Input(
+          shape=[timestep, input_shape], dtype=dtypes.float32)
+      lstm_layer = keras.layers.LSTM(rnn_state_size,
+                                     recurrent_activation='sigmoid')
+      output = lstm_layer(inputs)
+      lstm_model = keras.models.Model(inputs, output)
+      weights = lstm_model.get_weights()
+      y_1 = lstm_model.predict(x_train)
+      lstm_model.compile('rmsprop', 'mse')
+      lstm_model.fit(x_train, y_train)
+      y_2 = lstm_model.predict(x_train)
+
+      with test_util.device(use_gpu=True):
+        cudnn_layer = keras.layers.UnifiedLSTM(rnn_state_size)
+        cudnn_model = keras.models.Model(inputs, cudnn_layer(inputs))
+      cudnn_model.set_weights(weights)
+      y_3 = cudnn_model.predict(x_train)
+      cudnn_model.compile('rmsprop', 'mse')
+      cudnn_model.fit(x_train, y_train)
+      y_4 = cudnn_model.predict(x_train)
+
+      self.assertAllClose(y_1, y_3)
+      self.assertAllClose(y_2, y_4)
+
+  # b/120911602
+  @parameterized.named_parameters(('v0', 0), ('v1', 1), ('v2', 2))
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_implementation_mode_LSTM(self, implementation_mode):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        keras.layers.UnifiedLSTM,
+        kwargs={
+            'units': units,
+            'implementation': implementation_mode
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+    layer_class = keras.layers.UnifiedLSTM
+    k_constraint = keras.constraints.max_norm(0.01)
+    r_constraint = keras.constraints.max_norm(0.01)
+    b_constraint = keras.constraints.max_norm(0.01)
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_constraint=k_constraint,
+        recurrent_constraint=r_constraint,
+        bias_constraint=b_constraint)
+    layer.build((None, None, embedding_dim))
+    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+    self.assertEqual(layer.cell.bias.constraint, b_constraint)
+
+    layer_class = keras.layers.UnifiedLSTM
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(layer_class(units=5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  # b/120911602
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_masking_with_stacking_LSTM(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(keras.layers.UnifiedLSTM(10, return_sequences=True, unroll=False))
+    model.add(keras.layers.UnifiedLSTM(5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  # b/120911602
+  @parameterized.named_parameters(
+      # test_name, time_major, go_backwards
+      ('normal', False, False),
+      ('time_major', True, False),
+      ('go_backwards', False, True),
+      ('both', True, True),
+  )
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_time_major_and_go_backward(self, time_major, go_backwards):
+    input_shape = 10
+    rnn_state_size = 8
+    timestep = 4
+    batch = 100
+
+    x_train = np.random.random((batch, timestep, input_shape))
+
+    def build_model(layer_cls):
+      inputs = keras.layers.Input(
+          shape=[timestep, input_shape], dtype=dtypes.float32)
+      layer = layer_cls(rnn_state_size,
+                        recurrent_activation='sigmoid',
+                        time_major=time_major,
+                        return_sequences=True,
+                        go_backwards=go_backwards)
+      if time_major:
+        converted_input = keras.layers.Lambda(
+            lambda t: array_ops.transpose(t, [1, 0, 2]))(inputs)
+        outputs = layer(converted_input)
+        outputs = keras.layers.Lambda(
+            lambda t: array_ops.transpose(t, [1, 0, 2]))(outputs)
+      else:
+        outputs = layer(inputs)
+      return keras.models.Model(inputs, outputs)
+
+    lstm_model = build_model(keras.layers.LSTM)
+    y_ref = lstm_model.predict(x_train)
+    weights = lstm_model.get_weights()
+
+    unified_lstm_model = build_model(keras.layers.UnifiedLSTM)
+    unified_lstm_model.set_weights(weights)
+    y = unified_lstm_model.predict(x_train)
+
+    self.assertAllClose(y, y_ref)
+
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 10
+
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=batch,
+        test_samples=0,
+        input_shape=(timestep, input_shape),
+        num_classes=output_shape)
+    y_train = keras.utils.to_categorical(y_train, output_shape)
+
+    layer = keras.layers.UnifiedLSTM(rnn_state_size)
+
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('rmsprop', loss='mse')
+    model.fit(x_train, y_train, epochs=epoch)
+    model.evaluate(x_train, y_train)
+    model.predict(x_train)
+
+  # b/120911602
+  @parameterized.named_parameters(
+      # test_name, use_bias, bias_initializer, activation
+      ('normal', True, 'zeros'),
+      ('no_bias', False, 'zeros'),
+      ('random_bias', True, 'random_uniform'),
+  )
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_unified_lstm_model_save_load(self, use_bias, bias_initializer):
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+    h5_path = os.path.join(temp_dir, 'test.h5')
+
+    batch = 10
+    timestep = 3
+    input_dim = 5
+    units = 2
+
+    x = np.random.random((batch, timestep, input_dim))
+
+    def build_model():
+      inputs = keras.layers.Input(
+          shape=[timestep, input_dim], dtype=dtypes.float32)
+      layer = keras.layers.UnifiedLSTM(
+          units,
+          use_bias=use_bias,
+          bias_initializer=bias_initializer)
+      output = layer(inputs)
+      return keras.models.Model(inputs, output), layer
+
+    model, layer = build_model()
+    y_ref = model.predict(x)
+    model.save_weights(h5_path)
+
+    cloned_model, new_layer = build_model()
+    cloned_model.load_weights(h5_path)
+    y = cloned_model.predict(x)
+
+    self.assertAllClose(y, y_ref)
+    self.assertAllClose(layer.get_weights(), new_layer.get_weights())
+
+  # b/120911602
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_unified_lstm_output_on_multiple_kernel(self):
+    input_shape = 10
+    rnn_state_size = 8
+    timestep = 4
+    batch = 100
+
+    x_train = np.random.random((batch, timestep, input_shape))
+
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+    with test_util.device(use_gpu=False):
+      layer = keras.layers.UnifiedLSTM(rnn_state_size)
+      output = layer(inputs)
+      cpu_model = keras.models.Model(inputs, output)
+      weights = cpu_model.get_weights()
+    y_1 = cpu_model.predict(x_train)
+
+    with test_util.device(use_gpu=True):
+      layer = keras.layers.UnifiedLSTM(rnn_state_size)
+      output = layer(inputs)
+      gpu_model = keras.models.Model(inputs, output)
+      gpu_model.set_weights(weights)
+    y_2 = gpu_model.predict(x_train)
+
+    # Note that CuDNN uses 'sigmoid' as activation, so the unified LSTM uses
+    # 'sigmoid' as default. Construct the canonical LSTM with sigmoid to achieve
+    # the same output.
+    with test_util.device(use_gpu=True):
+      layer = keras.layers.LSTM(rnn_state_size, recurrent_activation='sigmoid')
+      output = layer(inputs)
+      canonical_model = keras.models.Model(inputs, output)
+      # Remove the extra cudnn bias since canonical lstm will not use it.
+      canonical_model.set_weights(weights[:3])
+    y_3 = canonical_model.predict(x_train)
+
+    self.assertAllClose(y_1, y_2)
+    self.assertAllClose(y_2, y_3)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_return_sequences_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        keras.layers.UnifiedLSTM,
+        kwargs={
+            'units': units,
+            'return_sequences': True
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  # b/120911602
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_statefulness_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = keras.layers.UnifiedLSTM
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Embedding(
+            4,
+            embedding_dim,
+            mask_zero=True,
+            input_length=timesteps,
+            batch_input_shape=(num_samples, timesteps)))
+    layer = layer_class(
+        units, return_sequences=False, stateful=True, weights=None)
+    model.add(layer)
+    model.compile(
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01), loss='mse')
+    out1 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertEqual(out1.shape, (num_samples, units))
+
+    # train once so that the states change
+    model.train_on_batch(
+        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+    out2 = model.predict(np.ones((num_samples, timesteps)))
+
+    # if the state is not reset, output should be different
+    self.assertNotEqual(out1.max(), out2.max())
+
+    # check that output changes after states are reset
+    # (even though the model itself didn't change)
+    layer.reset_states()
+    out3 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out2.max(), out3.max())
+
+    # check that container-level reset_states() works
+    model.reset_states()
+    out4 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertAllClose(out3, out4, atol=1e-5)
+
+    # check that the call to `predict` updated the states
+    out5 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out4.max(), out5.max())
+
+    # Check masking
+    layer.reset_states()
+
+    left_padded_input = np.ones((num_samples, timesteps))
+    left_padded_input[0, :1] = 0
+    left_padded_input[1, :2] = 0
+    out6 = model.predict(left_padded_input)
+
+    layer.reset_states()
+
+    right_padded_input = np.ones((num_samples, timesteps))
+    right_padded_input[0, -1:] = 0
+    right_padded_input[1, -2:] = 0
+    out7 = model.predict(right_padded_input)
+
+    self.assertAllClose(out7, out6, atol=1e-5)
+
+
+class UnifiedLSTMPerformanceTest(test.Benchmark):
+
+  def _measure_performance(self, test_config, model, x_train, y_train):
+    batch = test_config['batch']
+    epoch = test_config['epoch']
+    warmup_epoch = test_config['warmup_epoch']
+
+    # warm up the model
+    model.fit(x_train, y_train, batch_size=batch, epochs=warmup_epoch)
+    start_time = time.time()
+    model.fit(x_train, y_train, batch_size=batch, epochs=epoch - warmup_epoch)
+    end_time = time.time()
+    return (end_time - start_time) / (epoch - warmup_epoch)
+
+  def _time_performance_run_cudnn_lstm(self, test_config, x_train, y_train):
+    # Get the performance number for standard Cudnn LSTM
+    input_shape = test_config['input_shape']
+    rnn_state_size = test_config['rnn_state_size']
+    timestep = test_config['timestep']
+
+    cudnn_lstm_layer = keras.layers.CuDNNLSTM(rnn_state_size)
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = cudnn_lstm_layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+
+    sec_per_epoch = self._measure_performance(
+        test_config, model, x_train, y_train)
+    logging.info('Average performance for %s per epoch is: %s',
+                 'CuDNN LSTM', sec_per_epoch)
+    return sec_per_epoch
+
+  def _time_performance_run_unifed_lstm_gpu(
+      self, test_config, x_train, y_train):
+    # Get performance number for Unified_LSTM with grappler swap the impl
+    input_shape = test_config['input_shape']
+    rnn_state_size = test_config['rnn_state_size']
+    timestep = test_config['timestep']
+
+    layer = keras.layers.UnifiedLSTM(rnn_state_size)
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+
+    sec_per_epoch = self._measure_performance(
+        test_config, model, x_train, y_train)
+    logging.info('Average performance for %s per epoch is: %s',
+                 'Unified LSTM', sec_per_epoch)
+    return sec_per_epoch
+
+  def _time_performance_run_normal_lstm(
+      self, test_config, x_train, y_train):
+    # Get performance number for standard LSTM on GPU.
+    input_shape = test_config['input_shape']
+    rnn_state_size = test_config['rnn_state_size']
+    timestep = test_config['timestep']
+
+    layer = keras.layers.LSTM(rnn_state_size)
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+
+    sec_per_epoch = self._measure_performance(
+        test_config, model, x_train, y_train)
+    logging.info('Average performance for %s per epoch is: %s',
+                 'Normal LSTM', sec_per_epoch)
+    return sec_per_epoch
+
+  def _benchmark_performance_with_standard_cudnn_impl(self):
+    if not test.is_gpu_available():
+      self.skipTest('performance test will only run on GPU')
+
+    mode = 'eager' if context.executing_eagerly() else 'graph'
+    batch = 64
+    num_batch = 10
+    test_config = {
+        'input_shape': 128,
+        'rnn_state_size': 64,
+        'output_shape': 64,
+        'timestep': 50,
+        'batch': batch,
+        'epoch': 20,
+        # The performance for warmup epoch is ignored.
+        'warmup_epoch': 1,
+    }
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=(batch * num_batch),
+        test_samples=0,
+        input_shape=(test_config['timestep'], test_config['input_shape']),
+        num_classes=test_config['output_shape'])
+    y_train = keras.utils.to_categorical(y_train, test_config['output_shape'])
+
+    cudnn_sec_per_epoch = self._time_performance_run_cudnn_lstm(
+        test_config, x_train, y_train)
+    unified_lstm_sec_per_epoch = self._time_performance_run_unifed_lstm_gpu(
+        test_config, x_train, y_train)
+    normal_lstm_sec_per_epoch = self._time_performance_run_normal_lstm(
+        test_config, x_train, y_train)
+
+    cudnn_vs_unified = cudnn_sec_per_epoch / unified_lstm_sec_per_epoch
+    unified_vs_normal = normal_lstm_sec_per_epoch / unified_lstm_sec_per_epoch
+
+    self.report_benchmark(name='keras_cudnn_lstm_' + mode,
+                          wall_time=cudnn_sec_per_epoch,
+                          iters=test_config['epoch'],
+                          extras=test_config)
+    self.report_benchmark(name='keras_unified_lstm_' + mode,
+                          wall_time=unified_lstm_sec_per_epoch,
+                          iters=test_config['epoch'],
+                          extras=test_config)
+    self.report_benchmark(name='keras_canonical_lstm_' + mode,
+                          wall_time=normal_lstm_sec_per_epoch,
+                          iters=test_config['epoch'],
+                          extras=test_config)
+
+    logging.info('Expect the performance of Unified LSTM is within 80% of '
+                 'CuDNN LSTM, got {0:.2f}%'.format(cudnn_vs_unified * 100))
+    logging.info('Expect the performance of Unified LSTM is more than 5 times'
+                 ' of normal LSTM, got {0:.2f}'.format(unified_vs_normal))
+
+  def benchmark_performance_graph(self):
+    with context.graph_mode(), session_lib.Session(config=_config):
+      self._benchmark_performance_with_standard_cudnn_impl()
+
+  def benchmark_performance_eager(self):
+    with context.eager_mode():
+      self._benchmark_performance_with_standard_cudnn_impl()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/unified_rnn_test.py b/tensorflow/python/keras/layers/unified_rnn_test.py
deleted file mode 100644
index e26e47000d813c6b584640dbb563ffe10b76fbd4..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/layers/unified_rnn_test.py
+++ /dev/null
@@ -1,642 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for UnifiedLSTM layer."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import time
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python import keras
-from tensorflow.python.eager import context
-from tensorflow.python.eager import function
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
-from tensorflow.python.keras import activations
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import constraints
-from tensorflow.python.keras import initializers
-from tensorflow.python.keras import regularizers
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.engine.input_spec import InputSpec
-from tensorflow.python.keras.layers.cudnn_recurrent import CuDNNLSTM
-from tensorflow.python.keras.layers.recurrent import RNN
-from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_cudnn_rnn_ops
-from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import gradient_descent
-
-
-class RNNTest(test.TestCase):
-
-  rewrites = rewriter_config_pb2.RewriterConfig()
-  rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF
-  customer_optimizer = rewrites.custom_optimizers.add()
-  customer_optimizer.name = 'ExperimentalImplementationSelector'
-  rewrites.min_graph_nodes = -1
-  graph_options = config_pb2.GraphOptions(rewrite_options=rewrites)
-  config = config_pb2.ConfigProto(graph_options=graph_options)
-
-  def setUp(self):
-    self.config = RNNTest.config
-
-  def tearDown(self):
-    ops.reset_default_graph()
-
-  @test_util.run_deprecated_v1
-  def test_unifiedRNN(self):
-    input_shape = 10
-    rnn_state_size = 8
-    output_shape = 8
-    timestep = 4
-    batch = 100
-    epoch = 1
-
-    with self.cached_session(config=self.config, use_gpu=True) as sess:
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=batch,
-          test_samples=0,
-          input_shape=(timestep, input_shape),
-          num_classes=output_shape)
-      y_train = keras.utils.to_categorical(y_train, output_shape)
-
-      layer = UnifiedLSTM(rnn_state_size)
-
-      inputs = array_ops.placeholder(
-          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
-      predict = array_ops.placeholder(
-          dtypes.float32, shape=(None, output_shape), name='predict')
-
-      outputs, runtime = layer(inputs)
-      loss = losses.softmax_cross_entropy(predict, outputs)
-      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      train_op = optimizer.minimize(loss)
-
-      sess.run([variables.global_variables_initializer()])
-      existing_loss = 0
-      for _ in range(epoch):
-        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
-            inputs: x_train,
-            predict: y_train
-        })
-        if test.is_gpu_available():
-          self.assertEquals(runtime_value, b'cudnn')
-        else:
-          self.assertEquals(runtime_value, b'cpu')
-        # Make sure the loss is updated for every epoch
-        # (layer weights properly updated).
-        self.assertNotEqual(existing_loss, loss_value)
-        existing_loss = loss_value
-
-  @test_util.run_deprecated_v1
-  def test_unifiedRNN_with_cond(self):
-    # This test is to demonstrate the graph rewrite of grappler plugin under
-    # the condition that the function returns different number of internal
-    # states.
-    input_shape = 10
-    rnn_state_size = 8
-    output_shape = 8
-    timestep = 4
-    batch = 100
-    epoch = 1
-
-    with self.cached_session(config=self.config, use_gpu=True) as sess:
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=batch,
-          test_samples=0,
-          input_shape=(timestep, input_shape),
-          num_classes=output_shape)
-      y_train = keras.utils.to_categorical(y_train, output_shape)
-
-      layer = UnifiedLSTM(rnn_state_size)
-
-      inputs = array_ops.placeholder(
-          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
-      predict = array_ops.placeholder(
-          dtypes.float32, shape=(None, output_shape), name='predict')
-
-      zeros = array_ops.zeros([batch, output_shape])
-      dummy_runtime = constant_op.constant(
-          'unknown', dtype=dtypes.string, name='runtime')
-      a = constant_op.constant(0)
-      b = constant_op.constant(1)
-      # Will always run the lstm layer.
-      outputs, runtime = control_flow_ops.cond(
-          gen_math_ops.less(a, b),
-          lambda: layer(inputs),
-          lambda: (zeros, dummy_runtime))
-      loss = losses.softmax_cross_entropy(predict, outputs)
-      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      train_op = optimizer.minimize(loss)
-
-      sess.run([variables.global_variables_initializer()])
-      existing_loss = 0
-
-      for _ in range(epoch):
-        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
-            inputs: x_train,
-            predict: y_train
-        })
-        if test.is_gpu_available():
-          self.assertEquals(runtime_value, b'cudnn')
-        else:
-          self.assertEquals(runtime_value, b'cpu')
-        # Make sure the loss is updated for every epoch
-        # (layer weights properly updated).
-        self.assertNotEqual(existing_loss, loss_value)
-        existing_loss = loss_value
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def test_keras_model_with_lstm(self):
-    input_shape = 10
-    rnn_state_size = 8
-    output_shape = 8
-    timestep = 4
-    batch = 100
-    epoch = 10
-
-    (x_train, y_train), _ = testing_utils.get_test_data(
-        train_samples=batch,
-        test_samples=0,
-        input_shape=(timestep, input_shape),
-        num_classes=output_shape)
-    y_train = keras.utils.to_categorical(y_train, output_shape)
-
-    layer = UnifiedLSTM(rnn_state_size)
-
-    inputs = keras.layers.Input(
-        shape=[timestep, input_shape], dtype=dtypes.float32)
-
-    outputs, unused_runtime = layer(inputs)
-    model = keras.models.Model(inputs, outputs)
-    model.compile('rmsprop', loss='mse')
-    model.fit(x_train, y_train, epochs=epoch)
-
-  def _measure_performance(self, test_config, model, x_train, y_train):
-    batch = test_config['batch']
-    epoch = test_config['epoch']
-    warmup_epoch = test_config['warmup_epoch']
-
-    # warm up the model
-    model.fit(x_train, y_train, batch_size=batch, epochs=warmup_epoch)
-    start_time = time.time()
-    model.fit(x_train, y_train, batch_size=batch, epochs=epoch - warmup_epoch)
-    end_time = time.time()
-    return (end_time - start_time) / (epoch - warmup_epoch)
-
-  def _time_performance_run_cudnn_lstm(self, test_config, x_train, y_train):
-    # Get the performance number for standard Cudnn LSTM
-    input_shape = test_config['input_shape']
-    rnn_state_size = test_config['rnn_state_size']
-    timestep = test_config['timestep']
-
-    cudnn_lstm_layer = CuDNNLSTM(rnn_state_size)
-    inputs = keras.layers.Input(
-        shape=[timestep, input_shape], dtype=dtypes.float32)
-
-    outputs = cudnn_lstm_layer(inputs)
-    model = keras.models.Model(inputs, outputs)
-    model.compile('sgd', 'mse')
-
-    sec_per_epoch = self._measure_performance(
-        test_config, model, x_train, y_train)
-    logging.info('Average performance for %s per epoch is: %s',
-                 'CuDNN LSTM', sec_per_epoch)
-    return sec_per_epoch
-
-  def _time_performance_run_unifed_lstm_gpu(
-      self, test_config, x_train, y_train):
-    # Get performance number for Unified_LSTM with grappler swap the impl
-    input_shape = test_config['input_shape']
-    rnn_state_size = test_config['rnn_state_size']
-    timestep = test_config['timestep']
-
-    layer = UnifiedLSTM(rnn_state_size)
-    inputs = keras.layers.Input(
-        shape=[timestep, input_shape], dtype=dtypes.float32)
-
-    outputs, _ = layer(inputs)
-    model = keras.models.Model(inputs, outputs)
-    model.compile('sgd', 'mse')
-
-    sec_per_epoch = self._measure_performance(
-        test_config, model, x_train, y_train)
-    logging.info('Average performance for %s per epoch is: %s',
-                 'Unified LSTM', sec_per_epoch)
-    return sec_per_epoch
-
-  def _time_performance_run_normal_lstm(
-      self, test_config, x_train, y_train):
-    # Get performance number for standard LSTM on GPU.
-    input_shape = test_config['input_shape']
-    rnn_state_size = test_config['rnn_state_size']
-    timestep = test_config['timestep']
-
-    layer = keras.layers.LSTM(rnn_state_size)
-    inputs = keras.layers.Input(
-        shape=[timestep, input_shape], dtype=dtypes.float32)
-
-    outputs = layer(inputs)
-    model = keras.models.Model(inputs, outputs)
-    model.compile('sgd', 'mse')
-
-    sec_per_epoch = self._measure_performance(
-        test_config, model, x_train, y_train)
-    logging.info('Average performance for %s per epoch is: %s',
-                 'Normal LSTM', sec_per_epoch)
-    return sec_per_epoch
-
-  @test_util.run_in_graph_and_eager_modes(config=config, use_gpu=True)
-  def test_performance_with_standard_cudnn_impl(self):
-    if not test.is_gpu_available():
-      self.skipTest('performance test will only run on GPU')
-
-    batch = 64
-    num_batch = 10
-    test_config = {
-        'input_shape': 128,
-        'rnn_state_size': 64,
-        'output_shape': 64,
-        'timestep': 50,
-        'batch': batch,
-        'epoch': 20,
-        # The performance for warmup epoch is ignored.
-        'warmup_epoch': 1,
-    }
-    (x_train, y_train), _ = testing_utils.get_test_data(
-        train_samples=(batch * num_batch),
-        test_samples=0,
-        input_shape=(test_config['timestep'], test_config['input_shape']),
-        num_classes=test_config['output_shape'])
-    y_train = keras.utils.to_categorical(y_train, test_config['output_shape'])
-
-    cudnn_duration = self._time_performance_run_cudnn_lstm(
-        test_config, x_train, y_train)
-    unified_lstm_gpu_duration = self._time_performance_run_unifed_lstm_gpu(
-        test_config, x_train, y_train)
-    normal_lstm_duration = self._time_performance_run_normal_lstm(
-        test_config, x_train, y_train)
-
-    cudnn_vs_unified = cudnn_duration / unified_lstm_gpu_duration
-    unified_vs_normal = normal_lstm_duration / unified_lstm_gpu_duration
-
-    # TODO(scottzhu): reeanble the test after moving it to benchmark test suite.
-    # The current test has performance flakiness issue.
-    logging.info('Expect the performance of Unified LSTM is within 80% of '
-                 'CuDNN LSTM, got {0:.2f}%'.format(cudnn_vs_unified * 100))
-    logging.info('Expect the performance of Unified LSTM is more than 5 times'
-                 ' of normal LSTM, got {0:.2f}'.format(unified_vs_normal))
-
-    # Assert the performance diff should be within 80% of the native cudnn.
-    # self.assertGreaterEqual(
-    #     cudnn_vs_unified, 0.80,
-    #     'Expect the performance of Unified LSTM is within 80% of CuDNN LSTM, '
-    #     'but got {0:.2f}%'.format(cudnn_vs_unified * 100))
-    # # Assert the performance diff between CPU impl and GPU impl should be more
-    # # than 5 times.
-    # self.assertGreaterEqual(
-    #     unified_vs_normal, 5,
-    #     'Expect the performance of Unified LSTM is more than 5 times of '
-    #     'normal LSTM, but got {0:.2f}'.format(unified_vs_normal))
-
-
-class UnifiedLSTM(RNN):
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='hard_sigmoid',
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               unit_forget_bias=True,
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               time_major=False,
-               **kwargs):
-    super(RNN, self).__init__(**kwargs)  # pylint: disable=bad-super-call
-    self.units = units
-    cell_spec = collections.namedtuple('cell', ['state_size', 'output_size'])
-    self.cell = cell_spec(
-        state_size=(self.units, self.units), output_size=self.units)
-    self.activation = activations.get(activation)
-    self.recurrent_activation = activations.get(recurrent_activation)
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.recurrent_initializer = initializers.get(recurrent_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-    self.unit_forget_bias = unit_forget_bias
-
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-
-    self.kernel_constraint = constraints.get(kernel_constraint)
-    self.recurrent_constraint = constraints.get(recurrent_constraint)
-    self.bias_constraint = constraints.get(bias_constraint)
-
-    self.return_sequences = return_sequences
-    self.return_state = return_state
-    self.go_backwards = go_backwards
-    self.stateful = stateful
-    self.time_major = time_major
-    self._num_constants = None
-    self._num_inputs = None
-    self._states = None
-    self.input_spec = [InputSpec(ndim=3)]
-    self.state_spec = [
-        InputSpec(shape=(None, dim)) for dim in (self.units, self.units)
-    ]
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape):
-    super(UnifiedLSTM, self).build(input_shape)
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-    input_dim = int(input_shape[-1])
-
-    self.kernel = self.add_weight(
-        shape=(input_dim, self.units * 4),
-        name='kernel',
-        dtype=dtypes.float32,
-        use_resource=True,
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
-    self.recurrent_kernel = self.add_weight(
-        shape=(self.units, self.units * 4),
-        name='recurrent_kernel',
-        dtype=dtypes.float32,
-        use_resource=True,
-        initializer=self.recurrent_initializer,
-        regularizer=self.recurrent_regularizer,
-        constraint=self.recurrent_constraint)
-
-    # Normal LSTM has 4 bias instead of 8.
-    if self.unit_forget_bias:
-
-      def bias_initializer(_, *args, **kwargs):
-        return array_ops.concat([
-            self.bias_initializer((self.units * 5,), *args, **kwargs),
-            initializers.Ones()((self.units,), *args, **kwargs),
-            self.bias_initializer((self.units * 2,), *args, **kwargs),
-        ],
-                                axis=0)
-    else:
-      bias_initializer = self.bias_initializer
-    self.bias = self.add_weight(
-        shape=(self.units * 8,),
-        name='bias',
-        dtype=dtypes.float32,
-        use_resource=True,
-        initializer=bias_initializer,
-        regularizer=self.bias_regularizer,
-        constraint=self.bias_constraint)
-    self.built = True
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    if isinstance(inputs, list):
-      initial_state = inputs[1:]
-      inputs = inputs[0]
-    elif initial_state is not None:
-      pass
-    elif self.stateful:
-      initial_state = self.states
-    else:
-      initial_state = self.get_initial_state(inputs)
-
-    if len(initial_state) != len(self.states):
-      raise ValueError('Layer has ' + str(len(self.states)) +
-                       ' states but was passed ' + str(len(initial_state)) +
-                       ' initial states.')
-
-    if self.go_backwards:
-      # Reverse time axis.
-      inputs = K.reverse(inputs, 1)
-
-    if ops.executing_eagerly_outside_functions():
-      if context.num_gpus() > 0:
-        outputs, [new_h, new_c], runtime = cudnn_lstm(
-            inputs, initial_state[0], initial_state[1], self.kernel,
-            self.recurrent_kernel, self.bias, self.units)
-      else:
-        outputs, [new_h, new_c], runtime = normal_lstm(
-            inputs, initial_state[0], initial_state[1], self.kernel,
-            self.recurrent_kernel, self.bias, self.units, self.activation,
-            self.recurrent_activation)
-    else:
-      outputs, [new_h, new_c], runtime = normal_lstm(
-          inputs, initial_state[0], initial_state[1], self.kernel,
-          self.recurrent_kernel, self.bias, self.units, self.activation,
-          self.recurrent_activation)
-
-      function.register(cudnn_lstm, inputs, initial_state[0], initial_state[1],
-                        self.kernel, self.recurrent_kernel, self.bias,
-                        self.units)
-
-    states = [new_h, new_c]
-
-    if self.stateful:
-      updates = []
-      for i in range(len(states)):
-        updates.append(state_ops.assign(self.states[i], states[i]))
-      self.add_update(updates, inputs)
-
-    if self.return_sequences:
-      output = outputs
-    else:
-      output = outputs[:, -1, :]
-
-    if self.return_state:
-      return [output] + states
-    else:
-      return output, runtime
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-
-    if _is_multiple_state(self.cell.state_size):
-      state_size = self.cell.state_size
-    else:
-      state_size = [self.cell.state_size]
-
-    if getattr(self.cell, 'output_size', None) is not None:
-      output_dim = tensor_shape.as_shape(self.cell.output_size).as_list()
-    else:
-      # Note that state_size[0] could be a tensor_shape or int.
-      output_dim = tensor_shape.as_shape(state_size[0]).as_list()
-
-    if self.return_sequences:
-      output_shape = tuple([input_shape[0], input_shape[1]] + output_dim)
-    else:
-      output_shape = tuple([input_shape[0]] + output_dim)
-
-    if self.return_state:
-      state_shape = [
-          tuple([input_shape[0]] + tensor_shape.as_shape(dim).as_list())
-          for dim in state_size
-      ]
-      return [output_shape] + state_shape
-    else:
-      return output_shape
-
-  @property
-  def trainable_weights(self):
-    if self.trainable and self.built:
-      return [self.kernel, self.recurrent_kernel, self.bias]
-    return []
-
-  @property
-  def non_trainable_weights(self):
-    if not self.trainable and self.built:
-      return [self.kernel, self.recurrent_kernel, self.bias]
-    return []
-
-  @property
-  def losses(self):
-    return super(RNN, self).losses
-
-  def get_losses_for(self, inputs=None):
-    return super(RNN, self).get_losses_for(inputs=inputs)   # pylint: disable=bad-super-call
-
-  def get_weights(self):
-    return super(RNN, self).get_weights()  # pylint: disable=bad-super-call
-
-
-def _canonical_to_params(weights, biases, shape):
-  weights = [array_ops.reshape(x, shape) for x in weights]
-  biases = [array_ops.reshape(x, shape) for x in biases]
-  return array_ops.concat(weights + biases, axis=0)
-
-
-def _is_multiple_state(state_size):
-  """Check whether the state_size contains multiple states."""
-  return (hasattr(state_size, '__len__') and
-          not isinstance(state_size, tensor_shape.TensorShape))
-
-
-@function.defun_with_attributes(
-    attributes={
-        'experimental_api_implements': 'lstm',
-        'experimental_api_preferred_device': 'CPU'
-    })
-def normal_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, units,
-                activation, recurrent_activation):
-  input_shape = K.int_shape(inputs)
-  timesteps = input_shape[1]
-
-  def step(cell_inputs, cell_states):
-    h_tm1 = cell_states[0]  # previous memory state
-    c_tm1 = cell_states[1]  # previous carry state
-
-    # Only use the second half of the bias weights.
-    _, real_bias = array_ops.split(bias, 2)
-
-    z = K.dot(cell_inputs, kernel)
-    z += K.dot(h_tm1, recurrent_kernel)
-    z = K.bias_add(z, real_bias)
-
-    z0 = z[:, :units]
-    z1 = z[:, units:2 * units]
-    z2 = z[:, 2 * units:3 * units]
-    z3 = z[:, 3 * units:]
-
-    i = recurrent_activation(z0)
-    f = recurrent_activation(z1)
-    c = f * c_tm1 + i * activation(z2)
-    o = recurrent_activation(z3)
-
-    h = o * activation(c)
-    return h, [h, c]
-
-  _, outputs, new_states = K.rnn(
-      step,
-      inputs, [init_h, init_c],
-      constants=None,
-      unroll=False,
-      input_length=timesteps)
-  return outputs, new_states, constant_op.constant(
-      'cpu', dtype=dtypes.string, name='runtime')
-
-
-@function.defun_with_attributes(
-    attributes={
-        'experimental_api_implements': 'lstm',
-        'experimental_api_preferred_device': 'GPU'
-    })
-def cudnn_lstm(inputs, input_h, input_c, kernel, recurrent_kernel, bias, units):
-  inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
-  input_h = array_ops.expand_dims(input_h, axis=0)
-  input_c = array_ops.expand_dims(input_c, axis=0)
-
-  params = _canonical_to_params(
-      weights=[
-          kernel[:, :units],
-          kernel[:, units:units * 2],
-          kernel[:, units * 2:units * 3],
-          kernel[:, units * 3:],
-          recurrent_kernel[:, :units],
-          recurrent_kernel[:, units:units * 2],
-          recurrent_kernel[:, units * 2:units * 3],
-          recurrent_kernel[:, units * 3:],
-      ],
-      biases=[
-          bias[:units],
-          bias[units:units * 2],
-          bias[units * 2:units * 3],
-          bias[units * 3:units * 4],
-          bias[units * 4:units * 5],
-          bias[units * 5:units * 6],
-          bias[units * 6:units * 7],
-          bias[units * 7:],
-      ],
-      shape=constant_op.constant([-1]))
-
-  outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
-      inputs, input_h=input_h, input_c=input_c, params=params)
-  outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
-  h = h[0]
-  c = c[0]
-  return outputs, [h, c], constant_op.constant(
-      'cudnn', dtype=dtypes.string, name='runtime')
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index 67b154141efc036b5fa7920c8179b35f5eb38cc1..c78807611bd8b60c7cbc38828ce0da780c5554e1 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -29,6 +29,7 @@ from tensorflow.python.keras.layers.recurrent import _standardize_args
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -45,6 +46,7 @@ class Wrapper(Layer):
       layer: The layer to be wrapped.
   """
 
+  @checkpointable.no_automatic_dependency_tracking
   def __init__(self, layer, **kwargs):
     assert isinstance(layer, Layer)
     self.layer = layer
@@ -380,6 +382,7 @@ class Bidirectional(Wrapper):
   ```
   """
 
+  @checkpointable.no_automatic_dependency_tracking
   def __init__(self, layer, merge_mode='concat', weights=None, **kwargs):
     if not isinstance(layer, Layer):
       raise ValueError(
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 427314faf178bb295cfa43f83395ff9e3cec235a..46d5487b2c00fa3177a595774dc7ce8d40655f2e 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -192,8 +192,8 @@ class TimeDistributedTest(test.TestCase):
     x = keras.layers.Input(shape=(3, 2))
     layer = keras.layers.TimeDistributed(keras.layers.BatchNormalization())
     _ = layer(x)
-    self.assertEquals(len(layer.updates), 2)
-    self.assertEquals(len(layer.trainable_weights), 2)
+    self.assertEqual(len(layer.updates), 2)
+    self.assertEqual(len(layer.trainable_weights), 2)
     layer.trainable = False
     assert not layer.updates
     assert not layer.trainable_weights
@@ -201,7 +201,6 @@ class TimeDistributedTest(test.TestCase):
     assert len(layer.updates) == 2
     assert len(layer.trainable_weights) == 2
 
-  @tf_test_util.run_deprecated_v1
   def test_TimeDistributed_with_masked_embedding_and_unspecified_shape(self):
     with self.cached_session():
       # test with unspecified shape and Embeddings with mask_zero
@@ -234,7 +233,6 @@ class TimeDistributedTest(test.TestCase):
         self.assertAllEqual(mask_outputs_val[i], ref_mask_val[i])
       self.assertIs(mask_outputs[-1], None)  # final layer
 
-  @tf_test_util.run_deprecated_v1
   def test_TimeDistributed_with_masking_layer(self):
     with self.cached_session():
       # test with Masking layer
@@ -377,7 +375,7 @@ class BidirectionalTest(test.TestCase):
       model.compile(loss='mse', optimizer='sgd')
       model.fit(x, y, epochs=1, batch_size=1)
 
-  @tf_test_util.run_deprecated_v1
+  @tf_test_util.run_v1_only('b/120545219')
   def test_Bidirectional_merged_value(self):
     rnn = keras.layers.LSTM
     samples = 2
@@ -508,7 +506,7 @@ class BidirectionalTest(test.TestCase):
       layer.trainable = True
       assert len(layer.trainable_weights) == 6
 
-  @tf_test_util.run_deprecated_v1
+  @tf_test_util.run_v1_only('b/120545219')
   def test_Bidirectional_updates(self):
     with self.cached_session():
       x = keras.layers.Input(shape=(3, 2))
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index f6953f84995fc36200adb987329dfbac583d3686..4c584d0ff059ba8eabd3de06ebb06b2703400a73 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -28,6 +28,7 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.losses_utils import compute_weighted_loss
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops.losses import losses_impl
@@ -268,6 +269,132 @@ class MeanSquaredLogarithmicError(Loss):
     return mean_squared_logarithmic_error(y_true, y_pred)
 
 
+@tf_export('keras.losses.BinaryCrossentropy')
+class BinaryCrossentropy(Loss):
+  """Computes the binary cross entropy loss between the labels and predictions.
+
+  Usage:
+
+  ```python
+  bce = tf.keras.losses.BinaryCrossentropy()
+  loss = bce([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 12.007
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.BinaryCrossentropy())
+  ````
+
+  Args:
+    from_logits: Whether `output` is expected to be a logits tensor. By default,
+      we consider that `output` encodes a probability distribution.
+    label_smoothing: If greater than `0` then smooth the labels.
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               from_logits=False,
+               label_smoothing=0,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(BinaryCrossentropy, self).__init__(reduction=reduction, name=name)
+    self.from_logits = from_logits
+    self.label_smoothing = label_smoothing
+
+  def call(self, y_true, y_pred):
+    """Invokes the `BinaryCrossentropy` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Binary cross entropy losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+
+    if self.label_smoothing > 0:
+      y_true = y_true * (1 - self.label_smoothing) + 0.5 * self.label_smoothing
+
+    return binary_crossentropy(y_true, y_pred, from_logits=self.from_logits)
+
+
+@tf_export('keras.losses.CategoricalCrossentropy')
+class CategoricalCrossentropy(Loss):
+  """Computes categorical cross entropy loss between the `y_true` and `y_pred`.
+
+  Usage:
+
+  ```python
+  cce = tf.keras.losses.CategoricalCrossentropy()
+  loss = cce(
+    [[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
+    [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]])
+  print('Loss: ', loss.numpy())  # Loss: 0.3239
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.CategoricalCrossentropy())
+  ````
+
+  Args:
+    from_logits: Whether `output` is expected to be a logits tensor. By default,
+      we consider that `output` encodes a probability distribution.
+    label_smoothing: If greater than `0` then smooth the labels. This option is
+      currently not supported when `y_pred` is a sparse input (not one-hot).
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               from_logits=False,
+               label_smoothing=0,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(CategoricalCrossentropy, self).__init__(
+        reduction=reduction, name=name)
+    self.from_logits = from_logits
+    self.label_smoothing = label_smoothing
+
+  def call(self, y_true, y_pred):
+    """Invokes the `CategoricalCrossentropy` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Categorical cross entropy losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = ops.convert_to_tensor(y_true)
+    is_sparse = y_pred.shape != y_true.shape
+
+    if is_sparse:
+      return sparse_categorical_crossentropy(
+          y_true, y_pred, from_logits=self.from_logits)
+    else:
+      y_true = math_ops.cast(y_true, y_pred.dtype)
+      if self.label_smoothing > 0:
+        num_classes = math_ops.cast(array_ops.shape(y_true)[1], y_pred.dtype)
+        smooth_positives = 1.0 - self.label_smoothing
+        smooth_negatives = self.label_smoothing / num_classes
+        y_true = y_true * smooth_positives + smooth_negatives
+
+      return categorical_crossentropy(
+          y_true, y_pred, from_logits=self.from_logits)
+
+
 @tf_export('keras.metrics.mean_squared_error',
            'keras.metrics.mse',
            'keras.metrics.MSE',
@@ -355,20 +482,22 @@ def logcosh(y_true, y_pred):
 
 @tf_export('keras.metrics.categorical_crossentropy',
            'keras.losses.categorical_crossentropy')
-def categorical_crossentropy(y_true, y_pred):
-  return K.categorical_crossentropy(y_true, y_pred)
+def categorical_crossentropy(y_true, y_pred, from_logits=False):
+  return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
 
 
 @tf_export('keras.metrics.sparse_categorical_crossentropy',
            'keras.losses.sparse_categorical_crossentropy')
-def sparse_categorical_crossentropy(y_true, y_pred):
-  return K.sparse_categorical_crossentropy(y_true, y_pred)
+def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False):
+  return K.sparse_categorical_crossentropy(
+      y_true, y_pred, from_logits=from_logits)
 
 
 @tf_export('keras.metrics.binary_crossentropy',
            'keras.losses.binary_crossentropy')
-def binary_crossentropy(y_true, y_pred):
-  return K.mean(K.binary_crossentropy(y_true, y_pred), axis=-1)
+def binary_crossentropy(y_true, y_pred, from_logits=False):
+  return K.mean(
+      K.binary_crossentropy(y_true, y_pred, from_logits=from_logits), axis=-1)
 
 
 @tf_export('keras.metrics.kullback_leibler_divergence',
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index cbf3c3524ccfb788cfdaaf9325ea361e3e59cd5a..bc040fb685759ef20b698642dd9becb303562e73 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -27,6 +27,7 @@ from tensorflow.python import keras
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import test
 
@@ -94,6 +95,45 @@ class KerasLossesTest(test.TestCase):
       objective_output = keras.losses.sparse_categorical_crossentropy(y_a, y_b)
       assert keras.backend.eval(objective_output).shape == (6,)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_categorical_crossentropy_loss(self):
+    target = keras.backend.variable(np.random.randint(0, 1, (5, 1)))
+    logits = keras.backend.variable(np.random.random((5, 1)))
+    softmax_output = keras.backend.softmax(logits)
+    output_from_logit = keras.losses.categorical_crossentropy(
+        target, logits, from_logits=True)
+    output_from_softmax = keras.losses.categorical_crossentropy(
+        target, softmax_output)
+    np.testing.assert_allclose(
+        keras.backend.eval(output_from_logit),
+        keras.backend.eval(output_from_softmax), atol=1e-5)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_sparse_categorical_crossentropy_loss(self):
+    target = keras.backend.variable(np.random.randint(0, 1, (5, 1)))
+    logits = keras.backend.variable(np.random.random((5, 1)))
+    softmax_output = keras.backend.softmax(logits)
+    output_from_logit = keras.losses.sparse_categorical_crossentropy(
+        target, logits, from_logits=True)
+    output_from_softmax = keras.losses.sparse_categorical_crossentropy(
+        target, softmax_output)
+    np.testing.assert_allclose(
+        keras.backend.eval(output_from_logit),
+        keras.backend.eval(output_from_softmax), atol=1e-5)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_binary_crossentropy_loss(self):
+    target = keras.backend.variable(np.random.randint(0, 1, (5, 1)))
+    logits = keras.backend.variable(np.random.random((5, 1)))
+    sigmoid_output = keras.backend.sigmoid(logits)
+    output_from_logit = keras.losses.binary_crossentropy(
+        target, logits, from_logits=True)
+    output_from_sigmoid = keras.losses.binary_crossentropy(
+        target, sigmoid_output)
+    np.testing.assert_allclose(
+        keras.backend.eval(output_from_logit),
+        keras.backend.eval(output_from_sigmoid), atol=1e-5)
+
   def test_serialization(self):
     fn = keras.losses.get('mse')
     config = keras.losses.serialize(fn)
@@ -499,5 +539,276 @@ class CosineProximityTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class BinaryCrossentropyTest(test.TestCase):
+
+  def test_config(self):
+    bce_obj = keras.losses.BinaryCrossentropy(
+        reduction=losses_impl.ReductionV2.SUM, name='bce_1')
+    self.assertEqual(bce_obj.name, 'bce_1')
+    self.assertEqual(bce_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct_unweighted(self):
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+                                  dtype=dtypes.float32)
+    bce_obj = keras.losses.BinaryCrossentropy()
+    loss = bce_obj(y_true, y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[100.0, -100.0, -100.0],
+                                   [-100.0, 100.0, -100.0],
+                                   [-100.0, -100.0, 100.0]])
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    bce_obj = keras.losses.BinaryCrossentropy()
+    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
+    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = bce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 8.0004, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([10., 10., 10., -10., 10, -10],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 5., 3)
+
+  def test_scalar_weighted(self):
+    bce_obj = keras.losses.BinaryCrossentropy()
+    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
+    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = bce_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 18.4010, 3)
+
+    # Test with logits.
+    y_true = array_ops.ones((32, 1))
+    logits = array_ops.ones((32, 1), dtype=dtypes.float32)
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 0.7205, 3)
+
+  def test_sample_weighted(self):
+    bce_obj = keras.losses.BinaryCrossentropy()
+    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
+    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float64)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = bce_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 21.4907, 3)
+
+    # Test with logits.
+    y_true = constant_op.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0]])
+    logits = constant_op.constant(
+        [[100.0, -100.0, -100.0], [-100.0, 100.0, -100.0],
+         [-100.0, -100.0, 100.0]],
+        dtype=dtypes.float64)
+    weights = constant_op.constant([3, 2, 8])
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits, sample_weight=weights)
+    self.assertAlmostEqual(self.evaluate(loss), 288.8888, 3)
+
+  def test_no_reduction(self):
+    y_true = constant_op.constant(((1, 0, 1), (1, 1, 0), (0, 1, 1)))
+    logits = constant_op.constant(((100.0, -100.0, 100.0),
+                                   (100.0, -100.0, 100.0),
+                                   (100.0, 100.0, -100.0)))
+    bce_obj = keras.losses.BinaryCrossentropy(
+        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+    loss = bce_obj(y_true, logits)
+    self.assertAllClose((0., 66.6666, 66.6666), self.evaluate(loss), 3)
+
+  def test_label_smoothing(self):
+    logits = constant_op.constant([[100.0, -100.0, -100.0]])
+    y_true = constant_op.constant([[1, 0, 1]])
+    label_smoothing = 0.1
+    # Loss: max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    # Label smoothing: z' = z * (1 - L) + 0.5L
+    #                  1  = 1 - 0.5L
+    #                  0  = 0.5L
+    # Applying the above two fns to the given input:
+    # (100 - 100 * (1 - 0.5 L)  + 0 +
+    #  0   + 100 * (0.5 L)      + 0 +
+    #  0   + 100 * (1 - 0.5 L)  + 0) * (1/3)
+    #  = (100 + 50L) * 1/3
+    bce_obj = keras.losses.BinaryCrossentropy(
+        from_logits=True, label_smoothing=label_smoothing)
+    loss = bce_obj(y_true, logits)
+    expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
+    self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CategoricalCrossentropyTest(test.TestCase):
+
+  def test_config(self):
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        reduction=losses_impl.ReductionV2.SUM, name='bce_1')
+    self.assertEqual(cce_obj.name, 'bce_1')
+    self.assertEqual(cce_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct_unweighted(self):
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+                                  dtype=dtypes.int64)
+    y_pred = constant_op.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
+                                  dtype=dtypes.float32)
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[10., 0., 0.], [0., 10., 0.], [0., 0., 10.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), .3239, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), .0573, 3)
+
+  def test_scalar_weighted(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .7449, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .1317, 3)
+
+  def test_sample_weighted(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    sample_weight = constant_op.constant([[1.2], [3.4], [5.6]], shape=(3, 1))
+    loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 1.0696, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
+
+  def test_no_reduction(self):
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+    loss = cce_obj(y_true, logits)
+    self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
+
+  def test_label_smoothing(self):
+    logits = constant_op.constant([[100.0, -100.0, -100.0]])
+    y_true = constant_op.constant([[1, 0, 0]])
+    label_smoothing = 0.1
+    # Softmax Cross Entropy Loss: -\sum_i p_i \log q_i
+    # where for a softmax activation
+    # \log q_i = x_i - \log \sum_j \exp x_j
+    #          = x_i - x_max - \log \sum_j \exp (x_j - x_max)
+    # For our activations, [100, -100, -100]
+    # \log ( exp(0) + exp(-200) + exp(-200) ) = 0
+    # so our log softmaxes become: [0, -200, -200]
+    # Label smoothing: z' = z * (1 - L) + L/n
+    #                  1  = 1 - L + L/n
+    #                  0  = L/n
+    # Applying the above two fns to the given input:
+    # -0 * (1 - L + L/n) + 200 * L/n + 200 * L/n = 400 L/n
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        from_logits=True, label_smoothing=label_smoothing)
+    loss = cce_obj(y_true, logits)
+    expected_value = 400.0 * label_smoothing / 3.0
+    self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
+
+  def test_all_correct_unweighted_sparse(self):
+    y_true = constant_op.constant([[0], [1], [2]], dtype=dtypes.int64)
+    y_pred = constant_op.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
+                                  dtype=dtypes.float32)
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[10., 0., 0.], [0., 10., 0.], [0., 0., 10.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted_sparse(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([0, 1, 2])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), .3239, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), .0573, 3)
+
+  def test_scalar_weighted_sparse(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[0], [1], [2]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .7449, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .1317, 3)
+
+  def test_sample_weighted_sparse(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[0], [1], [2]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    sample_weight = constant_op.constant([[1.2], [3.4], [5.6]], shape=(3, 1))
+    loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 1.0696, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
+
+  def test_no_reduction_sparse(self):
+    y_true = constant_op.constant([[0], [1], [2]])
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+    loss = cce_obj(y_true, logits)
+    self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 3c2682e4c6ff07c0d7371bf7edef159f0a22501f..c8ccb7f624292639b5c9e3be1604a2e572ee8693 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -28,6 +28,7 @@ from enum import Enum
 import numpy as np
 import six
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
@@ -60,7 +61,6 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops import weights_broadcast_ops
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
 from tensorflow.tools.docs import doc_controls
@@ -171,12 +171,18 @@ class _ConfusionMatrix(Enum):
 
 
 def _assert_thresholds_range(thresholds):
-  invalid_thresholds = [t for t in thresholds if t < 0 or t > 1]
-  if any(invalid_thresholds):
+  invalid_thresholds = [t for t in thresholds if t is None or t < 0 or t > 1]
+  if invalid_thresholds:
     raise ValueError('Threshold values must be in [0, 1]. Invalid values: {}'
                      .format(invalid_thresholds))
 
 
+def _parse_init_thresholds(thresholds, default_threshold=0.5):
+  thresholds = to_list(default_threshold if thresholds is None else thresholds)
+  _assert_thresholds_range(thresholds)
+  return thresholds
+
+
 def _update_confusion_matrix_variables(variables_to_update,
                                        y_true,
                                        y_pred,
@@ -511,7 +517,7 @@ class Metric(Layer):
   ### End: For use by subclasses ###
 
 
-@tf_export('metrics.Mean', 'keras.metrics.Mean')
+@tf_export('keras.metrics.Mean')
 class Mean(Metric):
   """Computes the (weighted) mean of the given values.
 
@@ -528,7 +534,7 @@ class Mean(Metric):
   Usage:
 
   ```python
-  m = tf.metrics.Mean()
+  m = tf.keras.metrics.Mean()
   m.update_state([1, 3, 5, 7])
   print('Final result: ', m.result().numpy())  # Final result: 4.0
   ```
@@ -537,7 +543,7 @@ class Mean(Metric):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.add_metric(metrics_module.Mean(name='mean_1')(outputs))
+  model.add_metric(tf.keras.metrics.Mean(name='mean_1')(outputs))
   model.compile('sgd', loss='mse')
   ```
   """
@@ -651,7 +657,7 @@ class MeanMetricWrapper(Mean):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('metrics.Accuracy', 'keras.metrics.Accuracy')
+@tf_export('keras.metrics.Accuracy')
 class Accuracy(MeanMetricWrapper):
   """Calculates how often predictions matches labels.
 
@@ -670,7 +676,7 @@ class Accuracy(MeanMetricWrapper):
   Usage:
 
   ```python
-  m = tf.metrics.Accuracy()
+  m = tf.keras.metrics.Accuracy()
   m.update_state([1, 2, 3, 4], [0, 2, 3, 4])
   print('Final result: ', m.result().numpy())  # Final result: 0.75
   ```
@@ -679,7 +685,7 @@ class Accuracy(MeanMetricWrapper):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss='mse', metrics=[tf.metrics.Accuracy()])
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Accuracy()])
   ```
   """
 
@@ -693,7 +699,7 @@ class Accuracy(MeanMetricWrapper):
     return super(Accuracy, cls).from_config(config)
 
 
-@tf_export('metrics.BinaryAccuracy', 'keras.metrics.BinaryAccuracy')
+@tf_export('keras.metrics.BinaryAccuracy')
 class BinaryAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches labels.
 
@@ -712,7 +718,7 @@ class BinaryAccuracy(MeanMetricWrapper):
   Usage:
 
   ```python
-  m = tf.metrics.BinaryAccuracy()
+  m = tf.keras.metrics.BinaryAccuracy()
   m.update_state([1, 1, 0, 0], [0.98, 1, 0, 0.6])
   print('Final result: ', m.result().numpy())  # Final result: 0.75
   ```
@@ -721,7 +727,7 @@ class BinaryAccuracy(MeanMetricWrapper):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss='mse', metrics=[tf.metrics.BinaryAccuracy()])
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.BinaryAccuracy()])
   ```
   """
 
@@ -744,8 +750,7 @@ class BinaryAccuracy(MeanMetricWrapper):
     return super(BinaryAccuracy, cls).from_config(config)
 
 
-@tf_export(
-    'metrics.CategoricalAccuracy', 'keras.metrics.CategoricalAccuracy')
+@tf_export('keras.metrics.CategoricalAccuracy')
 class CategoricalAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches labels.
 
@@ -768,7 +773,7 @@ class CategoricalAccuracy(MeanMetricWrapper):
   Usage:
 
   ```python
-  m = tf.metrics.CategoricalAccuracy()
+  m = tf.keras.metrics.CategoricalAccuracy()
   m.update_state([[0, 0, 1], [0, 1, 0]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
   print('Final result: ', m.result().numpy())  # Final result: 0.5
   ```
@@ -777,7 +782,10 @@ class CategoricalAccuracy(MeanMetricWrapper):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss='mse', metrics=[tf.metrics.CategoricalAccuracy()])
+  model.compile(
+    'sgd',
+    loss='mse',
+    metrics=[tf.keras.metrics.CategoricalAccuracy()])
   ```
   """
 
@@ -798,9 +806,7 @@ class CategoricalAccuracy(MeanMetricWrapper):
     return super(CategoricalAccuracy, cls).from_config(config)
 
 
-@tf_export(
-    'metrics.SparseCategoricalAccuracy',
-    'keras.metrics.SparseCategoricalAccuracy')
+@tf_export('keras.metrics.SparseCategoricalAccuracy')
 class SparseCategoricalAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches integer labels.
 
@@ -820,7 +826,7 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
   Usage:
 
   ```python
-  m = tf.metrics.SparseCategoricalAccuracy()
+  m = tf.keras.metrics.SparseCategoricalAccuracy()
   m.update_state([[2], [1]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
   print('Final result: ', m.result().numpy())  # Final result: 0.5
   ```
@@ -832,7 +838,7 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
   model.compile(
       'sgd',
       loss='mse',
-      metrics=[tf.metrics.SparseCategoricalAccuracy()])
+      metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
   ```
   """
 
@@ -869,12 +875,11 @@ class _ConfusionMatrixConditionCount(Metric):
     """
     super(_ConfusionMatrixConditionCount, self).__init__(name=name, dtype=dtype)
     self._confusion_matrix_cond = confusion_matrix_cond
-    self.thresholds = 0.5 if thresholds is None else thresholds
-    thresholds = to_list(thresholds)
-    _assert_thresholds_range(thresholds)
+    self.thresholds = _parse_init_thresholds(
+        thresholds, default_threshold=0.5)
     self.accumulator = self.add_weight(
         'accumulator',
-        shape=(len(thresholds),),
+        shape=(len(self.thresholds),),
         initializer=init_ops.zeros_initializer)
 
   def update_state(self, y_true, y_pred, sample_weight=None):
@@ -895,10 +900,10 @@ class _ConfusionMatrixConditionCount(Metric):
     }, y_true, y_pred, self.thresholds, sample_weight)
 
   def result(self):
-    if isinstance(self.thresholds, (list, tuple)):
-      result = self.accumulator
-    else:
+    if len(self.thresholds) == 1:
       result = self.accumulator[0]
+    else:
+      result = self.accumulator
     return ops.convert_to_tensor(result)
 
   def reset_states(self):
@@ -907,7 +912,7 @@ class _ConfusionMatrixConditionCount(Metric):
       K.set_value(v, np.zeros((num_thresholds,)))
 
 
-@tf_export('metrics.FalsePositives', 'keras.metrics.FalsePositives')
+@tf_export('keras.metrics.FalsePositives')
 class FalsePositives(_ConfusionMatrixConditionCount):
   """Calculates the number of false positives.
 
@@ -925,7 +930,7 @@ class FalsePositives(_ConfusionMatrixConditionCount):
   Usage:
 
   ```python
-  m = tf.metrics.FalsePositives()
+  m = tf.keras.metrics.FalsePositives()
   m.update_state([0, 1, 0, 0], [0, 0, 1, 1])
   print('Final result: ', m.result().numpy())  # Final result: 2
   ```
@@ -934,7 +939,7 @@ class FalsePositives(_ConfusionMatrixConditionCount):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss='mse', metrics=[tf.metrics.FalsePositives()])
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.FalsePositives()])
   ```
   """
 
@@ -957,7 +962,7 @@ class FalsePositives(_ConfusionMatrixConditionCount):
         dtype=dtype)
 
 
-@tf_export('metrics.FalseNegatives', 'keras.metrics.FalseNegatives')
+@tf_export('keras.metrics.FalseNegatives')
 class FalseNegatives(_ConfusionMatrixConditionCount):
   """Calculates the number of false negatives.
 
@@ -975,7 +980,7 @@ class FalseNegatives(_ConfusionMatrixConditionCount):
   Usage:
 
   ```python
-  m = tf.metrics.FalseNegatives()
+  m = tf.keras.metrics.FalseNegatives()
   m.update_state([0, 1, 1, 1], [0, 1, 0, 0])
   print('Final result: ', m.result().numpy())  # Final result: 2
   ```
@@ -984,7 +989,7 @@ class FalseNegatives(_ConfusionMatrixConditionCount):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss='mse', metrics=[tf.metrics.FalseNegatives()])
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.FalseNegatives()])
   ```
   """
 
@@ -1007,7 +1012,7 @@ class FalseNegatives(_ConfusionMatrixConditionCount):
         dtype=dtype)
 
 
-@tf_export('metrics.TrueNegatives', 'keras.metrics.TrueNegatives')
+@tf_export('keras.metrics.TrueNegatives')
 class TrueNegatives(_ConfusionMatrixConditionCount):
   """Calculates the number of true negatives.
 
@@ -1025,7 +1030,7 @@ class TrueNegatives(_ConfusionMatrixConditionCount):
   Usage:
 
   ```python
-  m = tf.metrics.TrueNegatives()
+  m = tf.keras.metrics.TrueNegatives()
   m.update_state([0, 1, 0, 0], [1, 1, 0, 0])
   print('Final result: ', m.result().numpy())  # Final result: 2
   ```
@@ -1034,7 +1039,7 @@ class TrueNegatives(_ConfusionMatrixConditionCount):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss='mse', metrics=[tf.metrics.TrueNegatives()])
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.TrueNegatives()])
   ```
   """
 
@@ -1057,7 +1062,7 @@ class TrueNegatives(_ConfusionMatrixConditionCount):
         dtype=dtype)
 
 
-@tf_export('metrics.TruePositives', 'keras.metrics.TruePositives')
+@tf_export('keras.metrics.TruePositives')
 class TruePositives(_ConfusionMatrixConditionCount):
   """Calculates the number of true positives.
 
@@ -1075,7 +1080,7 @@ class TruePositives(_ConfusionMatrixConditionCount):
   Usage:
 
   ```python
-  m = tf.metrics.TruePositives()
+  m = tf.keras.metrics.TruePositives()
   m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
   print('Final result: ', m.result().numpy())  # Final result: 2
   ```
@@ -1084,7 +1089,7 @@ class TruePositives(_ConfusionMatrixConditionCount):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss='mse', metrics=[tf.metrics.TruePositives()])
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.TruePositives()])
   ```
   """
 
@@ -1107,7 +1112,7 @@ class TruePositives(_ConfusionMatrixConditionCount):
         dtype=dtype)
 
 
-@tf_export('metrics.Precision', 'keras.metrics.Precision')
+@tf_export('keras.metrics.Precision')
 class Precision(Metric):
   """Computes the precision of the predictions with respect to the labels.
 
@@ -1126,7 +1131,7 @@ class Precision(Metric):
   Usage:
 
   ```python
-  m = tf.metrics.Precision()
+  m = tf.keras.metrics.Precision()
   m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
   print('Final result: ', m.result().numpy())  # Final result: 0.66
   ```
@@ -1135,7 +1140,7 @@ class Precision(Metric):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss='mse', metrics=[tf.metrics.Precision()])
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Precision()])
   ```
   """
 
@@ -1152,16 +1157,15 @@ class Precision(Metric):
       dtype: (Optional) data type of the metric result.
     """
     super(Precision, self).__init__(name=name, dtype=dtype)
-    self.thresholds = 0.5 if thresholds is None else thresholds
-    thresholds = to_list(thresholds)
-    _assert_thresholds_range(thresholds)
+    self.thresholds = _parse_init_thresholds(
+        thresholds, default_threshold=0.5)
     self.tp = self.add_weight(
         'true_positives',
-        shape=(len(thresholds),),
+        shape=(len(self.thresholds),),
         initializer=init_ops.zeros_initializer)
     self.fp = self.add_weight(
         'false_positives',
-        shape=(len(thresholds),),
+        shape=(len(self.thresholds),),
         initializer=init_ops.zeros_initializer)
 
   def update_state(self, y_true, y_pred, sample_weight=None):
@@ -1184,7 +1188,7 @@ class Precision(Metric):
 
   def result(self):
     result = math_ops.div_no_nan(self.tp, self.tp + self.fp)
-    return result if isinstance(self.thresholds, (list, tuple)) else result[0]
+    return result[0] if len(self.thresholds) == 1 else result
 
   def reset_states(self):
     num_thresholds = len(to_list(self.thresholds))
@@ -1192,7 +1196,7 @@ class Precision(Metric):
       K.set_value(v, np.zeros((num_thresholds,)))
 
 
-@tf_export('metrics.Recall', 'keras.metrics.Recall')
+@tf_export('keras.metrics.Recall')
 class Recall(Metric):
   """Computes the recall of the predictions with respect to the labels.
 
@@ -1211,7 +1215,7 @@ class Recall(Metric):
   Usage:
 
   ```python
-  m = tf.metrics.Recall()
+  m = tf.keras.metrics.Recall()
   m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
   print('Final result: ', m.result().numpy())  # Final result: 0.66
   ```
@@ -1220,7 +1224,7 @@ class Recall(Metric):
 
   ```python
   model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss='mse', metrics=[tf.metrics.Recall()])
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Recall()])
   ```
   """
 
@@ -1237,16 +1241,15 @@ class Recall(Metric):
       dtype: (Optional) data type of the metric result.
     """
     super(Recall, self).__init__(name=name, dtype=dtype)
-    self.thresholds = 0.5 if thresholds is None else thresholds
-    thresholds = to_list(thresholds)
-    _assert_thresholds_range(thresholds)
+    self.thresholds = _parse_init_thresholds(
+        thresholds, default_threshold=0.5)
     self.tp = self.add_weight(
         'true_positives',
-        shape=(len(thresholds),),
+        shape=(len(self.thresholds),),
         initializer=init_ops.zeros_initializer)
     self.fn = self.add_weight(
         'false_negatives',
-        shape=(len(thresholds),),
+        shape=(len(self.thresholds),),
         initializer=init_ops.zeros_initializer)
 
   def update_state(self, y_true, y_pred, sample_weight=None):
@@ -1269,7 +1272,7 @@ class Recall(Metric):
 
   def result(self):
     result = math_ops.div_no_nan(self.tp, self.tp + self.fn)
-    return result if isinstance(self.thresholds, (list, tuple)) else result[0]
+    return result[0] if len(self.thresholds) == 1 else result
 
   def reset_states(self):
     num_thresholds = len(to_list(self.thresholds))
@@ -1341,6 +1344,7 @@ class SensitivitySpecificityBase(Metric):
       K.set_value(v, np.zeros((num_thresholds,)))
 
 
+@tf_export('keras.metrics.SensitivityAtSpecificity')
 class SensitivityAtSpecificity(SensitivitySpecificityBase):
   """Computes the sensitivity at a given specificity.
 
@@ -1363,7 +1367,7 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
   Usage:
 
   ```python
-  m = tf.metrics.SensitivityAtSpecificity(0.4, num_thresholds=1)
+  m = tf.keras.metrics.SensitivityAtSpecificity(0.4, num_thresholds=1)
   m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
   print('Final result: ', m.result().numpy())  # Final result: 0.5
   ```
@@ -1375,7 +1379,7 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
   model.compile(
       'sgd',
       loss='mse',
-      metrics=[tf.metrics.SensitivityAtSpecificity()])
+      metrics=[tf.keras.metrics.SensitivityAtSpecificity()])
   ```
   """
 
@@ -1409,6 +1413,7 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
                                self.tp[min_index] + self.fn[min_index])
 
 
+@tf_export('keras.metrics.SpecificityAtSensitivity')
 class SpecificityAtSensitivity(SensitivitySpecificityBase):
   """Computes the specificity at a given sensitivity.
 
@@ -1431,7 +1436,7 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
   Usage:
 
   ```python
-  m = tf.metrics.SpecificityAtSensitivity(0.8, num_thresholds=1)
+  m = tf.keras.metrics.SpecificityAtSensitivity(0.8, num_thresholds=1)
   m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
   print('Final result: ', m.result().numpy())  # Final result: 1.0
   ```
@@ -1443,7 +1448,7 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
   model.compile(
       'sgd',
       loss='mse',
-      metrics=[tf.metrics.SpecificityAtSensitivity()])
+      metrics=[tf.keras.metrics.SpecificityAtSensitivity()])
   ```
   """
 
diff --git a/tensorflow/python/keras/metrics_functional_test.py b/tensorflow/python/keras/metrics_functional_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..513daaf9fcc01cc6741df1b698190ade1e848492
--- /dev/null
+++ b/tensorflow/python/keras/metrics_functional_test.py
@@ -0,0 +1,122 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras metrics functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import metrics
+from tensorflow.python.platform import test
+
+
+class KerasFunctionalMetricsTest(test.TestCase):
+
+  def test_metrics(self):
+    with self.cached_session():
+      y_a = K.variable(np.random.random((6, 7)))
+      y_b = K.variable(np.random.random((6, 7)))
+      for metric in [metrics.binary_accuracy, metrics.categorical_accuracy]:
+        output = metric(y_a, y_b)
+        self.assertEqual(K.eval(output).shape, (6,))
+
+  def test_sparse_categorical_accuracy_int(self):
+    with self.cached_session():
+      metric = metrics.sparse_categorical_accuracy
+      y_true = K.variable(np.random.randint(0, 7, (6,)))
+      y_pred = K.variable(np.random.random((6, 7)))
+      self.assertEqual(K.eval(metric(y_true, y_pred)).shape, (6,))
+
+      # Test correctness if the shape of y_true is (num_samples,)
+      y_true = K.variable([1., 0., 0., 0.])
+      y_pred = K.variable([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
+      print(K.eval(metric(y_true, y_pred)))
+      self.assertAllEqual(K.eval(metric(y_true, y_pred)), [0., 1., 1., 1.])
+
+      # Test correctness if the shape of y_true is (num_samples, 1)
+      y_true = K.variable([[1.], [0.], [0.], [0.]])
+      y_pred = K.variable([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
+      print(K.eval(metric(y_true, y_pred)))
+      self.assertAllEqual(K.eval(metric(y_true, y_pred)), [0., 1., 1., 1.])
+
+  def test_sparse_categorical_accuracy_float(self):
+    with self.cached_session():
+      metric = metrics.sparse_categorical_accuracy
+      y_true = K.variable(np.random.random((6,)))
+      y_pred = K.variable(np.random.random((6, 7)))
+      self.assertEqual(K.eval(metric(y_true, y_pred)).shape, (6,))
+
+  def test_sparse_categorical_accuracy_eager(self):
+    """Tests that ints passed in via Eager return results. See b/113504761."""
+    with context.eager_mode():
+      metric = metrics.sparse_categorical_accuracy
+      y_true = np.arange(6).reshape([6, 1])
+      y_pred = np.arange(36).reshape([6, 6])
+      self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
+
+  def test_sparse_categorical_accuracy_float_eager(self):
+    """Tests that floats passed in via Eager return results. See b/113504761."""
+    with context.eager_mode():
+      metric = metrics.sparse_categorical_accuracy
+      y_true = np.arange(6, dtype=np.float32).reshape([6, 1])
+      y_pred = np.arange(36).reshape([6, 6])
+      self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
+
+  def test_sparse_top_k_categorical_accuracy(self):
+    with self.cached_session():
+      # Test correctness if the shape of y_true is (num_samples, 1)
+      y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
+      y_true = K.variable(np.array([[1], [0]]))
+      result = K.eval(
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
+      self.assertEqual(result, 1)
+      result = K.eval(
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
+      self.assertEqual(result, 0.5)
+      result = K.eval(
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
+      self.assertEqual(result, 0.)
+
+      # Test correctness if the shape of y_true is (num_samples,)
+      y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
+      y_true = K.variable(np.array([1, 0]))
+      result = K.eval(
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
+      self.assertEqual(result, 1)
+      result = K.eval(
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
+      self.assertEqual(result, 0.5)
+      result = K.eval(
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
+      self.assertEqual(result, 0.)
+
+  def test_top_k_categorical_accuracy(self):
+    with self.cached_session():
+      y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
+      y_true = K.variable(np.array([[0, 1, 0], [1, 0, 0]]))
+      result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=3))
+      self.assertEqual(result, 1)
+      result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=2))
+      self.assertEqual(result, 0.5)
+      result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=1))
+      self.assertEqual(result, 0.)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 92398acd8e6dc683e37cf759c667c4665961b356..9720d910eb337580c2e630b5dfb8888f8843c271 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -27,10 +27,10 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import metrics
-from tensorflow.python.keras.models import Sequential
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -40,98 +40,11 @@ from tensorflow.python.training.checkpointable import util as checkpointable_uti
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
-class KerasMetricsTest(test.TestCase):
-
-  def test_metrics(self):
-    with self.cached_session():
-      y_a = K.variable(np.random.random((6, 7)))
-      y_b = K.variable(np.random.random((6, 7)))
-      for metric in [metrics.binary_accuracy, metrics.categorical_accuracy]:
-        output = metric(y_a, y_b)
-        self.assertEqual(K.eval(output).shape, (6,))
-
-  def test_sparse_categorical_accuracy_int(self):
-    with self.cached_session():
-      metric = metrics.sparse_categorical_accuracy
-      y_true = K.variable(np.random.randint(0, 7, (6,)))
-      y_pred = K.variable(np.random.random((6, 7)))
-      self.assertEqual(K.eval(metric(y_true, y_pred)).shape, (6,))
-
-      # Test correctness if the shape of y_true is (num_samples,)
-      y_true = K.variable([1., 0., 0., 0.])
-      y_pred = K.variable([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
-      print(K.eval(metric(y_true, y_pred)))
-      self.assertAllEqual(K.eval(metric(y_true, y_pred)), [0., 1., 1., 1.])
-
-      # Test correctness if the shape of y_true is (num_samples, 1)
-      y_true = K.variable([[1.], [0.], [0.], [0.]])
-      y_pred = K.variable([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
-      print(K.eval(metric(y_true, y_pred)))
-      self.assertAllEqual(K.eval(metric(y_true, y_pred)), [0., 1., 1., 1.])
-
-  def test_sparse_categorical_accuracy_float(self):
-    with self.cached_session():
-      metric = metrics.sparse_categorical_accuracy
-      y_true = K.variable(np.random.random((6,)))
-      y_pred = K.variable(np.random.random((6, 7)))
-      self.assertEqual(K.eval(metric(y_true, y_pred)).shape, (6,))
-
-  def test_sparse_categorical_accuracy_eager(self):
-    """Tests that ints passed in via Eager return results. See b/113504761."""
-    with context.eager_mode():
-      metric = metrics.sparse_categorical_accuracy
-      y_true = np.arange(6).reshape([6, 1])
-      y_pred = np.arange(36).reshape([6, 6])
-      self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
-
-  def test_sparse_categorical_accuracy_float_eager(self):
-    """Tests that floats passed in via Eager return results. See b/113504761."""
-    with context.eager_mode():
-      metric = metrics.sparse_categorical_accuracy
-      y_true = np.arange(6, dtype=np.float32).reshape([6, 1])
-      y_pred = np.arange(36).reshape([6, 6])
-      self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
-
-  def test_sparse_top_k_categorical_accuracy(self):
-    with self.cached_session():
-      # Test correctness if the shape of y_true is (num_samples, 1)
-      y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
-      y_true = K.variable(np.array([[1], [0]]))
-      result = K.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
-      self.assertEqual(result, 1)
-      result = K.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
-      self.assertEqual(result, 0.5)
-      result = K.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
-      self.assertEqual(result, 0.)
-
-      # Test correctness if the shape of y_true is (num_samples,)
-      y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
-      y_true = K.variable(np.array([1, 0]))
-      result = K.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
-      self.assertEqual(result, 1)
-      result = K.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
-      self.assertEqual(result, 0.5)
-      result = K.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
-      self.assertEqual(result, 0.)
-
-  def test_top_k_categorical_accuracy(self):
-    with self.cached_session():
-      y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
-      y_true = K.variable(np.array([[0, 1, 0], [1, 0, 0]]))
-      result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=3))
-      self.assertEqual(result, 1)
-      result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=2))
-      self.assertEqual(result, 0.5)
-      result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=1))
-      self.assertEqual(result, 0.)
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+@test_util.run_all_in_graph_and_eager_modes
+class KerasMeanTest(test.TestCase):
+
+  # TODO(b/120949004): Re-enable garbage collection check
+  # @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def test_mean(self):
     m = metrics.Mean(name='my_mean')
 
@@ -163,7 +76,6 @@ class KerasMetricsTest(test.TestCase):
     self.assertEqual(self.evaluate(m.total), 0)
     self.assertEqual(self.evaluate(m.count), 0)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_mean_with_sample_weight(self):
     m = metrics.Mean(dtype=dtypes.float64)
     self.assertEqual(m.dtype, dtypes.float64)
@@ -227,7 +139,6 @@ class KerasMetricsTest(test.TestCase):
       self.assertAlmostEqual(self.evaluate(m.count), 1.7, 2)  # 0.5 + 1.2
       self.assertAlmostEqual(result, 52 / 1.7, 2)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_save_restore(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
@@ -258,7 +169,10 @@ class KerasMetricsTest(test.TestCase):
     self.assertEqual(200., self.evaluate(restore_mean.result()))
     self.assertEqual(3, self.evaluate(restore_mean.count))
 
-  @test_util.run_in_graph_and_eager_modes
+
+@test_util.run_all_in_graph_and_eager_modes
+class KerasAccuracyTest(test.TestCase):
+
   def test_accuracy(self):
     acc_obj = metrics.Accuracy(name='my acc')
 
@@ -280,7 +194,6 @@ class KerasMetricsTest(test.TestCase):
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.96, 2)  # 4.5/4.7
 
-  @test_util.run_in_graph_and_eager_modes
   def test_binary_accuracy(self):
     acc_obj = metrics.BinaryAccuracy(name='my acc')
 
@@ -313,7 +226,6 @@ class KerasMetricsTest(test.TestCase):
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.67, 2)  # 4.5/6.7
 
-  @test_util.run_in_graph_and_eager_modes
   def test_binary_accuracy_threshold(self):
     acc_obj = metrics.BinaryAccuracy(threshold=0.7)
     self.evaluate(variables.variables_initializer(acc_obj.variables))
@@ -321,7 +233,6 @@ class KerasMetricsTest(test.TestCase):
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.5, 2)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_categorical_accuracy(self):
     acc_obj = metrics.CategoricalAccuracy(name='my acc')
 
@@ -345,7 +256,6 @@ class KerasMetricsTest(test.TestCase):
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
 
-  @test_util.run_in_graph_and_eager_modes
   def test_sparse_categorical_accuracy(self):
     acc_obj = metrics.SparseCategoricalAccuracy(name='my acc')
 
@@ -369,18 +279,11 @@ class KerasMetricsTest(test.TestCase):
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
 
-
-def _get_simple_sequential_model(compile_metrics):
-  model = Sequential()
-  model.add(
-      layers.Dense(
-          3, activation='relu', input_dim=4, kernel_initializer='ones'))
-  model.add(layers.Dense(1, activation='sigmoid', kernel_initializer='ones'))
-  model.compile(
-      loss='mae',
-      metrics=compile_metrics,
-      optimizer=RMSPropOptimizer(learning_rate=0.001))
-  return model
+  def test_assert_thresholds_range(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'Threshold values must be in \[0, 1\]. Invalid values: \[None\]'):
+      metrics._assert_thresholds_range([None, 0.5])
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -451,16 +354,6 @@ class FalsePositivesTest(test.TestCase):
         r'Threshold values must be in \[0, 1\]. Invalid values: \[-1, 2\]'):
       metrics.FalsePositives(thresholds=[-1, 0.5, 2])
 
-  def test_reset_states(self):
-    fp_obj = metrics.FalsePositives()
-    model = _get_simple_sequential_model([fp_obj])
-    x = np.ones((100, 4))
-    y = np.zeros((100, 1))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
-
 
 @test_util.run_all_in_graph_and_eager_modes
 class FalseNegativesTest(test.TestCase):
@@ -523,16 +416,6 @@ class FalseNegativesTest(test.TestCase):
     result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
     self.assertAllClose([4., 16., 23.], self.evaluate(result))
 
-  def test_reset_states(self):
-    fn_obj = metrics.FalseNegatives()
-    model = _get_simple_sequential_model([fn_obj])
-    x = np.zeros((100, 4))
-    y = np.ones((100, 1))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
-
 
 @test_util.run_all_in_graph_and_eager_modes
 class TrueNegativesTest(test.TestCase):
@@ -595,16 +478,6 @@ class TrueNegativesTest(test.TestCase):
     result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
     self.assertAllClose([5., 15., 23.], self.evaluate(result))
 
-  def test_reset_states(self):
-    tn_obj = metrics.TrueNegatives()
-    model = _get_simple_sequential_model([tn_obj])
-    x = np.zeros((100, 4))
-    y = np.zeros((100, 1))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
-
 
 @test_util.run_all_in_graph_and_eager_modes
 class TruePositivesTest(test.TestCase):
@@ -666,16 +539,6 @@ class TruePositivesTest(test.TestCase):
     result = tp_obj(y_true, y_pred, sample_weight=37.)
     self.assertAllClose([222., 111., 37.], self.evaluate(result))
 
-  def test_reset_states(self):
-    tp_obj = metrics.TruePositives()
-    model = _get_simple_sequential_model([tp_obj])
-    x = np.ones((100, 4))
-    y = np.ones((100, 1))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
-
 
 @test_util.run_all_in_graph_and_eager_modes
 class PrecisionTest(test.TestCase):
@@ -683,7 +546,7 @@ class PrecisionTest(test.TestCase):
   def test_config(self):
     p_obj = metrics.Precision(name='my_precision', thresholds=[0.4, 0.9])
     self.assertEqual(p_obj.name, 'my_precision')
-    self.assertLen(p_obj.variables, 2)
+    self.assertEqual(len(p_obj.variables), 2)
     self.assertEqual([v.name for v in p_obj.variables],
                      ['true_positives:0', 'false_positives:0'])
     self.assertEqual(p_obj.thresholds, [0.4, 0.9])
@@ -788,18 +651,6 @@ class PrecisionTest(test.TestCase):
     self.assertArrayNear([expected_precision, 0], self.evaluate(p_obj.result()),
                          1e-3)
 
-  def test_reset_states(self):
-    p_obj = metrics.Precision()
-    model = _get_simple_sequential_model([p_obj])
-    x = np.concatenate((np.ones((50, 4)), np.ones((50, 4))))
-    y = np.concatenate((np.ones((50, 1)), np.zeros((50, 1))))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(p_obj.tp), 50.)
-    self.assertEqual(self.evaluate(p_obj.fp), 50.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(p_obj.tp), 50.)
-    self.assertEqual(self.evaluate(p_obj.fp), 50.)
-
 
 @test_util.run_all_in_graph_and_eager_modes
 class RecallTest(test.TestCase):
@@ -807,7 +658,7 @@ class RecallTest(test.TestCase):
   def test_config(self):
     r_obj = metrics.Recall(name='my_recall', thresholds=[0.4, 0.9])
     self.assertEqual(r_obj.name, 'my_recall')
-    self.assertLen(r_obj.variables, 2)
+    self.assertEqual(len(r_obj.variables), 2)
     self.assertEqual([v.name for v in r_obj.variables],
                      ['true_positives:0', 'false_negatives:0'])
     self.assertEqual(r_obj.thresholds, [0.4, 0.9])
@@ -911,18 +762,6 @@ class RecallTest(test.TestCase):
     self.assertArrayNear([expected_recall, 0], self.evaluate(r_obj.result()),
                          1e-3)
 
-  def test_reset_states(self):
-    r_obj = metrics.Recall()
-    model = _get_simple_sequential_model([r_obj])
-    x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
-    y = np.concatenate((np.ones((50, 1)), np.ones((50, 1))))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(r_obj.tp), 50.)
-    self.assertEqual(self.evaluate(r_obj.fn), 50.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(r_obj.tp), 50.)
-    self.assertEqual(self.evaluate(r_obj.fn), 50.)
-
 
 @test_util.run_all_in_graph_and_eager_modes
 class SensitivityAtSpecificityTest(test.TestCase, parameterized.TestCase):
@@ -1012,24 +851,6 @@ class SensitivityAtSpecificityTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
       metrics.SensitivityAtSpecificity(0.4, num_thresholds=-1)
 
-  def test_reset_states(self):
-    s_obj = metrics.SensitivityAtSpecificity(0.5, num_thresholds=1)
-    model = _get_simple_sequential_model([s_obj])
-    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
-                        np.ones((25, 4))))
-    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
-                        np.zeros((25, 1))))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(s_obj.tp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fn), 25.)
-    self.assertEqual(self.evaluate(s_obj.tn), 25.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(s_obj.tp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fn), 25.)
-    self.assertEqual(self.evaluate(s_obj.tn), 25.)
-
 
 @test_util.run_all_in_graph_and_eager_modes
 class SpecificityAtSensitivityTest(test.TestCase, parameterized.TestCase):
@@ -1119,24 +940,6 @@ class SpecificityAtSensitivityTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
       metrics.SpecificityAtSensitivity(0.4, num_thresholds=-1)
 
-  def test_reset_states(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.5, num_thresholds=1)
-    model = _get_simple_sequential_model([s_obj])
-    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
-                        np.ones((25, 4))))
-    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
-                        np.zeros((25, 1))))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(s_obj.tp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fn), 25.)
-    self.assertEqual(self.evaluate(s_obj.tn), 25.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(s_obj.tp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fn), 25.)
-    self.assertEqual(self.evaluate(s_obj.tn), 25.)
-
 
 @test_util.run_all_in_graph_and_eager_modes
 class CosineProximityTest(test.TestCase):
@@ -1171,5 +974,125 @@ class CosineProximityTest(test.TestCase):
     result = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
     self.assertAllClose(-0.59916, self.evaluate(result), atol=1e-5)
 
+
+def _get_model(compile_metrics):
+  model_layers = [
+      layers.Dense(3, activation='relu', kernel_initializer='ones'),
+      layers.Dense(1, activation='sigmoid', kernel_initializer='ones')]
+
+  model = testing_utils.get_model_from_layers(model_layers, input_shape=(4,))
+  model.compile(
+      loss='mae',
+      metrics=compile_metrics,
+      optimizer=RMSPropOptimizer(learning_rate=0.001),
+      run_eagerly=testing_utils.should_run_eagerly())
+  return model
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class ResetStatesTest(keras_parameterized.TestCase):
+
+  def test_reset_states_false_positives(self):
+    fp_obj = metrics.FalsePositives()
+    model = _get_model([fp_obj])
+    x = np.ones((100, 4))
+    y = np.zeros((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
+
+  def test_reset_states_false_negatives(self):
+    fn_obj = metrics.FalseNegatives()
+    model = _get_model([fn_obj])
+    x = np.zeros((100, 4))
+    y = np.ones((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
+
+  def test_reset_states_true_negatives(self):
+    tn_obj = metrics.TrueNegatives()
+    model = _get_model([tn_obj])
+    x = np.zeros((100, 4))
+    y = np.zeros((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
+
+  def test_reset_states_true_positives(self):
+    tp_obj = metrics.TruePositives()
+    model = _get_model([tp_obj])
+    x = np.ones((100, 4))
+    y = np.ones((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
+
+  def test_reset_states_precision(self):
+    p_obj = metrics.Precision()
+    model = _get_model([p_obj])
+    x = np.concatenate((np.ones((50, 4)), np.ones((50, 4))))
+    y = np.concatenate((np.ones((50, 1)), np.zeros((50, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(p_obj.tp), 50.)
+    self.assertEqual(self.evaluate(p_obj.fp), 50.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(p_obj.tp), 50.)
+    self.assertEqual(self.evaluate(p_obj.fp), 50.)
+
+  def test_reset_states_recall(self):
+    r_obj = metrics.Recall()
+    model = _get_model([r_obj])
+    x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
+    y = np.concatenate((np.ones((50, 1)), np.ones((50, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(r_obj.tp), 50.)
+    self.assertEqual(self.evaluate(r_obj.fn), 50.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(r_obj.tp), 50.)
+    self.assertEqual(self.evaluate(r_obj.fn), 50.)
+
+  def test_reset_states_sensitivity_at_specificity(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.5, num_thresholds=1)
+    model = _get_model([s_obj])
+    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
+                        np.ones((25, 4))))
+    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
+                        np.zeros((25, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+
+  def test_reset_states_specificity_at_sensitivity(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.5, num_thresholds=1)
+    model = _get_model([s_obj])
+    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
+                        np.ones((25, 4))))
+    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
+                        np.zeros((25, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index ca7dded60d4285a9e42a16cd2e65d6de530d0f38..cf64e00d20cb34058ad872581a11fb174d3f2119 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -28,6 +28,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import init_ops
@@ -408,6 +410,158 @@ class ModelSubclassingTest(test.TestCase):
     x2 = array_ops.ones((num_samples, input_dim))
     model([x1, x2])
 
+  def test_summary(self):
+
+    class ToString(object):
+
+      def __init__(self):
+        self.contents = ''
+
+      def __call__(self, msg):
+        self.contents += msg + '\n'
+
+    # Single-io
+    model = SimpleTestModel(num_classes=4, use_bn=True, use_dp=True)
+    model._set_inputs(np.ones((3, 4)))  # need to build model first
+    print_fn = ToString()
+    model.summary(print_fn=print_fn)
+    self.assertTrue('Trainable params: 356' in print_fn.contents)
+
+    # Multi-io
+    model = MultiIOTestModel(num_classes=(5, 6), use_bn=True, use_dp=True)
+    model._set_inputs([np.ones((3, 4)),
+                       np.ones((3, 4))])  # need to build model first
+    print_fn = ToString()
+    model.summary(print_fn=print_fn)
+    self.assertTrue('Trainable params: 587' in print_fn.contents)
+
+  def test_no_dependency(self):
+    class Foo(keras.Model):
+
+      def __init__(self):
+        super(Foo, self).__init__()
+        self.isdep = keras.layers.Dense(1)
+        self.notdep = data_structures.NoDependency(keras.layers.Dense(2))
+        self.notdep_var = data_structures.NoDependency(
+            resource_variable_ops.ResourceVariable(1., name='notdep_var'))
+
+    m = Foo()
+    self.assertEqual([m.isdep, m.notdep], m.layers)
+    self.assertEqual(1, len(m._checkpoint_dependencies))
+    self.assertIs(m.isdep, m._checkpoint_dependencies[0].ref)
+    self.assertEqual('notdep_var:0', m.notdep_var.name)
+
+  def test_extra_variable(self):
+
+    class ExtraVar(keras.Model):
+
+      def __init__(self):
+        super(ExtraVar, self).__init__()
+        self.dense = keras.layers.Dense(1)
+        self.var = resource_variable_ops.ResourceVariable(1.)
+        self.not_trainable_var = resource_variable_ops.ResourceVariable(
+            2., trainable=False)
+
+      def call(self, inputs):
+        return self.dense(inputs + self.var)
+
+    m = ExtraVar()
+    self.assertTrue(m.trainable)
+    self.assertEqual([m.dense], m.layers)
+    self.assertEqual([m.var, m.not_trainable_var], m.variables)
+    self.assertEqual([m.var], m.trainable_variables)
+    self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
+    m.trainable = False
+    self.assertEqual([m.var, m.not_trainable_var], m.variables)
+    self.assertEqual([], m.trainable_variables)
+    self.assertEqual([m.var, m.not_trainable_var], m.non_trainable_variables)
+    m.trainable = True
+
+    m(array_ops.ones([1, 1]))
+
+    self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.variables)
+    self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.weights)
+
+    self.assertEqual([m.dense.kernel, m.dense.bias, m.var, m.not_trainable_var],
+                     m.variables)
+    self.assertEqual([m.dense.kernel, m.dense.bias, m.var],
+                     m.trainable_variables)
+    self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
+
+    m.dense.trainable = False
+    self.assertEqual(
+        [m.var, m.dense.kernel, m.dense.bias, m.not_trainable_var],
+        m.variables)
+    self.assertEqual([m.var], m.trainable_variables)
+    self.assertEqual([m.dense.kernel, m.dense.bias, m.not_trainable_var],
+                     m.non_trainable_variables)
+
+  def test_add_weight_in_model(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,), trainable=False)
+
+      def call(self, inputs):
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModel()
+    model(x)
+    self.assertEqual(1, len(model.trainable_weights))
+    self.assertEqual(1, len(model.non_trainable_weights))
+    self.assertEqual(2, len(model.weights))
+
+    class MyModelCustomBuild(keras.Model):
+
+      def build(self, input_shape):
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,), trainable=False)
+
+      def call(self, inputs):
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModelCustomBuild()
+    model(x)
+    self.assertEqual(1, len(model.trainable_weights))
+    self.assertEqual(1, len(model.non_trainable_weights))
+    self.assertEqual(2, len(model.weights))
+
+  def test_add_update_in_model(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,))
+
+      def call(self, inputs):
+        # Unconditional
+        self.add_update(self.b.assign(self.b * 2))
+        # Conditional
+        self.add_update(self.c.assign(inputs[1, :]), inputs)
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModel()
+    model(x)
+
+    if context.executing_eagerly():
+      self.assertEqual(0, len(model.updates))
+    else:
+      self.assertEqual(2, len(model.updates))
+      self.assertEqual(1, len(model.get_updates_for(None)))
+      self.assertEqual(1, len(model.get_updates_for(x)))
+
+
+@keras_parameterized.run_all_keras_modes
+class ModelSubclassCompiledTest(keras_parameterized.TestCase):
+
   def test_single_io_workflow_with_np_arrays(self):
     num_classes = 2
     num_samples = 100
@@ -419,7 +573,8 @@ class ModelSubclassingTest(test.TestCase):
     model.compile(
         loss='mse',
         optimizer=RMSPropOptimizer(learning_rate=0.001),
-        metrics=['acc', keras.metrics.CategoricalAccuracy()])
+        metrics=['acc', keras.metrics.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((num_samples, input_dim))
     y = np.zeros((num_samples, num_classes))
@@ -437,7 +592,8 @@ class ModelSubclassingTest(test.TestCase):
                              use_bn=True)
     model.compile(loss='mse',
                   optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'])
+                  metrics=['acc'],
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     x1 = np.ones((num_samples, input_dim))
     x2 = np.ones((num_samples, input_dim))
@@ -454,14 +610,16 @@ class ModelSubclassingTest(test.TestCase):
 
     with self.cached_session():
       model = SimpleTestModel(num_classes=num_classes, use_dp=True, use_bn=True)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(
+          loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+          run_eagerly=testing_utils.should_run_eagerly())
 
-      x = np.ones((num_samples, input_dim))
-      y = np.zeros((num_samples, num_classes))
+      x = np.ones((num_samples, input_dim), dtype=np.float32)
+      y = np.zeros((num_samples, num_classes), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
       dataset = dataset.repeat(100)
       dataset = dataset.batch(10)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
 
       model.fit(iterator, epochs=2, steps_per_epoch=10, verbose=0)
       _ = model.evaluate(iterator, steps=10, verbose=0)
@@ -484,7 +642,9 @@ class ModelSubclassingTest(test.TestCase):
     self.assertEqual(model.built, False)
     self.assertEqual(len(model.weights), 0)
 
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch([x1, x2], [y1, y2])
 
     self.assertEqual(model.built, True)
@@ -514,7 +674,9 @@ class ModelSubclassingTest(test.TestCase):
     y = np.ones((num_samples, input_dim))
 
     model = BNNet()
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
     y_ref = model.predict(x)
 
     model.train_on_batch(x, y)
@@ -544,7 +706,9 @@ class ModelSubclassingTest(test.TestCase):
     x = np.ones((num_samples, input_dim))
     y = model.predict(x)
     self.assertEqual(np.sum(y), np.sum(x))
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
     loss = model.train_on_batch(x, y)
     self.assertGreater(loss, 0.1)
 
@@ -562,7 +726,9 @@ class ModelSubclassingTest(test.TestCase):
     y2 = np.zeros((num_samples, num_classes[1]))
 
     model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
     model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
     model.fit({'input_1': x1, 'input_2': x2},
               {'output_1': y1, 'output_2': y2},
@@ -571,7 +737,9 @@ class ModelSubclassingTest(test.TestCase):
               validation_data=([x1, x2], [y1, y2]))
 
     model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch([x1, x2], [y1, y2])
     model.train_on_batch({'input_1': x1, 'input_2': x2},
                          {'output_1': y1, 'output_2': y2})
@@ -589,7 +757,9 @@ class ModelSubclassingTest(test.TestCase):
     y2 = np.zeros((num_samples, num_classes[1]))
 
     model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
     model.evaluate([x1, x2], [y1, y2])
     model.test_on_batch([x1, x2], [y1, y2])
 
@@ -611,7 +781,9 @@ class ModelSubclassingTest(test.TestCase):
     y2 = np.zeros((num_samples, num_classes[1]))
 
     model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
     model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
     y_ref_1, y_ref_2 = model.predict([x1, x2])
 
@@ -640,31 +812,6 @@ class ModelSubclassingTest(test.TestCase):
       self.assertAllClose(y_ref_1, y1, atol=1e-5)
       self.assertAllClose(y_ref_2, y2, atol=1e-5)
 
-  def test_summary(self):
-
-    class ToString(object):
-
-      def __init__(self):
-        self.contents = ''
-
-      def __call__(self, msg):
-        self.contents += msg + '\n'
-
-    # Single-io
-    model = SimpleTestModel(num_classes=4, use_bn=True, use_dp=True)
-    model._set_inputs(np.ones((3, 4)))  # need to build model first
-    print_fn = ToString()
-    model.summary(print_fn=print_fn)
-    self.assertTrue('Trainable params: 356' in print_fn.contents)
-
-    # Multi-io
-    model = MultiIOTestModel(num_classes=(5, 6), use_bn=True, use_dp=True)
-    model._set_inputs([np.ones((3, 4)),
-                       np.ones((3, 4))])  # need to build model first
-    print_fn = ToString()
-    model.summary(print_fn=print_fn)
-    self.assertTrue('Trainable params: 587' in print_fn.contents)
-
   def test_subclass_nested_in_subclass(self):
     num_classes = 2
     num_samples = 100
@@ -673,7 +820,8 @@ class ModelSubclassingTest(test.TestCase):
     model = NestedTestModel1(num_classes=num_classes)
     model.compile(loss='mse',
                   optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'])
+                  metrics=['acc'],
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((num_samples, input_dim))
     y = np.zeros((num_samples, num_classes))
@@ -695,7 +843,8 @@ class ModelSubclassingTest(test.TestCase):
     model = NestedTestModel2(num_classes=num_classes)
     model.compile(loss='mse',
                   optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'])
+                  metrics=['acc'],
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((num_samples, input_dim))
     y = np.zeros((num_samples, num_classes))
@@ -717,7 +866,8 @@ class ModelSubclassingTest(test.TestCase):
     model = get_nested_model_3(input_dim=input_dim, num_classes=num_classes)
     model.compile(loss='mse',
                   optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'])
+                  metrics=['acc'],
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((num_samples, input_dim))
     y = np.zeros((num_samples, num_classes))
@@ -750,7 +900,8 @@ class ModelSubclassingTest(test.TestCase):
     model = keras.Sequential([Inner()])
     model.compile(loss='mse',
                   optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'])
+                  metrics=['acc'],
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((num_samples, input_dim))
     y = np.zeros((num_samples, num_classes))
@@ -786,134 +937,12 @@ class ModelSubclassingTest(test.TestCase):
     x = np.ones((10, 10))
     y = model.predict(x)
     self.assertEqual(np.sum(y), np.sum(x))
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
     loss = model.train_on_batch(x, y)
     self.assertGreater(loss, 0.1)
 
-  def test_no_dependency(self):
-    class Foo(keras.Model):
-
-      def __init__(self):
-        super(Foo, self).__init__()
-        self.isdep = keras.layers.Dense(1)
-        self.notdep = data_structures.NoDependency(keras.layers.Dense(2))
-        self.notdep_var = data_structures.NoDependency(
-            resource_variable_ops.ResourceVariable(1., name='notdep_var'))
-
-    m = Foo()
-    self.assertEqual([m.isdep, m.notdep], m.layers)
-    self.assertEqual(1, len(m._checkpoint_dependencies))
-    self.assertIs(m.isdep, m._checkpoint_dependencies[0].ref)
-    self.assertEqual('notdep_var:0', m.notdep_var.name)
-
-  def test_extra_variable(self):
-
-    class ExtraVar(keras.Model):
-
-      def __init__(self):
-        super(ExtraVar, self).__init__()
-        self.dense = keras.layers.Dense(1)
-        self.var = resource_variable_ops.ResourceVariable(1.)
-        self.not_trainable_var = resource_variable_ops.ResourceVariable(
-            2., trainable=False)
-
-      def call(self, inputs):
-        return self.dense(inputs + self.var)
-
-    m = ExtraVar()
-    self.assertTrue(m.trainable)
-    self.assertEqual([m.dense], m.layers)
-    self.assertEqual([m.var, m.not_trainable_var], m.variables)
-    self.assertEqual([m.var], m.trainable_variables)
-    self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
-    m.trainable = False
-    self.assertEqual([m.var, m.not_trainable_var], m.variables)
-    self.assertEqual([], m.trainable_variables)
-    self.assertEqual([m.var, m.not_trainable_var], m.non_trainable_variables)
-    m.trainable = True
-
-    m(array_ops.ones([1, 1]))
-
-    self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.variables)
-    self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.weights)
-
-    self.assertEqual([m.dense.kernel, m.dense.bias, m.var, m.not_trainable_var],
-                     m.variables)
-    self.assertEqual([m.dense.kernel, m.dense.bias, m.var],
-                     m.trainable_variables)
-    self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
-
-    m.dense.trainable = False
-    self.assertEqual(
-        [m.var, m.dense.kernel, m.dense.bias, m.not_trainable_var],
-        m.variables)
-    self.assertEqual([m.var], m.trainable_variables)
-    self.assertEqual([m.dense.kernel, m.dense.bias, m.not_trainable_var],
-                     m.non_trainable_variables)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_add_weight_in_model(self):
-
-    class MyModel(keras.Model):
-
-      def __init__(self):
-        super(MyModel, self).__init__()
-        self.b = self.add_weight('bias', (10,))
-        self.c = self.add_weight('bias2', (10,), trainable=False)
-
-      def call(self, inputs):
-        return inputs + self.b + self.c
-
-    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
-    model = MyModel()
-    model(x)
-    self.assertEqual(1, len(model.trainable_weights))
-    self.assertEqual(1, len(model.non_trainable_weights))
-    self.assertEqual(2, len(model.weights))
-
-    class MyModelCustomBuild(keras.Model):
-
-      def build(self, input_shape):
-        self.b = self.add_weight('bias', (10,))
-        self.c = self.add_weight('bias2', (10,), trainable=False)
-
-      def call(self, inputs):
-        return inputs + self.b + self.c
-
-    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
-    model = MyModelCustomBuild()
-    model(x)
-    self.assertEqual(1, len(model.trainable_weights))
-    self.assertEqual(1, len(model.non_trainable_weights))
-    self.assertEqual(2, len(model.weights))
-
-  def test_add_update_in_model(self):
-
-    class MyModel(keras.Model):
-
-      def __init__(self):
-        super(MyModel, self).__init__()
-        self.b = self.add_weight('bias', (10,))
-        self.c = self.add_weight('bias2', (10,))
-
-      def call(self, inputs):
-        # Unconditional
-        self.add_update(self.b.assign(self.b * 2))
-        # Conditional
-        self.add_update(self.c.assign(inputs[1, :]), inputs)
-        return inputs + self.b + self.c
-
-    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
-    model = MyModel()
-    model(x)
-
-    if context.executing_eagerly():
-      self.assertEqual(0, len(model.updates))
-    else:
-      self.assertEqual(2, len(model.updates))
-      self.assertEqual(1, len(model.get_updates_for(None)))
-      self.assertEqual(1, len(model.get_updates_for(x)))
-
 
 class GraphSpecificModelSubclassingTests(test.TestCase):
 
@@ -1083,9 +1112,9 @@ class TrainingMaskingModel(keras.Model):
     return self.dense1(x)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class CustomCallSignatureTests(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def test_no_inputs_in_signature(self):
     model = CustomCallModel()
     first = array_ops.ones([2, 3])
@@ -1099,7 +1128,6 @@ class CustomCallSignatureTests(test.TestCase):
     output = model(first, second=second, training=False)
     self.assertAllClose(expected_output, self.evaluate(output))
 
-  @test_util.run_in_graph_and_eager_modes
   def test_training_args_call_build(self):
     input_dim = 2
 
@@ -1112,7 +1140,6 @@ class CustomCallSignatureTests(test.TestCase):
                                     'has been properly built.'))
     self.assertTrue(model.built, 'Model should be built after calling `build`.')
 
-  @test_util.run_in_graph_and_eager_modes
   def test_training_and_mask_args_call_build(self):
     input_dim = 2
 
@@ -1125,7 +1152,6 @@ class CustomCallSignatureTests(test.TestCase):
                                     'has been properly built.'))
     self.assertTrue(model.built, 'Model should be built after calling `build`.')
 
-  @test_util.run_in_graph_and_eager_modes
   def test_custom_call_kwargs_and_build(self):
     first_input_shape = (2, 3)
     second_input_shape = (2, 5)
@@ -1138,7 +1164,6 @@ class CustomCallSignatureTests(test.TestCase):
         ValueError, 'cannot build your model if it has positional'):
       model.build(input_shape=[first_input_shape, second_input_shape])
 
-  @test_util.run_in_graph_and_eager_modes
   def test_inputs_in_signature(self):
 
     class HasInputsAndOtherPositional(keras.Model):
@@ -1155,7 +1180,6 @@ class CustomCallSignatureTests(test.TestCase):
       x1, x2 = keras.Input((1, 1)), keras.Input((1, 1))
       model(x1, x2)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_kwargs_in_signature(self):
 
     class HasKwargs(keras.Model):
@@ -1164,12 +1188,11 @@ class CustomCallSignatureTests(test.TestCase):
         return x
 
     model = HasKwargs()
-    arg = array_ops.ones([])
+    arg = array_ops.ones([1])
     model(arg, a=3)
     if not context.executing_eagerly():
       self.assertEqual(len(model.inputs), 1)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_args_in_signature(self):
 
     class HasArgs(keras.Model):
@@ -1189,23 +1212,26 @@ class CustomCallSignatureTests(test.TestCase):
 
     class HasArgs(keras.Model):
 
-      def call(self, x, training=True, *args, **kwargs):
+      def call(self, x, training=True, *args, **kwargs):  # pylint:disable=keyword-arg-before-vararg
         return x
 
-    with context.graph_mode():
-      model = HasArgs()
-      x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
-      with self.assertRaisesRegexp(
-          TypeError, 'may not accept both positional arguments and '):
-        model(x1, x2, x3, a=3)
+    model = HasArgs()
+    x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
+    with self.assertRaisesRegexp(
+        TypeError, 'may not accept both positional arguments and '):
+      model(x1, x2, x3, a=3)
 
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
   def test_training_no_default(self):
+    if context.executing_eagerly():
+      self.skipTest('b/120997007')
 
-    with context.graph_mode():
-      model = TrainingNoDefaultModel()
-      arg = array_ops.ones([1, 1])
-      model(arg, True)
-      self.assertEqual(len(model.inputs), 1)
+    model = TrainingNoDefaultModel()
+
+    arg = array_ops.ones([1, 1])
+    model(arg, True)
+    self.assertEqual(len(model.inputs), 1)
 
   def test_training_no_default_with_positional(self):
 
@@ -1214,11 +1240,10 @@ class CustomCallSignatureTests(test.TestCase):
       def call(self, x, training, positional):
         return x
 
-    with context.graph_mode():
-      model = TrainingNoDefaultWithPositional()
-      x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
-      with self.assertRaisesRegexp(TypeError, 'after a non-input'):
-        model(x1, x2, x3)
+    model = TrainingNoDefaultWithPositional()
+    x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
+    with self.assertRaisesRegexp(TypeError, 'after a non-input'):
+      model(x1, x2, x3)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 2637191bb75b357341376a703b2620243bd925bf..68d58bf66b4cf05b13c891cab6a7e3afed2981b3 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -31,8 +31,6 @@ from tensorflow.python.keras.engine.input_layer import InputLayer
 from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.checkpointable import data_structures
 from tensorflow.python.util.tf_export import tf_export
 
 # API entries importable from `keras.models`:
@@ -308,6 +306,10 @@ def _in_place_subclassed_model_reset(model):
     if isinstance(value, Layer):
       attributes_cache[name] = value
       assert value in model._layers
+      if hasattr(value, '_layers') and value._layers:
+        raise ValueError('We do not support the use of nested layers '
+                         'in `model_to_estimator` at this time. Found nested '
+                         'layer: %s' % value)
     elif isinstance(
         value,
         (list, tuple)) and name not in ('layers', '_layers', 'metrics',
@@ -322,7 +324,9 @@ def _in_place_subclassed_model_reset(model):
   # Replace layers on the model with fresh layers
   layers_to_names = {value: key for key, value in attributes_cache.items()}
   original_layers = model._layers[:]
-  model._layers = data_structures.NoDependency([])
+  setattr_tracking = model._setattr_tracking
+  model._setattr_tracking = False
+  model._layers = []
   for layer in original_layers:  # We preserve layer order.
     config = layer.get_config()
     # This will not work for nested subclassed models used as layers.
@@ -335,6 +339,7 @@ def _in_place_subclassed_model_reset(model):
     fresh_layer = layer.__class__.from_config(config)
     name = layers_to_names[layer]
     setattr(model, name, fresh_layer)
+    model._layers.append(fresh_layer)
 
   # Cache original model build attributes (in addition to layers)
   if (not hasattr(model, '_original_attributes_cache') or
@@ -367,12 +372,12 @@ def _in_place_subclassed_model_reset(model):
       ]
       for name in attributes_to_cache:
         attributes_cache[name] = getattr(model, name)
-  model._original_attributes_cache = data_structures.NoDependency(
-      attributes_cache)
+  model._original_attributes_cache = attributes_cache
   # Reset built state
   model.built = False
   model.inputs = None
   model.outputs = None
+  model._setattr_tracking = setattr_tracking
 
 
 def in_place_subclassed_model_state_restoration(model):
@@ -393,15 +398,15 @@ def in_place_subclassed_model_state_restoration(model):
     # back the previous attributes and track Layers by their original names
     # without adding dependencies on "utility" attributes which Models exempt
     # when they're constructed.
-    model._layers = data_structures.NoDependency([])
+    setattr_tracking = model._setattr_tracking
+    model._setattr_tracking = False
+    model._layers = []
     for name, value in model._original_attributes_cache.items():
-      if not isinstance(value, checkpointable.CheckpointableBase):
-        # If this value is not already checkpointable, it's probably that way
-        # for a reason; we don't want to start tracking data structures that the
-        # original Model didn't.
-        value = data_structures.NoDependency(value)
       setattr(model, name, value)
+      if isinstance(value, Layer):
+        model._layers.append(value)
     model._original_attributes_cache = None
+    model._setattr_tracking = setattr_tracking
   else:
     # Restore to the state of a never-called model.
     model.built = False
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index 8af3cc05f5db418709cec65f76810fdc0504487e..c466d94fed8f34e0ca9e25425f88d6028c806131 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -69,7 +69,7 @@ def sequential_model(add_input_layer, include_input_shape=True):
 
 class TestModelCloning(test.TestCase):
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def test_clone_sequential_model(self):
     with self.cached_session():
       val_a = np.random.random((10, 4))
@@ -84,28 +84,27 @@ class TestModelCloning(test.TestCase):
       # With placeholder creation
       new_model = keras.models.clone_model(model)
       # update ops from batch norm needs to be included
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(val_a, val_out)
 
       # On top of new tensor
       input_a = keras.Input(shape=(4,))
       new_model = keras.models.clone_model(model, input_tensors=input_a)
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(val_a, val_out)
 
       # On top of new, non-Keras tensor
       input_a = keras.backend.variable(val_a)
       new_model = keras.models.clone_model(model, input_tensors=input_a)
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(None, val_out)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def test_clone_sequential_model_input_layer(self):
 
-    @test_util.run_deprecated_v1
     def test_input_layer(include_inputs):
       with self.cached_session():
         val_a = np.random.random((10, 4))
@@ -142,7 +141,7 @@ class TestModelCloning(test.TestCase):
     test_input_layer(True)
     test_input_layer(False)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def test_clone_functional_model(self):
     with self.cached_session():
       val_a = np.random.random((10, 4))
@@ -168,7 +167,7 @@ class TestModelCloning(test.TestCase):
     with self.cached_session():
       # With placeholder creation
       new_model = keras.models.clone_model(model)
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch([val_a, val_b], val_out)
 
@@ -177,7 +176,7 @@ class TestModelCloning(test.TestCase):
       input_b = keras.Input(shape=(4,), name='b')
       new_model = keras.models.clone_model(
           model, input_tensors=[input_a, input_b])
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch([val_a, val_b], val_out)
 
@@ -186,7 +185,7 @@ class TestModelCloning(test.TestCase):
       input_b = keras.backend.variable(val_b)
       new_model = keras.models.clone_model(
           model, input_tensors=[input_a, input_b])
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(None, val_out)
 
@@ -318,6 +317,7 @@ class TestModelDeepCopy(test.TestCase):
                       model_copy.get_weights()[0]))
 
 
+@test_util.run_v1_only('b/120545219')
 class TestCloneAndBuildModel(test.TestCase):
 
   def test_clone_and_build_non_compiled_model(self):
@@ -365,7 +365,9 @@ class TestCloneAndBuildModel(test.TestCase):
 
     self.assertEqual('mse', model.loss)
     self.assertTrue(
-        isinstance(model.optimizer, keras.optimizers.RMSprop))
+        isinstance(model.optimizer,
+                   (keras.optimizers.RMSprop,
+                    keras.optimizer_v2.rmsprop.RMSprop)))
     self.assertEqual(['acc', metrics.categorical_accuracy],
                      model._compile_metrics)
 
@@ -402,7 +404,6 @@ class TestCloneAndBuildModel(test.TestCase):
       new_model.train_on_batch(inp, out)
       new_model.evaluate(inp, out)
 
-  @test_util.run_deprecated_v1
   def test_clone_and_build_compiled_sequential_model(self):
     with self.cached_session():
       model = keras.models.Sequential()
@@ -415,7 +416,6 @@ class TestCloneAndBuildModel(test.TestCase):
 
     self._clone_and_build_test_helper(model)
 
-  @test_util.run_deprecated_v1
   def test_clone_and_build_functional_model(self):
     with self.cached_session():
       input_a = keras.Input(shape=(4,))
@@ -432,7 +432,6 @@ class TestCloneAndBuildModel(test.TestCase):
 
     self._clone_and_build_test_helper(model)
 
-  @test_util.run_deprecated_v1
   def test_clone_and_build_subclassed_model(self):
     class SubclassedModel(keras.Model):
 
@@ -481,11 +480,9 @@ class TestCloneAndBuildModel(test.TestCase):
   def test_replace_tf_optimizer_iterations_variable(self):
     self.assert_optimizer_iterations_increases(adam.AdamOptimizer(0.01))
 
-  @test_util.run_deprecated_v1
   def test_replace_keras_optimizer_iterations_variable(self):
     self.assert_optimizer_iterations_increases('adam')
 
-  @test_util.run_deprecated_v1
   def test_clone_and_build_sequential_model_without_inputs_defined(self):
     with self.cached_session():
       model = sequential_model(False, False)
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index 6b805781f0b9e9b34ebd7bb80b4aa0075caf4db8..b8f01249419c595a735442310c735bc10648cba6 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -172,8 +172,9 @@ cuda_py_test(
 
 py_test(
     name = "optimizer_v2_test",
-    size = "medium",
+    size = "large",
     srcs = ["optimizer_v2_test.py"],
+    shard_count = 4,
     tags = [
         "no_windows",
     ],
@@ -189,6 +190,8 @@ py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
index e1d7ecb558668ce5db479531898cee3fea701ebf..88ddc943249974260b62c188f8a101c8ba20a253 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.optimizers.Adadelta', v1=[])
 class Adadelta(optimizer_v2.OptimizerV2):
   r"""Optimizer that implements the Adadelta algorithm.
 
@@ -83,16 +87,27 @@ class Adadelta(optimizer_v2.OptimizerV2):
     @end_compatibility
     """
     super(Adadelta, self).__init__(name, **kwargs)
-    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
     self._set_hyper('decay', self._initial_decay)
     self._set_hyper('rho', rho)
     self._set_hyper('epsilon', epsilon)
 
   def _create_slots(self, var_list):
+    # Separate for-loops to respect the ordering of slot variables from v1.
     for v in var_list:
       self.add_slot(v, 'accum_grad')
+    for v in var_list:
       self.add_slot(v, 'accum_var')
 
+  def set_weights(self, weights):
+    params = self.weights
+    # Override set_weights for backward compatibility of Keras V1 optimizer
+    # since it does not include iteration at head of the weight list. Set
+    # iteration to 0.
+    if len(params) == len(weights) + 1:
+      weights = [np.array(0)] + weights
+    super(Adadelta, self).set_weights(weights)
+
   def _resource_apply_dense(self, grad, var):
     var_dtype = var.dtype.base_dtype
     lr_t = self._decayed_lr(var_dtype)
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta_test.py b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
index 6409f3ead44242fa6e4cab496cf462f104e1e6fc..c95af6a8ad5308c357d96532f6599342b16aa276 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
@@ -75,16 +75,16 @@ class AdadeltaOptimizerTest(test.TestCase):
             slot = [None] * 2
             slot_update = [None] * 2
             slot[0] = adadelta_opt.get_slot(var0, "accum_grad")
-            self.assertEquals(slot[0].get_shape(), var0.get_shape())
+            self.assertEqual(slot[0].get_shape(), var0.get_shape())
 
             slot_update[0] = adadelta_opt.get_slot(var0, "accum_var")
-            self.assertEquals(slot_update[0].get_shape(), var0.get_shape())
+            self.assertEqual(slot_update[0].get_shape(), var0.get_shape())
 
             slot[1] = adadelta_opt.get_slot(var1, "accum_grad")
-            self.assertEquals(slot[1].get_shape(), var1.get_shape())
+            self.assertEqual(slot[1].get_shape(), var1.get_shape())
 
             slot_update[1] = adadelta_opt.get_slot(var1, "accum_var")
-            self.assertEquals(slot_update[1].get_shape(), var1.get_shape())
+            self.assertEqual(slot_update[1].get_shape(), var1.get_shape())
 
           # Fetch params to validate initial values
           self.assertAllClose(var0_init, self.evaluate(var0))
@@ -153,8 +153,11 @@ class AdadeltaOptimizerTest(test.TestCase):
       with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
+
+        def loss():
+          pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
+          return pred * pred
+
         sgd_op = adadelta.Adadelta(1.0, 1.0, 1.0).minimize(
             loss, var_list=[var0])
         variables.global_variables_initializer().run()
@@ -165,6 +168,14 @@ class AdadeltaOptimizerTest(test.TestCase):
         # Validate updated params
         self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
 
+  def testConstructAdadeltaWithLR(self):
+    opt = adadelta.Adadelta(lr=1.0, rho=0.9, epsilon=1.)
+    self.assertEqual(opt.lr, 1.0)
+    opt_2 = adadelta.Adadelta(learning_rate=0.1, rho=0.9, epsilon=1., lr=1.0)
+    self.assertEqual(opt_2.lr, 1.0)
+    opt_3 = adadelta.Adadelta(learning_rate=0.1, rho=0.9, epsilon=1.)
+    self.assertEqual(opt_3.lr, 0.1)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
index 0896f95f94d69e1822c21e27f95e9752005b86b8..ac55d2075a839e4ec5863e2b2ae8c22c9a4e645f 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
@@ -25,8 +27,10 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.optimizers.Adagrad', v1=[])
 class Adagrad(optimizer_v2.OptimizerV2):
   r"""Optimizer that implements the Adagrad algorithm.
 
@@ -78,13 +82,13 @@ class Adagrad(optimizer_v2.OptimizerV2):
     functions.
     @end_compatibility
     """
-    if initial_accumulator_value <= 0.0:
-      raise ValueError('initial_accumulator_value must be positive: %s' %
+    if initial_accumulator_value < 0.0:
+      raise ValueError('initial_accumulator_value must be non-negative: %s' %
                        initial_accumulator_value)
     if epsilon < 1e-7:
       raise ValueError('epsilon must be larger than 1e-7: %s' % epsilon)
     super(Adagrad, self).__init__(name, **kwargs)
-    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
     self._set_hyper('decay', self._initial_decay)
     self._initial_accumulator_value = initial_accumulator_value
     self._set_hyper('epsilon', epsilon)
@@ -96,6 +100,38 @@ class Adagrad(optimizer_v2.OptimizerV2):
           self._initial_accumulator_value, dtype=dtype)
       self.add_slot(var, 'accumulator', init)
 
+  def set_weights(self, weights):
+    params = self.weights
+    # Override set_weights for backward compatibility of Keras V1 optimizer
+    # since it does not include iteration at head of the weight list. Set
+    # iteration to 0.
+    if len(params) == len(weights) + 1:
+      weights = [np.array(0)] + weights
+    super(Adagrad, self).set_weights(weights)
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    """Creates an optimizer from its config.
+
+    This method is the reverse of `get_config`,
+    capable of instantiating the same optimizer from the config
+    dictionary.
+
+    Arguments:
+        config: A Python dictionary, typically the output of get_config.
+        custom_objects: A Python dictionary mapping names to additional Python
+          objects used to create this optimizer, such as a function used for a
+          hyperparameter.
+
+    Returns:
+        An optimizer instance.
+    """
+    if 'initial_accumulator_value' not in config:
+      config['initial_accumulator_value'] = 0.
+    if 'lr' in config:
+      config['learning_rate'] = config.pop('lr')
+    return cls(**config)
+
   def _resource_apply_dense(self, grad, var):
     var_dtype = var.dtype.base_dtype
     lr_t = self._decayed_lr(var_dtype)
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad_test.py b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
index 8004907bf52921e542314c4511b94eb5fd89b868..cf6f6a7832c56cd36d4b99ac88e26ce5c09ac7f6 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
@@ -167,8 +167,11 @@ class AdagradOptimizerTest(test.TestCase):
         var0 = resource_variable_ops.ResourceVariable(
             [[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
+
+        def loss():
+          pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
+          return pred * pred
+
         sgd_op = adagrad.Adagrad(1.0).minimize(loss, var_list=[var0])
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
@@ -297,12 +300,12 @@ class AdagradOptimizerTest(test.TestCase):
       with self.cached_session():
         var_repeated = resource_variable_ops.ResourceVariable(
             [1.0, 2.0], dtype=dtype)
-        loss_repeated = math_ops.reduce_sum(
-            embedding_ops.embedding_lookup(var_repeated, [0, 0]))
+        loss_repeated = lambda: math_ops.reduce_sum(  # pylint: disable=g-long-lambda
+            embedding_ops.embedding_lookup(var_repeated, [0, 0]))  # pylint: disable=cell-var-from-loop
         var_aggregated = resource_variable_ops.ResourceVariable(
             [1.0, 2.0], dtype=dtype)
-        loss_aggregated = 2 * math_ops.reduce_sum(
-            embedding_ops.embedding_lookup(var_aggregated, [0]))
+        loss_aggregated = lambda: 2 * math_ops.reduce_sum(  # pylint: disable=g-long-lambda
+            embedding_ops.embedding_lookup(var_aggregated, [0]))  # pylint: disable=cell-var-from-loop
         update_op_repeated = adagrad.Adagrad(2.0).minimize(
             loss_repeated, var_list=[var_repeated])
         update_op_aggregated = adagrad.Adagrad(2.0).minimize(
@@ -372,9 +375,9 @@ class AdagradOptimizerTest(test.TestCase):
         ada_update2 = ada_opt.apply_gradients(
             zip([grads0, grads1], [var0, var1]))
         slot0 = ada_opt.get_slot(var0, "accumulator")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = ada_opt.get_slot(var1, "accumulator")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values.
@@ -395,6 +398,14 @@ class AdagradOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
         self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  def testConstructAdagradWithLR(self):
+    opt = adagrad.Adagrad(lr=1.0)
+    self.assertEqual(opt.lr, 1.0)
+    opt_2 = adagrad.Adagrad(learning_rate=0.1, lr=1.0)
+    self.assertEqual(opt_2.lr, 1.0)
+    opt_3 = adagrad.Adagrad(learning_rate=0.1)
+    self.assertEqual(opt_3.lr, 0.1)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index a3f12909aaf8d2b4800f03ea63a3381d10266240..873dadb31a40c8af3ec26c6b550fe0e2c3e3fa25 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -24,8 +24,10 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.optimizers.Adam', v1=[])
 class Adam(optimizer_v2.OptimizerV2):
   """Optimizer that implements the Adam algorithm.
 
@@ -127,21 +129,33 @@ class Adam(optimizer_v2.OptimizerV2):
     """
 
     super(Adam, self).__init__(name, **kwargs)
-    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
     self._set_hyper('decay', self._initial_decay)
     self._set_hyper('beta_1', beta_1)
     self._set_hyper('beta_2', beta_2)
     self._set_hyper('epsilon', epsilon)
-    # TODO(tanzheny): create op for resource_apply_adam_with_amsgrad
-    if amsgrad:
-      raise ValueError('Amsgrad is currently not supported.')
-    self._amsgrad = amsgrad
+    self.amsgrad = amsgrad
 
   def _create_slots(self, var_list):
     # Create slots for the first and second moments.
+    # Separate for-loops to respect the ordering of slot variables from v1.
     for var in var_list:
       self.add_slot(var, 'm')
+    for var in var_list:
       self.add_slot(var, 'v')
+    if self.amsgrad:
+      for var in var_list:
+        self.add_slot(var, 'vhat')
+
+  def set_weights(self, weights):
+    params = self.weights
+    # If the weights are generated by Keras V1 optimizer, it includes vhats
+    # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2
+    # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
+    num_vars = int((len(params) - 1) / 2)
+    if len(weights) == 3 * num_vars + 1:
+      weights = weights[:len(params)]
+    super(Adam, self).set_weights(weights)
 
   def _resource_apply_dense(self, grad, var):
     var_dtype = var.dtype.base_dtype
@@ -150,21 +164,38 @@ class Adam(optimizer_v2.OptimizerV2):
     v = self.get_slot(var, 'v')
     beta_1_t = self._get_hyper('beta_1', var_dtype)
     beta_2_t = self._get_hyper('beta_2', var_dtype)
+    epsilon = self._get_hyper('epsilon', var_dtype)
     local_step = math_ops.cast(self.iterations + 1, var_dtype)
     beta_1_power = math_ops.pow(beta_1_t, local_step)
     beta_2_power = math_ops.pow(beta_2_t, local_step)
-    return training_ops.resource_apply_adam(
-        var.handle,
-        m.handle,
-        v.handle,
-        beta_1_power,
-        beta_2_power,
-        lr_t,
-        beta_1_t,
-        beta_2_t,
-        self._get_hyper('epsilon', var_dtype),
-        grad,
-        use_locking=self._use_locking)
+    if not self.amsgrad:
+      return training_ops.resource_apply_adam(
+          var.handle,
+          m.handle,
+          v.handle,
+          beta_1_power,
+          beta_2_power,
+          lr_t,
+          beta_1_t,
+          beta_2_t,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
+    else:
+      vhat = self.get_slot(var, 'vhat')
+      return training_ops.resource_apply_adam_with_amsgrad(
+          var.handle,
+          m.handle,
+          v.handle,
+          vhat.handle,
+          beta_1_power,
+          beta_2_power,
+          lr_t,
+          beta_1_t,
+          beta_2_t,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
 
   def _resource_apply_sparse(self, grad, var, indices):
     var_dtype = var.dtype.base_dtype
@@ -191,10 +222,23 @@ class Adam(optimizer_v2.OptimizerV2):
     with ops.control_dependencies([v_t]):
       v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
 
-    v_sqrt = math_ops.sqrt(v_t)
-    var_update = state_ops.assign_sub(
-        var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
-    return control_flow_ops.group(*[var_update, m_t, v_t])
+    if not self.amsgrad:
+      v_sqrt = math_ops.sqrt(v_t)
+      var_update = state_ops.assign_sub(
+          var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
+      return control_flow_ops.group(*[var_update, m_t, v_t])
+    else:
+      v_hat = self.get_slot(var, 'vhat')
+      v_hat_t = math_ops.maximum(v_hat, v_t)
+      with ops.control_dependencies([v_hat_t]):
+        v_hat_t = state_ops.assign(
+            v_hat, v_hat_t, use_locking=self._use_locking)
+      v_hat_sqrt = math_ops.sqrt(v_hat_t)
+      var_update = state_ops.assign_sub(
+          var,
+          lr * m_t / (v_hat_sqrt + epsilon_t),
+          use_locking=self._use_locking)
+      return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t])
 
   def _resource_scatter_add(self, x, i, v):
     with ops.control_dependencies(
@@ -209,6 +253,6 @@ class Adam(optimizer_v2.OptimizerV2):
         'beta_1': self._serialize_hyperparameter('beta_1'),
         'beta_2': self._serialize_hyperparameter('beta_2'),
         'epsilon': self._serialize_hyperparameter('epsilon'),
-        'amsgrad': self._amsgrad,
+        'amsgrad': self.amsgrad,
     })
     return config
diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py
index b1b17f8f088567e0be73b642c7deef1aa3282572..49a9de41cdf8fd6391c31b2e75b9eb116eeabfbd 100644
--- a/tensorflow/python/keras/optimizer_v2/adam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -51,6 +52,52 @@ def adam_update_numpy(param,
   return param_t, m_t, v_t
 
 
+def adam_update_numpy_amsgrad(param,
+                              g_t,
+                              t,
+                              m,
+                              v,
+                              vhat,
+                              lr=0.001,
+                              beta1=0.9,
+                              beta2=0.999,
+                              epsilon=1e-7):
+  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+  vhat_t = np.maximum(vhat, v_t)
+
+  param_t = param - lr_t * m_t / (np.sqrt(vhat_t) + epsilon)
+  return param_t, m_t, v_t, vhat_t
+
+
+def adam_sparse_update_numpy_amsgrad(param,
+                                     indices,
+                                     g_t,
+                                     t,
+                                     m,
+                                     v,
+                                     vhat,
+                                     lr=0.001,
+                                     beta1=0.9,
+                                     beta2=0.999,
+                                     epsilon=1e-7):
+  m_t, v_t, vhat_t, param_t = (np.copy(m), np.copy(v), np.copy(vhat),
+                               np.copy(param))
+  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+  m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
+  v_t_slice = beta2 * v[indices] + (1 - beta2) * g_t * g_t
+  m_t[indices] = m_t_slice
+  v_t[indices] = v_t_slice
+  v_hat_t = np.maximum(vhat_t, v_t)
+  v_hat_t_slice = v_hat_t[indices]
+  param_t_slice = param[indices] - (
+      lr_t * (m_t_slice / (np.sqrt(v_hat_t_slice) + epsilon)))
+  param_t[indices] = param_t_slice
+  return param_t, m_t, v_t, vhat_t
+
+
 def get_beta_accumulators(opt, dtype):
   local_step = math_ops.cast(opt.iterations + 1, dtype)
   beta_1_t = math_ops.cast(opt._get_hyper("beta_1"), dtype)
@@ -115,9 +162,9 @@ class AdamOptimizerTest(test.TestCase):
         # it (i.e. they have GPU kernels).
         var = variables.Variable([[1.0], [2.0]])
         indices = constant_op.constant([0, 1], dtype=index_dtype)
-        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        g_sum = lambda: math_ops.reduce_sum(array_ops.gather(var, indices))  # pylint: disable=cell-var-from-loop
         optimizer = adam.Adam(3.0)
-        minimize_op = optimizer.minimize(gathered_sum, var_list=[var])
+        minimize_op = optimizer.minimize(g_sum, var_list=[var])
         variables.global_variables_initializer().run()
         minimize_op.run()
 
@@ -211,6 +258,100 @@ class AdamOptimizerTest(test.TestCase):
     with context.eager_mode():
       self.doTestBasic(use_callable_params=True)
 
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testBasicWithAmsgrad(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, v0hat, m1, v1, v1hat = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adam.Adam(amsgrad=True)
+        if not context.executing_eagerly():
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        self.evaluate(variables.global_variables_initializer())
+        # Run 3 steps of Adam
+        for t in range(3):
+          beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          else:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          var0_np, m0, v0, v0hat = adam_update_numpy_amsgrad(
+              var0_np, grads0_np, t, m0, v0, v0hat)
+          var1_np, m1, v1, v1hat = adam_update_numpy_amsgrad(
+              var1_np, grads1_np, t, m1, v1, v1hat)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSparseWithAmsgrad(self):
+    # dtypes.half does not work on gpu + eager.
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        m0 = np.array([[0.0], [0.0]])
+        v0 = np.array([[0.0], [0.0]])
+        v0hat = np.array([[0.0], [0.0]])
+        indices_np = np.array([1])
+        indices = constant_op.constant(indices_np, dtype=dtypes.int32)
+        var0_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
+        repeated_index_update_var = variables.Variable(var0_np, dtype=dtype)
+        aggregated_update_var = variables.Variable(var0_np, dtype=dtype)
+        grads0_np = np.array([[0.2]], dtype=dtype.as_numpy_dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]), constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(grads0_np, indices,
+                                            constant_op.constant([2, 1]))
+        opt_repeated = adam.Adam(amsgrad=True)
+        opt_aggregated = adam.Adam(amsgrad=True)
+        if not context.executing_eagerly():
+          repeated_update = opt_repeated.apply_gradients(
+              [(grad_repeated_index, repeated_index_update_var)])
+          aggregated_update = opt_aggregated.apply_gradients(
+              [(grad_aggregated, aggregated_update_var)])
+        self.evaluate(variables.global_variables_initializer())
+        self.assertAllClose(
+            self.evaluate(aggregated_update_var),
+            self.evaluate(repeated_index_update_var))
+        for t in range(3):
+          if not context.executing_eagerly():
+            self.evaluate(repeated_update)
+            self.evaluate(aggregated_update)
+          else:
+            opt_repeated.apply_gradients(
+                [(grad_repeated_index, repeated_index_update_var)])
+            opt_aggregated.apply_gradients(
+                [(grad_aggregated, aggregated_update_var)])
+
+          var0_np, m0, v0, v0hat = adam_sparse_update_numpy_amsgrad(
+              var0_np, indices_np, grads0_np, t, m0, v0, v0hat)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(
+              var0_np, self.evaluate(aggregated_update_var))
+          self.assertAllCloseAccordingToType(
+              self.evaluate(aggregated_update_var),
+              self.evaluate(repeated_index_update_var))
+
   @test_util.run_deprecated_v1
   def testBasicWithLearningRateDecay(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
@@ -347,14 +488,28 @@ class AdamOptimizerTest(test.TestCase):
       v2 = resource_variable_ops.ResourceVariable(1.)
       opt = adam.Adam(1.)
       opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-      # There should be iteration, hyper variables, and two unique slot
-      # variables for v1 and v2 respectively.
-      self.assertEqual(10, len(set(opt.variables())))
-
-  def testAmsgradWithError(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 "Amsgrad is currently not supported"):
-      adam.Adam(learning_rate=1., beta_1=0.9, beta_2=0.99, amsgrad=True)
+      # There should be iteration, and two unique slot variables for v1 and v2.
+      self.assertEqual(5, len(set(opt.variables())))
+      self.assertEqual(
+          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+
+  def testSetWeightsFromV1AdamWithoutMinimize(self):
+    keras_v1_adam = optimizers.Adam()
+    keras_v2_adam = adam.Adam()
+    keras_v2_adam.set_weights(keras_v1_adam.get_weights())
+    keras_v1_iteration = keras_v1_adam.iterations
+    keras_v2_iteration = keras_v2_adam.iterations
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(
+        self.evaluate(keras_v1_iteration), self.evaluate(keras_v2_iteration))
+
+  def testConstructAdamWithLR(self):
+    opt = adam.Adam(lr=1.0)
+    self.assertEqual(opt.lr, 1.0)
+    opt_2 = adam.Adam(learning_rate=0.1, lr=1.0)
+    self.assertEqual(opt_2.lr, 1.0)
+    opt_3 = adam.Adam(learning_rate=0.1)
+    self.assertEqual(opt_3.lr, 0.1)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py
index ddd78584f852f24f9da6277888d1883bb44db327..9c826eb42a3faf142dd652a4a0764b66bcb79fb4 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@@ -25,8 +25,10 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.optimizers.Adamax', v1=[])
 class Adamax(adam.Adam):
   """Optimizer that implements the Adamax algorithm.
 
diff --git a/tensorflow/python/keras/optimizer_v2/adamax_test.py b/tensorflow/python/keras/optimizer_v2/adamax_test.py
index 7521a6196e43d8e7dccf7c849db23bb13785ec03..339c0fe6e6dbc5d9fc90aa29b212b5e0c2a290f1 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax_test.py
@@ -136,9 +136,9 @@ class AdamaxOptimizerTest(test.TestCase):
         # it (i.e. they have GPU kernels).
         var = variables.Variable([[1.0], [2.0]])
         indices = constant_op.constant([0, 1], dtype=index_dtype)
-        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        g_sum = lambda: math_ops.reduce_sum(array_ops.gather(var, indices))  # pylint: disable=cell-var-from-loop
         optimizer = adamax.Adamax(3.0)
-        minimize_op = optimizer.minimize(gathered_sum, var_list=[var])
+        minimize_op = optimizer.minimize(g_sum, var_list=[var])
         variables.global_variables_initializer().run()
         minimize_op.run()
 
@@ -359,9 +359,16 @@ class AdamaxOptimizerTest(test.TestCase):
       v2 = resource_variable_ops.ResourceVariable(1.)
       opt = adamax.Adamax(1.)
       opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-      # There should be iteration, hyper variables, and two unique slot
-      # variables for v1 and v2 respectively.
-      self.assertEqual(10, len(set(opt.variables())))
+      # There should be iteration, and two unique slot variables for v1 and v2.
+      self.assertEqual(5, len(set(opt.variables())))
+
+  def testConstructAdamaxWithLR(self):
+    opt = adamax.Adamax(lr=1.0)
+    self.assertEqual(opt.lr, 1.0)
+    opt_2 = adamax.Adamax(learning_rate=0.1, lr=1.0)
+    self.assertEqual(opt_2.lr, 1.0)
+    opt_3 = adamax.Adamax(learning_rate=0.1)
+    self.assertEqual(opt_3.lr, 0.1)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py
index e278e352f551a12718f6b400b16f9d7e05d0c02e..7828b1791e9a9fab1bf01c60d0fcb435b4314be0 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@@ -21,8 +21,10 @@ from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.optimizers.Ftrl', v1=[])
 class Ftrl(optimizer_v2.OptimizerV2):
   """Optimizer that implements the FTRL algorithm.
 
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl_test.py b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
index bec400e8cbba2654decaf520a24800095e4d16f5..f0f07e9d03f6db31f5e83efbbe6428688d944093 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl_test.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
@@ -113,8 +113,11 @@ class FtrlOptimizerTest(test.TestCase):
       with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
+
+        def loss():
+          pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
+          return pred * pred
+
         sgd_op = ftrl.Ftrl(1.0).minimize(loss, var_list=[var0])
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent.py b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
index 03e4515e022d9aaeb1eb74e3d1f0888c5b288ec0..06db2f3b4cfe3a4ff9cc4f577e660a1830fa14a7 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,8 +21,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("keras.optimizers.SGD", v1=[])
 class SGD(optimizer_v2.OptimizerV2):
   """Stochastic gradient descent and momentum optimizer.
 
@@ -32,7 +34,7 @@ class SGD(optimizer_v2.OptimizerV2):
   gradient is evaluated at theta(t).
   ```
 
-  or Computes (if `use_nesterov = False`):
+  or Computes (if `nesterov = False`):
   ```
   v(t+1) = momentum * v(t) - learning_rate * gradient
   theta(t+1) = theta(t) + v(t+1)
@@ -75,7 +77,7 @@ class SGD(optimizer_v2.OptimizerV2):
       **kwargs: keyword arguments. Allowed to be {`decay`}
     """
     super(SGD, self).__init__(name, **kwargs)
-    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
     self._set_hyper("decay", self._initial_decay)
 
     self._momentum = False
@@ -85,7 +87,7 @@ class SGD(optimizer_v2.OptimizerV2):
       raise ValueError("`momentum` must be between [0, 1].")
     self._set_hyper("momentum", momentum)
 
-    self._nesterov = nesterov
+    self.nesterov = nesterov
 
   def _create_slots(self, var_list):
     if self._momentum:
@@ -97,14 +99,14 @@ class SGD(optimizer_v2.OptimizerV2):
     lr_t = self._decayed_lr(var_dtype)
     if self._momentum:
       momentum_var = self.get_slot(var, "momentum")
-      return training_ops.resource_apply_momentum(
+      return training_ops.resource_apply_keras_momentum(
           var.handle,
           momentum_var.handle,
           lr_t,
           grad,
           self._get_hyper("momentum", var_dtype),
           use_locking=self._use_locking,
-          use_nesterov=self._nesterov)
+          use_nesterov=self.nesterov)
     else:
       return training_ops.resource_apply_gradient_descent(
           var.handle, lr_t, grad, use_locking=self._use_locking)
@@ -124,7 +126,7 @@ class SGD(optimizer_v2.OptimizerV2):
     var_dtype = var.dtype.base_dtype
     lr_t = self._decayed_lr(var_dtype)
     momentum_var = self.get_slot(var, "momentum")
-    return training_ops.resource_sparse_apply_momentum(
+    return training_ops.resource_sparse_apply_keras_momentum(
         var.handle,
         momentum_var.handle,
         lr_t,
@@ -132,7 +134,7 @@ class SGD(optimizer_v2.OptimizerV2):
         indices,
         self._get_hyper("momentum", var_dtype),
         use_locking=self._use_locking,
-        use_nesterov=self._nesterov)
+        use_nesterov=self.nesterov)
 
   def get_config(self):
     config = super(SGD, self).get_config()
@@ -140,6 +142,6 @@ class SGD(optimizer_v2.OptimizerV2):
         "learning_rate": self._serialize_hyperparameter("learning_rate"),
         "decay": self._serialize_hyperparameter("decay"),
         "momentum": self._serialize_hyperparameter("momentum"),
-        "nesterov": self._nesterov,
+        "nesterov": self.nesterov,
     })
     return config
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
index dae2a47735ef8cc3e988f4dff216b1462f75c3dc..9a4178db46981afb7fe841e5b8d2506db7692cfe 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -122,8 +122,6 @@ class GradientDescentOptimizerTest(test.TestCase):
         var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
         loss = lambda: math_ops.matmul(var0, x) + var1  # pylint: disable=cell-var-from-loop
-        if not context.executing_eagerly():
-          loss = loss()
         sgd = gradient_descent.SGD(1.0)
         sgd_op = sgd.minimize(loss, [var0, var1])
         self.evaluate(variables.global_variables_initializer())
@@ -141,9 +139,12 @@ class GradientDescentOptimizerTest(test.TestCase):
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        pred += var1
-        loss = pred * pred
+
+        def loss():
+          pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
+          pred += var1  # pylint: disable=cell-var-from-loop
+          return pred * pred
+
         sgd_op = gradient_descent.SGD(1.0).minimize(loss, [var0, var1])
         self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
@@ -181,7 +182,8 @@ class GradientDescentOptimizerTest(test.TestCase):
         opt = gradient_descent.SGD(3.0)
         values = [1.0, 3.0]
         vars_ = [variables.Variable([v], dtype=dtype) for v in values]
-        grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
+        loss = lambda: vars_[0] + vars_[1]  # pylint: disable=cell-var-from-loop
+        grads_and_vars = opt._compute_gradients(loss, vars_)
         self.evaluate(variables.global_variables_initializer())
         for grad, _ in grads_and_vars:
           self.assertAllCloseAccordingToType([1.0], self.evaluate(grad))
@@ -259,14 +261,20 @@ class GradientDescentOptimizerTest(test.TestCase):
       # be an EagerTensor once again, not a graph Tensor.
       self.assertEqual(float(step()), -1.0)
 
+  def testConstructSGDWithLR(self):
+    opt = gradient_descent.SGD(lr=1.0)
+    self.assertEqual(opt.lr, 1.0)
+    opt_2 = gradient_descent.SGD(learning_rate=0.1, lr=1.0)
+    self.assertEqual(opt_2.lr, 1.0)
+    opt_3 = gradient_descent.SGD(learning_rate=0.1)
+    self.assertEqual(opt_3.lr, 0.1)
+
 
 class MomentumOptimizerTest(test.TestCase):
 
   def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
-    var = var + accum * lr * momentum
-    accum = accum * momentum + g
-    var = var - lr * accum
-    var = var - accum * lr * momentum
+    accum = accum * momentum - g * lr
+    var += (accum * momentum - g * lr)
     return var, accum
 
   @test_util.run_in_graph_and_eager_modes
@@ -291,9 +299,9 @@ class MomentumOptimizerTest(test.TestCase):
 
         # Check we have slots
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
 
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
@@ -301,9 +309,9 @@ class MomentumOptimizerTest(test.TestCase):
         self.evaluate(mom_update)
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([0.1, 0.1]), self.evaluate(slot0))
+            np.array([-0.2, -0.2]), self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([0.01, 0.01]), self.evaluate(slot1))
+            np.array([-0.02, -0.02]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
@@ -317,11 +325,11 @@ class MomentumOptimizerTest(test.TestCase):
           mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
             self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
-            self.evaluate(slot1))
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
@@ -348,7 +356,7 @@ class MomentumOptimizerTest(test.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
         accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        loss = 5 * var0 * var0 + 3 * var1
+        loss = lambda: 5 * var0 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
         mom_op = gradient_descent.SGD(
             learning_rate=2.0, momentum=0.9, nesterov=True)
         opt_op = mom_op.minimize(loss, [var0, var1])
@@ -474,9 +482,9 @@ class MomentumOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
         # Check we have slots
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
         self.assertAllClose([1.0, 2.0], self.evaluate(var0))
@@ -486,9 +494,9 @@ class MomentumOptimizerTest(test.TestCase):
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([0.1, 0.1]), self.evaluate(slot0))
+            np.array([-0.2, -0.2]), self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([0.01, 0.01]), self.evaluate(slot1))
+            np.array([-0.02, -0.02]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
@@ -500,11 +508,11 @@ class MomentumOptimizerTest(test.TestCase):
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
             self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
-            self.evaluate(slot1))
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
@@ -536,9 +544,9 @@ class MomentumOptimizerTest(test.TestCase):
 
         # Check we have slots
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
         self.assertAllClose([0, 0], self.evaluate(var0)[0])
@@ -553,10 +561,10 @@ class MomentumOptimizerTest(test.TestCase):
             np.array([0, 0]),
             self.evaluate(slot0)[0])
         self.assertAllCloseAccordingToType(
-            np.array([.1, .1]),
+            np.array([-2.0 * .1, -2.0 * .1]),
             self.evaluate(slot0)[1])
         self.assertAllCloseAccordingToType(
-            np.array([.01, .01]),
+            np.array([-2.0 * .01, -2.0 * .01]),
             self.evaluate(slot1)[2])
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
@@ -573,10 +581,11 @@ class MomentumOptimizerTest(test.TestCase):
         # Check that the momentum accumulators have been updated.
         self.assertAllClose(np.array([0, 0]), self.evaluate(slot0)[0])
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
             self.evaluate(slot0)[1])
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]),
             self.evaluate(slot1)[2])
         # Check that the parameters have been updated.
         self.assertAllClose(np.array([0, 0]), self.evaluate(var0)[0])
@@ -609,9 +618,9 @@ class MomentumOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
         self.assertAllClose([1.0, 2.0], self.evaluate(var0))
@@ -621,9 +630,9 @@ class MomentumOptimizerTest(test.TestCase):
         mom_update1.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([0.1, 0.1]), self.evaluate(slot0))
+            np.array([-0.2, -0.2]), self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([0.01, 0.01]), self.evaluate(slot1))
+            np.array([-0.02, -0.02]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
@@ -635,11 +644,11 @@ class MomentumOptimizerTest(test.TestCase):
         mom_update2.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
             self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
-            self.evaluate(slot1))
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
@@ -678,12 +687,20 @@ class MomentumOptimizerTest(test.TestCase):
           opt3._get_hyper("momentum"))
       # self.assertEqual(
       #     self.evaluate(opt._get_hyper("decay")), opt3._get_hyper("decay"))
-      self.assertTrue(opt3._nesterov)
+      self.assertTrue(opt3.nesterov)
 
   def testNesterovWithoutMomentum(self):
     with self.assertRaisesRegexp(ValueError, "must be between"):
       gradient_descent.SGD(learning_rate=1.0, momentum=2.0)
 
+  def testConstructMomentumWithLR(self):
+    opt = gradient_descent.SGD(lr=1.0, momentum=0.9)
+    self.assertEqual(opt.lr, 1.0)
+    opt_2 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9, lr=1.0)
+    self.assertEqual(opt_2.lr, 1.0)
+    opt_3 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9)
+    self.assertEqual(opt_3.lr, 0.1)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/nadam.py b/tensorflow/python/keras/optimizer_v2/nadam.py
index 00b095e0dc950c7e68414c1657847b891652a5ba..afa74c8de37665ea217fa55cbdea3dda86908f55 100644
--- a/tensorflow/python/keras/optimizer_v2/nadam.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam.py
@@ -74,6 +74,9 @@ class Nadam(adam.Adam):
       **kwargs: keyword arguments. Allowed to be {`decay`}
     """
 
+    # Backwards compatiblity with keras NAdam optimizer.
+    if 'schedule_decay' in kwargs:
+      kwargs['decay'] = kwargs.pop('schedule_decay')
     # pylint: disable=useless-super-delegation
     super(Nadam, self).__init__(
         learning_rate=learning_rate,
diff --git a/tensorflow/python/keras/optimizer_v2/nadam_test.py b/tensorflow/python/keras/optimizer_v2/nadam_test.py
index d991e3117cad4530ffb1f3a4315b49dc46d26bfc..73568e81f0c6ae680226a123c0098e56a131e826 100644
--- a/tensorflow/python/keras/optimizer_v2/nadam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam_test.py
@@ -208,6 +208,18 @@ class NadamOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(var0_np, var0.eval())
           self.assertAllCloseAccordingToType(var1_np, var1.eval())
 
+  def testConstructNAdamWithLR(self):
+    opt = nadam.Nadam(lr=1.0)
+    self.assertEqual(opt.lr, 1.0)
+    opt_2 = nadam.Nadam(learning_rate=0.1, lr=1.0)
+    self.assertEqual(opt_2.lr, 1.0)
+    opt_3 = nadam.Nadam(learning_rate=0.1)
+    self.assertEqual(opt_3.lr, 0.1)
+
+  def testConstructNAdamWithScheduleDecay(self):
+    opt = nadam.Nadam(schedule_decay=0.2)
+    self.assertEqual(opt.decay, 0.2)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index 9c8fff0fe42501d2d2e43a4fb09f2d7ca3da1156..0e909d0d79c9b7238e9af42a0bdcd7f2d4d9f7c3 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -21,9 +21,12 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import functools
 
 import six
 
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -32,17 +35,40 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribution_strategy_context
-from tensorflow.python.training import optimizer as optimizer_v1
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+def _deduplicate_indexed_slices(values, indices):
+  """Sums `values` associated with any non-unique `indices`.
+
+  Args:
+    values: A `Tensor` with rank >= 1.
+    indices: A one-dimensional integer `Tensor`, indexing into the first
+      dimension of `values` (as in an IndexedSlices object).
+
+  Returns:
+    A tuple of (`summed_values`, `unique_indices`) where `unique_indices` is a
+    de-duplicated version of `indices` and `summed_values` contains the sum of
+    `values` slices associated with each unique index.
+  """
+  unique_indices, new_index_positions = array_ops.unique(indices)
+  summed_values = math_ops.unsorted_segment_sum(
+      values, new_index_positions,
+      array_ops.shape(unique_indices)[0])
+  return (summed_values, unique_indices)
 
 
 @six.add_metaclass(abc.ABCMeta)
-class OptimizerV2(optimizer_v1.Optimizer):
+@tf_export("keras.optimizers.Optimizer", v1=[])
+class OptimizerV2(checkpointable.CheckpointableBase):
   """Updated base class for optimizers.
 
   This class defines the API to add Ops to train a model.  You never use this
@@ -137,26 +163,30 @@ class OptimizerV2(optimizer_v1.Optimizer):
           _create_vars.
     """
     self._use_locking = True
-    super(OptimizerV2, self).__init__(self._use_locking, name)
+    self._name = name
     self._hyper = {}
     # dict: {variable name : {slot name : variable}}
     self._slots = {}
+    self._slot_names = []
     self._weights = []
 
+    # For implementing Checkpointable. Stores information about how to restore
+    # slot variables which have not yet been created
+    # (checkpointable._CheckpointPosition objects).
+    #  {slot_name :
+    #      {_var_key(variable_to_train): [checkpoint_position, ... ], ... },
+    #   ... }
+    self._deferred_slot_restorations = {}
+
     decay = kwargs.pop("decay", 0.0)
     if decay < 0.:
       raise ValueError("decay cannot be less than 0: {}".format(decay))
     self._initial_decay = decay
+    self.__dict__.update(kwargs)
 
     self._prepared = False
 
-  def minimize(self,
-               loss,
-               var_list,
-               aggregation_method=None,
-               colocate_gradients_with_ops=False,
-               name=None,
-               grad_loss=None):
+  def minimize(self, loss, var_list, grad_loss=None, name=None):
     """Add operations to minimize `loss` by updating `var_list`.
 
     This method simply combines calls `compute_gradients()` and
@@ -165,15 +195,11 @@ class OptimizerV2(optimizer_v1.Optimizer):
     of using this function.
 
     Args:
-      loss: A `Tensor` containing the value to minimize.
+      loss: A callable taking no arguments which returns the value to minimize.
       var_list: list or tuple of `Variable` objects to update to minimize
         `loss`.
-      aggregation_method: Specifies the method used to combine gradient terms.
-        Valid values are defined in the class `AggregationMethod`.
-      colocate_gradients_with_ops: If True, try colocating gradients with the
-        corresponding op.
-      name: Optional name for the returned operation.
       grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      name: Optional name for the returned operation.
 
     Returns:
       An Operation that updates the variables in `var_list`.  If `global_step`
@@ -185,29 +211,16 @@ class OptimizerV2(optimizer_v1.Optimizer):
     @compatibility(eager)
     When eager execution is enabled, `loss` should be a Python function that
     takes no arguments and computes the value to be minimized. Minimization (and
-    gradient computation) is done with respect to the elements of `var_list` if
-    not None, else with respect to any trainable variables created during the
-    execution of the `loss` function. `gate_gradients`, `aggregation_method`,
-    `colocate_gradients_with_ops` and `grad_loss` are ignored when eager
-    execution is enabled.
+    gradient computation) is done with respect to the elements of `var_list`.
+    `grad_loss` is ignored when eager execution is enabled.
     @end_compatibility
     """
-    grads_and_vars = self.compute_gradients(
-        loss,
-        var_list=var_list,
-        aggregation_method=aggregation_method,
-        colocate_gradients_with_ops=colocate_gradients_with_ops,
-        grad_loss=grad_loss)
+    grads_and_vars = self._compute_gradients(
+        loss, var_list=var_list, grad_loss=grad_loss)
 
     return self.apply_gradients(grads_and_vars, name=name)
 
-  def compute_gradients(self,
-                        loss,
-                        var_list,
-                        aggregation_method=None,
-                        colocate_gradients_with_ops=False,
-                        grad_loss=None,
-                        stop_gradients=None):
+  def _compute_gradients(self, loss, var_list, grad_loss=None):
     """Compute gradients of `loss` for the variables in `var_list`.
 
     This is the first part of `minimize()`.  It returns a list
@@ -217,19 +230,11 @@ class OptimizerV2(optimizer_v1.Optimizer):
     given variable.
 
     Args:
-      loss: A Tensor containing the value to minimize or a callable taking no
-        arguments which returns the value to minimize. When eager execution is
-        enabled it must be a callable.
-      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+      loss: A callable taking no arguments which returns the value to minimize.
+      var_list: List or tuple of `tf.Variable` to update to minimize
         `loss`.  Defaults to the list of variables collected in the graph under
         the key `GraphKeys.TRAINABLE_VARIABLES`.
-      aggregation_method: Specifies the method used to combine gradient terms.
-        Valid values are defined in the class `AggregationMethod`.
-      colocate_gradients_with_ops: If True, try colocating gradients with the
-        corresponding op.
       grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
-      stop_gradients: Optional. A Tensor or list of tensors not to differentiate
-        through.
 
     Returns:
       A list of (gradient, variable) pairs. Variable is always present, but
@@ -238,36 +243,22 @@ class OptimizerV2(optimizer_v1.Optimizer):
     Raises:
       TypeError: If `var_list` contains anything else than `Variable` objects.
       ValueError: If some arguments are invalid, or var_list is None.
-      RuntimeError: If called with eager execution enabled and `loss` is
-        not callable.
-
-    @compatibility(eager)
-    When eager execution is enabled, `aggregation_method`, and
-    `colocate_gradients_with_ops` are ignored.
-    @end_compatibility
     """
     var_list = nest.flatten(var_list)
     # TODO(josh11b): Test that we handle weight decay in a reasonable way.
-    if callable(loss):
-      with backprop.GradientTape() as tape:
-        tape.watch(var_list)
-        loss_value = loss()
-      grads = tape.gradient(loss_value, var_list, grad_loss)
-    else:
-      if context.executing_eagerly():
-        raise RuntimeError("`loss` passed to Optimizer.compute_gradients "
-                           "should be a function when eager execution is "
-                           "enabled.")
-      self._assert_valid_dtypes([loss])
-      if grad_loss is not None:
-        self._assert_valid_dtypes([grad_loss])
-      grads = gradients.gradients(
-          loss,
-          var_list,
-          grad_ys=grad_loss,
-          aggregation_method=aggregation_method,
-          colocate_gradients_with_ops=colocate_gradients_with_ops,
-          stop_gradients=stop_gradients)
+    with backprop.GradientTape() as tape:
+      tape.watch(var_list)
+      loss_value = loss()
+      loss_value = self._scale_loss(loss_value)
+    grads = tape.gradient(loss_value, var_list, grad_loss)
+
+    if hasattr(self, "clipnorm"):
+      grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
+    if hasattr(self, "clipvalue"):
+      grads = [
+          clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
+          for g in grads
+      ]
 
     grads_and_vars = list(zip(grads, var_list))
     self._assert_valid_dtypes([
@@ -277,6 +268,46 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
     return grads_and_vars
 
+  @staticmethod
+  def _scale_loss(loss_value):
+    if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
+      num_replicas = \
+        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      if num_replicas > 1:
+        loss_value *= (1. / num_replicas)
+    return loss_value
+
+  def get_gradients(self, loss, params):
+    """Returns gradients of `loss` with respect to `params`.
+
+    Arguments:
+      loss: Loss tensor.
+      params: List of variables.
+
+    Returns:
+      List of gradient tensors.
+
+    Raises:
+      ValueError: In case any gradient cannot be computed (e.g. if gradient
+        function not implemented).
+    """
+    loss = self._scale_loss(loss)
+    grads = gradients.gradients(loss, params)
+    if None in grads:
+      raise ValueError("An operation has `None` for gradient. "
+                       "Please make sure that all of your ops have a "
+                       "gradient defined (i.e. are differentiable). "
+                       "Common ops without gradient: "
+                       "K.argmax, K.round, K.eval.")
+    if hasattr(self, "clipnorm"):
+      grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
+    if hasattr(self, "clipvalue"):
+      grads = [
+          clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
+          for g in grads
+      ]
+    return grads
+
   def apply_gradients(self, grads_and_vars, name=None):
     """Apply gradients to variables.
 
@@ -299,10 +330,11 @@ class OptimizerV2(optimizer_v1.Optimizer):
     """
     grads_and_vars = _filter_grads(grads_and_vars)
     var_list = [v for (_, v) in grads_and_vars]
-    if distribution_strategy_context.has_distribution_strategy():
+    if distribute_ctx.has_distribution_strategy():
       reduced_grads = merge_grads(grads_and_vars)
       grads_and_vars = zip(reduced_grads, var_list)
 
+    self._prepare()
     with ops.init_scope():
       self._create_slots(var_list)
     update_ops = []
@@ -325,7 +357,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
         return update_op
 
     with ops.name_scope(name, self._name) as name:
-      self._prepare()
       for grad, var in grads_and_vars:
         scope_name = ("" if ops.executing_eagerly_outside_functions() else
                       "_" + var.op.name)
@@ -339,7 +370,13 @@ class OptimizerV2(optimizer_v1.Optimizer):
       return apply_updates
 
   def get_updates(self, loss, params):
-    return [self.minimize(loss, params)]
+    grads = self.get_gradients(loss, params)
+    grads_and_vars = list(zip(grads, params))
+    self._assert_valid_dtypes([
+        v for g, v in grads_and_vars
+        if g is not None and v.dtype != dtypes.resource
+    ])
+    return [self.apply_gradients(grads_and_vars)]
 
   def _set_hyper(self, name, value):
     """set hyper `name` to value. value can be callable, tensor, numeric."""
@@ -387,18 +424,36 @@ class OptimizerV2(optimizer_v1.Optimizer):
     else:
       super(OptimizerV2, self).__setattr__(name, value)
 
+  def get_slot_names(self):
+    """A list of names for this optimizer's slots."""
+    return self._slot_names
+
   def add_slot(self, var, slot_name, initializer="zeros"):
+    """Add a new slot variable for `var`."""
+    if slot_name not in self._slot_names:
+      self._slot_names.append(slot_name)
     var_key = _var_key(var)
     slot_dict = self._slots.setdefault(var_key, {})
-    if slot_name not in slot_dict:
-      slot_key = _get_slot_key_from_var(var, slot_name)
-      weight = self.add_weight(
-          name=slot_key,
-          shape=var.shape,
+    weight = slot_dict.get(slot_name, None)
+    if weight is None:
+      if isinstance(initializer, six.string_types) or callable(initializer):
+        initializer = initializers.get(initializer)
+        initial_value = functools.partial(
+            initializer, shape=var.shape, dtype=var.dtype)
+      else:
+        initial_value = initializer
+      weight = tf_variables.Variable(
+          name="%s/%s" % (var._shared_name, slot_name),  # pylint: disable=protected-access
           dtype=var.dtype,
-          initializer=initializer)
+          trainable=False,
+          initial_value=initial_value)
+      backend.track_variable(weight)
       slot_dict[slot_name] = weight
+      self._restore_slot_variable(
+          slot_name=slot_name, variable=var,
+          slot_variable=weight)
       self._weights.append(weight)
+    return weight
 
   def get_slot(self, var, slot_name):
     var_key = _var_key(var)
@@ -426,7 +481,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
             trainable=False,
             initializer=value,
             aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
-        self._weights.append(self._hyper[name])
     self._prepared = True
 
   @property
@@ -456,7 +510,12 @@ class OptimizerV2(optimizer_v1.Optimizer):
     Returns:
         Python dictionary.
     """
-    return {"name": self._name}
+    config = {"name": self._name}
+    if hasattr(self, "clipnorm"):
+      config["clipnorm"] = self.clipnorm
+    if hasattr(self, "clipvalue"):
+      config["clipvalue"] = self.clipvalue
+    return config
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
@@ -475,6 +534,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
     Returns:
         An optimizer instance.
     """
+    if "lr" in config:
+      config["learning_rate"] = config.pop("lr")
     return cls(**config)
 
   def _serialize_hyperparameter(self, hyperparameter_name):
@@ -562,12 +623,178 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
     return variable
 
+  def _assert_valid_dtypes(self, tensors):
+    """Asserts tensors are all valid types (see `_valid_dtypes`).
+
+    Args:
+      tensors: Tensors to check.
+
+    Raises:
+      ValueError: If any tensor is not a valid type.
+    """
+    valid_dtypes = self._valid_dtypes()
+    for t in tensors:
+      dtype = t.dtype.base_dtype
+      if dtype not in valid_dtypes:
+        raise ValueError("Invalid type %r for %s, expected: %s." %
+                         (dtype, t.name, [v for v in valid_dtypes]))
+
+  def _valid_dtypes(self):
+    """Valid types for loss, variables and gradients.
+
+    Subclasses should override to allow other float types.
+
+    Returns:
+      Valid types for loss, variables and gradients.
+    """
+    return set(
+        [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64])
+
+  def _call_if_callable(self, param):
+    """Call the function if param is callable."""
+    return param() if callable(param) else param
+
+  def _resource_apply_dense(self, grad, handle):
+    """Add ops to apply dense gradients to the variable `handle`.
+
+    Args:
+      grad: a `Tensor` representing the gradient.
+      handle: a `Tensor` of dtype `resource` which points to the variable to be
+        updated.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    raise NotImplementedError()
+
+  def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices):
+    """Add ops to apply sparse gradients to `handle`, with repeated indices.
+
+    Optimizers which override this method must deal with repeated indices. See
+    the docstring of `_apply_sparse_duplicate_indices` for details. By default
+    the correct behavior, to sum non-unique indices and their associated
+    gradients, is enforced by first pre-processing `grad` and `indices` and
+    passing them on to `_resource_apply_sparse`. Optimizers which deal correctly
+    with duplicate indices may instead override this method to avoid the
+    overhead of summing.
+
+    Args:
+      grad: a `Tensor` representing the gradient for the affected indices.
+      handle: a `Tensor` of dtype `resource` which points to the variable to be
+        updated.
+      indices: a `Tensor` of integral type representing the indices for which
+        the gradient is nonzero. Indices may be repeated.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    summed_grad, unique_indices = _deduplicate_indexed_slices(
+        values=grad, indices=indices)
+    return self._resource_apply_sparse(summed_grad, handle, unique_indices)
+
+  def _resource_apply_sparse(self, grad, handle, indices):
+    """Add ops to apply sparse gradients to the variable `handle`.
+
+    Similar to `_apply_sparse`, the `indices` argument to this method has been
+    de-duplicated. Optimizers which deal correctly with non-unique indices may
+    instead override `_resource_apply_sparse_duplicate_indices` to avoid this
+    overhead.
+
+    Args:
+      grad: a `Tensor` representing the gradient for the affected indices.
+      handle: a `Tensor` of dtype `resource` which points to the variable to be
+        updated.
+      indices: a `Tensor` of integral type representing the indices for which
+        the gradient is nonzero. Indices are unique.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    raise NotImplementedError()
+
+  # ---------------
+  # For implementing the checkpointable interface
+  # ---------------
+
+  def _restore_slot_variable(self, slot_name, variable, slot_variable):
+    """Restore a newly created slot variable's value."""
+    variable_key = _var_key(variable)
+    deferred_restorations = self._deferred_slot_restorations.get(
+        slot_name, {}).pop(variable_key, [])
+    # Iterate over restores, highest restore UID first to minimize the number
+    # of assignments.
+    deferred_restorations.sort(key=lambda position: position.restore_uid,
+                               reverse=True)
+    for checkpoint_position in deferred_restorations:
+      checkpoint_position.restore(slot_variable)
+
+  def _create_or_restore_slot_variable(
+      self, slot_variable_position, slot_name, variable):
+    """Restore a slot variable's value, possibly creating it.
+
+    Called when a variable which has an associated slot variable is created or
+    restored. When executing eagerly, we create the slot variable with a
+    restoring initializer.
+
+    No new variables are created when graph building. Instead,
+    _restore_slot_variable catches these after normal creation and adds restore
+    ops to the graph. This method is nonetheless important when graph building
+    for the case when a slot variable has already been created but `variable`
+    has just been added to a dependency graph (causing us to realize that the
+    slot variable needs to be restored).
+
+    Args:
+      slot_variable_position: A `checkpointable._CheckpointPosition` object
+        indicating the slot variable `Checkpointable` object to be restored.
+      slot_name: The name of this `Optimizer`'s slot to restore into.
+      variable: The variable object this slot is being created for.
+    """
+    variable_key = _var_key(variable)
+    slot_dict = self._slots.get(variable_key, {})
+    slot_variable = slot_dict.get(slot_name, None)
+    if (slot_variable is None and context.executing_eagerly() and
+        slot_variable_position.is_simple_variable()
+        # Defer slot variable creation if there is an active variable creator
+        # scope. Generally we'd like to eagerly create/restore slot variables
+        # when possible, but this may mean that scopes intended to catch
+        # `variable` also catch its eagerly created slot variable
+        # unintentionally (specifically make_template would add a dependency on
+        # a slot variable if not for this case). Deferring is mostly harmless
+        # (aside from double initialization), and makes variable creator scopes
+        # behave the same way they do when graph building.
+        and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
+      initializer = checkpointable.CheckpointInitialValue(
+          checkpoint_position=slot_variable_position)
+      slot_variable = self.add_slot(
+          var=variable,
+          initializer=initializer,
+          slot_name=slot_name)
+      # Slot variables are not owned by any one object (because we don't want to
+      # save the slot variable if the optimizer is saved without the non-slot
+      # variable, or if the non-slot variable is saved without the optimizer;
+      # it's a dependency hypergraph with edges of the form (optimizer, non-slot
+      # variable, variable)). So we don't _track_ slot variables anywhere, and
+      # instead special-case this dependency and otherwise pretend it's a normal
+      # graph.
+    if slot_variable is not None:
+      # If we've either made this slot variable, or if we've pulled out an
+      # existing slot variable, we should restore it.
+      slot_variable_position.restore(slot_variable)
+    else:
+      # We didn't make the slot variable. Defer restoring until it gets created
+      # normally. We keep a list rather than the one with the highest restore
+      # UID in case slot variables have their own dependencies, in which case
+      # those could differ between restores.
+      self._deferred_slot_restorations.setdefault(
+          slot_name, {}).setdefault(variable_key, []).append(
+              slot_variable_position)
+
 
 def _filter_grads(grads_and_vars):
   """Filter out iterable with grad equal to None."""
   grads_and_vars = tuple(grads_and_vars)
   if not grads_and_vars:
-    raise ValueError("No variables provided.")
+    return grads_and_vars
   filtered = []
   vars_with_empty_grads = []
   for grad, var in grads_and_vars:
@@ -597,7 +824,7 @@ def merge_update_step(update_ops, local_step):
       incre_op = local_step.assign_add(1).op
     return incre_op
 
-  return distribution_strategy_context.get_replica_context().merge_call(
+  return distribute_ctx.get_replica_context().merge_call(
       merge_update_step_fn, args=(update_ops, local_step))
 
 
@@ -605,11 +832,11 @@ def merge_grads(grads_and_vars):
   """Merge gradients from different replicas."""
 
   def merge_grad_fn(strategy, grads_and_vars):
-    reduced_grads = strategy.batch_reduce(
-        ds_reduce_util.ReduceOp.MEAN, grads_and_vars)
+    reduced_grads = strategy.batch_reduce(ds_reduce_util.ReduceOp.SUM,
+                                          grads_and_vars)
     return reduced_grads
 
-  return distribution_strategy_context.get_replica_context().merge_call(
+  return distribute_ctx.get_replica_context().merge_call(
       merge_grad_fn, args=(grads_and_vars,))
 
 
@@ -628,7 +855,7 @@ def _var_key(var):
   """
 
   # pylint: disable=protected-access
-  if distribution_strategy_context.has_distribution_strategy() and hasattr(
+  if distribute_ctx.has_distribution_strategy() and hasattr(
       var, "_primary_var"):
     var = var._primary_var
   if hasattr(var, "op"):
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index 305267d73e5d852778333e827d0b1ed089502d02..8b2865e2aae7c4e5d148a87fd58e2a2b169a40ac 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -18,8 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+import tempfile
+
+from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
@@ -29,19 +34,24 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import saving
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.training import momentum
 
 
 class OptimizerTest(test.TestCase):
@@ -53,8 +63,6 @@ class OptimizerTest(test.TestCase):
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-        if not context.executing_eagerly():
-          loss = loss()
         sgd = gradient_descent.SGD(3.0)
 
         self.evaluate(variables.global_variables_initializer())
@@ -105,33 +113,6 @@ class OptimizerTest(test.TestCase):
       # var1 = [0., 1.] - 0.5 * [3, 3]
       self.assertAllClose([-1.5, -0.5], self.evaluate(var1))
 
-  @test_util.run_in_graph_and_eager_modes
-  def testAggregationMethod(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-        if not context.executing_eagerly():
-          loss = loss()
-        sgd = gradient_descent.SGD(3.0)
-
-        self.evaluate(variables.global_variables_initializer())
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-        # Run 1 step of sgd through optimizer
-        opt_op = sgd.minimize(
-            loss,
-            var_list=[var0, var1],
-            aggregation_method=gradients_impl.AggregationMethod
-            .EXPERIMENTAL_ACCUMULATE_N)
-        self.evaluate(variables.global_variables_initializer())
-        self.evaluate(opt_op)
-        # Validate updated params
-        self.assertAllClose([-14., -13.], self.evaluate(var0))
-        self.assertAllClose([-6., -5.], self.evaluate(var1))
-
   @test_util.run_in_graph_and_eager_modes
   def testPrecomputedGradient(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -139,8 +120,6 @@ class OptimizerTest(test.TestCase):
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-        if not context.executing_eagerly():
-          loss = loss()
         grad_loss = constant_op.constant([42, -42], dtype=dtype)
         sgd = gradient_descent.SGD(3.0)
 
@@ -165,8 +144,6 @@ class OptimizerTest(test.TestCase):
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0  # pylint: disable=cell-var-from-loop
-        if not context.executing_eagerly():
-          loss = loss()
         sgd_op = gradient_descent.SGD(3.0)
         with self.assertRaisesRegexp(ValueError, 'No gradients'):
           # var1 has no gradient
@@ -179,8 +156,6 @@ class OptimizerTest(test.TestCase):
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         loss = lambda: constant_op.constant(5.0)
-        if not context.executing_eagerly():
-          loss = loss()
 
         sgd_op = gradient_descent.SGD(3.0)
         with self.assertRaisesRegexp(ValueError,
@@ -205,11 +180,9 @@ class OptimizerTest(test.TestCase):
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-        if not context.executing_eagerly():
-          loss = loss()
 
         sgd = gradient_descent.SGD(3.0)
-        grads_and_vars = sgd.compute_gradients(loss, [var0, var1])
+        grads_and_vars = sgd._compute_gradients(loss, [var0, var1])
         # Convert gradients to tf.Variables
         converted_grads = [
             resource_variable_ops.ResourceVariable(
@@ -248,7 +221,7 @@ class OptimizerTest(test.TestCase):
         return x * x
 
       sgd = gradient_descent.SGD(3.0)
-      grads_and_vars = sgd.compute_gradients(f, [x])
+      grads_and_vars = sgd._compute_gradients(f, [x])
       self.assertEqual(1, len(grads_and_vars))
       grad, x_as_var = grads_and_vars[0]
       self.assertIs(x, x_as_var)
@@ -267,8 +240,6 @@ class OptimizerTest(test.TestCase):
       var1 = variables.Variable([3.0, 4.0],
                                 constraint=constraint_0)
       loss = lambda: 5 * var0 + 3 * var1
-      if not context.executing_eagerly():  # pylint: disable=cell-var-from-loop
-        loss = loss()
       sgd = gradient_descent.SGD(3.0)
 
       self.evaluate(variables.global_variables_initializer())
@@ -327,6 +298,28 @@ class OptimizerTest(test.TestCase):
           self.evaluate(opt._get_hyper('learning_rate')),
           opt3._get_hyper('learning_rate'))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testGradClipValue(self):
+    with self.cached_session():
+      var = resource_variable_ops.ResourceVariable([1.0, 2.0])
+      loss = lambda: 3 * var
+      opt = gradient_descent.SGD(learning_rate=1.0, clipvalue=1.0)
+      opt_op = opt.minimize(loss, [var])
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(opt_op)
+      self.assertAllClose([0., 1.], self.evaluate(var))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testGradClipNorm(self):
+    with self.cached_session():
+      var = resource_variable_ops.ResourceVariable([1.0])
+      loss = lambda: 3 * var
+      opt = gradient_descent.SGD(learning_rate=1.0, clipnorm=1.0)
+      opt_op = opt.minimize(loss, [var])
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(opt_op)
+      self.assertAllClose([0.], self.evaluate(var))
+
   @test_util.run_in_graph_and_eager_modes
   def testWeights(self):
     with self.cached_session():
@@ -476,6 +469,152 @@ class OptimizerTest(test.TestCase):
         float(backend.get_value(model.optimizer.lr)), 0.01, atol=1e-4)
 
 
+class OptimizersCompatibilityTest(test.TestCase, parameterized.TestCase):
+
+  # TODO(tanzheny): remove test_numeric after algorithm for Momentum, Adam and
+  # NAdam has been unified: currently these three algorithms behave differently.
+  @parameterized.named_parameters(
+      ('adadelta', 'adadelta', True, True), ('adagrad', 'adagrad', True, True),
+      ('adam', 'adam', True, True), ('adamax', 'adamax', True, True),
+      ('nadam', 'nadam', True, False), ('momentum', 'momentum', True, True),
+      ('sgd', 'sgd', False, True))
+  def testOptimizersCompatibility(self, opt_str, test_weights, test_numeric):
+    np.random.seed(1331)
+    with self.cached_session():
+      train_samples = 20
+      input_dim = 3
+      num_classes = 2
+      (x, y), _ = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=10,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      y = keras.utils.to_categorical(y)
+
+      num_hidden = 5
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+
+      old_mode = os.environ.get('TF2_BEHAVIOR', None)
+      # Disable tf2 to create V1 optimizer.
+      disable_tf2()
+      if opt_str == 'momentum':
+        opt_v1 = optimizers.SGD(momentum=0.9)
+      else:
+        opt_v1 = optimizers.get(opt_str)
+
+      # Test compile and fit with v1 optimizer.
+      model.compile(opt_v1, loss='categorical_crossentropy', metrics=[])
+      model.fit(x, y, batch_size=5, epochs=1)
+      model_dir = tempfile.mkdtemp()
+      gfile.MakeDirs(model_dir)
+      file_name = os.path.join(model_dir, 'model.h5')
+      model.save(file_name)
+
+      enable_tf2()
+      # Test load and fit with v2 optimizer.
+      model_2 = saving.load_model(file_name)
+      opt_v2 = model_2.optimizer
+      self.assertIsInstance(opt_v2, optimizer_v2.OptimizerV2)
+      # set_weights is called inside load_model but exception is swallowed,
+      # this call checks the weights can be set correctly.
+      if test_weights:
+        opt_v2.set_weights(opt_v1.get_weights())
+      if test_numeric:
+        hist_1 = model.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+        hist_2 = model_2.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+        self.assertAllClose(model.get_weights(), model_2.get_weights())
+        self.assertAllClose(model.get_weights(), model_2.get_weights())
+        self.assertAllClose(hist_1.history['loss'], hist_2.history['loss'])
+
+      if old_mode is not None:
+        os.environ['TF2_BEHAVIOR'] = old_mode
+
+  def testNumericEquivalenceForNesterovMomentum(self):
+    np.random.seed(1331)
+    with self.cached_session():
+      train_samples = 20
+      input_dim = 3
+      num_classes = 2
+      (x, y), _ = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=10,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      y = keras.utils.to_categorical(y)
+
+      num_hidden = 5
+      model_k_v1 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2.set_weights(model_k_v1.get_weights())
+      model_tf = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_tf.set_weights(model_k_v2.get_weights())
+
+      opt_k_v1 = optimizers.SGD(lr=0.001, momentum=0.9, nesterov=True)
+      opt_k_v2 = gradient_descent.SGD(momentum=0.9, nesterov=True)
+      opt_tf = momentum.MomentumOptimizer(
+          learning_rate=0.001, momentum=0.9, use_nesterov=True)
+
+      model_k_v1.compile(opt_k_v1, loss='categorical_crossentropy', metrics=[])
+      model_k_v2.compile(opt_k_v2, loss='categorical_crossentropy', metrics=[])
+      model_tf.compile(opt_tf, loss='categorical_crossentropy', metrics=[])
+
+      hist_k_v1 = model_k_v1.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+      hist_k_v2 = model_k_v2.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+      hist_tf = model_tf.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+
+      self.assertAllClose(model_k_v1.get_weights(), model_tf.get_weights())
+      self.assertAllClose(model_k_v1.get_weights(), model_k_v2.get_weights())
+      self.assertAllClose(opt_k_v1.get_weights(), opt_k_v2.get_weights())
+      self.assertAllClose(hist_k_v1.history['loss'], hist_tf.history['loss'])
+      self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss'])
+
+  def testNumericEquivalenceForAmsgrad(self):
+    np.random.seed(1331)
+    with self.cached_session():
+      train_samples = 20
+      input_dim = 3
+      num_classes = 2
+      (x, y), _ = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=10,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      y = keras.utils.to_categorical(y)
+
+      num_hidden = 5
+      model_k_v1 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2.set_weights(model_k_v1.get_weights())
+
+      opt_k_v1 = optimizers.Adam(amsgrad=True)
+      opt_k_v2 = adam.Adam(amsgrad=True)
+
+      model_k_v1.compile(opt_k_v1, loss='categorical_crossentropy', metrics=[])
+      model_k_v2.compile(opt_k_v2, loss='categorical_crossentropy', metrics=[])
+
+      hist_k_v1 = model_k_v1.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+      hist_k_v2 = model_k_v2.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+
+      self.assertAllClose(model_k_v1.get_weights(), model_k_v2.get_weights())
+      self.assertAllClose(opt_k_v1.get_weights(), opt_k_v2.get_weights())
+      self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss'])
+
+
+def disable_tf2():
+  if 'TF2_BEHAVIOR' in os.environ:
+    del os.environ['TF2_BEHAVIOR']
+
+
+def enable_tf2():
+  os.environ['TF2_BEHAVIOR'] = 'enabled'
+
+
 # Note: These tests are kept in a separate class to avoid bugs in some
 # distributions of Python that break AutoGraph which is used by tf.function.
 class OptimizerWithFunctionTest(test.TestCase):
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
index 6a5b334fc46f6ae76f48cce29bc119cdc8f0eaf2..dbb5a37fd80c783a4f4f968488cc55919dc424a2 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("keras.optimizers.RMSprop", v1=[])
 class RMSprop(optimizer_v2.OptimizerV2):
   r"""Optimizer that implements the RMSprop algorithm.
 
@@ -91,7 +93,7 @@ class RMSprop(optimizer_v2.OptimizerV2):
       **kwargs: keyword arguments. Allowed to be {`decay`}
     """
     super(RMSprop, self).__init__(name, **kwargs)
-    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
     self._set_hyper("decay", self._initial_decay)
     self._set_hyper("rho", rho)
 
@@ -103,13 +105,13 @@ class RMSprop(optimizer_v2.OptimizerV2):
     self._set_hyper("momentum", momentum)
 
     self._set_hyper("epsilon", epsilon)
-    self._centered = centered
+    self.centered = centered
 
   def _create_slots(self, var_list):
     for var in var_list:
       self.add_slot(var, "rms")
       self.add_slot(var, "momentum")
-      if self._centered:
+      if self.centered:
         self.add_slot(var, "mg")
 
   def _resource_apply_dense(self, grad, var):
@@ -120,7 +122,7 @@ class RMSprop(optimizer_v2.OptimizerV2):
     rho = self._get_hyper("rho", var_dtype)
     momentum = self._get_hyper("momentum", var_dtype)
     epsilon = self._get_hyper("epsilon", var_dtype)
-    if self._centered:
+    if self.centered:
       mg = self.get_slot(var, "mg")
       return training_ops.resource_apply_centered_rms_prop(
           var.handle,
@@ -153,7 +155,7 @@ class RMSprop(optimizer_v2.OptimizerV2):
     rho = self._get_hyper("rho", var_dtype)
     momentum = self._get_hyper("momentum", var_dtype)
     epsilon = self._get_hyper("epsilon", var_dtype)
-    if self._centered:
+    if self.centered:
       mg = self.get_slot(var, "mg")
       return training_ops.resource_sparse_apply_centered_rms_prop(
           var.handle,
@@ -188,7 +190,7 @@ class RMSprop(optimizer_v2.OptimizerV2):
         "rho": self._serialize_hyperparameter("rho"),
         "momentum": self._serialize_hyperparameter("momentum"),
         "epsilon": self._serialize_hyperparameter("epsilon"),
-        "centered": self._centered,
+        "centered": self.centered,
     })
     return config
 
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
index a8658a8550760a04c6031e26721038b88fad0ebd..4d61cfbbc52789db172445f9286fdb848c0a7bc6 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -233,8 +233,11 @@ class RMSpropOptimizerTest(test.TestCase):
       with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
+
+        def loss():
+          pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
+          return pred * pred
+
         sgd_op = rmsprop.RMSprop(
             learning_rate=1.0,
             rho=0.0,
@@ -258,8 +261,12 @@ class RMSpropOptimizerTest(test.TestCase):
       with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
+
+        def loss():
+          pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
+          return pred * pred
+
+        # loss = lambda: pred * pred  # pylint: disable=cell-var-from-loop
         sgd_op = rmsprop.RMSprop(
             learning_rate=1.0,
             rho=0.0,
@@ -405,6 +412,14 @@ class RMSpropOptimizerTest(test.TestCase):
                 (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0))
             ]), self.evaluate(var1))
 
+  def testConstructRMSpropWithLR(self):
+    opt = rmsprop.RMSprop(lr=1.0)
+    self.assertEqual(opt.lr, 1.0)
+    opt_2 = rmsprop.RMSprop(learning_rate=0.1, lr=1.0)
+    self.assertEqual(opt_2.lr, 1.0)
+    opt_3 = rmsprop.RMSprop(learning_rate=0.1)
+    self.assertEqual(opt_3.lr, 0.1)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index 9c8020dc05abbd86bcaae01dc87b47ed0bd610d6..dda603fa2ec40c2178d0399e97786b4db001c1a5 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -23,6 +23,8 @@ import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python import tf2
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
 from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
@@ -37,14 +39,13 @@ from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.optimizers.Optimizer')
+@tf_export(v1=['keras.optimizers.Optimizer'])
 class Optimizer(object):
   """Abstract optimizer base class.
 
@@ -158,7 +159,7 @@ class Optimizer(object):
     return cls(**config)
 
 
-@tf_export('keras.optimizers.SGD')
+@tf_export(v1=['keras.optimizers.SGD'])
 class SGD(Optimizer):
   """Stochastic gradient descent optimizer.
 
@@ -223,7 +224,7 @@ class SGD(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.optimizers.RMSprop')
+@tf_export(v1=['keras.optimizers.RMSprop'])
 class RMSprop(Optimizer):
   """RMSProp optimizer.
 
@@ -290,7 +291,7 @@ class RMSprop(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.optimizers.Adagrad')
+@tf_export(v1=['keras.optimizers.Adagrad'])
 class Adagrad(Optimizer):
   """Adagrad optimizer.
 
@@ -357,7 +358,7 @@ class Adagrad(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.optimizers.Adadelta')
+@tf_export(v1=['keras.optimizers.Adadelta'])
 class Adadelta(Optimizer):
   """Adadelta optimizer.
 
@@ -441,7 +442,7 @@ class Adadelta(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.optimizers.Adam')
+@tf_export(v1=['keras.optimizers.Adam'])
 class Adam(Optimizer):
   """Adam optimizer.
 
@@ -482,7 +483,7 @@ class Adam(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [state_ops.assign_add(self.iterations, 1)]
+    self.updates = []
 
     lr = self.lr
     if self.initial_decay > 0:
@@ -490,7 +491,8 @@ class Adam(Optimizer):
           1. / (1. + self.decay * math_ops.cast(self.iterations,
                                                 K.dtype(self.decay))))
 
-    t = math_ops.cast(self.iterations, K.floatx()) + 1
+    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
+      t = math_ops.cast(self.iterations, K.floatx())
     lr_t = lr * (
         K.sqrt(1. - math_ops.pow(self.beta_2, t)) /
         (1. - math_ops.pow(self.beta_1, t)))
@@ -537,7 +539,7 @@ class Adam(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.optimizers.Adamax')
+@tf_export(v1=['keras.optimizers.Adamax'])
 class Adamax(Optimizer):
   """Adamax optimizer from Adam paper's Section 7.
 
@@ -825,6 +827,7 @@ def deserialize(config, custom_objects=None):
         'sgd': SGD,
         'tfoptimizer': TFOptimizer
     }
+
   # Make deserialization case-insensitive for built-in optimizers.
   if config['class_name'].lower() in all_classes:
     config['class_name'] = config['class_name'].lower()
diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
index d3cacb702c9e60f59b8484f66ee177febf711b56..18a20567ce9db90725a1cb05c34ae6baeacbcd7c 100644
--- a/tensorflow/python/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -65,6 +65,15 @@ def _test_optimizer(optimizer, target=0.75):
   optim = keras.optimizers.deserialize(config)
   new_config = keras.optimizers.serialize(optim)
   new_config['class_name'] = new_config['class_name'].lower()
+  new_config['config'].pop('name', None)
+  if 'amsgrad' not in config['config']:
+    new_config['config'].pop('amsgrad', None)
+  if 'decay' in new_config['config'] and 'schedule_decay' in config['config']:
+    new_config['config']['schedule_decay'] = new_config['config'].pop('decay')
+  if 'momentum' not in config['config']:
+    new_config['config'].pop('momentum', None)
+  if 'centered' not in config['config']:
+    new_config['config'].pop('centered', None)
   assert config == new_config
 
   # Test constraints.
@@ -91,26 +100,22 @@ def _test_optimizer(optimizer, target=0.75):
 
 class KerasOptimizersTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_sgd(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.SGD(lr=0.01,
                                            momentum=0.9,
                                            nesterov=True))
 
-  @test_util.run_deprecated_v1
   def test_rmsprop(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.RMSprop())
       _test_optimizer(keras.optimizers.RMSprop(decay=1e-3))
 
-  @test_util.run_deprecated_v1
   def test_adagrad(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adagrad())
       _test_optimizer(keras.optimizers.Adagrad(decay=1e-3))
 
-  @test_util.run_deprecated_v1
   def test_adadelta(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adadelta(), target=0.6)
@@ -119,32 +124,29 @@ class KerasOptimizersTest(test.TestCase):
       # the accuracy.
       _test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.4)
 
-  @test_util.run_deprecated_v1
   def test_adam(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adam())
-      _test_optimizer(keras.optimizers.Adam(decay=1e-3))
+      # Accuracy seems dependent on the seed initialization.
+      # TODO(b/121051441): fix test flakiness.
+      _test_optimizer(keras.optimizers.Adam(decay=1e-3), target=0.73)
       _test_optimizer(keras.optimizers.Adam(amsgrad=True))
 
-  @test_util.run_deprecated_v1
   def test_adamax(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adamax())
       _test_optimizer(keras.optimizers.Adamax(decay=1e-3))
 
-  @test_util.run_deprecated_v1
   def test_nadam(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Nadam())
 
-  @test_util.run_deprecated_v1
   def test_clipnorm(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.SGD(lr=0.01,
                                            momentum=0.9,
                                            clipnorm=0.5))
 
-  @test_util.run_deprecated_v1
   def test_clipvalue(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.SGD(lr=0.01,
diff --git a/tensorflow/python/keras/saving/BUILD b/tensorflow/python/keras/saving/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..1ab7aca58eba4fe12b507b2cc53fd1892f62d6ee
--- /dev/null
+++ b/tensorflow/python/keras/saving/BUILD
@@ -0,0 +1,69 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Description:
+#   Keras saving and loading libraries.
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "saving",
+    srcs = ["__init__.py"],
+    deps = [":saved_model"],
+)
+
+py_library(
+    name = "saved_model",
+    srcs = ["saved_model.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:mode_keys",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:util",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python/saved_model/model_utils",
+    ],
+)
+
+py_test(
+    name = "saved_model_test",
+    size = "medium",
+    srcs = ["saved_model_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",  # TODO(b/119349471): Re-enable
+        "no_windows",
+    ],
+    deps = [
+        ":saved_model",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:mode_keys",
+        "//tensorflow/python/keras",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/keras/saving/__init__.py b/tensorflow/python/keras/saving/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ff9f3b74e8ff253506cde18e60a01bbc9fac3ff
--- /dev/null
+++ b/tensorflow/python/keras/saving/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils for saving a Keras Model or Estimator to the SavedModel format."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.saving.saved_model import export
+from tensorflow.python.keras.saving.saved_model import load_from_saved_model
diff --git a/tensorflow/python/keras/saving/saved_model.py b/tensorflow/python/keras/saving/saved_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b83f321c222a325c2dec5da295e0e5789e28c32
--- /dev/null
+++ b/tensorflow/python/keras/saving/saved_model.py
@@ -0,0 +1,418 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Utility functions to save/load keras Model to/from SavedModel."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import six
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import models as models_lib
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.metrics import Metric
+from tensorflow.python.keras.models import model_from_json
+from tensorflow.python.keras.utils import metrics_utils
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import builder as saved_model_builder
+from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import model_utils
+from tensorflow.python.saved_model import save as save_lib
+from tensorflow.python.saved_model import utils_impl as saved_model_utils
+from tensorflow.python.training import mode_keys
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.util import compat
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('keras.experimental.export')
+def export(
+    model, saved_model_path, custom_objects=None, as_text=None,
+    input_signature=None, serving_only=False):
+  """Saves a `tf.keras.Model` into Tensorflow SavedModel format.
+
+  `save_model` generates new files/folders under the `saved_model_path` folder:
+  1) a checkpoint containing the model weights.
+  2) a saved_model.pb file containing the model's MetaGraphs. The prediction
+     graph is always exported. The evaluaton and training graphs are exported
+     if the following conditions are met:
+     - Evaluation: model loss is defined.
+     - Training: model is compiled with an optimizer defined under `tf.train`.
+       This is because `tf.keras.optimizers.Optimizer` instances cannot be
+       saved to checkpoints.
+  3) Model's json configuration, if model.get_config() has been implemented.
+     This file can be used to reload the model using
+     tf.keras.models.model_from_json(). Note that if any custom objects were
+     used, they should be passed to the `custom_object` argument when loading
+     the model.
+
+  Model limitations:
+  - Sequential and functional models can always be saved.
+  - Subclassed models can only be saved when `serving_only=True`. This is due to
+    the current implementation copying the model in order to export the training
+    and evaluation graphs. Because the topology of subclassed models cannot be
+    determined, the subclassed models cannot be cloned. Subclassed models will
+    be entirely exportable in the future.
+
+  Note that each mode is exported in separate graphs, so different modes do not
+  share variables. To use the train graph with evaluation or prediction graphs,
+  create a new checkpoint if variable values have been updated.
+
+  Example:
+
+  ```python
+  import tensorflow as tf
+
+  # Create a tf.keras model.
+  model = tf.keras.Sequential()
+  model.add(tf.keras.layers.Dense(1, input_shape=[10]))
+  model.summary()
+
+  # Save the tf.keras model in the SavedModel format.
+  saved_to_path = tf.keras.experimental.export(
+        model, '/tmp/my_simple_tf_keras_saved_model')
+
+  # Load the saved keras model back.
+  model_prime = tf.keras.experimental.load_from_saved_model(saved_to_path)
+  model_prime.summary()
+  ```
+
+  Args:
+    model: A `tf.keras.Model` to be saved. If the model is subclassed, the flag
+      `serving_only` must be set to True.
+    saved_model_path: a string specifying the path to the SavedModel directory.
+      The SavedModel will be saved to a timestamped folder created within this
+      directory.
+    custom_objects: Optional dictionary mapping string names to custom classes
+      or functions (e.g. custom loss functions).
+    as_text: whether to write the `SavedModel` proto in text format. Currently
+      unavailable in serving-only mode.
+    input_signature: A possibly nested sequence of `tf.TensorSpec` objects, used
+      to specify the expected model inputs. `input_signature`'s nested structure
+      should match the expected nested structure of the inputs to the model. If
+      this is not set, this function will attempt to infer the input shapes and
+      dtypes from the model. Note that if the model is subclassed, the tensor
+      inputs to the call function should be nested in the first argument (this
+      is a general requirement for using subclassed models with Keras functions
+      .fit(), .predict(), etc.).
+    serving_only: Export only the outputs produced from calling the model in
+      predict mode. The losses, optimizer, and other training configurations are
+      not saved. If the SavedModel will only be used for serving (rather than
+      retraining), or if the model is subclassed, this can be set to True.
+
+  Returns:
+    String path to the SavedModel folder, a subdirectory of `saved_model_path`.
+
+  Raises:
+    NotImplementedError: If the model is a subclassed model, and serving_only is
+      False.
+    ValueError: If the input signature cannot be inferred from the model.
+  """
+  export_dir = model_utils.get_timestamped_export_dir(saved_model_path)
+
+  if serving_only:
+    save_lib.save(
+        model, export_dir,
+        signatures=training_utils.trace_model_call(model, input_signature))
+  else:
+    _save_v1_format(model, export_dir, custom_objects, as_text, input_signature)
+
+  try:
+    _export_model_json(model, export_dir)
+  except NotImplementedError:
+    logging.warning('Skipped saving model JSON, subclassed model does not have '
+                    'get_config() defined.')
+
+  return export_dir
+
+
+def _export_model_json(model, saved_model_path):
+  """Saves model configuration as a json string under assets folder."""
+  model_json = model.to_json()
+  model_json_filepath = os.path.join(
+      saved_model_utils.get_or_create_assets_dir(saved_model_path),
+      compat.as_text(constants.SAVED_MODEL_FILENAME_JSON))
+  file_io.write_string_to_file(model_json_filepath, model_json)
+
+
+def _export_model_variables(model, saved_model_path):
+  """Saves model weights in checkpoint format under variables folder."""
+  saved_model_utils.get_or_create_variables_dir(saved_model_path)
+  checkpoint_prefix = saved_model_utils.get_variables_path(saved_model_path)
+  model.save_weights(checkpoint_prefix, save_format='tf', overwrite=True)
+  return checkpoint_prefix
+
+
+def _save_v1_format(model, path, custom_objects, as_text, input_signature):
+  """Exports model to v1 SavedModel format."""
+  if not model._is_graph_network:
+    if isinstance(model, sequential.Sequential):
+      # If input shape is not directly set in the model, the exported model
+      # will infer the expected shapes of the input from the model.
+      if not model.built and input_signature is None:
+        raise ValueError(
+            'Sequential model\'s input shape is unknown. Please build the '
+            'model, or use the input_signature argument to specify the '
+            'model inputs.')
+    else:
+      raise NotImplementedError(
+          'Subclassed models can only be exported for serving. Please set '
+          'argument serving_only=True.')
+
+  builder = saved_model_builder._SavedModelBuilder(path)
+
+  # Manually save variables to export them in an object-based checkpoint. This
+  # skips the `builder.add_meta_graph_and_variables()` step, which saves a
+  # named-based checkpoint.
+  # TODO(b/113134168): Add fn to Builder to save with object-based saver.
+  # TODO(b/113178242): This should only export the model json structure. Only
+  # one save is needed once the weights can be copied from the model to clone.
+  checkpoint_path = _export_model_variables(model, path)
+
+  # Export each mode. Use ModeKeys enums defined for `Estimator` to ensure that
+  # Keras models and `Estimator`s are exported with the same format.
+  # Every time a mode is exported, the code checks to see if new variables have
+  # been created (e.g. optimizer slot variables). If that is the case, the
+  # checkpoint is re-saved to include the new variables.
+  export_args = {'builder': builder,
+                 'model': model,
+                 'custom_objects': custom_objects,
+                 'checkpoint_path': checkpoint_path,
+                 'input_signature': input_signature}
+
+  has_saved_vars = False
+  if model.optimizer:
+    # TODO(kathywu): Verify this works with v2 optimizer.
+    if isinstance(model.optimizer, optimizers.TFOptimizer):
+      _export_mode(mode_keys.ModeKeys.TRAIN, has_saved_vars, **export_args)
+      has_saved_vars = True
+      _export_mode(mode_keys.ModeKeys.TEST, has_saved_vars, **export_args)
+    else:
+      logging.warning(
+          'Model was compiled with an optimizer, but the optimizer is not from '
+          '`tf.train` (e.g. `tf.train.AdagradOptimizer`). Only the serving '
+          'graph was exported. The train and evaluate graphs were not added to '
+          'the SavedModel.')
+  _export_mode(mode_keys.ModeKeys.PREDICT, has_saved_vars, **export_args)
+
+  builder.save(as_text)
+
+
+def _get_var_list(model):
+  """Returns list of all checkpointed saveable objects in the model."""
+  return checkpointable_utils.named_saveables(model)
+
+
+def create_placeholder(spec):
+  return K.placeholder(shape=spec.shape, dtype=spec.dtype, name=spec.name)
+
+
+def _export_mode(
+    mode, has_saved_vars, builder, model, custom_objects, checkpoint_path,
+    input_signature):
+  """Exports a model, and optionally saves new vars from the clone model.
+
+  Args:
+    mode: A `tf.estimator.ModeKeys` string.
+    has_saved_vars: A `boolean` indicating whether the SavedModel has already
+      exported variables.
+    builder: A `SavedModelBuilder` object.
+    model: A `tf.keras.Model` object.
+    custom_objects: A dictionary mapping string names to custom classes
+      or functions.
+    checkpoint_path: String path to checkpoint.
+    input_signature: Nested TensorSpec containing the expected inputs. Can be
+      `None`, in which case the signature will be inferred from the model.
+
+  Raises:
+    ValueError: If the train/eval mode is being exported, but the model does
+      not have an optimizer.
+  """
+  compile_clone = (mode != mode_keys.ModeKeys.PREDICT)
+  if compile_clone and not model.optimizer:
+    raise ValueError(
+        'Model does not have an optimizer. Cannot export mode %s' % mode)
+
+  model_graph = ops.get_default_graph()
+  with ops.Graph().as_default() as g:
+
+    K.set_learning_phase(mode == mode_keys.ModeKeys.TRAIN)
+
+    if input_signature is None:
+      input_tensors = None
+    else:
+      input_tensors = nest.map_structure(create_placeholder, input_signature)
+
+    # Clone the model into blank graph. This will create placeholders for inputs
+    # and targets.
+    clone = models_lib.clone_and_build_model(
+        model, input_tensors=input_tensors, custom_objects=custom_objects,
+        compile_clone=compile_clone)
+
+    # Make sure that iterations variable is added to the global step collection,
+    # to ensure that, when the SavedModel graph is loaded, the iterations
+    # variable is returned by `tf.train.get_global_step()`. This is required for
+    # compatibility with the SavedModelEstimator.
+    if compile_clone:
+      g.add_to_collection(ops.GraphKeys.GLOBAL_STEP, clone.optimizer.iterations)
+
+    # Extract update and train ops from train/test/predict functions.
+    train_op = None
+    if mode == mode_keys.ModeKeys.TRAIN:
+      clone._make_train_function()
+      train_op = clone.train_function.updates_op
+    elif mode == mode_keys.ModeKeys.TEST:
+      clone._make_test_function()
+    else:
+      clone._make_predict_function()
+    g.get_collection_ref(ops.GraphKeys.UPDATE_OPS).extend(clone.state_updates)
+
+    clone_var_list = checkpointable_utils.named_saveables(clone)
+
+    with session.Session().as_default():
+      if has_saved_vars:
+        # Confirm all variables in the clone have an entry in the checkpoint.
+        status = clone.load_weights(checkpoint_path)
+        status.assert_existing_objects_matched()
+      else:
+        # Confirm that variables between the clone and model match up exactly,
+        # not counting optimizer objects. Optimizer objects are ignored because
+        # if the model has not trained, the slot variables will not have been
+        # created yet.
+        # TODO(b/113179535): Replace with checkpointable equivalence.
+        _assert_same_non_optimizer_objects(model, model_graph, clone, g)
+
+        # TODO(b/113178242): Use value transfer for checkpointable objects.
+        clone.load_weights(checkpoint_path)
+
+        # Add graph and variables to SavedModel.
+        # TODO(b/113134168): Switch to add_meta_graph_and_variables.
+        clone.save_weights(checkpoint_path, save_format='tf', overwrite=True)
+        builder._has_saved_variables = True
+
+    # Add graph to the SavedModel builder.
+    builder.add_meta_graph(
+        model_utils.EXPORT_TAG_MAP[mode],
+        signature_def_map=_create_signature_def_map(clone, mode),
+        saver=saver_lib.Saver(clone_var_list),
+        init_op=variables.local_variables_initializer(),
+        train_op=train_op)
+    return None
+
+
+def _create_signature_def_map(model, mode):
+  """Creates a SignatureDef map from a Keras model."""
+  inputs_dict = {name: x for name, x in zip(model.input_names, model.inputs)}
+  if model.optimizer:
+    targets_dict = {x.name.split(':')[0]: x
+                    for x in model.targets if x is not None}
+    inputs_dict.update(targets_dict)
+  outputs_dict = {name: x
+                  for name, x in zip(model.output_names, model.outputs)}
+  metrics = metrics_utils.extract_model_metrics_as_v1_metrics(model)
+
+  # Add metric variables to the `LOCAL_VARIABLES` collection. Metric variables
+  # are by default not added to any collections. We are doing this here, so
+  # that metric variables get initialized.
+  local_vars = set(ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES))
+  vars_to_add = set()
+  if metrics is not None:
+    for key, value in six.iteritems(metrics):
+      if isinstance(value, Metric):
+        vars_to_add.update(value.variables)
+        # Convert Metric instances to (value_tensor, update_op) tuple.
+        metrics[key] = (value.result(), value.updates[0])
+  # Remove variables that are in the local variables collection already.
+  vars_to_add = vars_to_add.difference(local_vars)
+  for v in vars_to_add:
+    ops.add_to_collection(ops.GraphKeys.LOCAL_VARIABLES, v)
+
+  export_outputs = model_utils.export_outputs_for_mode(
+      mode,
+      predictions=outputs_dict,
+      loss=model.total_loss if model.optimizer else None,
+      metrics=metrics)
+  return model_utils.build_all_signature_defs(
+      inputs_dict,
+      export_outputs=export_outputs,
+      serving_only=(mode == mode_keys.ModeKeys.PREDICT))
+
+
+def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):  # pylint: disable=unused-argument
+  """Asserts model and clone contain the same checkpointable objects."""
+
+  # TODO(fchollet, kathywu): make sure this works in eager mode.
+  return True
+
+
+@tf_export('keras.experimental.load_from_saved_model')
+def load_from_saved_model(saved_model_path):
+  """Loads a keras.Model from a SavedModel created by keras export().
+
+  This function reinstantiates model state by:
+  1) loading model topology from json (this will eventually come
+     from metagraph).
+  2) loading model weights from checkpoint.
+
+  Example:
+
+  ```python
+  import tensorflow as tf
+
+  # Create a tf.keras model.
+  model = tf.keras.Sequential()
+  model.add(tf.keras.layers.Dense(1, input_shape=[10]))
+  model.summary()
+
+  # Save the tf.keras model in the SavedModel format.
+  saved_to_path = tf.keras.experimental.export(
+        model, '/tmp/my_simple_tf_keras_saved_model')
+
+  # Load the saved keras model back.
+  model_prime = tf.keras.experimental.load_from_saved_model(saved_to_path)
+  model_prime.summary()
+  ```
+
+  Args:
+    saved_model_path: a string specifying the path to an existing SavedModel.
+
+  Returns:
+    a keras.Model instance.
+  """
+  # restore model topology from json string
+  model_json_filepath = os.path.join(
+      compat.as_bytes(saved_model_path),
+      compat.as_bytes(constants.ASSETS_DIRECTORY),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON))
+  model_json = file_io.read_file_to_string(model_json_filepath)
+  model = model_from_json(model_json)
+
+  # restore model weights
+  checkpoint_prefix = os.path.join(
+      compat.as_text(saved_model_path),
+      compat.as_text(constants.VARIABLES_DIRECTORY),
+      compat.as_text(constants.VARIABLES_FILENAME))
+  model.load_weights(checkpoint_prefix)
+  return model
diff --git a/tensorflow/python/keras/saving/saved_model_test.py b/tensorflow/python/keras/saving/saved_model_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8063b8af4de91f73fcc9a00bb626a88a204b44cc
--- /dev/null
+++ b/tensorflow/python/keras/saving/saved_model_test.py
@@ -0,0 +1,539 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Tests for saving/loading function for keras Model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.saving import saved_model as keras_saved_model
+from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.saved_model import model_utils
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.training import mode_keys
+from tensorflow.python.training import training as training_module
+
+
+class TestModelSavingandLoading(test.TestCase):
+
+  def _save_model_dir(self, dirname='saved_model'):
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+    return os.path.join(temp_dir, dirname)
+
+  def test_saving_sequential_model(self):
+    with self.cached_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+      model.compile(
+          loss=keras.losses.MSE,
+          optimizer=keras.optimizers.RMSprop(lr=0.0001),
+          metrics=[keras.metrics.categorical_accuracy],
+          sample_weight_mode='temporal')
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3, 3))
+      model.train_on_batch(x, y)
+
+      ref_y = model.predict(x)
+
+      temp_saved_model = self._save_model_dir()
+      output_path = keras_saved_model.export(model, temp_saved_model)
+
+      loaded_model = keras_saved_model.load_from_saved_model(output_path)
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_saving_sequential_model_without_compile(self):
+    with self.cached_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+
+      x = np.random.random((1, 3))
+      ref_y = model.predict(x)
+
+      temp_saved_model = self._save_model_dir()
+      output_path = keras_saved_model.export(model, temp_saved_model)
+      loaded_model = keras_saved_model.load_from_saved_model(output_path)
+
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+  def test_saving_functional_model(self):
+    with self.cached_session():
+      inputs = keras.layers.Input(shape=(3,))
+      x = keras.layers.Dense(2)(inputs)
+      output = keras.layers.Dense(3)(x)
+
+      model = keras.models.Model(inputs, output)
+      model.compile(
+          loss=keras.losses.MSE,
+          optimizer=keras.optimizers.RMSprop(lr=0.0001),
+          metrics=[keras.metrics.categorical_accuracy])
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+      model.train_on_batch(x, y)
+
+      ref_y = model.predict(x)
+
+      temp_saved_model = self._save_model_dir()
+      output_path = keras_saved_model.export(model, temp_saved_model)
+      loaded_model = keras_saved_model.load_from_saved_model(output_path)
+
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_saving_functional_model_without_compile(self):
+    with self.cached_session():
+      inputs = keras.layers.Input(shape=(3,))
+      x = keras.layers.Dense(2)(inputs)
+      output = keras.layers.Dense(3)(x)
+
+      model = keras.models.Model(inputs, output)
+
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+
+      ref_y = model.predict(x)
+
+      temp_saved_model = self._save_model_dir()
+      output_path = keras_saved_model.export(model, temp_saved_model)
+      loaded_model = keras_saved_model.load_from_saved_model(output_path)
+
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_saving_with_tf_optimizer(self):
+    with self.cached_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.Dense(3))
+      model.compile(
+          loss='mse',
+          optimizer=training_module.RMSPropOptimizer(0.1),
+          metrics=['acc'])
+
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+      model.train_on_batch(x, y)
+      ref_y = model.predict(x)
+
+      temp_saved_model = self._save_model_dir()
+      output_path = keras_saved_model.export(model, temp_saved_model)
+      loaded_model = keras_saved_model.load_from_saved_model(output_path)
+      loaded_model.compile(
+          loss='mse',
+          optimizer=training_module.RMSPropOptimizer(0.1),
+          metrics=['acc'])
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+      # test that new updates are the same with both models
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+
+      ref_loss = model.train_on_batch(x, y)
+      loss = loaded_model.train_on_batch(x, y)
+      self.assertAllClose(ref_loss, loss, atol=1e-05)
+
+      ref_y = model.predict(x)
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+      # test saving/loading again
+      temp_saved_model2 = self._save_model_dir('saved_model_2')
+      output_path2 = keras_saved_model.export(
+          loaded_model, temp_saved_model2)
+      loaded_model = keras_saved_model.load_from_saved_model(output_path2)
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+  def test_saving_subclassed_model_raise_error(self):
+    # For now, saving subclassed model should raise an error. It should be
+    # avoided later with loading from SavedModel.pb.
+
+    class SubclassedModel(training.Model):
+
+      def __init__(self):
+        super(SubclassedModel, self).__init__()
+        self.layer1 = keras.layers.Dense(3)
+        self.layer2 = keras.layers.Dense(1)
+
+      def call(self, inp):
+        return self.layer2(self.layer1(inp))
+
+    model = SubclassedModel()
+
+    temp_saved_model = self._save_model_dir()
+    with self.assertRaises(NotImplementedError):
+      keras_saved_model.export(model, temp_saved_model)
+
+
+class LayerWithLearningPhase(keras.engine.base_layer.Layer):
+
+  def call(self, x):
+    phase = keras.backend.learning_phase()
+    output = tf_utils.smart_cond(
+        phase, lambda: x * 0, lambda: array_ops.identity(x))
+    if not context.executing_eagerly():
+      output._uses_learning_phase = True  # pylint: disable=protected-access
+    return output
+
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
+
+def functional_model(uses_learning_phase=True):
+  inputs = keras.layers.Input(shape=(3,))
+  x = keras.layers.Dense(2)(inputs)
+  x = keras.layers.Dense(3)(x)
+  if uses_learning_phase:
+    x = LayerWithLearningPhase()(x)
+  return keras.models.Model(inputs, x)
+
+
+def sequential_model(uses_learning_phase=True):
+  model = keras.models.Sequential()
+  model.add(keras.layers.Dense(2, input_shape=(3,)))
+  model.add(keras.layers.Dense(3))
+  if uses_learning_phase:
+    model.add(LayerWithLearningPhase())
+  return model
+
+
+def sequential_model_without_input_shape(uses_learning_phase=True):
+  model = keras.models.Sequential()
+  model.add(keras.layers.Dense(2))
+  model.add(keras.layers.Dense(3))
+  if uses_learning_phase:
+    model.add(LayerWithLearningPhase())
+  return model
+
+
+class Subclassed(keras.models.Model):
+
+  def __init__(self):
+    super(Subclassed, self).__init__()
+    self.dense1 = keras.layers.Dense(2)
+    self.dense2 = keras.layers.Dense(3)
+
+  def call(self, inputs):
+    x = self.dense1(inputs)
+    x = self.dense2(x)
+    return x
+
+
+def subclassed_model():
+  return Subclassed()
+
+
+def load_model(sess, path, mode):
+  tags = model_utils.EXPORT_TAG_MAP[mode]
+  if mode == mode_keys.ModeKeys.PREDICT:
+    sig_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+  else:
+    sig_def_key = mode
+
+  meta_graph_def = loader_impl.load(sess, tags, path)
+  inputs = {
+      k: sess.graph.get_tensor_by_name(v.name)
+      for k, v in meta_graph_def.signature_def[sig_def_key].inputs.items()}
+  outputs = {
+      k: sess.graph.get_tensor_by_name(v.name)
+      for k, v in meta_graph_def.signature_def[sig_def_key].outputs.items()}
+  return inputs, outputs, meta_graph_def
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
+
+  def _save_model_dir(self, dirname='saved_model'):
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+    return os.path.join(temp_dir, dirname)
+
+  @parameterized.parameters(
+      {
+          'model_builder': functional_model,
+          'uses_learning_phase': True,
+          'optimizer': training_module.AdadeltaOptimizer(),
+          'train_before_export': True},
+      {
+          'model_builder': functional_model,
+          'uses_learning_phase': True,
+          'optimizer': training_module.AdadeltaOptimizer(),
+          'train_before_export': False},
+      {
+          'model_builder': functional_model,
+          'uses_learning_phase': False,
+          'optimizer': None,
+          'train_before_export': False},
+      {
+          'model_builder': sequential_model,
+          'uses_learning_phase': True,
+          'optimizer': training_module.AdadeltaOptimizer(),
+          'train_before_export': True},
+      {
+          'model_builder': sequential_model,
+          'uses_learning_phase': True,
+          'optimizer': training_module.AdadeltaOptimizer(),
+          'train_before_export': False},
+      {
+          'model_builder': sequential_model,
+          'uses_learning_phase': False,
+          'optimizer': None,
+          'train_before_export': False},
+      {
+          'model_builder': sequential_model_without_input_shape,
+          'uses_learning_phase': True,
+          'optimizer': training_module.AdadeltaOptimizer(),
+          'train_before_export': False})
+  def testSaveAndLoadSavedModelExport(
+      self, model_builder, uses_learning_phase, optimizer, train_before_export):
+    saved_model_path = self._save_model_dir()
+    with self.session(graph=ops.Graph()):
+      np.random.seed(130)
+      input_arr = np.random.random((1, 3))
+      target_arr = np.random.random((1, 3))
+
+      model = model_builder(uses_learning_phase)
+      if optimizer is not None:
+        model.compile(
+            loss='mse',
+            optimizer=optimizer,
+            metrics=['mae'])
+        if train_before_export:
+          model.train_on_batch(input_arr, target_arr)
+
+        ref_loss, ref_mae = model.evaluate(input_arr, target_arr)
+
+      ref_predict = model.predict(input_arr)
+
+      # Export SavedModel
+      output_path = keras_saved_model.export(model, saved_model_path)
+
+    input_name = model.input_names[0]
+    output_name = model.output_names[0]
+    target_name = output_name + '_target'
+
+    # Load predict graph, and test predictions
+    with session.Session(graph=ops.Graph()) as sess:
+      inputs, outputs, _ = load_model(sess, output_path,
+                                      mode_keys.ModeKeys.PREDICT)
+
+      predictions = sess.run(outputs[output_name],
+                             {inputs[input_name]: input_arr})
+      self.assertAllClose(ref_predict, predictions, atol=1e-05)
+
+    if optimizer:
+      # Load eval graph, and test predictions, loss and metric values
+      with session.Session(graph=ops.Graph()) as sess:
+        inputs, outputs, _ = load_model(sess, output_path,
+                                        mode_keys.ModeKeys.TEST)
+
+        # First obtain the loss and predictions, and run the metric update op by
+        # feeding in the inputs and targets.
+        loss, predictions, _ = sess.run(
+            (outputs['loss'], outputs['predictions/' + output_name],
+             outputs['metrics/mean_absolute_error/update_op']), {
+                 inputs[input_name]: input_arr,
+                 inputs[target_name]: target_arr
+             })
+
+        # The metric value should be run after the update op, to ensure that it
+        # reflects the correct value.
+        metric_value = sess.run(outputs['metrics/mean_absolute_error/value'])
+
+        self.assertEqual(int(train_before_export),
+                         sess.run(training_module.get_global_step()))
+        self.assertAllClose(ref_loss, loss, atol=1e-05)
+        self.assertAllClose(ref_mae, metric_value, atol=1e-05)
+        self.assertAllClose(ref_predict, predictions, atol=1e-05)
+
+      # Load train graph, and check for the train op, and prediction values
+      with session.Session(graph=ops.Graph()) as sess:
+        inputs, outputs, meta_graph_def = load_model(
+            sess, output_path, mode_keys.ModeKeys.TRAIN)
+        self.assertEqual(int(train_before_export),
+                         sess.run(training_module.get_global_step()))
+        self.assertIn('loss', outputs)
+        self.assertIn('metrics/mean_absolute_error/update_op', outputs)
+        self.assertIn('metrics/mean_absolute_error/value', outputs)
+        self.assertIn('predictions/' + output_name, outputs)
+
+        # Train for a step
+        train_op = loader_impl.get_train_op(meta_graph_def)
+        train_outputs, _ = sess.run(
+            [outputs, train_op], {inputs[input_name]: input_arr,
+                                  inputs[target_name]: target_arr})
+        self.assertEqual(int(train_before_export) + 1,
+                         sess.run(training_module.get_global_step()))
+
+        if uses_learning_phase:
+          self.assertAllClose(
+              [[0, 0, 0]], train_outputs['predictions/' + output_name],
+              atol=1e-05)
+        else:
+          self.assertNotAllClose(
+              [[0, 0, 0]], train_outputs['predictions/' + output_name],
+              atol=1e-05)
+
+  def testSaveAndLoadSavedModelWithCustomObject(self):
+    saved_model_path = self._save_model_dir()
+    with session.Session(graph=ops.Graph()) as sess:
+      def relu6(x):
+        return keras.backend.relu(x, max_value=6)
+      inputs = keras.layers.Input(shape=(1,))
+      outputs = keras.layers.Activation(relu6)(inputs)
+      model = keras.models.Model(inputs, outputs)
+      output_path = keras_saved_model.export(
+          model, saved_model_path, custom_objects={'relu6': relu6})
+    with session.Session(graph=ops.Graph()) as sess:
+      inputs, outputs, _ = load_model(sess, output_path,
+                                      mode_keys.ModeKeys.PREDICT)
+      input_name = model.input_names[0]
+      output_name = model.output_names[0]
+      predictions = sess.run(
+          outputs[output_name], {inputs[input_name]: [[7], [-3], [4]]})
+      self.assertAllEqual([[6], [0], [4]], predictions)
+
+  def testAssertModelCloneSameObjectsIgnoreOptimizer(self):
+    input_arr = np.random.random((1, 3))
+    target_arr = np.random.random((1, 3))
+
+    model_graph = ops.Graph()
+    clone_graph = ops.Graph()
+
+    # Create two models with the same layers but different optimizers.
+    with session.Session(graph=model_graph):
+      inputs = keras.layers.Input(shape=(3,))
+      x = keras.layers.Dense(2)(inputs)
+      x = keras.layers.Dense(3)(x)
+      model = keras.models.Model(inputs, x)
+
+      model.compile(loss='mse', optimizer=training_module.AdadeltaOptimizer())
+      model.train_on_batch(input_arr, target_arr)
+
+    with session.Session(graph=clone_graph):
+      inputs = keras.layers.Input(shape=(3,))
+      x = keras.layers.Dense(2)(inputs)
+      x = keras.layers.Dense(3)(x)
+      clone = keras.models.Model(inputs, x)
+      clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
+      clone.train_on_batch(input_arr, target_arr)
+
+    keras_saved_model._assert_same_non_optimizer_objects(
+        model, model_graph, clone, clone_graph)
+
+  def testAssertModelCloneSameObjectsThrowError(self):
+    input_arr = np.random.random((1, 3))
+    target_arr = np.random.random((1, 3))
+
+    model_graph = ops.Graph()
+    clone_graph = ops.Graph()
+
+    # Create two models with the same layers but different optimizers.
+    with session.Session(graph=model_graph):
+      inputs = keras.layers.Input(shape=(3,))
+      x = keras.layers.Dense(2)(inputs)
+      x = keras.layers.Dense(3)(x)
+      model = keras.models.Model(inputs, x)
+
+      model.compile(loss='mse', optimizer=training_module.AdadeltaOptimizer())
+      model.train_on_batch(input_arr, target_arr)
+
+    with session.Session(graph=clone_graph):
+      inputs = keras.layers.Input(shape=(3,))
+      x = keras.layers.Dense(2)(inputs)
+      x = keras.layers.Dense(4)(x)
+      x = keras.layers.Dense(3)(x)
+      clone = keras.models.Model(inputs, x)
+      clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
+      clone.train_on_batch(input_arr, target_arr)
+
+  def testSaveSequentialModelWithoutInputShapes(self):
+    model = sequential_model_without_input_shape(True)
+    # A Sequential model that hasn't been built should raise an error.
+    with self.assertRaisesRegexp(ValueError, 'Please build the model'):
+      keras_saved_model.export(model, '')
+
+    saved_model_path = self._save_model_dir()
+    output_path = keras_saved_model.export(
+        model, saved_model_path,
+        input_signature=tensor_spec.TensorSpec(shape=(10, 11, 12, 13, 14),
+                                               dtype=dtypes.float32,
+                                               name='spec_input'))
+
+    with session.Session(graph=ops.Graph()) as sess:
+      inputs, outputs, _ = load_model(sess, output_path,
+                                      mode_keys.ModeKeys.PREDICT)
+      self.assertEqual(5, inputs[next(iter(inputs.keys()))].shape.ndims)
+      self.assertEqual(5, outputs[next(iter(outputs.keys()))].shape.ndims)
+      self.assertEqual(3, outputs[next(iter(outputs.keys()))].shape[-1])
+
+  @parameterized.parameters(
+      {
+          'model_builder': sequential_model_without_input_shape,
+          'input_signature': [tensor_spec.TensorSpec(shape=[None, 3],
+                                                     dtype=dtypes.float32)]},
+      {
+          'model_builder': subclassed_model,
+          'input_signature': [tensor_spec.TensorSpec(shape=[None, 3],
+                                                     dtype=dtypes.float32)]})
+  def testServingOnly(self, model_builder, input_signature):
+    if context.executing_eagerly():
+      saved_model_path = self._save_model_dir()
+      input_arr = np.random.random((5, 3)).astype(np.float32)
+      model = model_builder()
+      ref_predict = model.predict(input_arr)
+
+      output_path = keras_saved_model.export(
+          model, saved_model_path, serving_only=True,
+          input_signature=input_signature)
+
+      # Load predict graph, and test predictions
+      with session.Session(graph=ops.Graph()) as sess:
+        inputs, outputs, _ = load_model(sess, output_path,
+                                        mode_keys.ModeKeys.PREDICT)
+        predictions = sess.run(outputs[next(iter(outputs.keys()))],
+                               {inputs[next(iter(inputs.keys()))]: input_arr})
+        self.assertAllClose(ref_predict, predictions, atol=1e-05)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index d342131a521a90399090e48cf578f37c2a2e566c..fd062b0ab337aa6fa62a7603a36749cde315c3da 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -18,11 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import threading
+
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
+from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 
 
@@ -73,9 +77,13 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   Returns:
     The output data (Numpy array) returned by the layer, for additional
     checks to be done by the calling code.
+
+  Raises:
+    ValueError: if `input_shape is None`.
   """
   if input_data is None:
-    assert input_shape
+    if input_shape is None:
+      raise ValueError('input_shape is None')
     if not input_dtype:
       input_dtype = 'float32'
     input_data_shape = list(input_shape)
@@ -149,7 +157,15 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
     np.testing.assert_allclose(output, actual_output, rtol=1e-3)
 
   # test training mode (e.g. useful for dropout tests)
-  model.compile(RMSPropOptimizer(0.01), 'mse', weighted_metrics=['acc'])
+  # Rebuild the model to avoid the graph being reused between predict() and
+  # train(). This was causing some error for layer with Defun as it body.
+  # See b/120160788 for more details. This should be mitigated after 2.0.
+  model = keras.models.Model(x, layer(x))
+  if _thread_local_data.run_eagerly is not None:
+    model.compile(RMSPropOptimizer(0.01), 'mse', weighted_metrics=['acc'],
+                  run_eagerly=should_run_eagerly())
+  else:
+    model.compile(RMSPropOptimizer(0.01), 'mse', weighted_metrics=['acc'])
   model.train_on_batch(input_data, actual_output)
 
   # test as first layer in Sequential API
@@ -190,6 +206,74 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   return actual_output
 
 
+_thread_local_data = threading.local()
+_thread_local_data.model_type = None
+_thread_local_data.run_eagerly = None
+
+
+@tf_contextlib.contextmanager
+def model_type_scope(value):
+  """Provides a scope within which the model type to test is equal to `value`.
+
+  The model type gets restored to its original value upon exiting the scope.
+
+  Arguments:
+     value: model type value
+
+  Yields:
+    The provided value.
+  """
+  previous_value = _thread_local_data.model_type
+  try:
+    _thread_local_data.model_type = value
+    yield value
+  finally:
+    # Restore model type to initial value.
+    _thread_local_data.model_type = previous_value
+
+
+@tf_contextlib.contextmanager
+def run_eagerly_scope(value):
+  """Provides a scope within which we compile models to run eagerly or not.
+
+  The boolean gets restored to its original value upon exiting the scope.
+
+  Arguments:
+     value: Bool specifying if we should run models eagerly in the active test.
+     Should be True or False.
+
+  Yields:
+    The provided value.
+  """
+  previous_value = _thread_local_data.run_eagerly
+  try:
+    _thread_local_data.run_eagerly = value
+    yield value
+  finally:
+    # Restore model type to initial value.
+    _thread_local_data.run_eagerly = previous_value
+
+
+def should_run_eagerly():
+  """Returns whether the models we are testing should be run eagerly."""
+  if _thread_local_data.run_eagerly is None:
+    raise ValueError('Cannot call `should_run_eagerly()` outside of a '
+                     '`run_eagerly_scope()` or `run_all_keras_modes` '
+                     'decorator.')
+
+  return _thread_local_data.run_eagerly and context.executing_eagerly()
+
+
+def get_model_type():
+  """Gets the model type that should be tested."""
+  if _thread_local_data.model_type is None:
+    raise ValueError('Cannot call `get_model_type()` outside of a '
+                     '`model_type_scope()` or `run_with_all_model_types` '
+                     'decorator.')
+
+  return _thread_local_data.model_type
+
+
 def get_small_sequential_mlp(num_hidden, num_classes, input_dim=None):
   model = keras.models.Sequential()
   if input_dim:
@@ -208,3 +292,337 @@ def get_small_functional_mlp(num_hidden, num_classes, input_dim):
   activation = 'sigmoid' if num_classes == 1 else 'softmax'
   outputs = keras.layers.Dense(num_classes, activation=activation)(outputs)
   return keras.Model(inputs, outputs)
+
+
+class _SmallSubclassMLP(keras.Model):
+  """A subclass model based small MLP."""
+
+  def __init__(self, num_hidden, num_classes):
+    super(_SmallSubclassMLP, self).__init__()
+    self.layer_a = keras.layers.Dense(num_hidden, activation='relu')
+    activation = 'sigmoid' if num_classes == 1 else 'softmax'
+    self.layer_b = keras.layers.Dense(num_classes, activation=activation)
+
+  def call(self, inputs, **kwargs):
+    x = self.layer_a(inputs)
+    return self.layer_b(x)
+
+
+class _SmallSubclassMLPCustomBuild(keras.Model):
+  """A subclass model small MLP that uses a custom build method."""
+
+  def __init__(self, num_hidden, num_classes):
+    super(_SmallSubclassMLPCustomBuild, self).__init__()
+    self.layer_a = None
+    self.layer_b = None
+    self.num_hidden = num_hidden
+    self.num_classes = num_classes
+
+  def build(self, input_shape):
+    self.layer_a = keras.layers.Dense(self.num_hidden, activation='relu')
+    activation = 'sigmoid' if self.num_classes == 1 else 'softmax'
+    self.layer_b = keras.layers.Dense(self.num_classes, activation=activation)
+
+  def call(self, inputs, **kwargs):
+    x = self.layer_a(inputs)
+    return self.layer_b(x)
+
+
+def get_small_subclass_mlp(num_hidden, num_classes):
+  return _SmallSubclassMLP(num_hidden, num_classes)
+
+
+def get_small_subclass_mlp_with_custom_build(num_hidden, num_classes):
+  return _SmallSubclassMLPCustomBuild(num_hidden, num_classes)
+
+
+def get_small_mlp(num_hidden, num_classes, input_dim):
+  """Get a small mlp of the model type specified by `get_model_type`."""
+  model_type = get_model_type()
+  if model_type == 'subclass':
+    return get_small_subclass_mlp(num_hidden, num_classes)
+  if model_type == 'subclass_custom_build':
+    return get_small_subclass_mlp_with_custom_build(num_hidden, num_classes)
+  if model_type == 'sequential':
+    return get_small_sequential_mlp(num_hidden, num_classes, input_dim)
+  if model_type == 'functional':
+    return get_small_functional_mlp(num_hidden, num_classes, input_dim)
+  raise ValueError('Unknown model type {}'.format(model_type))
+
+
+class _SubclassModel(keras.Model):
+  """A Keras subclass model."""
+
+  def __init__(self, layers):
+    super(_SubclassModel, self).__init__()
+    self.all_layers = layers
+
+  def call(self, inputs, **kwargs):
+    x = inputs
+    for layer in self.all_layers:
+      x = layer(x)
+    return x
+
+
+class _SubclassModelCustomBuild(keras.Model):
+  """A Keras subclass model that uses a custom build method."""
+
+  def __init__(self, layer_generating_func):
+    super(_SubclassModelCustomBuild, self).__init__()
+    self.all_layers = None
+    self._layer_generating_func = layer_generating_func
+
+  def build(self, input_shape):
+    layers = []
+    for layer in self._layer_generating_func():
+      layers.append(layer)
+    self.all_layers = layers
+
+  def call(self, inputs, **kwargs):
+    x = inputs
+    for layer in self.all_layers:
+      x = layer(x)
+    return x
+
+
+def get_model_from_layers(layers, input_shape=None):
+  """Builds a model from a sequence of layers."""
+  model_type = get_model_type()
+  if model_type == 'subclass':
+    return _SubclassModel(layers)
+
+  if model_type == 'subclass_custom_build':
+    layer_generating_func = lambda: layers
+    return _SubclassModelCustomBuild(layer_generating_func)
+
+  if model_type == 'sequential':
+    model = keras.models.Sequential()
+    if input_shape:
+      model.add(keras.layers.InputLayer(input_shape=input_shape))
+    for layer in layers:
+      model.add(layer)
+    return model
+
+  if model_type == 'functional':
+    if not input_shape:
+      raise ValueError('Cannot create a functional model from layers with no '
+                       'input shape.')
+    inputs = keras.Input(shape=input_shape)
+    outputs = inputs
+    for layer in layers:
+      outputs = layer(outputs)
+    return keras.Model(inputs, outputs)
+
+  raise ValueError('Unknown model type {}'.format(model_type))
+
+
+class _MultiIOSubclassModel(keras.Model):
+  """Multi IO Keras subclass model."""
+
+  def __init__(self, branch_a, branch_b, shared_input_branch=None,
+               shared_output_branch=None):
+    super(_MultiIOSubclassModel, self).__init__()
+    self._shared_input_branch = shared_input_branch
+    self._branch_a = branch_a
+    self._branch_b = branch_b
+    self._shared_output_branch = shared_output_branch
+
+  def call(self, inputs, **kwargs):
+    if self._shared_input_branch:
+      for layer in self._shared_input_branch:
+        inputs = layer(inputs)
+      a = inputs
+      b = inputs
+    else:
+      a, b = inputs
+
+    for layer in self._branch_a:
+      a = layer(a)
+    for layer in self._branch_b:
+      b = layer(b)
+    outs = [a, b]
+
+    if self._shared_output_branch:
+      for layer in self._shared_output_branch:
+        outs = layer(outs)
+
+    return outs
+
+
+class _MultiIOSubclassModelCustomBuild(keras.Model):
+  """Multi IO Keras subclass model that uses a custom build method."""
+
+  def __init__(self, branch_a_func, branch_b_func,
+               shared_input_branch_func=None,
+               shared_output_branch_func=None):
+    super(_MultiIOSubclassModelCustomBuild, self).__init__()
+    self._shared_input_branch_func = shared_input_branch_func
+    self._branch_a_func = branch_a_func
+    self._branch_b_func = branch_b_func
+    self._shared_output_branch_func = shared_output_branch_func
+
+    self._shared_input_branch = None
+    self._branch_a = None
+    self._branch_b = None
+    self._shared_output_branch = None
+
+  def build(self, input_shape):
+    if self._shared_input_branch_func():
+      self._shared_input_branch = self._shared_input_branch_func()
+    self._branch_a = self._branch_a_func()
+    self._branch_b = self._branch_b_func()
+
+    if self._shared_output_branch_func():
+      self._shared_output_branch = self._shared_output_branch_func()
+
+  def call(self, inputs, **kwargs):
+    if self._shared_input_branch:
+      for layer in self._shared_input_branch:
+        inputs = layer(inputs)
+      a = inputs
+      b = inputs
+    else:
+      a, b = inputs
+
+    for layer in self._branch_a:
+      a = layer(a)
+    for layer in self._branch_b:
+      b = layer(b)
+    outs = a, b
+
+    if self._shared_output_branch:
+      for layer in self._shared_output_branch:
+        outs = layer(outs)
+
+    return outs
+
+
+def get_multi_io_model(
+    branch_a,
+    branch_b,
+    shared_input_branch=None,
+    shared_output_branch=None):
+  """Builds a multi-io model that contains two branches.
+
+  The produced model will be of the type specified by `get_model_type`.
+
+  To build a two-input, two-output model:
+    Specify a list of layers for branch a and branch b, but do not specify any
+    shared input branch or shared output branch. The resulting model will apply
+    each branch to a different input, to produce two outputs.
+
+    The first value in branch_a must be the Keras 'Input' layer for branch a,
+    and the first value in branch_b must be the Keras 'Input' layer for
+    branch b.
+
+    example usage:
+    ```
+    branch_a = [Input(shape=(2,), name='a'), Dense(), Dense()]
+    branch_b = [Input(shape=(3,), name='b'), Dense(), Dense()]
+
+    model = get_multi_io_model(branch_a, branch_b)
+    ```
+
+  To build a two-input, one-output model:
+    Specify a list of layers for branch a and branch b, and specify a
+    shared output branch. The resulting model will apply
+    each branch to a different input. It will then apply the shared output
+    branch to a tuple containing the intermediate outputs of each branch,
+    to produce a single output. The first layer in the shared_output_branch
+    must be able to merge a tuple of two tensors.
+
+    The first value in branch_a must be the Keras 'Input' layer for branch a,
+    and the first value in branch_b must be the Keras 'Input' layer for
+    branch b.
+
+    example usage:
+    ```
+    input_branch_a = [Input(shape=(2,), name='a'), Dense(), Dense()]
+    input_branch_b = [Input(shape=(3,), name='b'), Dense(), Dense()]
+    shared_output_branch = [Concatenate(), Dense(), Dense()]
+
+    model = get_multi_io_model(input_branch_a, input_branch_b,
+                               shared_output_branch=shared_output_branch)
+    ```
+  To build a one-input, two-output model:
+    Specify a list of layers for branch a and branch b, and specify a
+    shared input branch. The resulting model will take one input, and apply
+    the shared input branch to it. It will then respectively apply each branch
+    to that intermediate result in parallel, to produce two outputs.
+
+    The first value in the shared_input_branch must be the Keras 'Input' layer
+    for the whole model. Branch a and branch b should not contain any Input
+    layers.
+
+    example usage:
+    ```
+    shared_input_branch = [Input(shape=(2,), name='in'), Dense(), Dense()]
+    output_branch_a = [Dense(), Dense()]
+    output_branch_b = [Dense(), Dense()]
+
+
+    model = get_multi_io_model(output__branch_a, output_branch_b,
+                               shared_input_branch=shared_input_branch)
+    ```
+
+  Args:
+    branch_a: A sequence of layers for branch a of the model.
+    branch_b: A sequence of layers for branch b of the model.
+    shared_input_branch: An optional sequence of layers to apply to a single
+      input, before applying both branches to that intermediate result. If set,
+      the model will take only one input instead of two. Defaults to None.
+    shared_output_branch: An optional sequence of layers to merge the
+      intermediate results produced by branch a and branch b. If set,
+      the model will produce only one output instead of two. Defaults to None.
+
+  Returns:
+    A multi-io model of the type specified by `get_model_type`, specified
+    by the different branches.
+  """
+  # Extract the functional inputs from the layer lists
+  if shared_input_branch:
+    inputs = shared_input_branch[0]
+    shared_input_branch = shared_input_branch[1:]
+  else:
+    inputs = branch_a[0], branch_b[0]
+    branch_a = branch_a[1:]
+    branch_b = branch_b[1:]
+
+  model_type = get_model_type()
+  if model_type == 'subclass':
+    return _MultiIOSubclassModel(branch_a, branch_b, shared_input_branch,
+                                 shared_output_branch)
+
+  if model_type == 'subclass_custom_build':
+    return _MultiIOSubclassModelCustomBuild((lambda: branch_a),
+                                            (lambda: branch_b),
+                                            (lambda: shared_input_branch),
+                                            (lambda: shared_output_branch))
+
+  if model_type == 'sequential':
+    raise ValueError('Cannot use `get_multi_io_model` to construct '
+                     'sequential models')
+
+  if model_type == 'functional':
+    if shared_input_branch:
+      a_and_b = inputs
+      for layer in shared_input_branch:
+        a_and_b = layer(a_and_b)
+      a = a_and_b
+      b = a_and_b
+    else:
+      a, b = inputs
+
+    for layer in branch_a:
+      a = layer(a)
+    for layer in branch_b:
+      b = layer(b)
+    outputs = a, b
+
+    if shared_output_branch:
+      for layer in shared_output_branch:
+        outputs = layer(outputs)
+
+    return keras.Model(inputs, outputs)
+
+  raise ValueError('Unknown model type {}'.format(model_type))
diff --git a/tensorflow/python/keras/utils/data_utils.py b/tensorflow/python/keras/utils/data_utils.py
index 01a9d61a84c8ceb5a251a80c9440c0ba6469e64f..d133e3fa8aeb0ee420bfa131b98401f617f1daae 100644
--- a/tensorflow/python/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/utils/data_utils.py
@@ -596,9 +596,9 @@ class OrderedEnqueuer(SequenceEnqueuer):
         Function, a Function to initialize the pool
     """
     def pool_fn(seqs):
-      return multiprocessing.Pool(workers,
-                                  initializer=init_pool_generator,
-                                  initargs=(seqs, self.random_seed))
+      return multiprocessing.Pool(
+          workers, initializer=init_pool_generator, initargs=(seqs, None))
+
     return pool_fn
 
   def _wait_queue(self):
diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..431d107091e90c8ecf7be38a465443aaede11936
--- /dev/null
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -0,0 +1,77 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Utils related to keras metrics.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.keras import metrics
+from tensorflow.python.ops import metrics as metrics_module
+
+
+def extract_model_metrics_as_v1_metrics(model):
+  """Convert metrics from a Keras model to (value, update) ops.
+
+  This is used for converting Keras models to Estimators and SavedModels.
+
+  Args:
+    model: A `tf.keras.Model` object.
+
+  Returns:
+    Dictionary mapping metric names to tuples of (value, update) ops. May return
+    `None` if the model does not contain any metrics.
+  """
+  if not getattr(model, 'metrics', None):
+    return None
+
+  eval_metric_ops = {}
+
+  def get_metric_name(metric):
+    if isinstance(metric, metrics.Metric):
+      return metric.name
+    if callable(metric):
+      return metric.__name__
+    assert isinstance(metric, six.string_types)
+    return metric
+
+  # When each metric maps to an output
+  if isinstance(model.metrics, dict):
+    for i, output_name in enumerate(model.metrics.keys()):
+      # `metric` is the user given metric value in `compile`. This can be
+      # metric name (`acc`), metric function (binary_accuracy) or a metric
+      # object (BinaryAccuracy()).
+      metric = model.metrics[output_name]
+      metric_name = get_metric_name(metric)
+      # When some outputs use the same metric
+      if list(model.metrics.values()).count(metric_name) > 1:
+        metric_name += '_' + output_name
+      if isinstance(metric, metrics.Metric):
+        eval_metric_ops[metric_name] = metric
+      else:
+        eval_metric_ops[metric_name] = metrics_module.mean(
+            model.metrics_tensors[i - len(model.metrics)])
+  else:
+    for i, metric in enumerate(model.metrics):
+      metric_name = get_metric_name(metric)
+      if isinstance(metric, metrics.Metric):
+        eval_metric_ops[metric_name] = metric
+      else:
+        eval_metric_ops[metric_name] = metrics_module.mean(
+            model.metrics_tensors[i])
+  return eval_metric_ops
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
index 1780ab65871b1cbb712c612ea252298aadefb265..8c1abd632484273a01fd99cbd72ee73b66e46f27 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils_test.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
@@ -158,7 +158,7 @@ class TestMultiGPUModel(test.TestCase):
       dataset = data.Dataset.from_tensor_slices((x_train, y_train))
       dataset = dataset.repeat()
       dataset = dataset.batch(4)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = data.make_one_shot_iterator(dataset)
 
       inputs, targets = iterator.get_next()
 
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 97ac21b8adfd29f520346e4a5861ab561d17feb9..bd5c103b38dc1561fbcb19b326052bd4f3c6f293 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -272,7 +272,7 @@ tf_py_test(
 
 cuda_py_test(
     name = "ctc_loss_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["ctc_loss_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1068,6 +1068,25 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "summary_ops_test",
+    size = "small",
+    srcs = ["summary_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 tf_py_test(
     name = "summary_v1_ops_test",
     size = "small",
@@ -1190,8 +1209,15 @@ tf_py_test(
     srcs = ["unicode_decode_op_test.py"],
     additional_deps = [
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/ops/ragged:ragged",
+        "//tensorflow/python/ops/ragged:ragged_test_util",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
     ],
 )
@@ -1888,6 +1914,22 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "lu_op_test",
+    size = "small",
+    srcs = ["lu_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/ops/linalg",
+    ],
+)
+
 cuda_py_test(
     name = "manip_ops_test",
     size = "small",
@@ -2073,12 +2115,13 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python:tf2",
     ],
 )
 
diff --git a/tensorflow/python/kernel_tests/atrous_convolution_test.py b/tensorflow/python/kernel_tests/atrous_convolution_test.py
index 6b16fca29d0277e0e5f1f52f6c4a48343a441f67..2fb8a37e2b94bd81409970eb3c485362a17634b6 100644
--- a/tensorflow/python/kernel_tests/atrous_convolution_test.py
+++ b/tensorflow/python/kernel_tests/atrous_convolution_test.py
@@ -110,6 +110,7 @@ class AtrousConvolutionTest(test.TestCase):
 
     add_check(check, y1, y2)
 
+  @test_util.run_v1_only("b/120545219")
   def test_unknown_spatial_dims_for_channel_last_format(self):
     x = array_ops.placeholder(dtypes.float32, [1, None, None, 10])
     w = array_ops.zeros([3, 3, 10, 20])
@@ -117,6 +118,7 @@ class AtrousConvolutionTest(test.TestCase):
         x, w, "VALID", dilation_rate=[2, 2], data_format="NHWC")
     self.assertEqual(y.shape.as_list(), [1, None, None, 20])
 
+  @test_util.run_v1_only("b/120545219")
   def test_unknown_spatial_dims_for_channel_first_format(self):
     x = array_ops.placeholder(dtypes.float32, [1, 10, None, None])
     w = array_ops.zeros([3, 3, 10, 20])
@@ -262,6 +264,7 @@ class AtrousConvolutionTest(test.TestCase):
     err_tolerance = 1e-3
     self.assertLess(err, err_tolerance)
 
+  @test_util.run_v1_only("b/120545219")
   def testGradient(self):
     with self.cached_session():
       for padding in ["SAME", "VALID"]:
diff --git a/tensorflow/python/kernel_tests/base64_ops_test.py b/tensorflow/python/kernel_tests/base64_ops_test.py
index bb903d827f20438396cc3fbdef2cc59883a27345..381f190b8df6d65afaa80654e3d98377a69b9ae3 100644
--- a/tensorflow/python/kernel_tests/base64_ops_test.py
+++ b/tensorflow/python/kernel_tests/base64_ops_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("b/120545219")
 class Base64OpsTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/kernel_tests/batch_matmul_op_test.py b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
index a0ad8151b26e399b0a2bebfe89bca82f19249df3..c32a6c7e41759ac9abade06bb83be19a7392f2da 100644
--- a/tensorflow/python/kernel_tests/batch_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
@@ -20,9 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import constant_op
+from tensorflow.python import tf2
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -105,36 +106,37 @@ class BatchMatmulOpTest(test.TestCase):
 
   def _testNonEmpty(self, dtype, adjoint_a, adjoint_b, use_static_shape):
 
-    def compareNonEmpty(self, a_shape, b_shape):
+    def CompareNonEmpty(self, a_shape, b_shape):
       self._compare(
           self._rand(a_shape, dtype),
           self._rand(b_shape, dtype), adjoint_a, adjoint_b, use_static_shape)
 
-    compareNonEmpty(self, [1, 2, 3], [1, 3, 5])
-    compareNonEmpty(self, [1, 2, 3], [1, 3, 1])
-    compareNonEmpty(self, [1, 1, 3], [1, 3, 5])
-    compareNonEmpty(self, [1, 2, 3], [1, 3, 5])
-    compareNonEmpty(self, [7, 1, 3], [7, 3, 5])
-    compareNonEmpty(self, [7, 2, 3], [7, 3, 1])
-    compareNonEmpty(self, [7, 2, 3], [7, 3, 5])
-    compareNonEmpty(self, [10, 64, 75], [10, 75, 30])
-    compareNonEmpty(self, [5, 7, 2, 3], [5, 7, 3, 5])
+    CompareNonEmpty(self, [1, 2, 3], [1, 3, 5])
+    CompareNonEmpty(self, [1, 2, 3], [1, 3, 1])
+    CompareNonEmpty(self, [1, 1, 3], [1, 3, 5])
+    CompareNonEmpty(self, [1, 2, 3], [1, 3, 5])
+    CompareNonEmpty(self, [7, 1, 3], [7, 3, 5])
+    CompareNonEmpty(self, [7, 2, 3], [7, 3, 1])
+    CompareNonEmpty(self, [7, 2, 3], [7, 3, 5])
+    CompareNonEmpty(self, [10, 64, 75], [10, 75, 30])
+    CompareNonEmpty(self, [5, 7, 2, 3], [5, 7, 3, 5])
 
   def _testEmpty(self, dtype, adjoint_a, adjoint_b, use_static_shape):
 
-    def compareEmpty(self, a_shape, b_shape):
+    def CompareEmpty(self, a_shape, b_shape):
       self._compare(
           np.zeros(a_shape).astype(dtype),
           np.zeros(b_shape).astype(dtype), adjoint_a, adjoint_b,
           use_static_shape)
 
-    compareEmpty(self, [0, 3, 2], [0, 2, 4])
-    compareEmpty(self, [3, 0, 2], [3, 2, 5])
-    compareEmpty(self, [3, 3, 2], [3, 2, 0])
+    CompareEmpty(self, [0, 3, 2], [0, 2, 4])
+    CompareEmpty(self, [3, 0, 2], [3, 2, 5])
+    CompareEmpty(self, [3, 3, 2], [3, 2, 0])
 
 
 def _GetBatchMatmulOpTest(dtype, adjoint_a, adjoint_b, use_static_shape):
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     np.random.seed(42)
     self._testNonEmpty(dtype, adjoint_a, adjoint_b, use_static_shape)
@@ -154,17 +156,13 @@ class BatchMatmulGradientTest(test.TestCase):
     y = y_in if not adjoint_b else y_in.reshape(y_t_shape)
     epsilon = np.finfo(x.dtype).eps
     delta = epsilon**(1.0 / 3.0)
+    def Loss(x, y):
+      z = math_ops.matmul(x, y, adjoint_a, adjoint_b)
+      return math_ops.reduce_sum(z)
     with self.cached_session(use_gpu=True):
-      inx = constant_op.constant(x)
-      iny = constant_op.constant(y)
-      z = math_ops.matmul(inx, iny, adjoint_a, adjoint_b)
-      loss = math_ops.reduce_sum(z)
-      ((x_jacob_t, x_jacob_n),
-       (y_jacob_t, y_jacob_n)) = gradient_checker.compute_gradient(
-           [inx, iny], [x.shape, y.shape],
-           loss, [1],
-           x_init_value=[x, y],
-           delta=delta)
+      ((x_jacob_t, y_jacob_t),
+       (x_jacob_n, y_jacob_n)) = gradient_checker_v2.compute_gradient(
+           Loss, [x, y], delta=delta)
       tol = 20 * delta
       self.assertAllClose(x_jacob_t, x_jacob_n, rtol=tol, atol=tol)
       self.assertAllClose(y_jacob_t, y_jacob_n, rtol=tol, atol=tol)
@@ -188,6 +186,7 @@ class BatchMatmulGradientTest(test.TestCase):
 
 def _GetBatchMatmulGradientTest(dtype, adjoint_a, adjoint_b):
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     self._compare(1, 2, 3, 5, dtype, adjoint_a, adjoint_b)
     self._compare(3, 4, 7, 10, dtype, adjoint_a, adjoint_b)
@@ -202,11 +201,12 @@ if __name__ == "__main__":
     for adjoint_a_ in False, True:
       for adjoint_b_ in False, True:
         name = "%s_%s_%s" % (dtype_.__name__, adjoint_a_, adjoint_b_)
-        for use_static_shape in True, False:
+        # TF2 does not support placeholders under eager so we skip it
+        for use_static_shape_ in set([True, tf2.enabled()]):
           setattr(BatchMatmulOpTest,
-                  "testBatchMatmulOp_" + name + ("_%s" % use_static_shape),
+                  "testBatchMatmulOp_" + name + ("_%s" % use_static_shape_),
                   _GetBatchMatmulOpTest(dtype_, adjoint_a_, adjoint_b_,
-                                        use_static_shape))
+                                        use_static_shape_))
         if dtype_ is not np.int32:
           setattr(BatchMatmulGradientTest, "testBatchMatmulGradient_" + name,
                   _GetBatchMatmulGradientTest(dtype_, adjoint_a_, adjoint_b_))
diff --git a/tensorflow/python/kernel_tests/batch_scatter_ops_test.py b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
index eefcdc508f0a651e439006883258eab3466e6780..f70fb93da9d51c1f9838f67977dbbd4aef65562e 100644
--- a/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -50,7 +51,8 @@ class ScatterTest(test.TestCase):
                         vtype,
                         itype,
                         repeat_indices=False,
-                        updates_are_scalar=False):
+                        updates_are_scalar=False,
+                        method=False):
     np.random.seed(8)
     with self.cached_session(use_gpu=False):
       for indices_shape in (2,), (3, 7), (3, 4, 7):
@@ -71,7 +73,10 @@ class ScatterTest(test.TestCase):
           # Scatter via tensorflow
           ref = variables.Variable(old)
           ref.initializer.run()
-          tf_scatter(ref, indices, updates).eval()
+          if method:
+            ref.batch_scatter_update(ops.IndexedSlices(indices, updates))
+          else:
+            tf_scatter(ref, indices, updates).eval()
           self.assertAllClose(ref.eval(), new)
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index 390672febeb839f98ee1c892706d8731f65bfa58..2b9863fb89bac80f6a2f012a3f25c23f993d03ad 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -82,7 +82,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
     self.max_elements = 1 << 16
     self.num_quantiles = constant_op.constant(3, dtype=dtypes.int64)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testBasicQuantileBucketsSingleResource(self):
     with self.cached_session() as sess:
       quantile_accumulator_handle = self.create_resource("floats", self.eps,
@@ -107,7 +107,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
       self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testBasicQuantileBucketsMultipleResources(self):
     with self.cached_session() as sess:
       quantile_accumulator_handle_0 = self.create_resource("float_0", self.eps,
@@ -142,7 +142,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
       self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSaveRestoreAfterFlush(self):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
@@ -175,7 +175,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSaveRestoreBeforeFlush(self):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
diff --git a/tensorflow/python/kernel_tests/checkpoint_ops_test.py b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
index b8c8c9edb5ac4ee177f962ba584c6e00dd589ad1..a67461856808b064ff0de485d1fe28e79430c7fb 100644
--- a/tensorflow/python/kernel_tests/checkpoint_ops_test.py
+++ b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
@@ -125,7 +125,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
 
     save = saver.Saver([matrix])
     with self.cached_session() as sess:
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.bundle_file = os.path.join(test.get_temp_dir(), 'bundle_checkpoint')
       save.save(sess, self.bundle_file)
 
@@ -230,6 +230,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
           np.reshape(initializing_values, (num_rows, num_cols)),
           self.evaluate(remapped_matrix))
 
+  @test_util.run_deprecated_v1
   def test_load_and_remap_invalid_remapping(self):
     """Tests that errors are raised when an ID maps to multiple new IDs.
 
@@ -261,6 +262,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     with self.cached_session(), self.assertRaises(errors.UnimplementedError):
       self.evaluate(remapped_matrix)
 
+  @test_util.run_deprecated_v1
   def test_load_and_remap_incorrect_initializing_values(self):
     """Tests that errors are raised with incorrect number of init values."""
     remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
@@ -312,7 +314,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
     with self.cached_session() as sess:
       ckpt_path = os.path.join(test.get_temp_dir(), 'temp_ckpt')
       save = saver.Saver([matrix])
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       save.save(sess, ckpt_path)
       num_rows, num_cols = np_value.shape
 
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index f3947236b1f71eebb6517abbfde25e4e5f9efcc5..a08cfe960d005451ab5a02aff02e90a0fbcb92a0 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -155,6 +155,7 @@ class CholeskyOpTest(test.TestCase):
           np.array([[[1., 2., 3.], [3., 4., 5.]], [[1., 2., 3.], [3., 4., 5.]]
                    ]))
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     tensor3 = constant_op.constant([1., 2.])
     with self.assertRaises(ValueError):
@@ -233,6 +234,7 @@ class CholeskyGradTest(test.TestCase):
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.float64,), scalarTest=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testTwoBlockMatrixComplexFloat(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 1f4b37ce2a4a2d88f0c0d306fd6bb46d0cc99633..8fe3ba41e27aa101fd4f2e3b41b0a0b226471047 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -170,8 +170,8 @@ class CondV2Test(test.TestCase):
         self.assertRegexpMatches(
             cond2_op.get_attr("else_branch").name, r"foo_cond_1_false_\d*")
 
+  @test_util.run_v1_only("b/120545219")
   def testDefunInCond(self):
-    self.skipTest("b/117293122")
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
 
@@ -190,9 +190,8 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  @test_util.run_deprecated_v1
   def testNestedDefunInCond(self):
-    self.skipTest("b/117284369")
-
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
 
@@ -216,9 +215,8 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  @test_util.run_deprecated_v1
   def testDoubleNestedDefunInCond(self):
-    self.skipTest("b/117284369")
-
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
 
@@ -778,6 +776,26 @@ class CondV2Test(test.TestCase):
     self.assertAllEqual(
         self.evaluate(output_t), [-5, -4, -3, -2, -1, 0, 1, 4, 9, 16])
 
+  @test_util.run_deprecated_v1
+  def testForwardPassRewrite(self):
+    x = constant_op.constant(1.0, name="x")
+    output = cond_v2.cond_v2(constant_op.constant(True),
+                             lambda: x * 2.0,
+                             lambda: x)
+    if_op = output.op.inputs[0].op
+    self.assertEqual(if_op.type, "If")
+    # pylint: disable=g-deprecated-assert
+    self.assertEqual(len(if_op.outputs), 1)
+
+    gradients_impl.gradients(output, x)
+    # if_op should have been rewritten to output 2.0 intermediate.
+    self.assertEqual(len(if_op.outputs), 2)
+
+    gradients_impl.gradients(output, x)
+    # Computing the gradient again shouldn't rewrite if_op again.
+    self.assertEqual(len(if_op.outputs), 2)
+    # pylint: enable=g-deprecated-assert
+
 
 class CondV2CollectionTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/conditional_accumulator_test.py b/tensorflow/python/kernel_tests/conditional_accumulator_test.py
index 5847e4639bb37a82f7d5ec38f9eab434891da7e9..32a20587508b7b5b4f0eeda248f6bb0e55f34c1f 100644
--- a/tensorflow/python/kernel_tests/conditional_accumulator_test.py
+++ b/tensorflow/python/kernel_tests/conditional_accumulator_test.py
@@ -199,7 +199,7 @@ class ConditionalAccumulatorTest(test.TestCase):
           is_all_equal &= (val[i][j] == elems_ave[i][j])
       self.assertTrue(is_all_equal)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorWrongDynamicShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -321,7 +321,7 @@ class ConditionalAccumulatorTest(test.TestCase):
           shape=tensor_shape.TensorShape([1]),
           reduction_type="Invalid")
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorInvalidTakeGrad(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -408,7 +408,7 @@ class ConditionalAccumulatorTest(test.TestCase):
 
       set_global_step_op = q.set_global_step(new_global_step)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(3):
         set_global_step_op.run()
         self.evaluate(inc_global_step)
@@ -435,7 +435,7 @@ class ConditionalAccumulatorTest(test.TestCase):
                                    if x >= ls) / sum(1 for x in local_steps
                                                      if x >= ls), val)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testParallelApplyGrad(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -461,7 +461,7 @@ class ConditionalAccumulatorTest(test.TestCase):
 
       self.assertEqual(val, sum(elems) / len(elems))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testParallelTakeGrad(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -494,7 +494,7 @@ class ConditionalAccumulatorTest(test.TestCase):
 
       self.assertItemsEqual(elems, results)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorApplyAndBlockingTake(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -528,7 +528,7 @@ class ConditionalAccumulatorTest(test.TestCase):
     with self.assertRaisesOpError("was cancelled"):
       self.evaluate(takeg_op)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorCancel(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index f1efc5ce59d3c11afe48197e872c4dd92eaad1ea..39ceb0d7495678cc1d749f34804ee3287ba125ce 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as eager_function
+from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -43,6 +44,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_array_ops
@@ -129,6 +131,7 @@ def isum(s, maximum_iterations=None):
 @test_util.with_control_flow_v2
 class ControlFlowTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testRefIdentity(self):
     with self.cached_session():
       v = variables.VariableV1(7)
@@ -138,10 +141,10 @@ class ControlFlowTest(test.TestCase):
       v2 = control_flow_ops.with_dependencies([op], v)
 
       self.assertTrue(isinstance(v2, ops.Tensor))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(9, self.evaluate(v2))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testRefEnter(self):
     with self.cached_session():
       v = variables.VariableV1(7)
@@ -152,10 +155,10 @@ class ControlFlowTest(test.TestCase):
       op = state_ops.assign(enter_v, enter_nine)
       v2 = control_flow_ops.with_dependencies([op], enter_v)
       v3 = control_flow_ops.exit(v2)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(9, self.evaluate(v3))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testRefSwitch(self):
     with self.cached_session():
       v = variables.VariableV1(7)
@@ -163,7 +166,7 @@ class ControlFlowTest(test.TestCase):
       p = constant_op.constant(True)
       v1 = control_flow_ops._SwitchRefOrTensor(v._ref(), p)  # pylint: disable=protected-access
       v2 = state_ops.assign(v1[1], 9)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(9, self.evaluate(v2))
 
   def testEnterMulExit(self):
@@ -193,6 +196,7 @@ class ControlFlowTest(test.TestCase):
           v, "frame2", is_constant=False)
       self.assertEqual(enter_v_non_constant.shape, None)
 
+  @test_util.run_v1_only("b/120545219")
   def testSwitchMergeIndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant([1, 2, 3, 4, 5, 6])
@@ -202,11 +206,12 @@ class ControlFlowTest(test.TestCase):
       switch_op = control_flow_ops.switch(data, pred)
       merge_op = control_flow_ops.merge(switch_op)[0]
 
-      val = merge_op.values.eval()
-      ind = merge_op.indices.eval()
+      val = merge_op.values
+      ind = merge_op.indices
     self.assertAllEqual(np.arange(1, 7), val)
     self.assertAllEqual(np.arange(0, 12, 2), ind)
 
+  @test_util.run_v1_only("b/120545219")
   def testSwitchDeadBranch(self):
     with self.cached_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
@@ -219,6 +224,7 @@ class ControlFlowTest(test.TestCase):
           lambda e: "Retval[0] does not have value" in str(e)):
         self.evaluate(dead_branch)
 
+  @test_util.run_v1_only("b/120545219")
   def testSwitchMergeLess(self):
     with self.cached_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
@@ -231,6 +237,7 @@ class ControlFlowTest(test.TestCase):
       result = self.evaluate(merge_op)
     self.assertAllEqual(np.arange(1, 7), result)
 
+  @test_util.run_v1_only("b/120545219")
   def testSwitchMergeAddIdentity(self):
     with self.cached_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
@@ -244,6 +251,7 @@ class ControlFlowTest(test.TestCase):
       result = self.evaluate(merge_op)
     self.assertAllEqual(np.array([x + 1 for x in [1, 2, 3, 4, 5, 6]]), result)
 
+  @test_util.run_v1_only("b/120545219")
   def testSwitchMergeAddMul(self):
     with self.cached_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
@@ -258,6 +266,7 @@ class ControlFlowTest(test.TestCase):
       result = self.evaluate(merge_op)
     self.assertAllEqual(np.array([x * 5 for x in [1, 2, 3, 4, 5, 6]]), result)
 
+  @test_util.run_v1_only("b/120545219")
   def testLoop_false(self):
     with self.cached_session():
       false = ops.convert_to_tensor(False)
@@ -302,6 +311,7 @@ class ControlFlowTest(test.TestCase):
       result = self.evaluate(exit_i)
     self.assertAllEqual(10, result)
 
+  @test_util.run_v1_only("b/120545219")
   def testLoop_2(self):
     with self.cached_session():
       zero = constant_op.constant(0)
@@ -328,6 +338,7 @@ class ControlFlowTest(test.TestCase):
       result = self.evaluate(exit_i)
     self.assertAllEqual(10, result)
 
+  @test_util.run_v1_only("b/120545219")
   def testDifferentFrame(self):
     with self.cached_session():
       data = array_ops.placeholder(dtypes.float32, shape=[])
@@ -362,6 +373,7 @@ class ControlFlowTest(test.TestCase):
         lambda: math_ops.subtract(x, 1.))
     self.assertEqual(b.shape, tensor_shape.scalar())
 
+  @test_util.run_v1_only("b/120545219")
   def testFetchable(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
@@ -378,6 +390,7 @@ class ControlFlowTest(test.TestCase):
               sess.run(t, feed_dict={x: 3})
 
   @test_util.disable_control_flow_v2("Not relevant")
+  @test_util.run_v1_only("b/120545219")
   def testFeedable(self):
     with self.cached_session() as sess:
       c = constant_op.constant(2)
@@ -395,6 +408,7 @@ class ControlFlowTest(test.TestCase):
             with self.assertRaisesRegexp(ValueError, "may not be fed"):
               sess.run(r, feed_dict={t: 3})
 
+  @test_util.run_v1_only("b/120545219")
   def testCondIndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant(10)
@@ -405,11 +419,12 @@ class ControlFlowTest(test.TestCase):
       fn2 = lambda: ops.IndexedSlices(math_ops.subtract(x.values, 1), indices)
       r = control_flow_ops.cond(pred, fn1, fn2)
 
-      val = r.values.eval()
-      ind = r.indices.eval()
+      val = r.values
+      ind = r.indices
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondSparseTensor(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -423,15 +438,16 @@ class ControlFlowTest(test.TestCase):
       fn2 = lambda: sparse_tensor.SparseTensor(
           indices, x.values - 1, dense_shape=shape)
       r = control_flow_ops.cond(pred, fn1, fn2)
-      self.assertAllEqual([3.0, 5.0], r.values.eval())
-      self.assertAllEqual([[1], [4]], r.indices.eval())
+      self.assertAllEqual([3.0, 5.0], r.values)
+      self.assertAllEqual([[1], [4]], r.indices)
       self.assertAllEqual(r.values.get_shape(), (2,))
 
+  @test_util.run_v1_only("b/120545219")
   def testCondResource(self):
 
     with self.cached_session():
       rv = resource_variable_ops.ResourceVariable(True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       t = ops.convert_to_tensor(1.0)
 
       def case():
@@ -439,8 +455,10 @@ class ControlFlowTest(test.TestCase):
         with ops.control_dependencies([assign]):
           return array_ops.identity(t)
 
-      self.assertEqual(1.0, control_flow_ops.cond(rv, case, lambda: t).eval())
+      self.assertEqual(
+          1.0, self.evaluate(control_flow_ops.cond(rv, case, lambda: t)))
 
+  @test_util.run_v1_only("b/120545219")
   def testCondWithTensorArrayGrad(self):
     with self.cached_session() as sess:
       with ops.device(test.gpu_device_name()):
@@ -455,6 +473,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(sess.run(g, {pred: False}), [0.0, 0.0, 0.0])
 
   @test_util.disable_control_flow_v2("b/113293074")
+  @test_util.run_v1_only("b/120545219")
   def testCondIndexedSlicesDifferentTypes(self):
     with self.cached_session():
       values = constant_op.constant(10)
@@ -466,12 +485,13 @@ class ControlFlowTest(test.TestCase):
       fn2 = lambda: ops.IndexedSlices(math_ops.subtract(x.values, 1), i_64)
       r = control_flow_ops.cond(pred, fn1, fn2)
 
-      val = r.values.eval()
-      ind = r.indices.eval()
+      val = r.values
+      ind = r.indices
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
     self.assertTrue(ind.dtype == np.int64)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondColocation(self):
     with self.session(use_gpu=True):
       with ops.device("/cpu:0"):
@@ -547,8 +567,8 @@ class ControlFlowTest(test.TestCase):
 
     if not context.executing_eagerly():
       with self.cached_session():
-        variables.global_variables_initializer().run()
-        result = f().eval()
+        self.evaluate(variables.global_variables_initializer())
+        result = self.evaluate(f())
         self.assertEqual(True, result)
         # Only second cond result was fetched, so v1 assign shouldn't run.
         self.assertEqual(7, self.evaluate(v1))
@@ -576,6 +596,7 @@ class ControlFlowTest(test.TestCase):
         alive, count = body(i)
       self.assertAllEqual(4, self.evaluate(count))
 
+  @test_util.run_v1_only("b/120545219")
   def testCond_6(self):
     with self.cached_session():
       v1 = variables.Variable([7])
@@ -586,7 +607,7 @@ class ControlFlowTest(test.TestCase):
       fn2 = lambda: v1
       r = control_flow_ops.cond(pred, fn1, fn2)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       result = self.evaluate(r)
       self.assertAllEqual(np.array([7]), result)
 
@@ -671,6 +692,7 @@ class ControlFlowTest(test.TestCase):
       test_result = self.evaluate(r)
       self.assertDictEqual({"a": {"c": 210}, "b": {"d": 210}}, test_result)
 
+  @test_util.run_v1_only("b/120545219")
   def testCheckNestedOutputStruct(self):
     with self.cached_session() as sess:
       x = constant_op.constant(10)
@@ -681,7 +703,8 @@ class ControlFlowTest(test.TestCase):
       v1_msg = "The two structures don't have the same nested structure"
       v2_msg = "Outputs of true_fn and false_fn must have the same structure"
       with self.assertRaisesRegexp(
-          ValueError, v2_msg if control_flow_ops.ENABLE_COND_V2 else v1_msg):
+          ValueError,
+          v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
         r = control_flow_ops.cond(pred, fn1, fn2)
         self.evaluate(r)
 
@@ -701,7 +724,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual([2.0], self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/79881896 (control deps)")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testCondWithControl(self):
     with self.cached_session():
       control_holder = array_ops.placeholder(dtypes.float32, shape=())
@@ -717,6 +740,7 @@ class ControlFlowTest(test.TestCase):
           lambda: constant_op.constant(1))
       self.assertEqual(5, self.evaluate(r))
 
+  @test_util.run_v1_only("b/120545219")
   def testUninitializedRefIdentity(self):
     with self.cached_session() as sess:
       v = gen_state_ops.variable(
@@ -771,6 +795,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
       self.evaluate(r)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondGrad_1(self):
     with self.cached_session():
       x = constant_op.constant(10.0, name="x")
@@ -831,20 +856,21 @@ class ControlFlowTest(test.TestCase):
       with ops.device("/cpu:1"):
         grad = gradients_impl.gradients(z, x)[0]
 
-      self.assertEqual(sess.run(grad, {pred: True, x: 1.0, y: 2.0}), 4.0)
-      self.assertEqual(sess.run(grad, {pred: False, x: 1.0, y: 2.0}), 0.0)
-
       with ops.device("/cpu:0"):
         grad_grad = gradients_impl.gradients(grad, x)[0]
 
+      self.assertEqual(sess.run(grad, {pred: True, x: 1.0, y: 2.0}), 4.0)
+      self.assertEqual(sess.run(grad, {pred: False, x: 1.0, y: 2.0}), 0.0)
+
       # v1 control flow gets None second derivative for some reason.
-      if not control_flow_ops.ENABLE_COND_V2:
+      if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
         self.assertIsNone(grad_grad)
         return
 
       self.assertEqual(sess.run(grad_grad, {pred: True, x: 1.0, y: 2.0}), 0.0)
       self.assertEqual(sess.run(grad_grad, {pred: False, x: 1.0, y: 2.0}), 0.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testNestedCond_Simple(self):
     with self.cached_session():
       x = constant_op.constant(0., name="X")
@@ -861,7 +887,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(1.0, self.evaluate(result))
 
   @test_util.disable_control_flow_v2("b/113327884")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testCondGrad_Gather(self):
     with self.cached_session() as sess:
       v1 = variables.Variable([1.0, 42.0])
@@ -871,7 +897,7 @@ class ControlFlowTest(test.TestCase):
       fn2 = lambda: array_ops.gather(v1, [1, 1])
       r = control_flow_ops.cond(pred, fn1, fn2)
       grad = gradients_impl.gradients(r, [v1])[0]
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       # Should just be [1, 1], but possibly a sparse representation
       gv, gi = sess.run([grad.values, grad.indices], feed_dict={c: 1})
       dense_gv = [
@@ -885,6 +911,7 @@ class ControlFlowTest(test.TestCase):
       ]
       self.assertAllEqual(dense_gv, [0.0, 2.0])
 
+  @test_util.run_v1_only("b/120545219")
   def testCondPredicateTensor(self):
     """Regression test for lowering predicate from non-first output of an op."""
 
@@ -895,9 +922,8 @@ class ControlFlowTest(test.TestCase):
     r = control_flow_ops.cond(foo()[1], lambda: 1.0, lambda: 2.0)
     self.assertEqual(self.evaluate(r), 1.0)
 
-  # TODO(b/117945658): reenable
   @test_util.run_in_graph_and_eager_modes
-  def DISABLED_testCondAutoControlDeps(self):
+  def testCondAutoControlDeps(self):
 
     def branch_fn():
       logging_ops.print_v2("A")
@@ -917,16 +943,16 @@ class ControlFlowTest(test.TestCase):
     if not context.executing_eagerly():
       with self.cached_session():
         with self.captureWritesToStream(sys.stderr) as printed:
-          self.assertEqual(build_cond().eval(), 10)
+          self.assertEqual(self.evaluate(build_cond()), 10)
         self.assertEqual(printed.contents(), "C\n")
 
         with self.captureWritesToStream(sys.stderr) as printed:
-          self.assertEqual(build_nested_cond().eval(), 10)
+          self.assertEqual(self.evaluate(build_nested_cond()), 10)
         self.assertEqual(printed.contents(), "C\n")
 
     # In defuns, all prints should execute in program order.
     # This doesn't work with legacy control flow.
-    if control_flow_ops.ENABLE_COND_V2:
+    if control_flow_util.ENABLE_CONTROL_FLOW_V2:
 
       @eager_function.defun
       def cond():
@@ -944,9 +970,28 @@ class ControlFlowTest(test.TestCase):
         self.assertEqual(self.evaluate(nested_cond()), 10)
       self.assertEqual(printed.contents(), "A\nB\nC\n")
 
-  # TODO(b/117945658): reenable
+    # wrap_function should prune.
+    def pruned_cond():
+      return build_cond()
+    pruned_cond = wrap_function.wrap_function(pruned_cond, [])
+
+    with self.captureWritesToStream(sys.stderr) as printed:
+      self.assertEqual(self.evaluate(pruned_cond()), 10)
+    self.assertEqual(printed.contents(), "C\n")
+
+    def pruned_nested_cond():
+      return build_nested_cond()
+    pruned_nested_cond = wrap_function.wrap_function(pruned_nested_cond, [])
+
+    with self.captureWritesToStream(sys.stderr) as printed:
+      self.assertEqual(self.evaluate(pruned_nested_cond()), 10)
+    self.assertEqual(printed.contents(), "C\n")
+
   @test_util.run_in_graph_and_eager_modes
-  def DISABLED_testWhileAutoControlDeps(self):
+  def testWhileAutoControlDeps(self):
+    # Legacy while_loop fails this test because it produces deprecation notices
+    # in stderr.
+    if not control_flow_util.ENABLE_CONTROL_FLOW_V2: return
 
     def cond(i, unused_x):
       logging_ops.print_v2("A")
@@ -965,40 +1010,56 @@ class ControlFlowTest(test.TestCase):
 
     def build_nested_while():
       return control_flow_ops.cond(
-          constant_op.constant(True), build_while, lambda: (0, 0))
+          constant_op.constant(True), build_while, lambda: [0, 0])
 
     # In v1 graph mode, pruning should make only "D" print.
     if not context.executing_eagerly():
       with self.cached_session():
         with self.captureWritesToStream(sys.stderr) as printed:
-          self.assertEqual(build_while()[0].eval(), 2)
+          self.assertEqual(self.evaluate(build_while()[0]), 2)
         self.assertEqual(printed.contents(), "D\nD\n")
 
         with self.captureWritesToStream(sys.stderr) as printed:
-          self.assertEqual(build_nested_while()[0].eval(), 2)
+          self.assertEqual(self.evaluate(build_nested_while()[0]), 2)
         self.assertEqual(printed.contents(), "D\nD\n")
 
     # In defuns, all prints should execute in program order.
-    # This doesn't work with legacy control flow.
-    if control_flow_ops.ENABLE_WHILE_V2:
+    @eager_function.defun
+    def while_loop():
+      return build_while()[0]
 
-      @eager_function.defun
-      def while_loop():
-        return build_while()[0]
+    with self.captureWritesToStream(sys.stderr) as printed:
+      self.assertEqual(self.evaluate(while_loop()), 2)
+    self.assertEqual(printed.contents(), "A\nB\nC\nD\nA\nB\nC\nD\nA\n")
 
+    @eager_function.defun
+    def nested_while_loop():
+      return build_nested_while()[0]
+
+    # TODO(b/117840611): calling nested_while_loop fails in eager
+    if not context.executing_eagerly():
       with self.captureWritesToStream(sys.stderr) as printed:
-        self.assertEqual(self.evaluate(while_loop()), 2)
+        self.assertEqual(self.evaluate(nested_while_loop()), 2)
       self.assertEqual(printed.contents(), "A\nB\nC\nD\nA\nB\nC\nD\nA\n")
 
-      @eager_function.defun
-      def nested_while_loop():
-        return build_nested_while()[0]
+    # wrap_function should prune.
+    def pruned_while():
+      return build_while()[0]
+    pruned_while = wrap_function.wrap_function(pruned_while, [])
 
-      # TODO(b/117840611): calling nested_while_loop fails in eager
-      if not context.executing_eagerly():
-        with self.captureWritesToStream(sys.stderr) as printed:
-          self.assertEqual(self.evaluate(nested_while_loop()), 2)
-        self.assertEqual(printed.contents(), "A\nB\nC\nD\nA\nB\nC\nD\nA\n")
+    with self.captureWritesToStream(sys.stderr) as printed:
+      self.assertEqual(self.evaluate(pruned_while()), 2)
+    self.assertEqual(printed.contents(), "D\nD\n")
+
+    def pruned_nested_while():
+      return build_nested_while()[0]
+    pruned_nested_while = wrap_function.wrap_function(pruned_nested_while, [])
+
+    # TODO(b/117840611): calling nested_while_loop fails in eager
+    if not context.executing_eagerly():
+      with self.captureWritesToStream(sys.stderr) as printed:
+        self.assertEqual(self.evaluate(pruned_nested_while()), 2)
+      self.assertEqual(printed.contents(), "D\nD\n")
 
   # Microbenchmark: 256,000 iterations/s.
   @test_util.disable_control_flow_v2("b/116630618 (Times out)")
@@ -1011,6 +1072,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(10000, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/79881896 (control deps)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileExternalControlDependencies(self):
     with self.cached_session():
       v = variables.Variable(0.0)
@@ -1023,10 +1085,11 @@ class ControlFlowTest(test.TestCase):
 
       result = control_flow_ops.while_loop(cond=lambda i: i < 2,
                                            body=body_fn, loop_vars=[1])
-      self.assertAllEqual(result.eval(), 2)
-      self.assertAllEqual(v.eval(), 1.0)
+      self.assertAllEqual(result, 2)
+      self.assertAllEqual(v.read_value(), 1.0)
 
   @test_util.disable_control_flow_v2("b/79881896 (control deps)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileExternalControlDependenciesNoInput(self):
     with self.cached_session():
       v = variables.Variable(0.0)
@@ -1040,10 +1103,10 @@ class ControlFlowTest(test.TestCase):
       result = control_flow_ops.while_loop(cond=lambda i: i < 5,
                                            body=body_fn, loop_vars=[0])
       self.evaluate(result)
-      self.assertAllEqual(v.eval(), 1.0)
+      self.assertAllEqual(self.evaluate(v), 1.0)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithRefs_1(self):
     with self.cached_session() as sess:
       x = variables.VariableV1(0)._ref()  # pylint: disable=protected-access
@@ -1058,7 +1121,7 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(c, b, [i, x], parallel_iterations=5)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual(r[0].dtype, dtypes.int32)
       self.assertEqual(r[1].dtype, dtypes.int32_ref)
@@ -1080,6 +1143,7 @@ class ControlFlowTest(test.TestCase):
       r = isum(s, maximum_iterations=3)
       self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], self.evaluate(r))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with self.cached_session():
       r = control_flow_ops.while_loop(
@@ -1087,6 +1151,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(1, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/115776323 (max_iters)")
+  @test_util.run_v1_only("b/120545219")
   def testSingleNestedMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -1112,6 +1177,7 @@ class ControlFlowTest(test.TestCase):
     # Should execute without issue.
     self.assertEqual(3, self.evaluate(loop_execute))
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -1133,7 +1199,7 @@ class ControlFlowTest(test.TestCase):
     gs = gradients_impl.gradients(loop_no_xla, v)
     self.evaluate(gs)  # This should execute without error.
 
-    if control_flow_ops.ENABLE_WHILE_V2:
+    if control_flow_util.ENABLE_CONTROL_FLOW_V2:
       xla_context = control_flow_ops.XLAControlFlowContext()
       xla_context.Enter()
       with self.assertRaisesRegexp(
@@ -1172,6 +1238,7 @@ class ControlFlowTest(test.TestCase):
           r"context '.*' \(currently defined in '.*'\)"):
         _ = gradients_impl.gradients(loop_with_maxiter, v)
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidMaximumIterationsFromSiblingContextWhileLoopInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -1190,7 +1257,7 @@ class ControlFlowTest(test.TestCase):
           lambda i, x: (i + 1, v * x), (0, 1.0),
           maximum_iterations=max_iter_holder[0])
 
-    if control_flow_ops.ENABLE_WHILE_V2:
+    if control_flow_util.ENABLE_CONTROL_FLOW_V2:
       xla_context = control_flow_ops.XLAControlFlowContext()
       xla_context.Enter()
       with self.assertRaisesRegexp(
@@ -1215,6 +1282,7 @@ class ControlFlowTest(test.TestCase):
         _ = gradients_impl.gradients(loop, v)
 
   @test_util.disable_control_flow_v2("b/118457764")
+  @test_util.run_v1_only("b/120545219")
   def testNestedWhileLoopWithMaxItersFromOuterContextInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -1301,7 +1369,7 @@ class ControlFlowTest(test.TestCase):
       d = ops.convert_to_tensor(100)
       r = control_flow_ops.while_loop(lambda i, m, c, o: math_ops.less(i, d),
                                       compute, [i, m, c, o])
-      result = r[3].eval()
+      result = r[3]
     self.assertAllEqual(10100, result)
 
   @test_util.run_deprecated_v1
@@ -1323,9 +1391,10 @@ class ControlFlowTest(test.TestCase):
       s = array_ops.size(x)
       r = control_flow_ops.while_loop(lambda i, m, c, o: math_ops.less(i, s),
                                       compute, [i, m, c, o])
-      result = r[3].eval()
+      result = r[3]
     self.assertAllEqual(42, result)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhile_5(self):
     with self.cached_session():
 
@@ -1347,10 +1416,11 @@ class ControlFlowTest(test.TestCase):
                                           tensor_shape.unknown_shape(),
                                           tensor_shape.unknown_shape()
                                       ])
-      result = r[2].eval()
+      result = r[2]
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
 
   @test_util.disable_control_flow_v2("b/116338794 (buffer_reuse)")
+  @test_util.run_v1_only("b/120545219")
   def testBufferForwarding(self):
     run_options = config_pb2.RunOptions(
         trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -1435,6 +1505,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
       self.assertEqual([10000], self.evaluate(r))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileShapeInference(self):
     with self.cached_session():
       i = constant_op.constant(0)
@@ -1461,6 +1532,7 @@ class ControlFlowTest(test.TestCase):
         r = control_flow_ops.while_loop(c, b, [i, m])
 
   @test_util.disable_control_flow_v2("b/116328420 (SparseTensor)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileShapeInferenceSparseTensor(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -1480,12 +1552,12 @@ class ControlFlowTest(test.TestCase):
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
-      self.assertEqual(r.dense_shape.get_shape()[0].value, 1)
+      self.assertEqual(r.dense_shape.get_shape()[0], 1)
 
       _, r = control_flow_ops.while_loop(
           c, b, [i, x],
           [i.get_shape(), tensor_shape.TensorShape([None])])
-      self.assertTrue(r.dense_shape.get_shape()[0].value is None)
+      self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
 
       with self.assertRaisesRegexp(ValueError, "is not compatible with"):
         _, r = control_flow_ops.while_loop(
@@ -1493,7 +1565,7 @@ class ControlFlowTest(test.TestCase):
             [i.get_shape(), tensor_shape.TensorShape([5])])
 
   @test_util.disable_control_flow_v2("b/116282023 (IndexedSlices)")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWhileShapeInferenceIndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant([[2.0, 4.0], [3.0, 5.0]], name="values")
@@ -1512,15 +1584,14 @@ class ControlFlowTest(test.TestCase):
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
-      self.assertEqual(r.dense_shape.get_shape()[0].value, 2)
+      self.assertEqual(r.dense_shape.get_shape()[0], 2)
       self.assertEqual(r.values.get_shape(), tensor_shape.TensorShape([2, 2]))
 
       _, r = control_flow_ops.while_loop(
           c, b, [i, x],
           [i.get_shape(), tensor_shape.TensorShape([None, 2])])
-      self.assertEqual(r.dense_shape.get_shape()[0].value, 2)
-      self.assertTrue(r.values.get_shape()[0].value is None)
-      self.assertEqual(r.values.get_shape()[1].value, 2)
+      self.assertEqual(r.dense_shape.get_shape()[0], 2)
+      self.assertEqual(r.values.get_shape().as_list(), [None, 2])
 
       with self.assertRaisesRegexp(ValueError, "is not compatible with"):
         _, r = control_flow_ops.while_loop(
@@ -1584,6 +1655,7 @@ class ControlFlowTest(test.TestCase):
     self._testNestedWhile_2(use_gpu=False)
     self._testNestedWhile_2(use_gpu=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithControl_1(self):
     with self.cached_session():
       n = constant_op.constant(0)
@@ -1598,7 +1670,7 @@ class ControlFlowTest(test.TestCase):
 
       res = control_flow_ops.while_loop(
           condition, body, [n, r], parallel_iterations=1)
-      self.assertAllEqual(12, res[1].eval())
+      self.assertAllEqual(12, res[1])
 
   @test_util.run_deprecated_v1
   def testWhileWithControl_2(self):
@@ -1615,6 +1687,7 @@ class ControlFlowTest(test.TestCase):
           condition, body, [r], parallel_iterations=1)
       self.assertAllEqual(12, self.evaluate(res))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithControl_3(self):
     with self.cached_session() as sess:
       b = array_ops.placeholder(dtypes.bool)
@@ -1624,6 +1697,7 @@ class ControlFlowTest(test.TestCase):
         r = control_flow_ops.while_loop(lambda x: x < 10, lambda x: x + c, [x0])
       self.assertEqual(10, sess.run(r, {b: True}))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithControl_4(self):
     with self.cached_session() as sess:
       b = array_ops.placeholder(dtypes.bool)
@@ -1635,6 +1709,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(10, sess.run(r, {b: True}))
 
   @test_util.disable_control_flow_v2("b/79881896 (control_deps)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithControl_5(self):
     with self.cached_session() as sess:
       b = array_ops.placeholder(dtypes.bool)
@@ -1663,6 +1738,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(0, self.evaluate(loop))
 
   @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileCondWithControl_1(self):
     with self.cached_session():
       v = variable_scope.get_variable(
@@ -1681,11 +1757,12 @@ class ControlFlowTest(test.TestCase):
             return i + 1
 
       r = control_flow_ops.while_loop(loop_condition, loop_body, (i0,))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(4, self.evaluate(r))
       self.assertAllClose(65536.0, self.evaluate(v))
 
   @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileCondExitControl(self):
 
     with self.cached_session():
@@ -1706,7 +1783,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           constant_op.constant(False), lambda: constant_op.constant(1.0),
           false_branch)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(6.0, self.evaluate(r))
       self.assertEqual(99, self.evaluate(v))
 
@@ -1817,15 +1894,15 @@ class ControlFlowTest(test.TestCase):
       with ops.device("/cpu:1"):
         grad = gradients_impl.gradients(z, x_init)[0]
 
+      with ops.device("/cpu:0"):
+        grad_grad = gradients_impl.gradients(grad, x_init)[0]
+
       self.assertEqual(sess.run(grad, {pred: True}), 8.0)
       self.assertEqual(sess.run(grad, {pred: False}), 0.0)
 
-      if not control_flow_ops.ENABLE_WHILE_V2:
+      if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
         return
 
-      with ops.device("/cpu:0"):
-        grad_grad = gradients_impl.gradients(grad, x_init)[0]
-
       self.assertEqual(sess.run(grad_grad, {pred: True}), 0.0)
       self.assertEqual(sess.run(grad_grad, {pred: False}), 0.0)
 
@@ -1849,12 +1926,13 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(3, self.evaluate(r))
       result = self.evaluate(select)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileUpdateVariable_2(self):
     with self.cached_session():
       select1 = variables.Variable([3.0, 4.0, 5.0])
@@ -1874,7 +1952,7 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(3, self.evaluate(r))
       result1 = self.evaluate(select1)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result1)
@@ -1882,7 +1960,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result2)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWhileUpdateVariable_3(self):
     with self.cached_session():
       select = variables.Variable([3.0, 4.0, 5.0])
@@ -1900,17 +1978,17 @@ class ControlFlowTest(test.TestCase):
           loop_iterator,
           loop_body, [n, array_ops.identity(select)],
           parallel_iterations=1)
-      variables.global_variables_initializer().run()
-      result = r[1].eval()
+      self.evaluate(variables.global_variables_initializer())
+      result = r[1]
     self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWhileUpdateVariable_4(self):
     with self.cached_session():
       var_a = variables.Variable(0, name="a")
       var_b = variables.Variable(0, name="b")
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       c = constant_op.constant(0, name="c")
       asn1 = state_ops.assign_add(var_a, 1, name="a_add")
@@ -1934,13 +2012,13 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(10, self.evaluate(var_b))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWhileUpdateVariable_5(self):
     with self.cached_session():
       # Create some variables.
       var_a = variables.Variable(0, name="a")
       var_b = variables.Variable(0, name="b")
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Change condition to check var_b
       def pred(_):
@@ -1965,13 +2043,14 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(10, self.evaluate(var_b))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileUpdateVariable_6(self):
     with self.cached_session():
       # Create some variables.
       var_a = variables.Variable(0, name="a")
       var_b = variables.Variable(0, name="b")
       c = constant_op.constant(0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Loop condition
       def pred(i):
@@ -1994,6 +2073,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(55, self.evaluate(var_b))
       self.assertEqual(10, self.evaluate(var_a))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileQueue_1(self):
     with self.cached_session():
       q = data_flow_ops.FIFOQueue(-1, dtypes.int32)
@@ -2010,9 +2090,21 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [i], parallel_iterations=1)
       self.assertEqual([10], self.evaluate(r))
       for i in xrange(10):
-        self.assertEqual([i], q.dequeue().eval())
+        self.assertEqual([i], self.evaluate(q.dequeue()))
+
+  @test_util.run_v1_only("b/120545219")
+  def testWhileTimeOut(self):
+    run_options = config_pb2.RunOptions(timeout_in_ms=1)
+    with self.cached_session() as sess:
+      n = constant_op.constant(0)
+      c = lambda x: True
+      b = lambda x: math_ops.add(x, 1)
+      r = control_flow_ops.while_loop(c, b, [n])
+      with self.assertRaises(errors_impl.DeadlineExceededError):
+        sess.run(r, options=run_options)
 
   @test_util.disable_control_flow_v2("b/117119329 (stack)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileStack_1(self):
     with self.cached_session():
       s = gen_data_flow_ops.stack_v2(-1, dtypes.int32, stack_name="foo")
@@ -2082,10 +2174,12 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(1024.0, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/116351701 (colocation)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_ColocateGradients(self):
     self._testWhileGrad_ColocateGradients(colocate=False)
     self._testWhileGrad_ColocateGradients(colocate=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_Square(self):
     with self.cached_session():
       v = constant_op.constant(2.0, name="v")
@@ -2097,6 +2191,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllClose(1024.0, self.evaluate(r))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_Shape(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32, shape=[None])
@@ -2127,6 +2222,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients([r, y], x)[0]
       self.assertAllClose([2.0, 4.0], sess.run(r, feed_dict={x: [1.0, 2.0]}))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_MultipleUses(self):
     with self.cached_session():
       v = constant_op.constant(2.0, name="v")
@@ -2138,6 +2234,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, v)[0]
       self.assertEqual(524288.0, self.evaluate(r))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_LoopAdd(self):
     with self.cached_session():
       v = constant_op.constant(2.0, name="v")
@@ -2201,6 +2298,7 @@ class ControlFlowTest(test.TestCase):
   def testNestedWhileCondWhileGradGpu(self):
     self._testNestedWhileCondWhileGrad(use_gpu=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_Variable(self):
     with self.cached_session():
       a = variables.Variable(3.0)
@@ -2210,8 +2308,8 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v], parallel_iterations=1)
 
       r = gradients_impl.gradients(r, a)
-      variables.global_variables_initializer().run()
-      self.assertAllClose(216.0, r[0].eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose(216.0, r[0])
 
   @test_util.run_deprecated_v1
   def testWhileGrad_ResourceVariable(self):
@@ -2223,9 +2321,10 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v], parallel_iterations=1)
 
       g = gradients_impl.gradients(r, a)
-      variables.global_variables_initializer().run()
-      self.assertAllClose(216.0, g[0].eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose(216.0, g[0])
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGradInCond(self):
 
     with self.cached_session():
@@ -2243,7 +2342,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(9.0, r.eval(feed_dict={x: 1.0}))
 
   @test_util.disable_control_flow_v2("b/116340060")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testGradInWhileWrtInitialLoopVal(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32, shape=(), name="x")
@@ -2261,6 +2360,7 @@ class ControlFlowTest(test.TestCase):
           "loop invariants or wrt the input parameters to the loop body."):
         control_flow_ops.while_loop(lambda i, x: i < 3, body, [0, y])
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGradInWhile(self):
     with self.cached_session():
       n = ops.convert_to_tensor(1.0, name="n")
@@ -2277,6 +2377,7 @@ class ControlFlowTest(test.TestCase):
                                       [tensor_shape.unknown_shape()])
       self.assertAllClose(9.0, r.eval(feed_dict={x: 1.0}))
 
+  @test_util.run_v1_only("b/120545219")
   def testCondGradInNestedWhiles(self):
 
     def outer_body(i, x):
@@ -2295,6 +2396,49 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(i_val, 3)
       self.assertAllClose(x_val, 1.0)
 
+  def testNestedResourceAccess(self):
+    var = resource_variable_ops.ResourceVariable(constant_op.constant(3.0))
+
+    @eager_function.defun
+    def test_fn():
+      x = constant_op.constant(0.0)
+      r = control_flow_ops.while_loop(
+          # Outer loop condition
+          lambda i, y: i < 2,
+          # Outer loop body
+          lambda i, y: (i + 1, y + control_flow_ops.cond(
+              constant_op.constant(True),
+              # True branch
+              lambda: control_flow_ops.while_loop(
+                  # Inner loop condition
+                  lambda j, z: j < 3,
+                  # Inner loop body
+                  lambda j, z: (j + 1, z + math_ops.square(var)),
+                  # Inner initial loop value
+                  [0, y])[1],
+              # False branch
+              lambda: (0.0))),
+          # Outer initial loop value
+          [0, x])[1]
+
+      grad = gradients_impl.gradients(r, x)[0]
+      return r, grad
+
+    self.evaluate(variables.global_variables_initializer())
+    r, grad = self.evaluate(test_fn())
+    # 2 * 3 * 3^2
+    self.assertEqual(r, 81.0)
+    # v1 control flow gets the wrong answer!!!
+    # Gradient computation:
+    #   f(x) = x + 3^2
+    #   inner_loop(x) = f(f(f(x))) = x + 3*3^2 = x + 27
+    #   g(x) = x + inner_loop(x) = 2x + 27
+    #   outer_loop(x) = g(g(x)) = 4x + 81
+    #   outer_loop'(x) = 4
+    # Note that v1 control flow gets 4.0 as well if the cond is removed.
+    if control_flow_util.ENABLE_CONTROL_FLOW_V2:
+      self.assertEqual(grad, 4.0)
+
   def testWhile_NestedInput(self):
     with self.cached_session() as sess:
       named = collections.namedtuple("named", ("a", "b"))
@@ -2322,6 +2466,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual([100.0, 1.0, 102.0, 3.0, 4.0 + 100 * 2.0],
                        self.evaluate(r_flattened))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhile_NestedBadArityFails(self):
     with self.cached_session():
       named = collections.namedtuple("named", ("a", "b"))
@@ -2338,6 +2483,7 @@ class ControlFlowTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, "the same number of elements"):
         control_flow_ops.while_loop(c, b, loop_vars)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_ys_xs(self):
     with self.cached_session():
       x = constant_op.constant(3.0, name="x")
@@ -2353,13 +2499,13 @@ class ControlFlowTest(test.TestCase):
       rx, ry = control_flow_ops.while_loop(c, b, [x, y], parallel_iterations=1)
 
       r = gradients_impl.gradients([rx, ry], x)
-      self.assertAllClose(304.0, r[0].eval())
+      self.assertAllClose(304.0, r[0])
       r = gradients_impl.gradients([rx, ry], y)
-      self.assertAllClose(124.0, r[0].eval())
+      self.assertAllClose(124.0, r[0])
       r = gradients_impl.gradients([rx], x)
-      self.assertAllClose(295.0, r[0].eval())
+      self.assertAllClose(295.0, r[0])
       r = gradients_impl.gradients([rx], y)
-      self.assertAllClose(120.0, r[0].eval())
+      self.assertAllClose(120.0, r[0])
 
   @test_util.run_deprecated_v1
   def testWhileGrad_Dependency(self):
@@ -2377,11 +2523,12 @@ class ControlFlowTest(test.TestCase):
       ri, rx = control_flow_ops.while_loop(c, b, [i, x], parallel_iterations=1)
 
       r = gradients_impl.gradients([ri, rx], x)
-      self.assertAllClose(1024.0, r[0].eval())
+      self.assertAllClose(1024.0, r[0])
       r = gradients_impl.gradients([rx], x)
-      self.assertAllClose(1024.0, r[0].eval())
+      self.assertAllClose(1024.0, r[0])
 
   @test_util.disable_control_flow_v2("b/116355153 (back_prop flag)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_NoGradient(self):
     with self.cached_session():
       v = constant_op.constant(2.0, name="v")
@@ -2390,9 +2537,10 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v], back_prop=False)
       r = math_ops.add(r, v)
       r = gradients_impl.gradients(r, v)
-      self.assertAllClose(1.0, r[0].eval())
+      self.assertAllClose(1.0, r[0])
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_NoDependency(self):
     with self.cached_session() as sess:
       variable = variables.Variable(array_ops.ones([2, 3]))
@@ -2410,7 +2558,7 @@ class ControlFlowTest(test.TestCase):
           cond=cond, body=body, loop_vars=loop_vars)
       cost = math_ops.reduce_sum(tensors[2])
       grad = gradients_impl.gradients(cost, [variable])
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(np.ones([2, 3]), sess.run(grad[0]))
 
   @test_util.run_deprecated_v1
@@ -2433,6 +2581,7 @@ class ControlFlowTest(test.TestCase):
       grad = gradients_impl.gradients(cost, [c0])
       self.assertAllClose(0.0, sess.run(grad[0]))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_SerialTwoLoops(self):
     with self.cached_session():
       i = constant_op.constant(0, name="i")
@@ -2449,8 +2598,9 @@ class ControlFlowTest(test.TestCase):
       _, rx = control_flow_ops.while_loop(c, b, [i, rx], parallel_iterations=1)
 
       r = gradients_impl.gradients([rx], x)
-      self.assertAllClose(1024.0, r[0].eval())
+      self.assertAllClose(1024.0, r[0])
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_ParallelTwoLoops(self):
     with self.cached_session():
       i = constant_op.constant(0, name="i")
@@ -2468,8 +2618,9 @@ class ControlFlowTest(test.TestCase):
       rx = math_ops.add(r1, r2)
 
       r = gradients_impl.gradients([rx], x)
-      self.assertAllClose(64.0, r[0].eval())
+      self.assertAllClose(64.0, r[0])
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_OneOutputWithControlDependencyOnSecond(self):
     with self.cached_session():
       i = constant_op.constant(0, name="i")
@@ -2513,6 +2664,7 @@ class ControlFlowTest(test.TestCase):
     self._testNestedWhileGrad_Simple(use_gpu=False)
     self._testNestedWhileGrad_Simple(use_gpu=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testNestedWhileGrad_SerialInner(self):
     with self.cached_session():
       v = constant_op.constant(1.0)
@@ -2560,6 +2712,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllClose(512.0, self.evaluate(r))
 
+  @test_util.run_v1_only("b/120545219")
   def testNestedWhileGrad_ParallelIterations(self):
     # Make sure the stack pushes and pops of an inner loop are executed in
     # the sequential order of the iterations of its outer loop.
@@ -2580,7 +2733,7 @@ class ControlFlowTest(test.TestCase):
       train_op = optimizer.minimize(math_ops.reduce_mean(math_ops.square(res)))
       self.evaluate(variables.global_variables_initializer())
       self.evaluate(train_op)
-      self.assertAllClose(2.999, self.evaluate(var))
+      self.assertAllClose(2.999, var.read_value())
 
   def _testWhileCondGrad_Simple(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
@@ -2641,13 +2794,15 @@ class ControlFlowTest(test.TestCase):
           [i0.get_shape(), tensor_shape.TensorShape([None, 2])])
       s = math_ops.reduce_sum(h)
 
-      self.evaluate(variables.global_variables_initializer())
       optimizer = gradient_descent.GradientDescentOptimizer(0.01)
       op = optimizer.minimize(s)
+
+      self.evaluate(variables.global_variables_initializer())
       self.evaluate(op)
       self.assertAllClose([[0.98000002, 1.98000002]], self.evaluate(x))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithRefsWithGradients_1(self):
     with self.cached_session() as sess:
       x = variables.VariableV1(0.)._ref()  # pylint: disable=protected-access
@@ -2665,7 +2820,7 @@ class ControlFlowTest(test.TestCase):
       grad_ys = [variables.VariableV1(73)._ref()]  # pylint: disable=protected-access
       grad = gradients_impl.gradients([r[1]], [x], grad_ys=grad_ys)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual(r[0].dtype, dtypes.int32)
       self.assertEqual(r[1].dtype, dtypes.float32_ref)
@@ -2677,6 +2832,7 @@ class ControlFlowTest(test.TestCase):
     self.assertEqual(73, value_x_grad)
 
   @test_util.disable_control_flow_v2("b/116282023 (IndexedSlices)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_IndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -2699,7 +2855,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(np.array([1024.0, 1024.0]), self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/116328420 (SparseTensor)")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_SparseTensor(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -2723,6 +2879,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(np.array([1024.0, 1024.0]), self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/115920078 (gradients)")
+  @test_util.run_v1_only("b/120545219")
   def testCallGradInLoop(self):
     with self.cached_session() as sess:
       i0 = constant_op.constant(0)
@@ -2807,6 +2964,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(32.0, self.evaluate(r))
 
   @test_util.run_deprecated_v1
+  @test_util.disable_control_flow_v2("b/118712257")
   def testWhileGrad_StopGradInside(self):
     with self.cached_session():
       x = constant_op.constant(3.0, name="x")
@@ -2827,6 +2985,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(156.0, self.evaluate(r))
 
   @test_util.run_deprecated_v1
+  @test_util.disable_control_flow_v2("b/118712257")
   def testWhileGrad_StopGradInsideNoShape(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
@@ -2860,7 +3019,7 @@ class ControlFlowTest(test.TestCase):
 
     result = functional_ops.scan(fn, np.array([1., 2., 3.], dtype=np.float32))
     grad_theta = gradients_impl.gradients(result, theta)
-    if not control_flow_ops.ENABLE_WHILE_V2:
+    if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
       with self.assertRaisesRegexp(TypeError, "Second-order gradient"):
         gradients_impl.gradients(grad_theta, theta)
     grad_theta_stopped = array_ops.stop_gradient(grad_theta)
@@ -2903,6 +3062,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose([0., 0.], self.evaluate(dy_dq))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGradientWithNontrainablePath2(self):
     q = variables.Variable([7., 8.])
 
@@ -2921,6 +3081,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose([1., 1.], self.evaluate(dy_dq))
 
   @test_util.disable_control_flow_v2("b/115920078 (gradients)")
+  @test_util.run_v1_only("b/120545219")
   def testIssue16504(self):
     c = constant_op.constant(np.arange(100), dtype=dtypes.float32)
     w = variables.Variable(
@@ -2944,6 +3105,7 @@ class ControlFlowTest(test.TestCase):
     grad, = gradients_impl.gradients(w, c)
     self.assertIsNotNone(grad)
 
+  @test_util.run_v1_only("b/120545219")
   def testStopGradMultiFlows(self):
     with self.cached_session():
 
@@ -2967,9 +3129,10 @@ class ControlFlowTest(test.TestCase):
       grads = linalg_ops.norm(gradients_impl.gradients(r, vars_)[0])
       z = math_ops.add(r, array_ops.stop_gradient(math_ops.reduce_sum(grads)))
       result = gradients_impl.gradients(z, vars_)[0]
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(5.0, self.evaluate(result))
 
+  @test_util.run_v1_only("b/120545219")
   def testOneValueCond(self):
 
     with self.cached_session():
@@ -3003,6 +3166,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(4.0, i.eval(feed_dict={d: 1}))
       self.assertAllClose(2.0 * math.sqrt(2), i.eval(feed_dict={d: 2}))
 
+  @test_util.run_v1_only("b/120545219")
   def testCase(self):
     with self.cached_session():
       x = constant_op.constant(1)
@@ -3017,14 +3181,14 @@ class ControlFlowTest(test.TestCase):
               x < y: f1,
               x > z: f2
           }, default=f3, exclusive=True)
-      self.assertAllEqual(r1.eval(), 17)
+      self.assertAllEqual(r1, 17)
 
       r2 = control_flow_ops.case([(y > z, f1), (y > x, f2)], default=f3)
-      self.assertAllEqual(r2.eval(), 23)
+      self.assertAllEqual(r2, 23)
 
       # Duplicate events can happen, first one is selected
       r3 = control_flow_ops.case([(x < y, f1), (x < y, f2)], default=f3)
-      self.assertAllEqual(r3.eval(), 17)
+      self.assertAllEqual(r3, 17)
 
       # Duplicate events cause an error if exclusive = True
       r4 = control_flow_ops.case(
@@ -3034,7 +3198,7 @@ class ControlFlowTest(test.TestCase):
 
       # Check that the default is called if none of the others are
       r5 = control_flow_ops.case({x > y: f1}, default=f3)
-      self.assertAllEqual(r5.eval(), -1)
+      self.assertAllEqual(r5, -1)
 
       ran_once = [False, False, False]
 
@@ -3053,8 +3217,9 @@ class ControlFlowTest(test.TestCase):
           [(x < y, break_run_twice(0)), (x > y, break_run_twice(1))],
           default=lambda: constant_op.constant(2))
 
-      self.assertAllEqual(r6.eval(), 0)
+      self.assertAllEqual(r6, 0)
 
+  @test_util.run_v1_only("b/120545219")
   def testCaseSideEffects(self):
     with self.cached_session() as sess:
       v0 = variables.Variable(-1)
@@ -3075,22 +3240,23 @@ class ControlFlowTest(test.TestCase):
       r2 = control_flow_ops.case(
           ((x > y, a), (x > y, b)), default=c, exclusive=True)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1] * 3)
       self.assertEqual(2, self.evaluate(r2))
       self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1, -1, 2])
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1] * 3)
       self.assertEqual(1, self.evaluate(r1))
       self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1, 1, -1])
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1] * 3)
       self.assertEqual(0, self.evaluate(r0))
       self.assertAllEqual(self.evaluate([v0, v1, v2]), [0, -1, -1])
 
   @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
+  @test_util.run_v1_only("b/120545219")
   def testOneOpCond(self):
     with self.cached_session():
       v = variables.Variable(0)
@@ -3107,7 +3273,7 @@ class ControlFlowTest(test.TestCase):
 
       i = control_flow_ops.cond(p, a, b)
       self.assertTrue(isinstance(i, ops.Tensor))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual(0, self.evaluate(v))
 
@@ -3119,6 +3285,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(2, i.eval(feed_dict={c.name: 0}))
       self.assertEqual(2, self.evaluate(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testWithOpsDependencies(self):
     with self.cached_session() as sess:
       v = variables.VariableV1(0.0)
@@ -3142,6 +3309,7 @@ class ControlFlowTest(test.TestCase):
     # Ensure that 'v' is initialized
     self.assertAllClose(0.0, real_v_val)
 
+  @test_util.run_v1_only("b/120545219")
   def testWithTensorDependencies(self):
     with self.cached_session():
       v = variables.VariableV1(0.0)
@@ -3168,6 +3336,7 @@ class ControlFlowTest(test.TestCase):
       # Ensure that 'v' is initialized
       self.assertAllClose(0.0, self.evaluate(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testWithIndexedSlicesDependencies(self):
     with self.cached_session():
       v = variables.VariableV1(
@@ -3214,6 +3383,7 @@ class ControlFlowTest(test.TestCase):
         self.assertDeviceEqual("", with_vdef_dep.device)
         self.assertEqual([b"loc:@vdef"], with_vdef_dep.op.colocation_groups())
 
+  @test_util.run_v1_only("b/120545219")
   def testGroup(self):
     with self.cached_session() as sess:
       v1 = variables.VariableV1([0.0])
@@ -3233,6 +3403,7 @@ class ControlFlowTest(test.TestCase):
     self.assertAllClose([0.0], v1_val)
     self.assertAllClose([1.0], v2_val)
 
+  @test_util.run_v1_only("b/120545219")
   def testGroupEmpty(self):
     op = control_flow_ops.group()
     self.assertEqual(op.type, "NoOp")
@@ -3293,7 +3464,7 @@ class ControlFlowTest(test.TestCase):
     self.assertEqual([None, None], m.get_shape().as_list())
     self.assertEqual([], index.get_shape())
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testRefSelect(self):
     index = array_ops.placeholder(dtypes.int32)
 
@@ -3348,6 +3519,7 @@ class ControlFlowTest(test.TestCase):
       with self.assertRaises(ValueError):
         sess.run(tensor_list[0])
 
+  @test_util.run_v1_only("b/120545219")
   def testWhilePyFuncBasic(self):
 
     def func(x):
@@ -3359,8 +3531,9 @@ class ControlFlowTest(test.TestCase):
           lambda i, v: [i + 1, script_ops.py_func(func, [v], [dtypes.float32])[0]],
           [constant_op.constant(0), constant_op.constant(2.0, dtypes.float32)],
           [tensor_shape.unknown_shape(), tensor_shape.unknown_shape()])
-      self.assertEqual(r[1].eval(), 65536.0)
+      self.assertEqual(self.evaluate(r[1]), 65536.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileFuncBasic(self):
 
     @function.Defun(dtypes.float32)
@@ -3374,17 +3547,17 @@ class ControlFlowTest(test.TestCase):
           [constant_op.constant(0), x],
           [tensor_shape.unknown_shape(),
            tensor_shape.unknown_shape()])
-      self.assertEqual(r[1].eval(), 65536.0)
-
-      r = gradients_impl.gradients(r, x)[0]
-      self.assertEqual(r.eval(), 524288.0)
+      grad = gradients_impl.gradients(r, x)[0]
+      self.assertEqual(self.evaluate(r[1]), 65536.0)
+      self.assertEqual(self.evaluate(grad), 524288.0)
       # while_v2 does not have stacks.
-      if not control_flow_ops.ENABLE_WHILE_V2:
+      if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
         self.assertEqual(
             len([op for op in x.graph.get_operations() if op.type == "StackV2"
                 ]), 1)
 
 
+  @test_util.run_v1_only("b/120545219")
   def testQIntSwitchMerge(self):
     with self.cached_session(force_gpu=test.is_gpu_available()) as sess:
       constant_qint = constant_op.constant(np.array([42]), dtypes.qint8)
@@ -3393,6 +3566,7 @@ class ControlFlowTest(test.TestCase):
       result = control_flow_ops.merge([v_f, v_t])
       self.evaluate(result)
 
+  @test_util.run_v1_only("b/120545219")
   def testQIntRefSwitchMerge(self):
     with self.cached_session(use_gpu=test.is_gpu_available()) as sess:
       var_qint = gen_state_ops.variable(
@@ -3406,6 +3580,7 @@ class ControlFlowTest(test.TestCase):
       result = control_flow_ops.ref_merge([v_f, v_t])
       self.evaluate(result)
 
+  @test_util.run_v1_only("b/120545219")
   def testUInt64SwitchMerge(self):
     with self.cached_session(force_gpu=test.is_gpu_available()) as sess:
       constant_uint64 = constant_op.constant(np.array([42]), dtypes.uint64)
@@ -3453,6 +3628,7 @@ class ControlFlowContextCheckTest(test.TestCase):
         math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
     return cond_tensor[0]
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidContext(self):
     # Accessing a while loop tensor outside of control flow is illegal.
     while_tensor = self._getWhileTensor()
@@ -3462,7 +3638,7 @@ class ControlFlowContextCheckTest(test.TestCase):
         "is in a while loop. See info log for more details."):
       math_ops.add(1, while_tensor)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testInvalidContextInCond(self):
     # Accessing a while loop tensor in cond is illegal.
     while_tensor = self._getWhileTensor()
@@ -3475,6 +3651,7 @@ class ControlFlowContextCheckTest(test.TestCase):
           math_ops.less(1, 2), lambda: math_ops.add(1, while_tensor),
           lambda: constant_op.constant(0))
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidContextInWhile(self):
     # Accessing a while loop tensor in a different while loop is illegal.
     while_tensor = self._getWhileTensor()
@@ -3509,6 +3686,7 @@ class ControlFlowContextCheckTest(test.TestCase):
 
     control_flow_ops.cond(math_ops.less(1, 2), branch_fn, branch_fn)
 
+  @test_util.run_v1_only("b/120545219")
   def testValidWhileContext(self):
     # Accessing a tensor in a nested while is OK.
     def body(_):
@@ -3517,6 +3695,7 @@ class ControlFlowContextCheckTest(test.TestCase):
 
     control_flow_ops.while_loop(lambda i: i < 5, body, [0])
 
+  @test_util.run_v1_only("b/120545219")
   def testValidNestedContexts(self):
     # Accessing a tensor from a cond context in a while context, all inside an
     # outer while context, is OK.
@@ -3531,7 +3710,7 @@ class ControlFlowContextCheckTest(test.TestCase):
 
     control_flow_ops.while_loop(lambda i: i < 5, body, [0])
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testInvalidNestedContexts(self):
     # Accessing a tensor from a while context in a different while context, all
     # inside a cond context, is illegal.
@@ -3550,6 +3729,7 @@ class ControlFlowContextCheckTest(test.TestCase):
 
 class TupleTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testTensors(self):
     for v1_first in [True, False]:
       with self.cached_session():
@@ -3580,7 +3760,7 @@ class TupleTest(test.TestCase):
           self.assertAllClose([30.0], self.evaluate(t2))
           self.assertAllClose([1.0], self.evaluate(v1))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testIndexedSlices(self):
     for v1_first in [True, False]:
       with self.cached_session():
@@ -3733,7 +3913,7 @@ class WhileOpBenchmark(test.Benchmark):
     with session.Session() as sess, ops.device(default_device):
       # Get the initial id i, input x, and kernel.
       i, x, kernel = self._getInitVariables()
-      self.evaluate(variables.global_variables_initializer())
+      variables.global_variables_initializer().run()
 
       if static_unroll:
         for _ in xrange(steps):
@@ -3832,6 +4012,7 @@ class EagerTest(test.TestCase):
           isum(tensor, maximum_iterations=3).numpy(),
           [1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3])
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with context.eager_mode():
       tensor = constant_op.constant(0)
@@ -3854,6 +4035,7 @@ class EagerTest(test.TestCase):
       self.assertAllEqual(t1.numpy(), tup1.numpy())
       self.assertAllEqual(t2.numpy(), tup2.numpy())
 
+  @test_util.run_v1_only("b/120545219")
   def testCase(self):
     with context.eager_mode():
       x = constant_op.constant(1)
diff --git a/tensorflow/python/kernel_tests/control_flow_util_test.py b/tensorflow/python/kernel_tests/control_flow_util_test.py
index 762c445da05008a78fec1ec9e1cc7186e1539134..573f4b0d250ba5ff75118ed5738c3de2a8711a2f 100644
--- a/tensorflow/python/kernel_tests/control_flow_util_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_util_test.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
@@ -32,6 +33,7 @@ from tensorflow.python.platform import test
 
 class ControlFlowUtilTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testIsSwitch(self):
     switch_false, _ = control_flow_ops.switch(1, True)
     switch = switch_false.op
@@ -44,6 +46,7 @@ class ControlFlowUtilTest(test.TestCase):
 
     self.assertFalse(control_flow_util.IsSwitch(test_ops.int_output().op))
 
+  @test_util.run_v1_only("b/120545219")
   def testIsLoopEnter(self):
     enter = gen_control_flow_ops.enter(1, frame_name="name").op
     self.assertTrue(control_flow_util.IsLoopEnter(enter))
@@ -61,6 +64,7 @@ class ControlFlowUtilTest(test.TestCase):
 
     self.assertFalse(control_flow_util.IsLoopEnter(test_ops.int_output().op))
 
+  @test_util.run_v1_only("b/120545219")
   def testIsLoopExit(self):
     exit_op = control_flow_ops.exit(1).op
     self.assertTrue(control_flow_util.IsLoopExit(exit_op))
diff --git a/tensorflow/python/kernel_tests/control_flow_util_v2_test.py b/tensorflow/python/kernel_tests/control_flow_util_v2_test.py
index d0374a77005db4597ddbce76c1d2a3b9ac0e792d..08d3214e288bf873515f0b5a45ddf1e50ee1b281 100644
--- a/tensorflow/python/kernel_tests/control_flow_util_v2_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_util_v2_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2
 from tensorflow.python.platform import test
 
@@ -30,14 +31,11 @@ from tensorflow.python.platform import test
 class ControlFlowUtilV2Test(test.TestCase):
 
   def setUp(self):
-    self._enable_cond_v2_old = control_flow_ops.ENABLE_COND_V2
-    self._enable_while_v2_old = control_flow_ops.ENABLE_WHILE_V2
-    control_flow_ops.ENABLE_COND_V2 = True
-    control_flow_ops.ENABLE_WHILE_V2 = True
+    self._enable_control_flow_v2_old = control_flow_util.ENABLE_CONTROL_FLOW_V2
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = True
 
   def tearDown(self):
-    control_flow_ops.ENABLE_COND_V2 = self._enable_cond_v2_old
-    control_flow_ops.ENABLE_WHILE_V2 = self._enable_while_v2_old
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = self._enable_control_flow_v2_old
 
   def _create_control_flow(self, expect_in_defun):
     """Helper method for testInDefun."""
diff --git a/tensorflow/python/kernel_tests/ctc_loss_op_test.py b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
index e6b5835079ed67c495e1ccf315f4b515fedca8f8..352dedea4abc885d3f7765533b345e09ecec6dc9 100644
--- a/tensorflow/python/kernel_tests/ctc_loss_op_test.py
+++ b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
@@ -106,7 +106,7 @@ class CTCLossTest(test.TestCase):
         with self.assertRaisesOpError(expected_err_re):
           self.evaluate([loss, grad])
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testBasic(self):
     """Test two batch entries."""
     # Input and ground truth from Alex Graves' implementation.
@@ -242,7 +242,6 @@ class CTCLossTest(test.TestCase):
 
     self._testCTCLoss(inputs, seq_lens, labels, loss_truth, grad_truth)
 
-  @test_util.run_deprecated_v1
   def test_time_major(self):
     """Testing time_major param.
 
@@ -272,7 +271,7 @@ class CTCLossTest(test.TestCase):
       (tf_loss, tf_loss_transposed) = self.evaluate([loss, loss_transposed])
       self.assertAllEqual(tf_loss, tf_loss_transposed)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testInvalidSecondGradient(self):
     inputs = np.random.randn(2, 2, 3).astype(np.float32)
     inputs_t = constant_op.constant(inputs)
@@ -289,7 +288,7 @@ class CTCLossTest(test.TestCase):
                                    "explicitly disabled"):
         _ = gradients_impl._hessian_vector_product(loss, [inputs_t], v)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testEmptyBatch(self):
     inputs = constant_op.constant([], dtype=dtypes.float32, shape=(1, 0, 2))
     sequence_lengths = constant_op.constant([], dtype=dtypes.int32)
@@ -306,7 +305,7 @@ class CTCLossTest(test.TestCase):
 
 class CTCLossTestV2(test.TestCase):
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testCtcLossV2(self):
     random_seed.set_random_seed(5)
 
@@ -351,7 +350,7 @@ class CTCLossTestV2(test.TestCase):
             logit_length=logit_length,
             blank_index=0))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testCtcLossDenseIsSameAsCtcLoss(self):
     with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
       random_seed.set_random_seed(5)
@@ -405,7 +404,7 @@ class CTCLossTestV2(test.TestCase):
               rtol=2e-06,
               atol=2e-06)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testCtcLossDenseUniqueFastPathIsSameAsCtcLoss(self):
     random_seed.set_random_seed(5)
 
@@ -459,7 +458,7 @@ class CTCLossTestV2(test.TestCase):
             rtol=2e-06,
             atol=2e-06)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testCtcLossDenseWithBlankIndexIsSameAsCtcLoss(self):
     random_seed.set_random_seed(5)
 
@@ -516,7 +515,7 @@ class CTCLossTestV2(test.TestCase):
             rtol=2e-06,
             atol=2e-06)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testCtcLossDenseWithNegativeBlankIndexIsSameAsCtcLoss(self):
     with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
       random_seed.set_random_seed(5)
@@ -565,7 +564,6 @@ class CTCLossTestV2(test.TestCase):
               rtol=2e-06,
               atol=2e-06)
 
-  @test_util.run_deprecated_v1
   def testCollapseRepeated(self):
     collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
         labels=[[1, 3, 3, 3, 0],
@@ -579,7 +577,6 @@ class CTCLossTestV2(test.TestCase):
          [1, 4, 0, 0],
          [4, 2, 9, 4]])
 
-  @test_util.run_deprecated_v1
   def testCollapseRepeatedPreservesDtypes(self):
     collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
         labels=constant_op.constant(
@@ -597,7 +594,6 @@ class CTCLossTestV2(test.TestCase):
          [1, 4, 0, 0],
          [4, 2, 9, 4]])
 
-  @test_util.run_deprecated_v1
   def testCollapseRepeatedExtraPadding(self):
     collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
         labels=[[1, 3, 3, 3, 0, 0, 0],
@@ -611,7 +607,6 @@ class CTCLossTestV2(test.TestCase):
          [1, 4, 0, 0],
          [4, 2, 9, 4]])
 
-  @test_util.run_deprecated_v1
   def testCollapseRepeatedFrontRepeats(self):
     collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
         labels=[[1, 1, 1, 2, 2],
@@ -625,7 +620,6 @@ class CTCLossTestV2(test.TestCase):
          [1, 2],
          [1, 0]])
 
-  @test_util.run_deprecated_v1
   def testCollapseRepeatedAllLabelsTheSame(self):
     collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
         labels=[[1, 1, 1, 1, 1],
@@ -658,7 +652,6 @@ class CTCLossTestV2(test.TestCase):
 
     self.assertAllEqual(padded_dense, new_dense)
 
-  @test_util.run_deprecated_v1
   def testUnique(self):
     labels = [
         [3, 4, 4, 3],
@@ -674,7 +667,6 @@ class CTCLossTestV2(test.TestCase):
         [0, 0, 0, 1],
     ], idx)
 
-  @test_util.run_deprecated_v1
   def testSumStates(self):
     idx = [
         [0, 1, 0, 1],
@@ -694,7 +686,6 @@ class CTCLossTestV2(test.TestCase):
          [1.8, 0.8, 0.0, 0.0]]
     ], sum_of_states)
 
-  @test_util.run_deprecated_v1
   def testStateToOlabel(self):
     labels = [
         [3, 4, 3, 4],
@@ -733,7 +724,6 @@ class CTCLossTestV2(test.TestCase):
          [22.0 + 23.0 + 24.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
     ])
 
-  @test_util.run_deprecated_v1
   def testStateToOlabelUnique(self):
     labels = [
         [3, 4, 3, 4],
@@ -811,7 +801,7 @@ class CTCLossTestV2(test.TestCase):
       x = random_ops.random_uniform([])
       fn = lambda accum, elem: accum + x * elem
       out = ctc_ops._scan(fn, constant_op.constant([0.0, 1.0, 2.0]), 23.0)
-      self.assertAllEqual(*sess.run([
+      self.assertAllClose(*sess.run([
           [23.0 + x * 0.0, 23.0 + x * 1.0, 23.0 + x * 3.0], out
       ]))
 
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 9bb7d8b8b12baafe15fe9150e58c4e03749e7261..70f19f9d2f9d9155f5cc5e3458cb8cad8fb18064 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -887,7 +887,7 @@ class ComplexMakeRealImagTest(test.TestCase):
       tf_angle = math_ops.angle(inx)
       tf_angle_val = self.evaluate(tf_angle)
 
-    self.assertAllEqual(np_angle, tf_angle_val)
+    self.assertAllClose(np_angle, tf_angle_val)
     self.assertShapeEqual(np_angle, tf_angle)
 
   def testAngle64(self):
@@ -895,18 +895,14 @@ class ComplexMakeRealImagTest(test.TestCase):
     imag = (np.arange(-3, 3) / 5.).reshape([1, 3, 2]).astype(np.float32)
     cplx = real + 1j * imag
     self._compareAngle(cplx, use_gpu=False)
-    # TODO: Enable GPU tests for angle op after resolving
-    # build failures on GPU (See #10643 for context).
-    # self._compareAngle(cplx, use_gpu=True)
+    self._compareAngle(cplx, use_gpu=True)
 
   def testAngle(self):
     real = (np.arange(-3, 3) / 4.).reshape([1, 3, 2]).astype(np.float64)
     imag = (np.arange(-3, 3) / 5.).reshape([1, 3, 2]).astype(np.float64)
     cplx = real + 1j * imag
     self._compareAngle(cplx, use_gpu=False)
-    # TODO: Enable GPU tests for angle op after resolving
-    # build failures on GPU (See #10643 for context).
-    # self._compareAngle(cplx, use_gpu=True)
+    self._compareAngle(cplx, use_gpu=True)
 
   @test_util.run_deprecated_v1
   def testRealReal(self):
diff --git a/tensorflow/python/kernel_tests/denormal_test.py b/tensorflow/python/kernel_tests/denormal_test.py
index 80a3033ecc4d7e2ca1e25eea9e0525f1038d1023..d824e95f213acf5480be9bf2c431a4c4b89d106a 100644
--- a/tensorflow/python/kernel_tests/denormal_test.py
+++ b/tensorflow/python/kernel_tests/denormal_test.py
@@ -36,8 +36,9 @@ class DenormalTest(test.TestCase):
       self.assertEqual(tiny, tiny / 16 * 16)
 
   def _flushDenormalsTest(self, use_gpu, dtypes):
-    if platform.machine() == "ppc64le" or platform.machine() == "s390x":
-      # Disabled denormal_test on power/s390x platform
+    if platform.machine() == "ppc64le" or platform.machine(
+    ) == "s390x" or platform.machine() == "aarch64":
+      # Disabled denormal_test on power/s390x/aarch64 platform
       # Check relevant discussion - https://github.com/tensorflow/tensorflow/issues/11902
       return
     with self.cached_session(use_gpu=use_gpu):
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py b/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
index 4f74e1e741233db75793fa5468262887b6c52686..a778bf231bb80eefd6f4d602662fe50f67817a4f 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -33,7 +32,6 @@ class AssignOpTest(test.TestCase):
   # NOTE(mrry): We exclude thess tests from the TSAN TAP target, because they
   #   contain benign and deliberate data races when multiple threads update
   #   the same parameters without a lock.
-  @test_util.run_deprecated_v1
   def testParallelUpdateWithoutLocking(self):
     with self.cached_session() as sess:
       ones_t = array_ops.fill([1024, 1024], 1.0)
@@ -42,7 +40,7 @@ class AssignOpTest(test.TestCase):
           state_ops.assign_add(
               p, ones_t, use_locking=False) for _ in range(20)
       ]
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       def run_add(add_op):
         self.evaluate(add_op)
@@ -61,7 +59,6 @@ class AssignOpTest(test.TestCase):
       self.assertTrue((vals >= ones).all())
       self.assertTrue((vals <= ones * 20).all())
 
-  @test_util.run_deprecated_v1
   def testParallelAssignWithoutLocking(self):
     with self.cached_session() as sess:
       ones_t = array_ops.fill([1024, 1024], float(1))
@@ -70,7 +67,7 @@ class AssignOpTest(test.TestCase):
           state_ops.assign(p, math_ops.multiply(ones_t, float(i)), False)
           for i in range(1, 21)
       ]
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       def run_assign(assign_op):
         self.evaluate(assign_op)
@@ -94,7 +91,6 @@ class AssignOpTest(test.TestCase):
   # contain non-benign but known data races between the variable assignment and
   # returning the output tensors. This issue will be resolved with the new
   # resource variables.
-  @test_util.run_deprecated_v1
   def testParallelUpdateWithLocking(self):
     with self.cached_session() as sess:
       zeros_t = array_ops.fill([1024, 1024], 0.0)
@@ -104,7 +100,7 @@ class AssignOpTest(test.TestCase):
           state_ops.assign_add(
               p, ones_t, use_locking=True) for _ in range(20)
       ]
-      p.initializer.run()
+      self.evaluate(p.initializer)
 
       def run_add(add_op):
         self.evaluate(add_op)
@@ -122,7 +118,6 @@ class AssignOpTest(test.TestCase):
       ones = np.ones((1024, 1024)).astype(np.float32)
       self.assertAllEqual(vals, ones * 20)
 
-  @test_util.run_deprecated_v1
   def testParallelAssignWithLocking(self):
     with self.cached_session() as sess:
       zeros_t = array_ops.fill([1024, 1024], 0.0)
@@ -133,7 +128,7 @@ class AssignOpTest(test.TestCase):
               p, math_ops.multiply(ones_t, float(i)), use_locking=True)
           for i in range(1, 21)
       ]
-      p.initializer.run()
+      self.evaluate(p.initializer)
 
       def run_assign(assign_op):
         self.evaluate(assign_op)
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_test.py b/tensorflow/python/kernel_tests/dense_update_ops_test.py
index 309da88bef71d51ad638c6b9d599de8c460e33da..545de87ca10deb6c01ab889f331aa61dc815e19e 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_test.py
@@ -86,7 +86,7 @@ class AssignOpTest(test.TestCase):
   def testBasic(self):
     self._testTypes(np.arange(0, 20).reshape([4, 5]))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAssignNonStrictShapeChecking(self):
     with self.cached_session():
       data = array_ops.fill([1024, 1024], 0)
@@ -101,7 +101,7 @@ class AssignOpTest(test.TestCase):
       a2.op.run()
       self.assertAllEqual(p.eval(), self.evaluate(data2))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testInitRequiredAssignAdd(self):
     with self.cached_session():
       p = variables.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
@@ -109,7 +109,7 @@ class AssignOpTest(test.TestCase):
       with self.assertRaisesOpError("use uninitialized"):
         a.op.run()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testInitRequiredAssignSub(self):
     with self.cached_session():
       p = variables.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index f6d834c2f85e36e4fdd0f91b9d9a893992096793..5b1a47fb03563f3c104e0d0ca158a0918dcb39b6 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_impl
@@ -185,6 +186,7 @@ class DepthwiseConv2DTest(test.TestCase):
     self.assertShapeEqual(native_result, conv_native)
     self.assertShapeEqual(native_result, conv_interface)
 
+  @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2D(self):
     for index, (input_size, filter_size, _, stride,
                 padding) in enumerate(ConfigsToTest()):
@@ -428,6 +430,7 @@ class DepthwiseConv2DTest(test.TestCase):
           use_gpu, grouped_conv, err)
       self.assertLess(err, tolerance)
 
+  @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2DInputGrad(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(CheckGradConfigsToTest()):
@@ -477,6 +480,7 @@ class DepthwiseConv2DTest(test.TestCase):
             use_gpu=True,
             data_format="NCHW")
 
+  @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2DFilterGrad(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(CheckGradConfigsToTest()):
diff --git a/tensorflow/python/kernel_tests/determinant_op_test.py b/tensorflow/python/kernel_tests/determinant_op_test.py
index d6ef9e70b83ad70d470c6cbc55ce16e1924bbeef..dbfda385ed221cda8c42843326bccb08a10e0689 100644
--- a/tensorflow/python/kernel_tests/determinant_op_test.py
+++ b/tensorflow/python/kernel_tests/determinant_op_test.py
@@ -133,6 +133,7 @@ class DeterminantOpTest(test.TestCase):
     huge_matrix = np.array([[max_double, 0.0], [0.0, max_double]])
     self._compareDeterminant(huge_matrix)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonSquareMatrix(self):
     # When the determinant of a non-square matrix is attempted we should return
     # an error
@@ -140,6 +141,7 @@ class DeterminantOpTest(test.TestCase):
       linalg_ops.matrix_determinant(
           np.array([[1., 2., 3.], [3., 5., 4.]]).astype(np.float32))
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The input to the determinant should be a 2-dimensional tensor.
     tensor1 = constant_op.constant([1., 2.])
@@ -150,6 +152,7 @@ class DeterminantOpTest(test.TestCase):
     self._compareDeterminant(np.empty([0, 2, 2]))
     self._compareDeterminant(np.empty([2, 0, 0]))
 
+  @test_util.run_v1_only("b/120545219")
   def testConcurrentExecutesWithoutError(self):
     with self.session(use_gpu=True) as sess:
       matrix1 = random_ops.random_normal([5, 5], seed=42)
diff --git a/tensorflow/python/kernel_tests/distributions/multinomial_test.py b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
index b3f3416a52faf78c269c76839a3f5d7ac533bbab..187ddd4cf417a54acbdd7bcd5fc60459336f11c9 100644
--- a/tensorflow/python/kernel_tests/distributions/multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import multinomial
@@ -33,6 +34,7 @@ class MultinomialTest(test.TestCase):
   def setUp(self):
     self._rng = np.random.RandomState(42)
 
+  @test_util.run_v1_only("b/120545219")
   def testSimpleShapes(self):
     with self.cached_session():
       p = [.1, .3, .6]
@@ -42,6 +44,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([3]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([]), dist.batch_shape)
 
+  @test_util.run_v1_only("b/120545219")
   def testComplexShapes(self):
     with self.cached_session():
       p = 0.5 * np.ones([3, 2, 2], dtype=np.float32)
@@ -52,6 +55,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([2]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([3, 2]), dist.batch_shape)
 
+  @test_util.run_v1_only("b/120545219")
   def testN(self):
     p = [[0.1, 0.2, 0.7], [0.2, 0.3, 0.5]]
     n = [[3.], [4]]
@@ -60,6 +64,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((2, 1), dist.total_count.get_shape())
       self.assertAllClose(n, dist.total_count.eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testP(self):
     p = [[0.1, 0.2, 0.7]]
     with self.cached_session():
@@ -68,6 +73,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((1, 3), dist.logits.get_shape())
       self.assertAllClose(p, dist.probs.eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testLogits(self):
     p = np.array([[0.1, 0.2, 0.7]], dtype=np.float32)
     logits = np.log(p) - 50.
@@ -78,6 +84,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose(p, multinom.probs.eval())
       self.assertAllClose(logits, multinom.logits.eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testPmfUnderflow(self):
     logits = np.array([[-200, 0]], dtype=np.float32)
     with self.cached_session():
@@ -85,6 +92,7 @@ class MultinomialTest(test.TestCase):
       lp = dist.log_prob([1., 0.]).eval()[0]
       self.assertAllClose(-200, lp, atol=0, rtol=1e-6)
 
+  @test_util.run_v1_only("b/120545219")
   def testPmfandCountsAgree(self):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
@@ -97,6 +105,7 @@ class MultinomialTest(test.TestCase):
       with self.assertRaisesOpError("counts must sum to `self.total_count`"):
         dist.prob([3., 3, 0]).eval()
 
+  @test_util.run_v1_only("b/120545219")
   def testPmfNonIntegerCounts(self):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
@@ -157,6 +166,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose([0.1, 0.9], self.evaluate(pmf))
       self.assertEqual((2), pmf.get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testPmfCountsStretchedInBroadcastWhenSameRank(self):
     with self.cached_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
@@ -165,6 +175,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose(pmf.eval(), [0.1, 0.7])
       self.assertEqual((2), pmf.get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testPmfCountsStretchedInBroadcastWhenLowerRank(self):
     with self.cached_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
@@ -194,6 +205,7 @@ class MultinomialTest(test.TestCase):
       self.evaluate(pmf)
       self.assertEqual((4, 3), pmf.get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testMultinomialMean(self):
     with self.cached_session():
       n = 5.
@@ -203,6 +215,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((3,), dist.mean().get_shape())
       self.assertAllClose(expected_means, dist.mean().eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testMultinomialCovariance(self):
     with self.cached_session():
       n = 5.
@@ -214,6 +227,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((3, 3), dist.covariance().get_shape())
       self.assertAllClose(expected_covariances, dist.covariance().eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testMultinomialCovarianceBatch(self):
     with self.cached_session():
       # Shape [2]
@@ -246,6 +260,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((3, 5, 4, 4), covariance.get_shape())
       self.assertEqual((6, 3, 3, 3), covariance2.get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testCovarianceFromSampling(self):
     # We will test mean, cov, var, stddev on a DirichletMultinomial constructed
     # via broadcast between alpha, n.
@@ -288,6 +303,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose(sample_var_, analytic_var, atol=0.01, rtol=0.01)
       self.assertAllClose(sample_stddev_, analytic_stddev, atol=0.01, rtol=0.01)
 
+  @test_util.run_v1_only("b/120545219")
   def testSampleUnbiasedNonScalarBatch(self):
     with self.cached_session() as sess:
       dist = multinomial.Multinomial(
@@ -317,6 +333,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose(
           actual_covariance_, sample_covariance_, atol=0., rtol=0.20)
 
+  @test_util.run_v1_only("b/120545219")
   def testSampleUnbiasedScalarBatch(self):
     with self.cached_session() as sess:
       dist = multinomial.Multinomial(
diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py
index 6019245d0f8463b8f65624c14d340fafe66dea84..3ea2071e13a24fb804924081add2f2b41f314716 100644
--- a/tensorflow/python/kernel_tests/embedding_ops_test.py
+++ b/tensorflow/python/kernel_tests/embedding_ops_test.py
@@ -851,8 +851,9 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       embedding_weights = self._random_weights()
       sparse_ids, sparse_weights = self._ids_and_weights_2d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, sparse_weights).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, sparse_weights).eval())
 
       self.assertAllClose(
           embedding_lookup_result,
@@ -865,8 +866,10 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       embedding_weights = self._random_weights()
       sparse_ids, sparse_weights = self._ids_and_weights_2d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, sparse_weights, default_id=3).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, sparse_weights,
+              default_id=3).eval())
 
       self.assertAllClose(
           embedding_lookup_result,
@@ -880,8 +883,9 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       embedding_weights = self._random_weights()
       sparse_ids, _ = self._ids_and_weights_2d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, None).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, None).eval())
 
       self.assertAllClose(
           embedding_lookup_result,
@@ -895,8 +899,9 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       embedding_weights = self._random_weights(num_shards=3)
       sparse_ids, _ = self._ids_and_weights_2d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, None).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, None).eval())
 
       embedding_weights = list(itertools.chain(*embedding_weights))
       self.assertAllClose(embedding_lookup_result,
@@ -926,8 +931,9 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       embedding_weights = self._random_weights()
       sparse_ids, sparse_weights = self._ids_and_weights_3d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, sparse_weights).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, sparse_weights).eval())
 
       self.assertAllClose(embedding_lookup_result, [[
           (1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) / 3.0,
@@ -940,8 +946,10 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       embedding_weights = self._random_weights()
       sparse_ids, sparse_weights = self._ids_and_weights_3d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, sparse_weights, default_id=3).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, sparse_weights,
+              default_id=3).eval())
 
       self.assertAllClose(
           embedding_lookup_result,
@@ -957,8 +965,9 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       embedding_weights = self._random_weights()
       sparse_ids, _ = self._ids_and_weights_3d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, None).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, None).eval())
 
       self.assertAllClose(embedding_lookup_result, [[(
           embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4, [
@@ -974,8 +983,9 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       embedding_weights = self._random_weights(num_shards=3)
       sparse_ids, _ = self._ids_and_weights_3d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, None).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, None).eval())
 
       embedding_weights = list(itertools.chain(*embedding_weights))
       self.assertAllClose(embedding_lookup_result, [[
diff --git a/tensorflow/python/kernel_tests/fifo_queue_test.py b/tensorflow/python/kernel_tests/fifo_queue_test.py
index 9655351a01e7e566c091b6a0b1b54ec154fffa4c..0579dddb70264199a53c140ab60ad2ddf9b00bb9 100644
--- a/tensorflow/python/kernel_tests/fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/fifo_queue_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
+@test_util.run_v1_only("b/120545219")
 class FIFOQueueTest(test.TestCase):
 
   def testConstructor(self):
@@ -1423,6 +1424,7 @@ class FIFOQueueTest(test.TestCase):
         session.run([a, c])
 
 
+@test_util.run_v1_only("b/120545219")
 class FIFOQueueDictTest(test.TestCase):
 
   def testConstructor(self):
@@ -1583,6 +1585,7 @@ class FIFOQueueDictTest(test.TestCase):
       self.assertTrue([compat.as_bytes("dd"), compat.as_bytes("ee")], list(s))
 
 
+@test_util.run_v1_only("b/120545219")
 class FIFOQueueWithTimeoutTest(test.TestCase):
 
   def testDequeueWithTimeout(self):
@@ -1617,6 +1620,7 @@ class FIFOQueueWithTimeoutTest(test.TestCase):
       self.assertEqual(37, self.evaluate(dequeued_t))
 
 
+@test_util.run_v1_only("b/120545219")
 class QueueContainerTest(test.TestCase):
 
   def testContainer(self):
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index c489623fe56610fb6d05f7c3bed1ae3532e10eeb..0d6a3cbd3527ac409ddf5c1c851c8993f404d029 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -466,7 +466,7 @@ class FunctionalOpsTest(test.TestCase):
     loss = l0 + array_ops.stop_gradient(l1)
     grad = gradients_impl.gradients(ys=[loss], xs=[a, b])
     with self.test_session(use_gpu=True) as sess:
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.evaluate(grad)
 
   @test_util.run_in_graph_and_eager_modes
@@ -494,7 +494,7 @@ class FunctionalOpsTest(test.TestCase):
 
   @test_util.disable_control_flow_v2("b/119323354")
   @test_util.run_in_graph_and_eager_modes
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testMapEmptyScalar(self):
     map_return = functional_ops.map_fn(lambda x: 1, constant_op.constant([]))
     self.assertAllEqual([0], map_return.get_shape().dims)
@@ -503,7 +503,7 @@ class FunctionalOpsTest(test.TestCase):
   # TODO(akshayka): this test fails in eager: the iterable is of length 0 so
   # so the body of the while loop never executes
   @test_util.disable_control_flow_v2("b/119323354")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testMapEmptyTensor(self):
     with self.cached_session():
       map_return = functional_ops.map_fn(lambda x: array_ops.zeros([3, 2]),
@@ -797,7 +797,7 @@ class FunctionalOpsTest(test.TestCase):
     self.assertAllEqual(Run(100., False), 5050.)
     self.assertAllEqual(Run(100., True), 5050.)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWhileError(self):
     for use_gpu in (True, False):
       with ops.Graph().as_default() as g:
@@ -1027,7 +1027,7 @@ class FunctionalOpsTest(test.TestCase):
   def testForMLPWhile(self):
     self._testForMLP(True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testForError(self):
 
     @function.Defun(dtypes.int32, dtypes.float32)
@@ -1233,7 +1233,7 @@ class PartitionedCallTest(test.TestCase):
       self.assertAllEqual(expected, result)
 
   # Use an invalid executor name to test the plumbing of the executor_type attr.
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testExecutorTypeAttrExecutorNotFound(self):
     @function.Defun(dtypes.int32)
     def AddFive(x):
diff --git a/tensorflow/python/kernel_tests/identity_op_py_test.py b/tensorflow/python/kernel_tests/identity_op_py_test.py
index 1a6794e896f71cb18a8315b1ed50b798cd170973..40ec9db4226a89305732683118f7f906db1ba965 100644
--- a/tensorflow/python/kernel_tests/identity_op_py_test.py
+++ b/tensorflow/python/kernel_tests/identity_op_py_test.py
@@ -62,7 +62,7 @@ class IdentityOpTest(test.TestCase):
       self.assertEquals(shape,
                         array_ops.identity(np.array(array_2x3)).get_shape())
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testRefIdentityShape(self):
     with self.cached_session():
       shape = [2, 3]
diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index 28e1d7e1684e8dcfd7bc2a589c6e6c1a04ed3299..ff84221611813cf37537b843087faa70ae1d3e8e 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -61,6 +61,7 @@ class MatrixUnaryFunctorGradientTest(test_lib.TestCase):
 
 def _GetMatrixUnaryFunctorGradientTest(functor_, dtype_, shape_, **kwargs_):
 
+  @test_util.run_v1_only('b/120545219')
   def Test(self):
     with self.session(use_gpu=True):
       np.random.seed(1)
@@ -103,6 +104,7 @@ def _GetMatrixBinaryFunctorGradientTest(functor_,
                                         float32_tol_fudge=1.0,
                                         **kwargs_):
 
+  @test_util.run_v1_only('b/120545219')
   def Test(self):
     # TODO(rmlarsen): Debug illegal address bug on CUDA and re-enable
     # GPU test for matrix_solve.
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 8df1156438924693211ce3401972ac9dacb2346f..489f6c9b00471e6c10a8a04830613e9c5b99661a 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -880,6 +880,222 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(self.evaluate(result_0), [6., 8.])
     self.assertAllEqual(self.evaluate(result_1), [10., 12.])
 
+  @test_util.run_deprecated_v1
+  def testSkipEagerConcatShapeInference(self):
+
+    def BuildTensor(element_shape):
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32, element_shape=element_shape)
+      return list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+
+    self.assertIsNone(BuildTensor(None).shape.rank)
+    self.assertAllEqual(BuildTensor([None, 2, 3]).shape.as_list(), [None, 2, 3])
+    self.assertAllEqual(
+        BuildTensor([None, 2, None]).shape.as_list(), [None, 2, None])
+    self.assertAllEqual(BuildTensor([1, 2, 3]).shape.as_list(), [None, 2, 3])
+
+  def testConcatWithFullyDefinedElementShape(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[2, 2])
+    l = list_ops.tensor_list_push_back(l, [[0., 1.], [2., 3.]])
+    l = list_ops.tensor_list_push_back(l, [[4., 5.], [6., 7.]])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(
+        self.evaluate(t), [[0., 1.], [2., 3.], [4., 5.], [6., 7.]])
+
+  def testConcatWithNonFullyDefinedElementShape(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[None, 2])
+    l = list_ops.tensor_list_push_back(l, [[0., 1.]])
+    l = list_ops.tensor_list_push_back(l, [[2., 3.], [4., 5.]])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [[0., 1.], [2., 3.], [4., 5.]])
+
+  def testConcatWithMismatchingTensorShapesFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=None)
+    l = list_ops.tensor_list_push_back(l, [[0., 1.]])
+    l = list_ops.tensor_list_push_back(l, [[2.], [4.]])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Tried to concat tensors with unequal shapes: "
+        r"\[2\] vs \[1\]"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatEmptyListWithFullyDefinedElementShape(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[5, 2])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t).shape, (0, 2))
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[None, 2])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t).shape, (0, 2))
+
+  def testConcatEmptyListWithUnknownElementShapeFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=None)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "All except the first dimension must be fully"
+        " defined when concating an empty tensor list"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatEmptyListWithPartiallyDefinedElementShapeFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[2, None])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "All except the first dimension must be fully"
+        " defined when concating an empty tensor list"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatListWithScalarElementShapeFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=tensor_shape.scalar())
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Concat requires elements to be at least vectors, "
+        "found scalars instead"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatListWithScalarElementsFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=None)
+    l1 = list_ops.tensor_list_push_back(l, 1.)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError, "Concat saw a scalar shape at index 0"
+        " but requires at least vectors"):
+      t = list_ops.tensor_list_concat(l1, element_dtype=dtypes.float32)
+      self.evaluate(t)
+    l1 = list_ops.tensor_list_push_back(l, [1.])
+    l1 = list_ops.tensor_list_push_back(l1, 2.)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError, "Concat saw a scalar shape at index 1"
+        " but requires at least vectors"):
+      t = list_ops.tensor_list_concat(l1, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testEvenSplit(self):
+
+    def RunTest(input_tensor, lengths, expected_stacked_output):
+      l = list_ops.tensor_list_split(
+          input_tensor, element_shape=None, lengths=lengths)
+      self.assertAllEqual(
+          list_ops.tensor_list_stack(l, element_dtype=dtypes.float32),
+          expected_stacked_output)
+
+    RunTest([1., 2., 3.], [1, 1, 1], [[1.], [2.], [3.]])
+    RunTest([1., 2., 3., 4.], [2, 2], [[1., 2.], [3., 4.]])
+    RunTest([[1., 2.], [3., 4.]], [1, 1], [[[1., 2.]], [[3., 4.]]])
+
+  def testUnevenSplit(self):
+    l = list_ops.tensor_list_split([1., 2., 3., 4., 5],
+                                   element_shape=None,
+                                   lengths=[3, 2])
+    self.assertAllEqual(list_ops.tensor_list_length(l), 2)
+    self.assertAllEqual(
+        list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32),
+        [1., 2., 3.])
+    self.assertAllEqual(
+        list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32),
+        [4., 5.])
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithInvalidTensorShapeFails(self):
+    with self.cached_session():
+      tensor = array_ops.placeholder(dtype=dtypes.float32)
+      l = list_ops.tensor_list_split(tensor, element_shape=None, lengths=[1])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"Tensor must be at least a vector, but saw shape: \[\]"):
+        l.eval({tensor: 1})
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithInvalidLengthsShapeFails(self):
+    with self.cached_session():
+      lengths = array_ops.placeholder(dtype=dtypes.int64)
+      l = list_ops.tensor_list_split([1., 2.],
+                                     element_shape=None,
+                                     lengths=lengths)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"Expected lengths to be a vector, received shape: \[\]"):
+        l.eval({lengths: 1})
+
+  def testSplitWithInvalidLengthsFails(self):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r"Invalid value in lengths: -1"):
+      l = list_ops.tensor_list_split([1., 2.],
+                                     element_shape=None,
+                                     lengths=[1, -1])
+      self.evaluate(l)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Attempting to slice \[0, 3\] from tensor with length 2"):
+      l = list_ops.tensor_list_split([1., 2.], element_shape=None, lengths=[3])
+      self.evaluate(l)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Unused values in tensor. Length of tensor: 2 Values used: 1"):
+      l = list_ops.tensor_list_split([1., 2.], element_shape=None, lengths=[1])
+      self.evaluate(l)
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithScalarElementShapeFails(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 r"Shapes must be equal rank, but are 1 and 0"):
+      l = list_ops.tensor_list_split([1., 2.], element_shape=[], lengths=[1, 1])
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"TensorListSplit requires element_shape to be at least of rank 1, "
+          r"but saw: \[\]"):
+        element_shape = array_ops.placeholder(dtype=dtypes.int32)
+        l = list_ops.tensor_list_split([1., 2.],
+                                       element_shape=element_shape,
+                                       lengths=[1, 1])
+        l.eval({element_shape: []})
+
+  def testEagerOnlySplitWithScalarElementShapeFails(self):
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"TensorListSplit requires element_shape to be at least of rank 1, "
+          r"but saw: \[\]"):
+        list_ops.tensor_list_split([1., 2.], element_shape=[], lengths=[1, 1])
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithIncompatibleTensorShapeAndElementShapeFails(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 r"Shapes must be equal rank, but are 2 and 1"):
+      l = list_ops.tensor_list_split([[1.], [2.]],
+                                     element_shape=[1],
+                                     lengths=[1, 1])
+
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"tensor shape \[2,1\] is not compatible with element_shape \[1\]"):
+        element_shape = array_ops.placeholder(dtype=dtypes.int32)
+        l = list_ops.tensor_list_split([[1.], [2.]],
+                                       element_shape=element_shape,
+                                       lengths=[1, 1])
+        l.eval({element_shape: [1]})
+
+  def testEagerOnlySplitWithIncompatibleTensorShapeAndElementShapeFails(self):
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"tensor shape \[2,1\] is not compatible with element_shape \[1\]"):
+        list_ops.tensor_list_split([[1.], [2.]],
+                                   element_shape=[1],
+                                   lengths=[1, 1])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index abff61f81b08a131f7ae2e2bab81ba04530f36cf..4584a27e6227bf53e4de5f74730cc9b737214cd5 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -51,26 +51,26 @@ class AbsoluteDifferenceLossTest(test.TestCase):
         losses.absolute_difference(
             self._predictions, self._predictions, weights=None)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeight(self):
     loss = losses.absolute_difference(self._predictions, self._predictions)
     with self.cached_session():
       self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLoss(self):
     loss = losses.absolute_difference(self._labels, self._predictions)
     with self.cached_session():
       self.assertAlmostEqual(5.5, self.evaluate(loss), 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
       self.assertAlmostEqual(5.5 * weights, self.evaluate(loss), 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.absolute_difference(self._labels, self._predictions,
@@ -148,7 +148,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -158,7 +158,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       loss = losses.softmax_cross_entropy(labels, logits, weights)
       self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -311,7 +311,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -321,7 +321,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
       self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -677,13 +677,13 @@ class LogLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         losses.log_loss(self._labels, self._labels, weights=None)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeight(self):
     loss = losses.log_loss(self._labels, self._labels)
     with self.cached_session():
       self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeightWithPlaceholder(self):
     tf_predictions = array_ops.placeholder(
         dtypes.float32, shape=self._np_labels.shape)
@@ -692,14 +692,14 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(
           0.0, loss.eval(feed_dict={tf_predictions: self._np_labels}), 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLoss(self):
     loss = losses.log_loss(self._labels, self._predictions)
     with self.cached_session():
       self.assertAlmostEqual(-np.sum(self._expected_losses) / 6.0,
                              self.evaluate(loss), 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.log_loss(self._labels, self._predictions, weights)
@@ -707,7 +707,7 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
                              self.evaluate(loss), 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.log_loss(self._labels, self._predictions,
@@ -716,7 +716,7 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
                              self.evaluate(loss), 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeightAndPlaceholder(self):
     tf_predictions = array_ops.placeholder(
         dtypes.float32, shape=self._np_predictions.shape)
@@ -728,7 +728,7 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
                              loss, 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeightAndPlaceholderWithRankOnly(self):
     tf_predictions = array_ops.placeholder(dtypes.float32, shape=[None, None])
     weights = 2.3
@@ -788,7 +788,7 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(-np.sum(expected_losses) / 5.0,
                              self.evaluate(loss), 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithMeasurementSpecificWeightsWithPlaceholder(self):
     weights = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3))
     expected_losses = np.multiply(self._expected_losses, weights)
@@ -816,7 +816,7 @@ class LogLossTest(test.TestCase):
     with self.cached_session():
       self.assertAlmostEqual(-np.sum(expected_losses), self.evaluate(loss), 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithSampleSpecificWeightsMostZeroWithPlaceholder(self):
     weights = np.array([0, 0, 0, 0, 0, 2]).reshape((2, 3))
     expected_losses = np.multiply(self._expected_losses, weights)
@@ -955,26 +955,26 @@ class MeanSquaredErrorTest(test.TestCase):
           losses.mean_squared_error(predictions=constant_op.constant(0),
                                     labels=constant_op.constant(0)).eval())
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeight(self):
     loss = losses.mean_squared_error(self._predictions, self._predictions)
     with self.cached_session():
       self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLoss(self):
     loss = losses.mean_squared_error(self._labels, self._predictions)
     with self.cached_session():
       self.assertAlmostEqual(49.5, self.evaluate(loss), 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
       self.assertAlmostEqual(49.5 * weights, self.evaluate(loss), 3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.mean_squared_error(self._labels, self._predictions,
@@ -1068,12 +1068,12 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
       self.assertAlmostEqual(
           expected_loss, dynamic_inputs_op.eval(feed_dict=feed_dict), places=3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeight(self):
     self._test_valid_weights(
         self._labels, self._labels, expected_loss=0.0)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLoss(self):
     self._test_valid_weights(
         self._labels, self._predictions,
@@ -1104,7 +1104,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
           np_grad = self.evaluate(grad)
           self.assertFalse(np.isnan(np_grad).any())
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     weight = 2.3
     self._test_valid_weights(
@@ -1112,7 +1112,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         expected_loss=weight * np.sum(self._expected_losses),
         weights=weight)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.mean_pairwise_squared_error(
@@ -1179,7 +1179,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
             weights_placeholder: weights,
         })
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testInvalid3dWeighted2x0(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
diff --git a/tensorflow/python/kernel_tests/lu_op_test.py b/tensorflow/python/kernel_tests/lu_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..06deb0e1c82175c33b028e017a5f54cc2549253b
--- /dev/null
+++ b/tensorflow/python/kernel_tests/lu_op_test.py
@@ -0,0 +1,288 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.tf.Lu."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+
+class LuOpTest(test.TestCase):
+
+  @property
+  def float_types(self):
+    return set((np.float64, np.float32, np.complex64, np.complex128))
+
+  def _verifyLuBase(self, x, lower, upper, perm, verification,
+                    output_idx_type):
+    lower_np, upper_np, perm_np, verification_np = self.evaluate(
+        [lower, upper, perm, verification])
+
+    self.assertAllClose(x, verification_np)
+    self.assertShapeEqual(x, lower)
+    self.assertShapeEqual(x, upper)
+
+    self.assertAllEqual(x.shape[:-1], perm.shape.as_list())
+
+    # Check dtypes are as expected.
+    self.assertEqual(x.dtype, lower_np.dtype)
+    self.assertEqual(x.dtype, upper_np.dtype)
+    self.assertEqual(output_idx_type.as_numpy_dtype, perm_np.dtype)
+
+    # Check that the permutation is valid.
+    if perm_np.shape[-1] > 0:
+      perm_reshaped = np.reshape(perm_np, (-1, perm_np.shape[-1]))
+      for perm_vector in perm_reshaped:
+        self.assertAllClose(np.arange(len(perm_vector)), np.sort(perm_vector))
+
+  def _verifyLu(self, x, output_idx_type=dtypes.int64):
+    # Verify that Px = LU.
+    with test_util.use_gpu():
+
+      lu, perm = linalg_ops.lu(x, output_idx_type=output_idx_type)
+
+      # Prepare the lower factor of shape num_rows x num_rows
+      lu_shape = np.array(lu.shape.as_list())
+      batch_shape = lu_shape[:-2]
+      num_rows = lu_shape[-2]
+      num_cols = lu_shape[-1]
+
+      lower = array_ops.matrix_band_part(lu, -1, 0)
+
+      if num_rows > num_cols:
+        eye = linalg_ops.eye(
+            num_rows, batch_shape=batch_shape, dtype=lower.dtype)
+        lower = array_ops.concat([lower, eye[..., num_cols:]], axis=-1)
+      elif num_rows < num_cols:
+        lower = lower[..., :num_rows]
+
+      # Fill the diagonal with ones.
+      ones_diag = array_ops.ones(
+          np.append(batch_shape, num_rows), dtype=lower.dtype)
+      lower = array_ops.matrix_set_diag(lower, ones_diag)
+
+      # Prepare the upper factor.
+      upper = array_ops.matrix_band_part(lu, 0, -1)
+
+      verification = math_ops.matmul(lower, upper)
+
+      # Permute the rows of product of the Cholesky factors.
+      if num_rows > 0:
+        # Reshape the product of the triangular factors and permutation indices
+        # to a single batch dimension. This makes it easy to apply
+        # invert_permutation and gather_nd ops.
+        perm_reshaped = array_ops.reshape(perm, [-1, num_rows])
+        verification_reshaped = array_ops.reshape(verification,
+                                                  [-1, num_rows, num_cols])
+        # Invert the permutation in each batch.
+        inv_perm_reshaped = functional_ops.map_fn(array_ops.invert_permutation,
+                                                  perm_reshaped)
+        batch_size = perm_reshaped.shape.as_list()[0]
+        # Prepare the batch indices with the same shape as the permutation.
+        # The corresponding batch index is paired with each of the `num_rows`
+        # permutation indices.
+        batch_indices = math_ops.cast(
+            array_ops.broadcast_to(
+                math_ops.range(batch_size)[:, None], perm_reshaped.shape),
+            dtype=output_idx_type)
+        permuted_verification_reshaped = array_ops.gather_nd(
+            verification_reshaped,
+            array_ops.stack([batch_indices, inv_perm_reshaped], axis=-1))
+
+        # Reshape the verification matrix back to the original shape.
+        verification = array_ops.reshape(permuted_verification_reshaped,
+                                         lu_shape)
+
+      self._verifyLuBase(x, lower, upper, perm, verification,
+                         output_idx_type)
+
+  def testBasic(self):
+    data = np.array([[4., -1., 2.], [-1., 6., 0], [10., 0., 5.]])
+
+    for dtype in (np.float32, np.float64):
+      for output_idx_type in (dtypes.int32, dtypes.int64):
+        self._verifyLu(data.astype(dtype), output_idx_type=output_idx_type)
+
+    for dtype in (np.complex64, np.complex128):
+      for output_idx_type in (dtypes.int32, dtypes.int64):
+        complex_data = np.tril(1j * data, -1).astype(dtype)
+        complex_data += np.triu(-1j * data, 1).astype(dtype)
+        complex_data += data
+        self._verifyLu(complex_data, output_idx_type=output_idx_type)
+
+  def testPivoting(self):
+    with test_util.use_gpu():
+      # This matrix triggers partial pivoting because the first diagonal entry
+      # is small.
+      data = np.array([[1e-9, 1., 0.], [1., 0., 0], [0., 1., 5]])
+      self._verifyLu(data.astype(np.float32))
+
+      for dtype in (np.float32, np.float64):
+        self._verifyLu(data.astype(dtype))
+        _, p = linalg_ops.lu(data)
+        p_val = self.evaluate([p])
+        # Make sure p_val is not the identity permutation.
+        self.assertNotAllClose(np.arange(3), p_val)
+
+      for dtype in (np.complex64, np.complex128):
+        complex_data = np.tril(1j * data, -1).astype(dtype)
+        complex_data += np.triu(-1j * data, 1).astype(dtype)
+        complex_data += data
+        self._verifyLu(complex_data)
+        _, p = linalg_ops.lu(data)
+        p_val = self.evaluate([p])
+        # Make sure p_val is not the identity permutation.
+        self.assertNotAllClose(np.arange(3), p_val)
+
+  def testInvalidMatrix(self):
+    # LU factorization gives an error when the input is singular.
+    # Note: A singular matrix may return without error but it won't be a valid
+    # factorization.
+    with test_util.use_gpu():
+      for dtype in self.float_types:
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(
+              linalg_ops.lu(
+                  np.array([[1., 2., 3.], [2., 4., 6.], [2., 3., 4.]],
+                           dtype=dtype)))
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(
+              linalg_ops.lu(
+                  np.array([[[1., 2., 3.], [2., 4., 6.], [1., 2., 3.]],
+                            [[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]]],
+                           dtype=dtype)))
+
+  def testBatch(self):
+    simple_array = np.array([[[1., -1.], [2., 5.]]])  # shape (1, 2, 2)
+    self._verifyLu(simple_array)
+    self._verifyLu(np.vstack((simple_array, simple_array)))
+    odd_sized_array = np.array([[[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]]])
+    self._verifyLu(np.vstack((odd_sized_array, odd_sized_array)))
+
+    batch_size = 200
+
+    # Generate random matrices.
+    np.random.seed(42)
+    matrices = np.random.rand(batch_size, 5, 5)
+    self._verifyLu(matrices)
+
+    # Generate random complex valued matrices.
+    np.random.seed(52)
+    matrices = np.random.rand(batch_size, 5,
+                              5) + 1j * np.random.rand(batch_size, 5, 5)
+    self._verifyLu(matrices)
+
+  def testLargeMatrix(self):
+    # Generate random matrices.
+    n = 500
+    np.random.seed(64)
+    data = np.random.rand(n, n)
+    self._verifyLu(data)
+
+    # Generate random complex valued matrices.
+    np.random.seed(129)
+    data = np.random.rand(n, n) + 1j * np.random.rand(n, n)
+    self._verifyLu(data)
+
+  @test_util.run_v1_only("b/120545219")
+  def testEmpty(self):
+    self._verifyLu(np.empty([0, 2, 2]))
+    self._verifyLu(np.empty([2, 0, 0]))
+
+  @test_util.run_deprecated_v1
+  def testConcurrentExecutesWithoutError(self):
+    with test_util.use_gpu():
+      matrix1 = random_ops.random_normal([5, 5], seed=42)
+      matrix2 = random_ops.random_normal([5, 5], seed=42)
+      lu1, p1 = linalg_ops.lu(matrix1)
+      lu2, p2 = linalg_ops.lu(matrix2)
+      lu1_val, p1_val, lu2_val, p2_val = self.evaluate([lu1, p1, lu2, p2])
+      self.assertAllEqual(lu1_val, lu2_val)
+      self.assertAllEqual(p1_val, p2_val)
+
+
+class LuBenchmark(test.Benchmark):
+  shapes = [
+      (4, 4),
+      (10, 10),
+      (16, 16),
+      (101, 101),
+      (256, 256),
+      (1000, 1000),
+      (1024, 1024),
+      (2048, 2048),
+      (4096, 4096),
+      (513, 2, 2),
+      (513, 8, 8),
+      (513, 256, 256),
+      (4, 513, 2, 2),
+  ]
+
+  def _GenerateMatrix(self, shape):
+    batch_shape = shape[:-2]
+    shape = shape[-2:]
+    assert shape[0] == shape[1]
+    n = shape[0]
+    matrix = np.ones(shape).astype(np.float32) / (2.0 * n) + np.diag(
+        np.ones(n).astype(np.float32))
+    return np.tile(matrix, batch_shape + (1, 1))
+
+  def benchmarkLuOp(self):
+    for shape in self.shapes:
+      with ops.Graph().as_default(), \
+          session.Session(config=benchmark.benchmark_config()) as sess, \
+          ops.device("/cpu:0"):
+        matrix = variables.Variable(self._GenerateMatrix(shape))
+        lu, p = linalg_ops.lu(matrix)
+        variables.global_variables_initializer().run()
+        self.run_op_benchmark(
+            sess,
+            control_flow_ops.group(lu, p),
+            min_iters=25,
+            name="lu_cpu_{shape}".format(shape=shape))
+
+      if test.is_gpu_available(True):
+        with ops.Graph().as_default(), \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
+            ops.device("/device:GPU:0"):
+          matrix = variables.Variable(self._GenerateMatrix(shape))
+          lu, p = linalg_ops.lu(matrix)
+          variables.global_variables_initializer().run()
+          self.run_op_benchmark(
+              sess,
+              control_flow_ops.group(lu, p),
+              min_iters=25,
+              name="lu_gpu_{shape}".format(shape=shape))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 983f463f5e34faeb292ef445ec127d9f7c9879f2..d31ecbcd3f1d57386fa629cd533f5f698176ca76 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -21,11 +21,12 @@ from __future__ import print_function
 import operator
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -41,11 +42,9 @@ class MatVecTest(test_lib.TestCase):
   def testTwoByTwoCase(self):
     a = np.array([[1, 2], [3, 4]])
     b = np.array([5, 6])
-    with self.cached_session():
-      c = math_ops.matvec(a, b)
-      self.assertAllEqual((2,), c.shape)
-      c_ = self.evaluate(c)
-    self.assertAllEqual([5 + 2 * 6, 3 * 5 + 4 * 6], c_)
+    c = math_ops.matvec(a, b)
+    self.assertAllEqual((2,), c.shape)
+    self.assertAllEqual([5 + 2 * 6, 3 * 5 + 4 * 6], c)
 
 
 def _AddTest(test, op_name, testcase_name, fn):
@@ -85,7 +84,7 @@ def _GetMatMulTest(a_np_, b_np_, use_static_shape_, **kwargs_):
     # np.matrix(a_np_) * np.matrix(b_np_)
     effective_a_np = _GetTransposedMatrices(a_np_, "a", kwargs_)
     effective_b_np = _GetTransposedMatrices(b_np_, "b", kwargs_)
-    with self.session(use_gpu=use_gpu) as sess:
+    with self.cached_session() as sess, test_util.device(use_gpu):
       if use_static_shape_:
         a = constant_op.constant(effective_a_np)
         b = constant_op.constant(effective_b_np)
@@ -128,45 +127,45 @@ def _GetMatMulGradientTest(a_np_, b_np_, use_static_shape_, **kwargs_):
     epsilon = np.finfo(a_np_.dtype).eps
     delta = epsilon**(1.0 / 3.0)
     tol = 20 * delta
-    with self.session(use_gpu=True):
-      a = constant_op.constant(effective_a_np)
-      b = constant_op.constant(effective_b_np)
-      res = math_ops.matmul(a, b, **kwargs_)
-      for x, x_init in [a, effective_a_np], [b, effective_b_np]:
-        theoretical, numerical = gradient_checker.compute_gradient(
-            x,
-            x_init.shape,
-            res, [a_np_.shape[0], b_np_.shape[1]],
-            x_init_value=x_init,
-            delta=delta)
-        self.assertAllClose(theoretical, numerical, rtol=tol, atol=tol)
+    with self.session(), test_util.use_gpu():
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda x: math_ops.matmul(x, effective_b_np, **kwargs_),
+          [effective_a_np],
+          delta=delta)
+      self.assertAllClose(theoretical, numerical, rtol=tol, atol=tol)
+
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda x: math_ops.matmul(effective_a_np, x, **kwargs_),
+          [effective_b_np],
+          delta=delta)
+      self.assertAllClose(theoretical, numerical, rtol=tol, atol=tol)
 
   return Test
 
 
 class MatMulStatsTest(test_lib.TestCase):
 
+  @test_util.run_v1_only("Test requires a Graph and NodeDef inspection")
   def testSimpleStatistics(self):
-    g = ops.Graph()
-    with g.as_default():
-      a = variables.Variable(random_ops.random_normal([25, 16]))
-      b = variables.Variable(random_ops.random_normal([16, 9]))
-      math_ops.matmul(a, b)
-      for op in g.get_operations():
-        flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value
-        if op.name == "MatMul":
-          self.assertEqual(7200, flops)
-
+    a = variables.Variable(random_ops.random_normal([25, 16]))
+    b = variables.Variable(random_ops.random_normal([16, 9]))
+    math_ops.matmul(a, b)
+    g = ops.get_default_graph()
+    for op in g.get_operations():
+      flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value
+      if op.name == "MatMul":
+        self.assertEqual(7200, flops)
+
+  @test_util.run_v1_only("Test requires a Graph and NodeDef inspection")
   def testTransposedStatistics(self):
-    g = ops.Graph()
-    with g.as_default():
-      a = variables.Variable(random_ops.random_normal([16, 25]))
-      b = variables.Variable(random_ops.random_normal([16, 9]))
-      math_ops.matmul(a, b, transpose_a=True)
-      for op in g.get_operations():
-        flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value
-        if op.name == "MatMul":
-          self.assertEqual(7200, flops)
+    a = variables.Variable(random_ops.random_normal([16, 25]))
+    b = variables.Variable(random_ops.random_normal([16, 9]))
+    math_ops.matmul(a, b, transpose_a=True)
+    g = ops.get_default_graph()
+    for op in g.get_operations():
+      flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value
+      if op.name == "MatMul":
+        self.assertEqual(7200, flops)
 
 
 try:
@@ -194,43 +193,40 @@ except AttributeError:
 
 class MatMulInfixOperatorTest(test_lib.TestCase):
 
-  @test_util.run_deprecated_v1
   def testMismatchedShape(self):
-    with self.assertRaisesWithPredicateMatch(ValueError,
-                                             lambda e: "Shape must" in str(e)):
+    with self.assertRaisesRegexp(
+        Exception, "(Shape must be rank 2 but is rank 1|is not a matrix)"):
       infix_matmul(
           ops.convert_to_tensor([10.0, 20.0, 30.0]),
           ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
 
-  @test_util.run_deprecated_v1
   def testMismatchedDimensions(self):
-    with self.assertRaisesWithPredicateMatch(
-        ValueError, lambda e: "Dimensions must" in str(e)):
+    with self.assertRaisesRegexp(
+        Exception, "(Dimensions must be equal|Matrix size-incompatible)"):
       infix_matmul(
           ops.convert_to_tensor([[10.0, 20.0, 30.0]]),
           ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("Tensor.op is generally not applicable in TF 2")
   def testInfixMatmulIsTfMatmul(self):
     a = ops.convert_to_tensor([[10.0, 20.0, 30.0]])
     b = ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0], [80.0, 90.0]])
     c = infix_matmul(a, b)
     self.assertEqual(c.op.type, "MatMul")
 
-  @test_util.run_deprecated_v1
   def testInfixMatmulDoesDotProduct(self):
     a = ops.convert_to_tensor([[10.0, 20.0, 30.0]])
     b = ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0], [80.0, 90.0]])
     c = infix_matmul(a, b)
     d = math_ops.matmul(a, b)
-    with self.cached_session():
-      self.assertAllEqual(c.eval(), self.evaluate(d))
+    self.assertAllEqual(c, d)
 
 
 if __name__ == "__main__":
   sizes = [1, 3, 5]
   trans_options = [[False, False], [True, False], [False, True]]
-  for use_static_shape in [False, True]:
+  # TF2 does not support placeholders under eager so we skip it
+  for use_static_shape in set([True, tf2.enabled()]):
     for dtype in (np.int32, np.int64, np.float16, np.float32, np.float64,
                   np.complex64, np.complex128):
       if not use_static_shape and (dtype == np.int32 or dtype == np.int64):
diff --git a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
index 129ea40dfe67e916dad24bf4824e0f33ce084ff7..fdb7e4a1a4e54883afd66e6a856a977b61ff8aaf 100644
--- a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
@@ -44,6 +45,7 @@ class MatrixBandPartTest(test_lib.TestCase):
 
 def _GetMatrixBandPartTest(dtype_, batch_shape_, shape_):
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     mat = np.ones(shape_).astype(dtype_)
     batch_mat = np.tile(mat, batch_shape_ + (1, 1))
@@ -73,6 +75,7 @@ class MatrixBandPartGradTest(test_lib.TestCase):
 
 def _GetMatrixBandPartGradTest(dtype_, batch_shape_, shape_):
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     shape = batch_shape_ + shape_
     x = constant_op.constant(np.random.rand(*shape), dtype=dtype_)
diff --git a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
index b0bce6a1b9b2b3983b42c98e2249d6c88b1f54d2..682ac12adc6acef378ccbb256066cbd2b099e1b9 100644
--- a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
@@ -84,6 +84,7 @@ class LogarithmOpTest(test.TestCase):
     # Complex batch
     self._verifyLogarithmComplex(self._makeBatch(matrix1, matrix2))
 
+  @test_util.run_v1_only("b/120545219")
   def testNonSquareMatrix(self):
     # When the logarithm of a non-square matrix is attempted we should return
     # an error
@@ -91,6 +92,7 @@ class LogarithmOpTest(test.TestCase):
       gen_linalg_ops.matrix_logarithm(
           np.array([[1., 2., 3.], [3., 4., 5.]], dtype=np.complex64))
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The input to the logarithm should be at least a 2-dimensional tensor.
     tensor3 = constant_op.constant([1., 2.], dtype=dtypes.complex64)
@@ -121,6 +123,7 @@ class LogarithmOpTest(test.TestCase):
             size=np.prod(shape)).reshape(shape).astype(np.complex128)
         self._verifyLogarithmComplex(matrix)
 
+  @test_util.run_v1_only("b/120545219")
   def testConcurrentExecutesWithoutError(self):
     with self.session(use_gpu=True) as sess:
       matrix1 = math_ops.cast(
diff --git a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
index a6f5da9d3d7d4aef318c64812c4601ad02be8506..463477a6a2cb5cf174b461c1fbffd2024f7ce21e 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -134,7 +135,7 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
       self.assertEqual(np_ans.shape, tf_ans_val.shape)
       self.assertAllClose(np_ans, tf_ans_val, atol=2 * tol, rtol=2 * tol)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The matrix and right-hand sides should have the same number of rows.
     with self.session(use_gpu=True):
@@ -143,23 +144,26 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
       with self.assertRaises(ValueError):
         linalg_ops.matrix_solve_ls(matrix, rhs)
 
-  @test_util.run_deprecated_v1
   def testEmpty(self):
     full = np.array([[1., 2.], [3., 4.], [5., 6.]])
     empty0 = np.empty([3, 0])
     empty1 = np.empty([0, 2])
     for fast in [True, False]:
       with self.cached_session(use_gpu=True):
-        tf_ans = linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast).eval()
+        tf_ans = self.evaluate(
+            linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast))
         self.assertEqual(tf_ans.shape, (0, 0))
-        tf_ans = linalg_ops.matrix_solve_ls(empty0, full, fast=fast).eval()
+        tf_ans = self.evaluate(
+            linalg_ops.matrix_solve_ls(empty0, full, fast=fast))
         self.assertEqual(tf_ans.shape, (0, 2))
-        tf_ans = linalg_ops.matrix_solve_ls(full, empty0, fast=fast).eval()
+        tf_ans = self.evaluate(
+            linalg_ops.matrix_solve_ls(full, empty0, fast=fast))
         self.assertEqual(tf_ans.shape, (2, 0))
-        tf_ans = linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast).eval()
+        tf_ans = self.evaluate(
+            linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast))
         self.assertEqual(tf_ans.shape, (2, 2))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testBatchResultSize(self):
     # 3x3x3 matrices, 3x3x1 right-hand sides.
     matrix = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9.] * 3).reshape(3, 3, 3)
@@ -350,7 +354,8 @@ class MatrixSolveLsBenchmark(test_lib.Benchmark):
 
 if __name__ == "__main__":
   for dtype_ in [np.float32, np.float64, np.complex64, np.complex128]:
-    for use_placeholder_ in [True, False]:
+    # TF2 does not support placeholders under eager so we skip it
+    for use_placeholder_ in set([False, not tf2.enabled()]):
       for fast_ in [True, False]:
         l2_regularizers = [0] if dtype_ == np.complex128 else [0, 0.1]
         for l2_regularizer_ in l2_regularizers:
diff --git a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
index 1e2109b8c41663a18d21eaeea75f3944ae38d5bb..3edb390c724b6c71cd8849efc2b22a579e87247f 100644
--- a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
@@ -90,17 +90,20 @@ class SquareRootOpTest(test.TestCase):
     self._verifySquareRootReal(np.empty([0, 2, 2]))
     self._verifySquareRootReal(np.empty([2, 0, 0]))
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The input to the square root should be at least a 2-dimensional tensor.
     tensor = constant_op.constant([1., 2.])
     with self.assertRaises(ValueError):
       gen_linalg_ops.matrix_square_root(tensor)
 
+  @test_util.run_v1_only("b/120545219")
   def testNotSquare(self):
     with self.assertRaises(ValueError):
       tensor = constant_op.constant([[1., 0., -1.], [-1., 1., 0.]])
       self.evaluate(gen_linalg_ops.matrix_square_root(tensor))
 
+  @test_util.run_v1_only("b/120545219")
   def testConcurrentExecutesWithoutError(self):
     with test_util.use_gpu():
       matrix1 = random_ops.random_normal([5, 5], seed=42)
diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py
index 5ff0c58bf1bee6909d68420f89bcecf5afa490e6..20b9ad95c8be7aa59a2a1b70d59341e2f3ec8fa4 100644
--- a/tensorflow/python/kernel_tests/norm_op_test.py
+++ b/tensorflow/python/kernel_tests/norm_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.platform import test as test_lib
@@ -35,6 +36,7 @@ def _AddTest(test, test_name, fn):
 
 class NormOpTest(test_lib.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testBadOrder(self):
     matrix = [[0., 1.], [2., 3.]]
     for ord_ in "fro", -7, -1.1, 0:
@@ -52,6 +54,7 @@ class NormOpTest(test_lib.TestCase):
                                    "'ord' must be a supported matrix norm"):
         linalg_ops.norm(matrix, ord=ord_, axis=[-2, -1])
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidAxis(self):
     matrix = [[0., 1.], [2., 3.]]
     for axis_ in [], [1, 2, 3], [[1]], [[1], [2]], [3.1415], [1, 1]:
@@ -78,6 +81,7 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
         tf_norm_val = sess.run(tf_norm, feed_dict={tf_matrix: matrix})
     self.assertAllClose(np_norm, tf_norm_val, rtol=1e-5, atol=1e-5)
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     is_matrix_norm = (isinstance(axis_, tuple) or
                       isinstance(axis_, list)) and len(axis_) == 2
diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py
index 5751f3fe7670a2e0ca423e6deb526e14fc66dec9..f13f9d68062e7874222b5bc67d6fcc8378af0714 100644
--- a/tensorflow/python/kernel_tests/numerics_test.py
+++ b/tensorflow/python/kernel_tests/numerics_test.py
@@ -64,9 +64,9 @@ class VerifyTensorAllFiniteTest(test.TestCase):
         self.evaluate(t_verified)
 
 
+@test_util.run_v1_only("b/120545219")
 class NumericsTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def testInf(self):
     with self.session(graph=ops.Graph()):
       t1 = constant_op.constant(1.0)
@@ -77,7 +77,6 @@ class NumericsTest(test.TestCase):
       with self.assertRaisesOpError("Inf"):
         self.evaluate(a)
 
-  @test_util.run_deprecated_v1
   def testNaN(self):
     with self.session(graph=ops.Graph()):
       t1 = constant_op.constant(0.0)
@@ -88,7 +87,6 @@ class NumericsTest(test.TestCase):
       with self.assertRaisesOpError("NaN"):
         self.evaluate(a)
 
-  @test_util.run_deprecated_v1
   def testBoth(self):
     with self.session(graph=ops.Graph()):
       t1 = constant_op.constant([1.0, 0.0])
@@ -107,7 +105,6 @@ class NumericsTest(test.TestCase):
       self.assertAllEqual(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), value)
       self.assertEqual([2, 3], checked.get_shape())
 
-  @test_util.run_deprecated_v1
   def testControlFlowCond(self):
     predicate = array_ops.placeholder(dtypes.bool, shape=[])
     _ = control_flow_ops.cond(predicate,
@@ -120,7 +117,6 @@ class NumericsTest(test.TestCase):
         r"or `tf.while_loop\(\)`\."):
       numerics.add_check_numerics_ops()
 
-  @test_util.run_deprecated_v1
   def testControlFlowWhile(self):
     predicate = array_ops.placeholder(dtypes.bool, shape=[])
     _ = control_flow_ops.while_loop(lambda _: predicate,
diff --git a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
index b4818360d57236e754dd5fb837365026d8dbc019..e3999695d0605f49d1440c3305f020e4871940a3 100644
--- a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
@@ -29,11 +29,13 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("b/120545219")
 class PaddingFIFOQueueTest(test.TestCase):
 
   def testConstructor(self):
@@ -1393,6 +1395,7 @@ class PaddingFIFOQueueTest(test.TestCase):
     with self.assertRaisesOpError("was cancelled"):
       self.evaluate(enqueue_many_op)
 
+  @test_util.run_deprecated_v1
   def testResetOfBlockingOperation(self):
     with self.cached_session() as sess:
       q_empty = data_flow_ops.PaddingFIFOQueue(5, dtypes_lib.float32, ((),))
diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py
index 48655391fa70eb22cf56faddb3ae3734d40f91a0..edcbc2967e2fb14c8c2d3c6a3ae9b434876e02d5 100644
--- a/tensorflow/python/kernel_tests/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py
@@ -323,26 +323,24 @@ class PartitionedVariablesTestCase(test.TestCase):
     for i in xrange(len(expected_specs)):
       self.assertEquals(expected_specs[i], slices[i]._save_slice_info.spec)
 
-  @test_util.run_deprecated_v1
   def testVecConstantInit(self):
     with self.cached_session():
       rnd_par = constant_op.constant([1, 2, 3, 4])
       vs = partitioned_variables.create_partitioned_variables([4], [4], rnd_par)
-      variables.global_variables_initializer().run()
-      val = array_ops.concat(vs, 0).eval()
+      self.evaluate(variables.global_variables_initializer())
+      val = array_ops.concat(vs, 0)
       rnd = self.evaluate(rnd_par)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.int32] * 4, [v.dtype.base_dtype for v in vs])
       self._TestSaveSpec(vs, ["4 0,1", "4 1,1", "4 2,1", "4 3,1"])
 
-  @test_util.run_deprecated_v1
   def testConstantInit(self):
     with self.cached_session():
       rnd_par = constant_op.constant([[1, 2, 3, 4], [5, 6, 7, 8]])
       vs = partitioned_variables.create_partitioned_variables([2, 4], [1, 2],
                                                               rnd_par)
-      variables.global_variables_initializer().run()
-      val = array_ops.concat(vs, 1).eval()
+      self.evaluate(variables.global_variables_initializer())
+      val = array_ops.concat(vs, 1)
       rnd = self.evaluate(rnd_par)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.int32] * 2, [v.dtype.base_dtype for v in vs])
@@ -356,7 +354,7 @@ class PartitionedVariablesTestCase(test.TestCase):
                                                                  rnd_par)
         vs2 = partitioned_variables.create_partitioned_variables([2, 4], [1, 2],
                                                                  rnd_par)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       var1_name = vs1[0]._save_slice_info.full_name
       var2_name = vs2[0]._save_slice_info.full_name
       self.assertEqual("hi/PartitionedVariable", var1_name)
@@ -376,7 +374,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           vs, reuse=True, use_resource=use_resource):
         vs2 = partitioned_variables.create_partitioned_variables(
             [2, 4], [1, 2], rnd_par, dtype=dtypes.int32)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       var1_name = vs1[0]._save_slice_info.full_name
       var2_name = vs2[0]._save_slice_info.full_name
       self.assertEqual("hola/PartitionedVariable", var1_name)
@@ -393,7 +391,7 @@ class PartitionedVariablesTestCase(test.TestCase):
                                                                  rnd_par)
         vs2 = partitioned_variables.create_partitioned_variables([2, 4], [1, 2],
                                                                  rnd_par)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       var1_name = vs1[0]._save_slice_info.full_name
       var2_name = vs2[0]._save_slice_info.full_name
       # Currently, the name scope 'ola' has no effect.
@@ -408,18 +406,16 @@ class PartitionedVariablesTestCase(test.TestCase):
   def testName(self):
     self._testNameHelper(use_resource=False)
 
-  @test_util.run_deprecated_v1
   def testResourceName(self):
     self._testNameHelper(use_resource=True)
 
-  @test_util.run_deprecated_v1
   def testRandomInitValue(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([200, 40]))
       vs = partitioned_variables.create_partitioned_variables(
           rnd.get_shape(), [1, 10], rnd.initialized_value())
-      variables.global_variables_initializer().run()
-      val = array_ops.concat(vs, 1).eval()
+      self.evaluate(variables.global_variables_initializer())
+      val = array_ops.concat(vs, 1)
       rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.float32] * 10, [v.dtype.base_dtype for v in vs])
@@ -430,7 +426,6 @@ class PartitionedVariablesTestCase(test.TestCase):
           "200 40 0,200:36,4"
       ])
 
-  @test_util.run_deprecated_v1
   def testRandomInitUnevenPartitions(self):
     with self.cached_session():
       rnd = variables.Variable(
@@ -440,7 +435,7 @@ class PartitionedVariablesTestCase(test.TestCase):
               rnd.get_shape(), [1, i], rnd.initialized_value())
           for i in xrange(1, 10)
       ]
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       rnd_val = self.evaluate(rnd)
       # Only check the slice save specs for the first 5 tf.
       save_specs = [
@@ -462,33 +457,31 @@ class PartitionedVariablesTestCase(test.TestCase):
           ]
       ]
       for i, vs in enumerate(var_lists):
-        var_val = array_ops.concat(vs, 1).eval()
+        var_val = array_ops.concat(vs, 1)
         self.assertAllClose(rnd_val, var_val)
         self.assertEqual([dtypes.float64] * len(vs),
                          [v.dtype.base_dtype for v in vs])
         if i < len(save_specs):
           self._TestSaveSpec(vs, save_specs[i])
 
-  @test_util.run_deprecated_v1
   def testDegenerate(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([10, 43]))
       vs = partitioned_variables.create_partitioned_variables(
           rnd.get_shape(), [1, 1], rnd.initialized_value())
-      variables.global_variables_initializer().run()
-      val = array_ops.concat(vs, 0).eval()
+      self.evaluate(variables.global_variables_initializer())
+      val = array_ops.concat(vs, 0)
       rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self._TestSaveSpec(vs, ["10 43 0,10:0,43"])
 
-  @test_util.run_deprecated_v1
   def testSliceSizeOne(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([10, 43]))
       vs = partitioned_variables.create_partitioned_variables(
           rnd.get_shape(), [10, 1], rnd.initialized_value())
-      variables.global_variables_initializer().run()
-      val = array_ops.concat(vs, 0).eval()
+      self.evaluate(variables.global_variables_initializer())
+      val = array_ops.concat(vs, 0)
       rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self._TestSaveSpec(vs, [
@@ -497,7 +490,6 @@ class PartitionedVariablesTestCase(test.TestCase):
           "10 43 6,1:0,43", "10 43 7,1:0,43", "10 43 8,1:0,43", "10 43 9,1:0,43"
       ])
 
-  @test_util.run_deprecated_v1
   def testIotaInitializer(self):
     self.assertAllClose([0., 1., 2., 3.], _IotaInitializer([4]))
     self.assertAllClose([[0., 1.], [0., 10.], [0., 100.], [0., 1000.]],
@@ -505,11 +497,11 @@ class PartitionedVariablesTestCase(test.TestCase):
     with self.cached_session():
       vs = partitioned_variables.create_partitioned_variables([13, 5], [3, 1],
                                                               _IotaInitializer)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       slice0 = _IotaInitializer([5, 5])
       slice1 = _IotaInitializer([4, 5])
       slice2 = _IotaInitializer([4, 5])
-      val = array_ops.concat(vs, 0).eval()
+      val = array_ops.concat(vs, 0)
       self.assertAllClose(slice0 + slice1 + slice2, val)
       self._TestSaveSpec(vs, ["13 5 0,5:0,5", "13 5 5,4:0,5", "13 5 9,4:0,5"])
 
@@ -520,7 +512,7 @@ class PartitionedVariablesTestCase(test.TestCase):
     with self.cached_session():
       var0, var1 = partitioned_variables.create_partitioned_variables(
           [20, 12], [1, 2], init_ops.random_uniform_initializer())
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       val0, val1 = self.evaluate(var0).flatten(), self.evaluate(var1).flatten()
       self.assertTrue(np.linalg.norm(val0 - val1) > 1e-6)
     # Negative test that proves that slices have the same values if
@@ -528,7 +520,7 @@ class PartitionedVariablesTestCase(test.TestCase):
     with self.cached_session():
       var0, var1 = partitioned_variables.create_partitioned_variables(
           [20, 12], [1, 2], init_ops.random_uniform_initializer(seed=201))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       val0, val1 = self.evaluate(var0).flatten(), self.evaluate(var1).flatten()
       self.assertAllClose(val0, val1)
 
@@ -607,8 +599,8 @@ class PartitionedVariablesTestCase(test.TestCase):
       self.assertTrue(
           c.op in concat_control_inputs,
           "var_x._concat() should get control dependencies from its scope.")
-      variables.global_variables_initializer().run()
-      self.assertAllClose(value.eval(), var_x.as_tensor().eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose(value, var_x.as_tensor())
 
   def testMetaGraphSaveLoad(self):
     save_prefix = os.path.join(self.get_temp_dir(), "ckpt")
@@ -623,7 +615,7 @@ class PartitionedVariablesTestCase(test.TestCase):
         v0_part = v0._get_partitions()
         self.assertEqual(len(v0_list), 5)
         self.assertAllEqual(v0_part, (5, 1))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         save_graph.get_collection_ref("partvar").append(v0)
         saver = saver_lib.Saver()
diff --git a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
index 24fb51fc47774677ab9581beb78957b922257d0f..347e092dee3b964b3abba5fae2a46c80d80f79bf 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
@@ -253,6 +253,7 @@ class PoolingTest(test.TestCase):
         ksize = test_util.NHWCToNCHW(ksize)
         strides = test_util.NHWCToNCHW(strides)
         t = test_util.NHWCToNCHW(t)
+        output_sizes = test_util.NHWCToNCHW(output_sizes)
 
       t = pool_func(
           t,
diff --git a/tensorflow/python/kernel_tests/priority_queue_test.py b/tensorflow/python/kernel_tests/priority_queue_test.py
index 9be682ea52f5b46ce54a4da4ded04163c6c780b0..49ec7ee4836d40719971822aff9e063b7235dc8b 100644
--- a/tensorflow/python/kernel_tests/priority_queue_test.py
+++ b/tensorflow/python/kernel_tests/priority_queue_test.py
@@ -27,6 +27,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -35,6 +36,7 @@ from tensorflow.python.platform import test
 
 class PriorityQueueTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertReadOnceSorts(self):
     with self.cached_session() as sess:
       q = data_flow_ops.PriorityQueue(2000, (dtypes.string, dtypes.string), (
@@ -112,6 +114,7 @@ class PriorityQueueTest(test.TestCase):
         missed.remove((dv0, dv1))
       self.assertEqual(missed, set())
 
+  @test_util.run_v1_only("b/120545219")
   def testRoundTripFillsCapacityMultiThreadedEnqueueAndDequeue(self):
     with self.cached_session() as sess:
       q = data_flow_ops.PriorityQueue(10, (dtypes.int64), (()))
@@ -267,6 +270,7 @@ class PriorityQueueTest(test.TestCase):
         missed.remove((dv0, dv1))
       self.assertEqual(missed, set())
 
+  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertOnceReadOnceSorts(self):
     with self.cached_session() as sess:
       q = data_flow_ops.PriorityQueue(2000, (dtypes.string, dtypes.string), (
@@ -288,6 +292,7 @@ class PriorityQueueTest(test.TestCase):
       for e, dv0, dv1 in zip(deq_elem, deq_value_0, deq_value_1):
         self.assertTrue((dv0, dv1) in allowed[e])
 
+  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertOnceReadManySorts(self):
     with self.cached_session():
       q = data_flow_ops.PriorityQueue(2000, (dtypes.int64), (()))
@@ -296,6 +301,7 @@ class PriorityQueueTest(test.TestCase):
       deq_values = np.hstack((q.dequeue_many(100)[0].eval() for _ in range(10)))
       self.assertAllEqual(deq_values, sorted(elem))
 
+  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertOnceReadOnceLotsSorts(self):
     with self.cached_session():
       q = data_flow_ops.PriorityQueue(2000, (dtypes.int64), (()))
@@ -311,6 +317,7 @@ class PriorityQueueTest(test.TestCase):
       with self.assertRaises(TypeError):
         q.enqueue_many((["a", "b", "c"], ["a", "b", "c"])).run()
 
+  @test_util.run_v1_only("b/120545219")
   def testInsertingNonScalarFails(self):
     with self.cached_session() as sess:
       input_priority = array_ops.placeholder(dtypes.int64)
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 1f3f02a9f01c220f0f755dc6057cf54d99f591ea..482633d539dfb0d1b0737846ba44ff3e0826ad43 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -102,6 +102,7 @@ class PyFuncTest(test.TestCase):
           script_ops.eager_py_func(np_func, [x, y], [dtypes.float32]))
       self.assertEqual(z[0], np_func(1.0, 2.0).astype(np.float32))
 
+  @test_util.run_v1_only("b/120545219")
   def testArray(self):
     with self.cached_session():
       x = constant_op.constant([1.0, 2.0], dtypes.float64)
@@ -168,6 +169,7 @@ class PyFuncTest(test.TestCase):
                              (dtypes.float64, dtypes.float64)))
       self.assertAllClose(y, [0.0, 1.0])
 
+  @test_util.run_v1_only("b/120545219")
   def testStrings(self):
 
     def read_fixed_length_numpy_strings():
@@ -185,6 +187,7 @@ class PyFuncTest(test.TestCase):
           script_ops.py_func(read_and_return_strings, [x, y], dtypes.string))
       self.assertAllEqual(z, [b"hello there", b"hi there"])
 
+  @test_util.run_v1_only("b/120545219")
   def testStringsAreConvertedToBytes(self):
 
     def read_fixed_length_numpy_strings():
@@ -202,6 +205,7 @@ class PyFuncTest(test.TestCase):
           script_ops.py_func(read_and_return_strings, [x, y], dtypes.string))
       self.assertAllEqual(z, [b"hello there", b"hi there"])
 
+  @test_util.run_v1_only("b/120545219")
   def testObjectArraysAreConvertedToBytes(self):
 
     def read_object_array():
@@ -217,12 +221,14 @@ class PyFuncTest(test.TestCase):
       z, = script_ops.py_func(read_and_return_strings, [x, y], [dtypes.string])
       self.assertListEqual(list(z.eval()), [b"hello there", b"hi ya"])
 
+  @test_util.run_v1_only("b/120545219")
   def testStringPadding(self):
     correct = [b"this", b"is", b"a", b"test"]
     with self.cached_session():
       s, = script_ops.py_func(lambda: [correct], [], [dtypes.string])
       self.assertAllEqual(s.eval(), correct)
 
+  @test_util.run_v1_only("b/120545219")
   def testStringPaddingAreConvertedToBytes(self):
     inp = ["this", "is", "a", "test"]
     correct = [b"this", b"is", b"a", b"test"]
@@ -230,6 +236,7 @@ class PyFuncTest(test.TestCase):
       s, = script_ops.py_func(lambda: [inp], [], [dtypes.string])
       self.assertAllEqual(s.eval(), correct)
 
+  @test_util.run_v1_only("b/120545219")
   def testLarge(self):
     with self.cached_session() as sess:
       x = array_ops.zeros([1000000], dtype=np.float32)
@@ -243,6 +250,7 @@ class PyFuncTest(test.TestCase):
       x = self.evaluate(script_ops.py_func(lambda: 42.0, [], dtypes.float64))
       self.assertAllClose(x, 42.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testAlias(self):
     with self.cached_session():
       np_array = np.array([1.0, 2.0], dtype=np.float32)
@@ -251,6 +259,7 @@ class PyFuncTest(test.TestCase):
       value.op.run()
       self.assertAllEqual(np_array, [1.0, 2.0])
 
+  @test_util.run_v1_only("b/120545219")
   def testReturnUnicodeString(self):
     with self.cached_session():
       correct = u"你好 世界"
@@ -261,6 +270,7 @@ class PyFuncTest(test.TestCase):
       z, = script_ops.py_func(unicode_string, [], [dtypes.string])
       self.assertEqual(z.eval(), correct.encode("utf8"))
 
+  @test_util.run_v1_only("b/120545219")
   def testBadNumpyReturnType(self):
     with self.cached_session():
 
@@ -274,6 +284,7 @@ class PyFuncTest(test.TestCase):
                                    "Unsupported numpy type"):
         self.evaluate(y)
 
+  @test_util.run_v1_only("b/120545219")
   def testBadReturnType(self):
     with self.cached_session():
 
@@ -287,6 +298,7 @@ class PyFuncTest(test.TestCase):
                                    "Unsupported object type"):
         self.evaluate(z)
 
+  @test_util.run_v1_only("b/120545219")
   def testReturnInput(self):
     with self.cached_session():
 
@@ -321,6 +333,7 @@ class PyFuncTest(test.TestCase):
       self.assertEqual(self.evaluate(x), 0)
       self.assertEqual(self.evaluate(x), 0)
 
+  @test_util.run_v1_only("b/120545219")
   def testGradientFunction(self):
     # Input to tf.py_func is necessary, otherwise get_gradient_function()
     # returns None per default.
@@ -330,6 +343,7 @@ class PyFuncTest(test.TestCase):
     self.assertEqual(None, ops.get_gradient_function(x.op))
     self.assertEqual(None, ops.get_gradient_function(y.op))
 
+  @test_util.run_v1_only("b/120545219")
   def testCOrder(self):
     with self.cached_session():
       val = [[1, 2], [3, 4]]
@@ -337,6 +351,7 @@ class PyFuncTest(test.TestCase):
                               [dtypes.int64])
       self.assertAllEqual(val, self.evaluate(x))
 
+  @test_util.run_v1_only("b/120545219")
   def testParallel(self):
     # Tests that tf.py_func's can run in parallel if they release the GIL.
     with self.cached_session() as session:
@@ -382,6 +397,7 @@ class PyFuncTest(test.TestCase):
       self.assertIsNone(ret)
       self.assertAllEqual([3], s.value)
 
+  @test_util.run_v1_only("b/120545219")
   def testNoReturnValueStateless(self):
 
     def do_nothing(unused_x):
@@ -420,6 +436,7 @@ class PyFuncTest(test.TestCase):
     with self.assertRaisesWithPredicateMatch(tf_exp, expected_error_check):
       self.evaluate(f)
 
+  @test_util.run_v1_only("b/120545219")
   def testExceptionHandling(self):
     with self.cached_session():
       self._testExceptionHandling(ValueError, errors.InvalidArgumentError)
@@ -514,7 +531,7 @@ class PyFuncTest(test.TestCase):
       self.assertAllEqual(ret, [[3.0], [3.0], [3.0]])
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testEagerExceptionHandling(self):
     with test_util.device(use_gpu=True):
       self._testExceptionHandling(
@@ -534,7 +551,7 @@ class PyFuncTest(test.TestCase):
       self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True)
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testEagerReturningVariableRaisesError(self):
     def return_variable():
       return resource_variable_ops.ResourceVariable(0.0)
@@ -558,6 +575,7 @@ class PyFuncTest(test.TestCase):
     dy_dx = tape.gradient(y, x)
     self.assertEqual(self.evaluate(dy_dx), 6.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testEagerGradientGraph(self):
 
     def f(x):
@@ -568,6 +586,7 @@ class PyFuncTest(test.TestCase):
     dy_dx = gradients_impl.gradients(y, x)[0]
     self.assertEqual(self.evaluate(dy_dx), 6.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testEagerGradientGraphTwoOutputs(self):
 
     def f(x, y):
@@ -597,6 +616,7 @@ class PyFuncTest(test.TestCase):
     self.assertEqual(self.evaluate(dz_dx), 6.0)
     self.assertEqual(self.evaluate(dz_dy), 8.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testEagerGradientGraphMultipleArgs(self):
 
     def f(x, y):
@@ -610,6 +630,7 @@ class PyFuncTest(test.TestCase):
     self.assertEqual(self.evaluate(dz_dx), 6.0)
     self.assertEqual(self.evaluate(dz_dy), 8.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testEagerGradientGraphLogHuber(self):
 
     def log_huber(x, m):
@@ -631,6 +652,7 @@ class PyFuncTest(test.TestCase):
       self.assertEqual(y, 1.0)
       self.assertEqual(dy_dx, 2.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testEagerRespectsDevicePlacmentOfOp(self):
 
     def f(x):
diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index 0f2537b3711338b5f244f4163620feeab290b6df..5adb95c7d60e88e43f6f171f6594c8542ef53143 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -39,6 +40,7 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class QrOpTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The input to qr should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
@@ -102,7 +104,7 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       tol = 1e-14
     # Tests that a ~= q*r.
     a_recon = math_ops.matmul(q, r)
-    self.assertAllClose(a_recon.eval(), a, rtol=tol, atol=tol)
+    self.assertAllClose(a_recon, a, rtol=tol, atol=tol)
 
   def CheckUnitary(self, x):
     # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
@@ -112,8 +114,9 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       tol = 1e-5
     else:
       tol = 1e-14
-    self.assertAllClose(identity.eval(), self.evaluate(xx), atol=tol)
+    self.assertAllClose(identity, xx, atol=tol)
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     np.random.seed(1)
     x_np = np.random.uniform(
@@ -162,6 +165,7 @@ class QrGradOpTest(test.TestCase):
 
 def _GetQrGradOpTest(dtype_, shape_, full_matrices_):
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     np.random.seed(42)
     a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_)
@@ -202,7 +206,8 @@ if __name__ == "__main__":
       for cols in 1, 2, 5, 10, 32, 100:
         for full_matrices in False, True:
           for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
-            for use_static_shape in True, False:
+            # TF2 does not support placeholders under eager so we skip it
+            for use_static_shape in set([True, tf2.enabled()]):
               shape = batch_dims + (rows, cols)
               name = "%s_%s_full_%s_static_%s" % (dtype.__name__,
                                                   "_".join(map(str, shape)),
diff --git a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
index ed4f5434d9fb9344c828682e7a15514aca7a0b33..dd814a22b4e59261b33e1a57fd9014147792858b 100644
--- a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
+++ b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
@@ -29,11 +29,13 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
 
+@test_util.run_v1_only("b/120545219")
 class RandomShuffleQueueTest(test.TestCase):
 
   def setUp(self):
@@ -1415,6 +1417,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       self.assertItemsEqual(elem, results)
 
+  @test_util.run_v1_only("b/120545219")
   def testBigDequeueMany(self):
     with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(2, 0, dtypes_lib.int32, ((),))
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index 55e68f4884c8f4174784766f693492df4e8663ad..d4ba1ad77d5547ccb9fe4e2154d145751cf63514 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -21,15 +21,15 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python import tf2
 from tensorflow.python.compat import compat
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
@@ -106,105 +106,101 @@ class ReluTest(test.TestCase):
 
   # The gradient test for ReLU is a bit tricky as the derivative is not well
   # defined at around zero and we want to avoid that in terms of input values.
-  @test_util.run_deprecated_v1
   def testGradientFloat32(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.relu(x, name="relu")
-      x_init = np.asarray(
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu, [x]))
     print("relu (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   # The gradient for fp16 is inaccurate due to the low-precision.
-  # Instead of relying on compute_gradient_error, we compare the fp16 analytical
-  # gradient against their fp32 counterpart.
-  @test_util.run_deprecated_v1
+  # We compare the fp16 analytical gradient against their fp32 counterpart.
   def testGradientFloat16(self):
-    with self.session(use_gpu=True) as sess:
-      # Randomly construct a 1D shape from [1, 40)
-      shape = random_ops.random_uniform(
-          [1], minval=1, maxval=40, dtype=dtypes.int32)
-
-      # Construct the fp32 graph and its gradient.
-      x = random_ops.random_uniform(shape, minval=-1, maxval=1, name="x")
-      y1 = nn_ops.relu(x, name="relu_fp32")
-      l1 = nn_ops.l2_loss(y1)
-      dx_f32 = gradients_impl.gradients(l1, x)
-
-      # Construct the fp16 graph and its gradient.
-      # It starts with the same x, in fp32. But before it reaches Relu, it is
-      # cast into fp16. So during backprop, the gradient computation is in fp16.
-      x2 = math_ops.cast(x, dtype=dtypes.float16, name="cast")
-      y2 = nn_ops.relu(x2, name="relu_fp16")
-      l2 = nn_ops.l2_loss(y2)
-      dx_f16 = gradients_impl.gradients(l2, x)
-
-      # Repeat the experiment for 100 times. All tensor shapes and its tensor
-      # values are randomly generated for each run.
-      for _ in xrange(100):
-        dx_f32_v, dx_f16_v = self.evaluate([dx_f32, dx_f16])
-        self.assertAllClose(dx_f32_v, dx_f16_v, atol=3e-4)
-
-  @test_util.run_deprecated_v1
+
+    def grad(x):
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = nn_ops.l2_loss(nn_ops.relu(x))
+      return tape.gradient(y, x)
+
+    def f():
+      with test_util.use_gpu():
+        # Randomly construct a 1D shape from [1, 40)
+        shape = random_ops.random_uniform([1],
+                                          minval=1,
+                                          maxval=40,
+                                          dtype=dtypes.int32)
+        x32 = random_ops.random_uniform(shape, minval=-1, maxval=1)
+        x16 = math_ops.cast(x32, dtype=dtypes.float16)
+        return grad(x32), grad(x16)
+
+    # We're going to ensure that the fp16 and fp32 gradients
+    # are "close" to each other for ~100 random values.
+    #
+    # In TensorFlow 1.x, invoking f() (without eager execution enabled)
+    # would construct a graph. Instead of construct a graph with O(100) nodes,
+    # we construct a single graph to be executed ~100 times in a Session.
+    if not tf2.enabled():
+      d32_tensor, d16_tensor = f()
+      with self.cached_session() as sess:
+        f = lambda: sess.run([d32_tensor, d16_tensor])
+
+    # Repeat the experiment for 100 times. All tensor shapes and its tensor
+    # values are randomly generated for each run.
+    for _ in xrange(100):
+      d32, d16 = f()
+      self.assertAllClose(d32, d16, atol=3e-4)
+
   def testGradientFloat64(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.relu(x, name="relu")
-      x_init = np.asarray(
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu, [x]))
     print("relu (float64) gradient err = ", err)
     self.assertLess(err, 1e-10)
 
-  @test_util.run_deprecated_v1
   def testGradGradFloat32(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.relu(x, name="relu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float32
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.relu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("relu (float32) gradient of gradient err = ", err)
     self.assertLess(err, 1e-4)
 
-  @test_util.run_deprecated_v1
   def testGradGradFloat64(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.relu(x, name="relu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float64
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.relu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("relu (float64) gradient of gradient err = ", err)
     self.assertLess(err, 1e-10)
 
@@ -258,38 +254,25 @@ class Relu6Test(test.TestCase):
   # The gradient test for ReLU6 is a bit tricky as the derivative is
   # not well defined at around zero and six and we want to avoid that
   # in terms of input values.
-  @test_util.run_deprecated_v1
   def testGradientFloat32(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 6.1, 6.3, 6.5, 6.7, 6.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.relu6(x, name="relu6")
-      x_init = np.asarray(
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [6.1, 6.3, 6.5, 6.7, 6.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu6, [x]))
     print("relu6 (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
-  @test_util.run_deprecated_v1
   def testGradientFloat64(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 6.1, 6.3, 6.5, 6.7, 6.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.relu6(x, name="relu6")
-      x_init = np.asarray(
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [6.1, 6.3, 6.5, 6.7, 6.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu6, [x]))
     print("relu6 (float64) gradient err = ", err)
     self.assertLess(err, 1e-10)
 
@@ -333,77 +316,65 @@ class LeakyReluTest(test.TestCase):
   # The gradient test for Leaky ReLU is a bit tricky as the derivative is not
   # well defined at around zero and we want to avoid that in terms of input
   # values.
-  @test_util.run_deprecated_v1
   def testGradientFloat32(self):
-    with self.test_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.leaky_relu(x, alpha=0.1, name="leaky_relu")
-      x_init = np.asarray(
+    with self.cached_session():
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.leaky_relu, [x]))
     print("leaky_relu (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
-  @test_util.run_deprecated_v1
   def testGradientFloat64(self):
-    with self.test_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.leaky_relu(x, alpha=0.2, name="leaky_relu")
-      x_init = np.asarray(
+    with self.cached_session():
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.leaky_relu, [x]))
     print("leaky_relu (float64) gradient err = ", err)
     self.assertLess(err, 1e-10)
 
-  @test_util.run_deprecated_v1
   def testGradGradFloat32(self):
     with compat.forward_compatibility_horizon(2018, 11, 2):
-      with self.test_session():
-        x = constant_op.constant(
-            [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-            shape=[2, 5],
-            name="x")
-        y = nn_ops.leaky_relu(x, alpha=0.1, name="leaky_relu")
-        z = gradients_impl.gradients(y, x)
-        x_init = np.asarray(
+      with self.cached_session():
+
+        def f(x):
+          assert x.dtype == dtypes.float32
+          with backprop.GradientTape() as tape:
+            tape.watch(x)
+            y = nn_ops.leaky_relu(x)
+          return tape.gradient(y, x)
+
+        x = np.asarray(
             [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
             dtype=np.float32,
             order="F")
-        err = gradient_checker.compute_gradient_error(
-            x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(f, [x]))
       print("leaky_relu (float32) gradient of gradient err = ", err)
       self.assertLess(err, 1e-4)
 
-  @test_util.run_deprecated_v1
   def testGradGradFloat64(self):
     with compat.forward_compatibility_horizon(2018, 11, 2):
-      with self.test_session():
-        x = constant_op.constant(
-            [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-            shape=[2, 5],
-            dtype=dtypes.float64,
-            name="x")
-        y = nn_ops.leaky_relu(x, alpha=0.02, name="leaky_relu")
-        z = gradients_impl.gradients(y, x)
-        x_init = np.asarray(
+      with self.cached_session():
+
+        def f(x):
+          assert x.dtype == dtypes.float64
+          with backprop.GradientTape() as tape:
+            tape.watch(x)
+            y = nn_ops.leaky_relu(x)
+          return tape.gradient(y, x)
+
+        x = np.asarray(
             [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
             dtype=np.float64,
             order="F")
-        err = gradient_checker.compute_gradient_error(
-            x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(f, [x]))
       print("leaky_relu (float64) gradient of gradient err = ", err)
       self.assertLess(err, 1e-10)
 
@@ -451,76 +422,75 @@ class EluTest(test.TestCase):
     for t in [np.float16, np.float32, np.float64]:
       self._testElu(np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
-  @test_util.run_deprecated_v1
   def testGradientFloat32(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
-      x = constant_op.constant(x_val, name="x")
-      y = nn_ops.elu(x, name="elu")
-      x_init = np.asarray(x_val, dtype=np.float32, order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      x = np.asarray(x_val, dtype=np.float32, order="F")
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.elu, [x]))
     print("elu (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
-  @test_util.run_deprecated_v1
   def testGradientFloat64(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
-      x = constant_op.constant(x_val, dtype=dtypes.float64, name="x")
-      y = nn_ops.elu(x, name="elu")
-      x_init = np.asarray(x_val, dtype=np.float64, order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      x = np.asarray(x_val, dtype=np.float64, order="F")
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.elu, [x]))
     print("elu (float64) gradient err = ", err)
     self.assertLess(err, 1e-6)
 
-  @test_util.run_deprecated_v1
   def testGradGrad(self):
     with self.cached_session():
-      x = array_ops.placeholder(dtype=dtypes.float32)
-      elu = nn_ops.elu(x)
-      g, = gradients_impl.gradients(elu, x)
-      gg, = gradients_impl.gradients(g, x)
 
-      for x_val in [-1, -0.5, 0.5, 1]:
-        err = np.abs(gg.eval(feed_dict={x: x_val}) - _elu_grad_grad(x_val))
+      def f(x):
+        with backprop.GradientTape(persistent=True) as tape:
+          tape.watch(x)
+          y = nn_ops.elu(x)
+          dy = tape.gradient(y, x)
+        return tape.gradient(dy, x)
+
+      for x in [-1., -0.5, 0.5, 1.]:
+        got = self.evaluate(f(constant_op.constant(x)))
+        want = _elu_grad_grad(x)
+        err = np.abs(got - want)
         self.assertLess(err, 1e-4)
 
-  @test_util.run_deprecated_v1
   def testGradGradFloat32(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.elu(x, name="elu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float32
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.elu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("elu (float32) gradient of gradient err = ", err)
     self.assertLess(err, 1e-4)
 
-  @test_util.run_deprecated_v1
   def testGradGradFloat64(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.elu(x, name="elu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float64
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.elu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("elu (float64) gradient of gradient err = ", err)
     self.assertLess(err, 1e-6)
 
@@ -556,64 +526,59 @@ class SeluTest(test.TestCase):
         self._testSelu(
             np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
-  @test_util.run_deprecated_v1
   def testGradientFloat32(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
-      x = constant_op.constant(x_val, name="x")
-      y = nn_ops.selu(x, name="selu")
-      x_init = np.asarray(x_val, dtype=np.float32, order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      x = np.asarray(x_val, dtype=np.float32, order="F")
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.selu, [x]))
     print("selu (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
-  @test_util.run_deprecated_v1
   def testGradientFloat64(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
-      x = constant_op.constant(x_val, dtype=dtypes.float64, name="x")
-      y = nn_ops.selu(x, name="selu")
-      x_init = np.asarray(x_val, dtype=np.float64, order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      x = np.asarray(x_val, dtype=np.float64, order="F")
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.selu, [x]))
     print("selu (float64) gradient err = ", err)
     self.assertLess(err, 1e-6)
 
-  @test_util.run_deprecated_v1
   def testGradGradFloat32(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.selu(x, name="selu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float32
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.selu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("selu (float32) gradient of gradient err = ", err)
     self.assertLess(err, 1e-4)
 
-  @test_util.run_deprecated_v1
   def testGradGradFloat64(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.selu(x, name="selu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float64
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.selu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("selu (float64) gradient of gradient err = ", err)
     self.assertLess(err, 1e-6)
 
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index b57d9d47aa384cbf6e0d235cc19e198a98c05682..df7b68616522f58633da9a1df174e370a5e73144 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -33,7 +33,10 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import custom_gradient
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
@@ -585,6 +588,33 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     v.load(2.0)
     self.assertEqual(2.0, self.evaluate(v.value()))
 
+  def testShapePassedToGradient(self):
+    with ops.Graph().as_default():
+      @custom_gradient.custom_gradient
+      def differentiable_scatter_update(handle, indices, values):
+        with ops.control_dependencies([
+            resource_variable_ops.resource_scatter_update(
+                handle, indices, values)]):
+          new_handle = array_ops.identity(handle)
+
+        def grad(dresult):
+          self.assertIsNotNone(
+              tensor_util.constant_value(dresult.dense_shape))
+          return [dresult, None, None]
+
+        return new_handle, grad
+
+      var = variable_scope.get_variable(
+          "foo", shape=[20], initializer=init_ops.zeros_initializer,
+          dtype=dtypes.float64, use_resource=True)
+
+      indices = math_ops.range(10)
+      updates = math_ops.range(9, -1, -1, dtype=dtypes.float64)
+      new_handle = differentiable_scatter_update(var.handle, indices, updates)
+      gathered = resource_variable_ops.resource_gather(
+          new_handle, indices, dtype=var.dtype)
+      gradients_impl.gradients([gathered], [updates])
+
   def testToFromProtoCachedValue(self):
     with ops.Graph().as_default():
       v_def = resource_variable_ops.ResourceVariable(
@@ -599,7 +629,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           variable_def=other_v_def)
       self.assertTrue(other_v_prime._cached_value is not None)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testVariableDefInitializedInstances(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v_def = resource_variable_ops.ResourceVariable(
@@ -659,7 +689,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
   def testToFromProto(self):
     with self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       w = resource_variable_ops.ResourceVariable.from_proto(v.to_proto())
       self.assertEquals(2, math_ops.add(w, 1).eval())
@@ -704,7 +734,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual(0.0, self.evaluate(v.value()))
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testDestroyResource(self):
     v = resource_variable_ops.ResourceVariable(3.0, name="var0")
     self.evaluate(variables.global_variables_initializer())
@@ -767,7 +797,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
   def testSharedName(self):
     with self.cached_session():
       v = resource_variable_ops.ResourceVariable(300.0, name="var4")
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       w = resource_variable_ops.var_handle_op(
           dtype=v.dtype.base_dtype, shape=v.get_shape(), shared_name="var4",
@@ -820,7 +850,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       v.initializer.run(feed_dict={v.initial_value: 3.0})
       self.assertEqual(3.0, v.value().eval())
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testControlFlowInitialization(self):
     """Expects an error if an initializer is in a control-flow scope."""
 
@@ -924,6 +954,19 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       state_ops.scatter_sub(v, [1], [3])
       self.assertAllEqual([1.0, -1.0], v.numpy())
 
+  def testScatterUpdateVariant(self):
+    with context.eager_mode():
+      v = resource_variable_ops.ResourceVariable([
+          list_ops.empty_tensor_list(
+              element_dtype=dtypes.float32, element_shape=[])
+      ])
+      v.scatter_update(
+          ops.IndexedSlices(
+              list_ops.tensor_list_from_tensor([1., 2.], element_shape=[]), 0))
+      self.assertAllEqual(
+          list_ops.tensor_list_get_item(v[0], 0, element_dtype=dtypes.float32),
+          1.)
+
   def testScatterNdAddStateOps(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable(
@@ -957,7 +1000,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(self.evaluate(v.assign_add(1)), [1, 2, 3, 4])
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testCopyToGraphUninitialized(self):
     v = resource_variable_ops.ResourceVariable([0, 1, 2, 3])
     copy_to_graph = ops.Graph()
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 3bc457f8fb626e4906d78664fa09a75c371743e0..a49496e4ef15bc2772fe7abdac4d801b77787079 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -262,7 +262,7 @@ class RNNTest(test.TestCase):
       rnn.dynamic_rnn(cell, inputs, dtype=dtypes.float32, sequence_length=[4])
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testTensorArrayStateIsAccepted(self):
     cell = TensorArrayStateRNNCell()
     in_eager_mode = context.executing_eagerly()
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index c1241ba87eeeb47982df0cbf6049cedb912e6a39..8510a08f0c96dd9ae08a2ca3e782cc7d28e86264 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -217,7 +217,7 @@ class StatefulScatterNdTest(test.TestCase):
   def testVariableRankAdd(self):
     self._VariableRankTests(_NumpyAdd, state_ops.scatter_nd_add)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testVariableRankSub(self):
     self._VariableRankTests(_NumpySub, state_ops.scatter_nd_sub)
 
@@ -235,7 +235,7 @@ class StatefulScatterNdTest(test.TestCase):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testScatterRepeatIndices(self):
     """This tests scatter_add using indices that repeat."""
     self._ScatterRepeatIndicesTest(_NumpyAdd, state_ops.scatter_nd_add)
@@ -257,7 +257,7 @@ class StatefulScatterNdTest(test.TestCase):
   #     session.run([update0, update1])
   #     self.assertAllEqual([False, True], self.evaluate(var))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testScatterOutOfRangeCpu(self):
     # TODO(simister): Re-enable once binary size increase due to
     # scatter_nd ops is under control.
@@ -294,7 +294,7 @@ class StatefulScatterNdTest(test.TestCase):
         state_ops.scatter_nd_update(ref, indices,
                                     updates).get_shape().as_list(), shape)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testResVarInvalidOutputShape(self):
     res = variables.Variable(
         initial_value=lambda: array_ops.zeros(shape=[], dtype=dtypes.float32),
@@ -509,7 +509,7 @@ class ScatterNdTest(test.TestCase):
         ValueError, "Indices and updates specified for empty output shape"):
       self.scatter_nd(indices, updates, shape)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testEmptyOutputShape2(self):
     indices = array_ops.placeholder(dtypes.int32, shape=None)
     updates = array_ops.placeholder(dtypes.int32, shape=None)
@@ -717,6 +717,7 @@ class ScatterNdTensorTest(test.TestCase):
     self.assertAllEqual(subbed,
                         constant_op.constant([1, -10, 1, -9, -8, 1, 1, -11]))
 
+  @test_util.run_v1_only("b/120545219")
   def testUpdateAddSubGradients(self):
 
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index 623c17d373cc7231d7191b715a77b6a3cf8701fc..ce7e0c04c861dcbeee85d496496b3e657b883e56 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -192,6 +192,10 @@ class ScatterTest(test.TestCase):
     if tf_scatter != state_ops.scatter_div:
       vtypes.append(np.int32)
 
+    if (tf_scatter == state_ops.scatter_min or
+        tf_scatter == state_ops.scatter_max):
+      vtypes.append(np.float16)
+
     for vtype in vtypes:
       for itype in (np.int32, np.int64):
         self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
diff --git a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
index 42577f7e423c71cce5e112d1cde5bbca495f70ed..47b22ec29673f31c3216d4b4a39687a40bc95a95 100644
--- a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
+++ b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
@@ -22,8 +22,9 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -39,6 +40,7 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class SelfAdjointEigTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testWrongDimensions(self):
     # The input to self_adjoint_eig should be a tensor of
     # at least rank 2.
@@ -49,6 +51,7 @@ class SelfAdjointEigTest(test.TestCase):
     with self.assertRaises(ValueError):
       linalg_ops.self_adjoint_eig(vector)
 
+  @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
     all_ops = []
     with self.session(use_gpu=True) as sess:
@@ -161,7 +164,7 @@ def _GetSelfAdjointEigTest(dtype_, shape_, compute_v_):
             math_ops.matmul(tf_v, array_ops.matrix_diag(tf_e)),
             tf_v,
             adjoint_b=True)
-        self.assertAllClose(a_ev.eval(), a, atol=atol)
+        self.assertAllClose(self.evaluate(a_ev), a, atol=atol)
 
         # Compare to numpy.linalg.eigh.
         CompareEigenDecompositions(self, np_e, np_v, self.evaluate(tf_e),
@@ -169,7 +172,7 @@ def _GetSelfAdjointEigTest(dtype_, shape_, compute_v_):
       else:
         tf_e = linalg_ops.self_adjoint_eigvals(constant_op.constant(a))
         self.assertAllClose(
-            np.sort(np_e, -1), np.sort(tf_e.eval(), -1), atol=atol)
+            np.sort(np_e, -1), np.sort(self.evaluate(tf_e), -1), atol=atol)
 
   return Test
 
@@ -185,53 +188,51 @@ def _GetSelfAdjointEigGradTest(dtype_, shape_, compute_v_):
     n = shape_[-1]
     batch_shape = shape_[:-2]
     np_dtype = dtype_.as_numpy_dtype
-    a = np.random.uniform(
-        low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
-    if dtype_.is_complex:
-      a += 1j * np.random.uniform(
+
+    def RandomInput():
+      a = np.random.uniform(
           low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
-    a += np.conj(a.T)
-    a = np.tile(a, batch_shape + (1, 1))
+      if dtype_.is_complex:
+        a += 1j * np.random.uniform(
+            low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
+      a += np.conj(a.T)
+      a = np.tile(a, batch_shape + (1, 1))
+      return a
+
     # Optimal stepsize for central difference is O(epsilon^{1/3}).
     epsilon = np.finfo(np_dtype).eps
     delta = 0.1 * epsilon**(1.0 / 3.0)
     # tolerance obtained by looking at actual differences using
     # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
+    # after discarding one random input sample
+    _ = RandomInput()
     if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64):
       tol = 1e-2
     else:
       tol = 1e-7
     with self.session(use_gpu=True):
-      tf_a = constant_op.constant(a)
-      if compute_v_:
-        tf_e, tf_v = linalg_ops.self_adjoint_eig(tf_a)
+      def Compute(x):
+        e, v = linalg_ops.self_adjoint_eig(x)
         # (complex) Eigenvectors are only unique up to an arbitrary phase
         # We normalize the vectors such that the first component has phase 0.
-        top_rows = tf_v[..., 0:1, :]
-        if tf_a.dtype.is_complex:
+        top_rows = v[..., 0:1, :]
+        if dtype_.is_complex:
           angle = -math_ops.angle(top_rows)
           phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle))
         else:
           phase = math_ops.sign(top_rows)
-        tf_v *= phase
-        outputs = [tf_e, tf_v]
+        v *= phase
+        return e, v
+
+      if compute_v_:
+        funcs = [lambda x: Compute(x)[0], lambda x: Compute(x)[1]]
       else:
-        tf_e = linalg_ops.self_adjoint_eigvals(tf_a)
-        outputs = [tf_e]
-      for b in outputs:
-        x_init = np.random.uniform(
-            low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
-        if dtype_.is_complex:
-          x_init += 1j * np.random.uniform(
-              low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
-        x_init += np.conj(x_init.T)
-        x_init = np.tile(x_init, batch_shape + (1, 1))
-        theoretical, numerical = gradient_checker.compute_gradient(
-            tf_a,
-            tf_a.get_shape().as_list(),
-            b,
-            b.get_shape().as_list(),
-            x_init_value=x_init,
+        funcs = [linalg_ops.self_adjoint_eigvals]
+
+      for f in funcs:
+        theoretical, numerical = gradient_checker_v2.compute_gradient(
+            f,
+            [RandomInput()],
             delta=delta)
         self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
@@ -245,7 +246,7 @@ if __name__ == "__main__":
       for size in 1, 2, 5, 10:
         for batch_dims in [(), (3,)] + [(3, 2)] * (max(size, size) < 10):
           shape = batch_dims + (size, size)
-          name = "%s_%s_%s" % (dtype, "_".join(map(str, shape)), compute_v)
+          name = "%s_%s_%s" % (dtype.name, "_".join(map(str, shape)), compute_v)
           _AddTest(SelfAdjointEigTest, "SelfAdjointEig", name,
                    _GetSelfAdjointEigTest(dtype, shape, compute_v))
           _AddTest(SelfAdjointEigGradTest, "SelfAdjointEigGrad", name,
diff --git a/tensorflow/python/kernel_tests/session_ops_test.py b/tensorflow/python/kernel_tests/session_ops_test.py
index dc663cb091cb172b6ab68dd1686ea2c6270e3cc1..7d422278408207a3abcccf58921ec94b018a2cea 100644
--- a/tensorflow/python/kernel_tests/session_ops_test.py
+++ b/tensorflow/python/kernel_tests/session_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import session_ops
@@ -30,6 +31,7 @@ from tensorflow.python.platform import test
 
 class SessionOpsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testHandleBasic(self):
     with self.cached_session() as sess:
       # Return a handle.
@@ -44,6 +46,7 @@ class SessionOpsTest(test.TestCase):
       y = math_ops.multiply(x, 10)
       self.assertEqual(500, sess.run(y, feed_dict={f: h.handle}))
 
+  @test_util.run_deprecated_v1
   def testHandleEval(self):
     with self.cached_session() as sess:
       # Return a handle.
@@ -56,6 +59,7 @@ class SessionOpsTest(test.TestCase):
       # Get the tensor from its handle.
       self.assertEqual(50, h.eval())
 
+  @test_util.run_deprecated_v1
   def testHandleAndValue(self):
     with self.cached_session() as sess:
       # Return a handle and a value.
@@ -69,6 +73,7 @@ class SessionOpsTest(test.TestCase):
       self.assertEqual(50, h.eval())
       self.assertEqual(500, v)
 
+  @test_util.run_deprecated_v1
   def testHandleCond(self):
     with self.cached_session() as sess:
       # Return a handle and a value
@@ -89,6 +94,7 @@ class SessionOpsTest(test.TestCase):
 
       self.assertEqual(5000, result)
 
+  @test_util.run_deprecated_v1
   def testHandleForLoop(self):
     with self.cached_session() as sess:
       # Initialize a handle.
@@ -106,6 +112,7 @@ class SessionOpsTest(test.TestCase):
 
       self.assertEqual(100, h.eval())
 
+  @test_util.run_deprecated_v1
   def testHandleWhileLoop(self):
     with self.cached_session() as sess:
       # Initialize a handle.
@@ -126,6 +133,7 @@ class SessionOpsTest(test.TestCase):
 
       self.assertEqual(101, h.eval())
 
+  @test_util.run_deprecated_v1
   def testHandleMover(self):
     with self.cached_session() as sess:
       # Return a handle.
@@ -147,6 +155,7 @@ class SessionOpsTest(test.TestCase):
         h = self.evaluate(h)
         self.assertEqual(100, sess.run(y, feed_dict={f: h.handle}))
 
+  @test_util.run_deprecated_v1
   def testHandleDelete(self):
     with self.cached_session() as sess:
       # Return a handle.
@@ -156,6 +165,7 @@ class SessionOpsTest(test.TestCase):
       h = session_ops.get_session_handle(c)
       self.evaluate(h).delete()
 
+  @test_util.run_deprecated_v1
   def testHandleDeleteRaw(self):
     with self.cached_session() as sess:
       # Return a handle.
@@ -170,6 +180,7 @@ class SessionOpsTest(test.TestCase):
       f, x = session_ops.delete_session_tensor(raw_h)
       sess.run(x, feed_dict={f: raw_h})
 
+  @test_util.run_deprecated_v1
   def testMultiDevices(self):
     with self.cached_session() as sess:
       with ops.device(test.gpu_device_name()):
@@ -188,6 +199,7 @@ class SessionOpsTest(test.TestCase):
                      b_p: b_handle.handle})
       self.assertEqual(3.0, c_handle.eval())
 
+  @test_util.run_deprecated_v1
   def testHandleGC(self):
     with self.cached_session() as sess:
       # initial values live on CPU
@@ -212,6 +224,7 @@ class SessionOpsTest(test.TestCase):
             feed_dict={add_h1: one_handle.handle,
                        add_h2: x_handle.handle})
 
+  @test_util.run_deprecated_v1
   def testHandlePlacement(self):
     with self.cached_session() as sess:
       a = constant_op.constant(1.0)
@@ -232,6 +245,7 @@ class SessionOpsTest(test.TestCase):
                      b_p: b_handle.handle})
       self.assertEqual(3.0, c_handle.eval())
 
+  @test_util.run_deprecated_v1
   def testFeedOneHandleDirectly(self):
     with self.cached_session() as sess:
       a = constant_op.constant(10.0)
@@ -243,6 +257,7 @@ class SessionOpsTest(test.TestCase):
 
       self.assertAllClose(2500.0, sess.run(d, feed_dict={c: h_c}))
 
+  @test_util.run_deprecated_v1
   def testDirectHandleFeedOverlappingWithFetches(self):
     with self.cached_session() as sess:
       a = constant_op.constant(10.0)
@@ -269,6 +284,7 @@ class SessionOpsTest(test.TestCase):
       self.assertAllClose(50.0, c_val)
       self.assertAllClose(50.0, d_val)
 
+  @test_util.run_deprecated_v1
   def testFeedTwoHandlesDirectly(self):
     with self.cached_session() as sess:
       a = constant_op.constant(10.0)
@@ -283,6 +299,7 @@ class SessionOpsTest(test.TestCase):
       self.assertAllClose(48.0, sess.run(e, feed_dict={c: h_c, d: h_d}))
       self.assertAllClose(-48.0, sess.run(e, feed_dict={c: h_d, d: h_c}))
 
+  @test_util.run_deprecated_v1
   def testFeedHandleToVariableDirectly(self):
     with self.cached_session() as sess:
       a = variables.Variable(12.0)
diff --git a/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
index d9b45f67c3bd02cd0bac0553d6070cfb6651bfc6..e0ce06418a457eee9a45b172f9cc5887d1167153 100644
--- a/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -53,16 +54,77 @@ class ReconstructionOpsTest(test.TestCase):
                             "100000000000000"]
 
   def test_all_ones(self):
-    signal = constant_op.constant(np.ones((3, 5)), dtype=dtypes.int64)
+    signal = array_ops.ones([3, 5])
     reconstruction = reconstruction_ops.overlap_and_add(signal, 2)
 
-    with self.session(use_gpu=True) as sess:
+    self.assertEqual(reconstruction.shape.as_list(), [9])
+
+    with self.session(use_gpu=True):
       output = self.evaluate(reconstruction)
 
       expected_output = np.array([1, 1, 2, 2, 3, 2, 2, 1, 1])
 
       self.assertAllClose(output, expected_output)
 
+  @test_util.run_deprecated_v1
+  def test_unknown_shapes(self):
+    # This test uses placeholders and does not work in eager mode.
+    if context.executing_eagerly():
+      return
+
+    signal = array_ops.placeholder(dtype=dtypes.int32, shape=[None, None, None])
+    frame_step = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    reconstruction = reconstruction_ops.overlap_and_add(signal, frame_step)
+
+    self.assertEqual(reconstruction.shape.as_list(), [None, None])
+
+    with self.session(use_gpu=True) as sess:
+      output = sess.run(reconstruction,
+                        feed_dict={signal: np.ones([4, 3, 5]), frame_step: 2})
+
+      expected_output = np.array([[1, 1, 2, 2, 3, 2, 2, 1, 1]] * 4)
+
+      self.assertAllClose(output, expected_output)
+
+  @test_util.run_deprecated_v1
+  def test_unknown_rank(self):
+    # This test uses placeholders and does not work in eager mode.
+    if context.executing_eagerly():
+      return
+
+    signal = array_ops.placeholder(dtype=dtypes.int32, shape=None)
+    frame_step = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    reconstruction = reconstruction_ops.overlap_and_add(signal, frame_step)
+
+    self.assertEqual(reconstruction.shape, None)
+
+    with self.session(use_gpu=True) as sess:
+      output = sess.run(reconstruction,
+                        feed_dict={signal: np.ones([4, 3, 5]), frame_step: 2})
+
+      expected_output = np.array([[1, 1, 2, 2, 3, 2, 2, 1, 1]] * 4)
+
+      self.assertAllClose(output, expected_output)
+
+  @test_util.run_deprecated_v1
+  def test_fast_path(self):
+    # This test uses tensor names and does not work in eager mode.
+    if context.executing_eagerly():
+      return
+
+    signal = array_ops.ones([3, 5])
+    frame_step = 5
+    reconstruction = reconstruction_ops.overlap_and_add(signal, frame_step)
+
+    self.assertEqual(reconstruction.name, "overlap_and_add/fast_path:0")
+
+    with self.session(use_gpu=True) as sess:
+      output = self.evaluate(reconstruction)
+
+      expected_output = np.ones([15])
+
+      self.assertAllClose(output, expected_output)
+
   @test_util.run_deprecated_v1
   def test_simple(self):
     def make_input(frame_length, num_frames=3):
@@ -100,7 +162,7 @@ class ReconstructionOpsTest(test.TestCase):
                                   dtype=dtypes.int64)
     reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
 
-    with self.session(use_gpu=True) as sess:
+    with self.session(use_gpu=True):
       output = self.evaluate(reconstruction)
       string_output = [np.base_repr(x, self.bases[0]) for x in output]
 
@@ -110,7 +172,7 @@ class ReconstructionOpsTest(test.TestCase):
     signal = constant_op.constant(self.powers, dtype=dtypes.int64)
     reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
 
-    with self.session(use_gpu=True) as sess:
+    with self.session(use_gpu=True):
       output = self.evaluate(reconstruction)
 
       accumulator = True
@@ -126,7 +188,7 @@ class ReconstructionOpsTest(test.TestCase):
     signal = constant_op.constant(input_matrix, dtype=dtypes.float32)
     reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
 
-    with self.session(use_gpu=True) as sess:
+    with self.session(use_gpu=True):
       output = self.evaluate(reconstruction)
 
       string_output = [np.base_repr(int(x), self.bases[0]) for x in
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index 707b8a429f2be1fcce39516d368e2b7a05570652..a82492996a48448c3e5829ee6a8cede0bf20ad92 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -201,6 +201,15 @@ class SoftmaxTest(test.TestCase):
         use_gpu=False)
     self._testOverflow(use_gpu=False)
 
+  def testAlongNegativeDimension(self):
+    self._testSoftmax(
+        np.array([[[1., 1., 1., 1.], [1., 2., 3., 4.]],
+                  [[2., 3., 4., 5.], [6., 7., 8., 9.]],
+                  [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype(np.float32),
+        dim=-2,
+        use_gpu=False)
+    self._testOverflow(use_gpu=False)
+
   def testShapeInference(self):
     op = nn_ops.softmax([[[1., 1., 1., 1.], [1., 2., 3., 4.]],
                          [[2., 3., 4., 5.], [6., 7., 8., 9.]],
diff --git a/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py b/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
index 275c86e534940e10af282f9548fbb87a87a41a4d..4a967b656285a1094b8eef17fb0b7f41f83cd8e7 100644
--- a/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
+++ b/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
@@ -267,7 +267,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       self.assertAllEqual(val.values, [[5, 5], [0, 20], [30, 0]])
       self.assertAllEqual(val.dense_shape, [-1, 2])
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testParallelApplyGradMean(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -299,7 +299,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
           np.array([[expected_val, 0], [0, expected_val]]).astype(np.float32),
           val, sess)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testParallelApplyGradSum(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -334,7 +334,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
           np.array([[expected_val, 0], [0, expected_val]]).astype(np.float32),
           val, sess)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testParallelTakeGrad(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -374,7 +374,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
         self._assertEqual_nparray(
             np.array([[0, 0], [elems[i], 0]]), results[i], sess)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorApplyAndBlockingTake(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -410,7 +410,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
     with self.assertRaisesOpError("was cancelled"):
       self.evaluate(takeg_op)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorCancel(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -430,7 +430,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
 
       takeg_thread.join()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNonVectorIndices(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -443,7 +443,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
             grad_indices=[[0, 1], [1, 0]],
             grad_values=np.array([1, 2]).astype(np.float32)).run()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testZeroDimensionValues(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -454,7 +454,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
         q.apply_grad(
             grad_indices=[0], grad_values=np.array(1).astype(np.float32)).run()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWrongNonEmptyInputValues(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -466,7 +466,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
             grad_indices=[0, 1],
             grad_values=np.array([[0, 1, 1]]).astype(np.float32)).run()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testDynamicNonVectorIndices(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -486,7 +486,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
                      x_values: np.array([1, 2]).astype(np.float32)
                  })
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testDynamicWrongNonEmptyInputValues(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -505,7 +505,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
                      x_values: np.array([[0, 1, 1]]).astype(np.float32)
                  })
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testEmptyShapeApply(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -531,7 +531,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       q.apply_grad(grad_indices=[0], grad_values=[1.0], grad_shape=[]).run()
       q.apply_grad(grad_indices=[0], grad_values=[1.0]).run()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testValidateShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 75f65e625170f231f10cf0e0dbbbad5e1b7f941b..7598991489ce6019352e19cb6c50819d91085b0d 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
@@ -798,6 +799,19 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
                                                result_tensor.values).eval()
     self.assertAllEqual(result_np, res_densified)
 
+  @test_util.run_deprecated_v1
+  def testCwiseShapeValidation(self):
+    # Test case for GitHub 24072.
+    with self.session(use_gpu=False):
+      a = array_ops.ones([3, 4, 1], dtype=dtypes.int32)
+      b = sparse_tensor.SparseTensor([[0, 0, 1, 0], [0, 0, 3, 0]], [10, 20],
+                                     [1, 1, 4, 2])
+      c = a * b
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "broadcasts dense to sparse only; got incompatible shapes"):
+        c.eval()
+
   @test_util.run_deprecated_v1
   def testCwiseDivAndMul(self):
     np.random.seed(1618)
diff --git a/tensorflow/python/kernel_tests/stack_ops_test.py b/tensorflow/python/kernel_tests/stack_ops_test.py
index d50f3f468069df02490368e66ab0871a7f014560..1930d2484fdc986ba8c5ab50df55769aa4fdc45a 100644
--- a/tensorflow/python/kernel_tests/stack_ops_test.py
+++ b/tensorflow/python/kernel_tests/stack_ops_test.py
@@ -96,7 +96,7 @@ class StackOpTest(test.TestCase):
           c1, b1, [r, v], [r.get_shape(), tensor_shape.unknown_shape()])
       self.assertAllClose(np.ones(2000) * 10.0, self.evaluate(ry))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testStackWhileSwap(self):
     self._testStackWhileSwap(use_gpu=False)
     self._testStackWhileSwap(use_gpu=True)
@@ -248,7 +248,7 @@ class StackOpRefTest(test.TestCase):
           c1, b1, [r, v], [r.get_shape(), tensor_shape.unknown_shape()])
       self.assertAllClose(np.ones(2000) * 10.0, self.evaluate(ry))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testStackWhileSwap(self):
     self._testStackWhileSwap(use_gpu=False)
     self._testStackWhileSwap(use_gpu=True)
diff --git a/tensorflow/python/kernel_tests/summary_ops_test.py b/tensorflow/python/kernel_tests/summary_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd446eb40eb9ff1931a3eb4555f9dd81a77b659f
--- /dev/null
+++ b/tensorflow/python/kernel_tests/summary_ops_test.py
@@ -0,0 +1,267 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for V2 summary ops from summary_ops_v2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.core.framework import summary_pb2
+from tensorflow.core.util import event_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
+from tensorflow.python.lib.io import tf_record
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+class SummaryOpsTest(test_util.TensorFlowTestCase):
+
+  def testWrite(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      with summary_ops.create_file_writer(logdir).as_default():
+        output = summary_ops.write('tag', 42, step=12)
+        self.assertTrue(output.numpy())
+    events = events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual(12, events[1].step)
+    value = events[1].summary.value[0]
+    self.assertEqual('tag', value.tag)
+    self.assertEqual(42, to_numpy(value))
+
+  def testWrite_fromFunction(self):
+    logdir = self.get_temp_dir()
+    @def_function.function
+    def f():
+      with summary_ops.create_file_writer(logdir).as_default():
+        return summary_ops.write('tag', 42, step=12)
+    with context.eager_mode():
+      output = f()
+      self.assertTrue(output.numpy())
+    events = events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual(12, events[1].step)
+    value = events[1].summary.value[0]
+    self.assertEqual('tag', value.tag)
+    self.assertEqual(42, to_numpy(value))
+
+  def testWrite_metadata(self):
+    logdir = self.get_temp_dir()
+    metadata = summary_pb2.SummaryMetadata()
+    metadata.plugin_data.plugin_name = 'foo'
+    with context.eager_mode():
+      with summary_ops.create_file_writer(logdir).as_default():
+        summary_ops.write('obj', 0, 0, metadata=metadata)
+        summary_ops.write('bytes', 0, 0, metadata=metadata.SerializeToString())
+        m = constant_op.constant(metadata.SerializeToString())
+        summary_ops.write('string_tensor', 0, 0, metadata=m)
+    events = events_from_logdir(logdir)
+    self.assertEqual(4, len(events))
+    self.assertEqual(metadata, events[1].summary.value[0].metadata)
+    self.assertEqual(metadata, events[2].summary.value[0].metadata)
+    self.assertEqual(metadata, events[3].summary.value[0].metadata)
+
+  def testWrite_name(self):
+    @def_function.function
+    def f():
+      output = summary_ops.write('tag', 42, step=12, name='anonymous')
+      self.assertTrue(output.name.startswith('anonymous'))
+    f()
+
+  def testWrite_ndarray(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      with summary_ops.create_file_writer(logdir).as_default():
+        summary_ops.write('tag', [[1, 2], [3, 4]], step=12)
+    events = events_from_logdir(logdir)
+    value = events[1].summary.value[0]
+    self.assertAllEqual([[1, 2], [3, 4]], to_numpy(value))
+
+  def testWrite_tensor(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      t = constant_op.constant([[1, 2], [3, 4]])
+      with summary_ops.create_file_writer(logdir).as_default():
+        summary_ops.write('tag', t, step=12)
+      expected = t.numpy()
+    events = events_from_logdir(logdir)
+    value = events[1].summary.value[0]
+    self.assertAllEqual(expected, to_numpy(value))
+
+  def testWrite_tensor_fromFunction(self):
+    logdir = self.get_temp_dir()
+    @def_function.function
+    def f(t):
+      with summary_ops.create_file_writer(logdir).as_default():
+        summary_ops.write('tag', t, step=12)
+    with context.eager_mode():
+      t = constant_op.constant([[1, 2], [3, 4]])
+      f(t)
+      expected = t.numpy()
+    events = events_from_logdir(logdir)
+    value = events[1].summary.value[0]
+    self.assertAllEqual(expected, to_numpy(value))
+
+  def testWrite_stringTensor(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      with summary_ops.create_file_writer(logdir).as_default():
+        summary_ops.write('tag', [b'foo', b'bar'], step=12)
+    events = events_from_logdir(logdir)
+    value = events[1].summary.value[0]
+    self.assertAllEqual([b'foo', b'bar'], to_numpy(value))
+
+  @test_util.also_run_as_tf_function
+  def testWrite_noDefaultWriter(self):
+    with context.eager_mode():
+      self.assertFalse(summary_ops.write('tag', 42, step=0))
+
+  def testWrite_shouldRecordSummaries(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      with summary_ops.create_file_writer(logdir).as_default():
+        self.assertTrue(summary_ops.write('default_on', 1, step=0))
+        with summary_ops.always_record_summaries():
+          self.assertTrue(summary_ops.write('set_on', 1, step=0))
+        with summary_ops.never_record_summaries():
+          self.assertFalse(summary_ops.write('set_off', 1, step=0))
+    events = events_from_logdir(logdir)
+    self.assertEqual(3, len(events))
+    self.assertEqual('default_on', events[1].summary.value[0].tag)
+    self.assertEqual('set_on', events[2].summary.value[0].tag)
+
+  def testWrite_shouldRecordSummaries_fromFunction(self):
+    logdir = self.get_temp_dir()
+    @def_function.function
+    def f(tag_prefix):
+      with summary_ops.create_file_writer(logdir).as_default():
+        default_output = summary_ops.write(tag_prefix + '_default', 1, step=0)
+        with summary_ops.always_record_summaries():
+          on_output = summary_ops.write(tag_prefix + '_on', 1, step=0)
+        with summary_ops.never_record_summaries():
+          off_output = summary_ops.write(tag_prefix + '_off', 1, step=0)
+        return [default_output, on_output, off_output]
+    with context.eager_mode():
+      self.assertAllEqual([True, True, False], f('default'))
+      with summary_ops.always_record_summaries():
+        self.assertAllEqual([True, True, False], f('on'))
+      with summary_ops.never_record_summaries():
+        self.assertAllEqual([False, True, False], f('off'))
+    events = events_from_logdir(logdir)
+    self.assertEqual(6, len(events))
+    self.assertEqual('default_default', events[1].summary.value[0].tag)
+    self.assertEqual('default_on', events[2].summary.value[0].tag)
+    self.assertEqual('on_default', events[3].summary.value[0].tag)
+    self.assertEqual('on_on', events[4].summary.value[0].tag)
+    self.assertEqual('off_on', events[5].summary.value[0].tag)
+
+  @test_util.also_run_as_tf_function
+  def testSummaryScope(self):
+    with summary_ops.summary_scope('foo') as (tag, scope):
+      self.assertEqual('foo', tag)
+      self.assertEqual('foo/', scope)
+      with summary_ops.summary_scope('bar') as (tag, scope):
+        self.assertEqual('foo/bar', tag)
+        self.assertEqual('foo/bar/', scope)
+      with summary_ops.summary_scope('with/slash') as (tag, scope):
+        self.assertEqual('foo/with/slash', tag)
+        self.assertEqual('foo/with/slash/', scope)
+      with ops.name_scope(None):
+        with summary_ops.summary_scope('unnested') as (tag, scope):
+          self.assertEqual('unnested', tag)
+          self.assertEqual('unnested/', scope)
+
+  @test_util.also_run_as_tf_function
+  def testSummaryScope_defaultName(self):
+    with summary_ops.summary_scope(None) as (tag, scope):
+      self.assertEqual('summary', tag)
+      self.assertEqual('summary/', scope)
+    with summary_ops.summary_scope(None, 'backup') as (tag, scope):
+      self.assertEqual('backup', tag)
+      self.assertEqual('backup/', scope)
+
+  @test_util.also_run_as_tf_function
+  def testSummaryScope_handlesCharactersIllegalForScope(self):
+    with summary_ops.summary_scope('f?o?o') as (tag, scope):
+      self.assertEqual('f?o?o', tag)
+      self.assertEqual('foo/', scope)
+    # If all characters aren't legal for a scope name, use default name.
+    with summary_ops.summary_scope('???', 'backup') as (tag, scope):
+      self.assertEqual('???', tag)
+      self.assertEqual('backup/', scope)
+
+  @test_util.also_run_as_tf_function
+  def testSummaryScope_nameNotUniquifiedForTag(self):
+    constant_op.constant(0, name='foo')
+    with summary_ops.summary_scope('foo') as (tag, _):
+      self.assertEqual('foo', tag)
+    with summary_ops.summary_scope('foo') as (tag, _):
+      self.assertEqual('foo', tag)
+    with ops.name_scope('with'):
+      constant_op.constant(0, name='slash')
+    with summary_ops.summary_scope('with/slash') as (tag, _):
+      self.assertEqual('with/slash', tag)
+
+
+def events_from_file(filepath):
+  """Returns all events in a single event file.
+
+  Args:
+    filepath: Path to the event file.
+
+  Returns:
+    A list of all tf.Event protos in the event file.
+  """
+  records = list(tf_record.tf_record_iterator(filepath))
+  result = []
+  for r in records:
+    event = event_pb2.Event()
+    event.ParseFromString(r)
+    result.append(event)
+  return result
+
+
+def events_from_logdir(logdir):
+  """Returns all events in the single eventfile in logdir.
+
+  Args:
+    logdir: The directory in which the single event file is sought.
+
+  Returns:
+    A list of all tf.Event protos from the single event file.
+
+  Raises:
+    AssertionError: If logdir does not contain exactly one file.
+  """
+  assert gfile.Exists(logdir)
+  files = gfile.ListDirectory(logdir)
+  assert len(files) == 1, 'Found not exactly one file in logdir: %s' % files
+  return events_from_file(os.path.join(logdir, files[0]))
+
+
+def to_numpy(summary_value):
+  return tensor_util.MakeNdarray(summary_value.tensor)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index 97a280ef51c1c4330e405feea6a2efd07a78e399..cfa9f122d1fcee1748cd30bdc4212d81a5709ae6 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import linalg_ops
@@ -38,6 +40,7 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class SvdOpTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The input to svd should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
@@ -49,6 +52,7 @@ class SvdOpTest(test.TestCase):
                                  "Shape must be at least rank 2 but is rank 1"):
       linalg_ops.svd(vector)
 
+  @test_util.run_v1_only("b/120545219")
   def testConcurrentExecutesWithoutError(self):
     with self.session(use_gpu=True) as sess:
       all_ops = []
@@ -117,14 +121,15 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
         diag_s = array_ops.concat([diag_s, zeros], a.ndim - 1)
     a_recon = math_ops.matmul(u, diag_s)
     a_recon = math_ops.matmul(a_recon, v, adjoint_b=True)
-    self.assertAllClose(a_recon.eval(), a, rtol=tol, atol=tol)
+    self.assertAllClose(a_recon, a, rtol=tol, atol=tol)
 
   def CheckUnitary(self, x, tol):
     # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
     xx = math_ops.matmul(x, x, adjoint_a=True)
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
-    self.assertAllClose(identity.eval(), self.evaluate(xx), atol=tol)
+    self.assertAllClose(identity, xx, atol=tol)
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     is_complex = dtype_ in (np.complex64, np.complex128)
     is_single = dtype_ in (np.float32, np.complex64)
@@ -213,6 +218,7 @@ def _GetSvdGradOpTest(dtype_, shape_, compute_uv_, full_matrices_):
     tf_v *= phase[..., :n]
     return tf_s, tf_u, tf_v
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     np.random.seed(42)
     a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_)
@@ -263,7 +269,8 @@ if __name__ == "__main__":
           for cols in 1, 2, 5, 10, 32, 100:
             for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
               shape = batch_dims + (rows, cols)
-              for use_static_shape in True, False:
+              # TF2 does not support placeholders under eager so we skip it
+              for use_static_shape in set([True, tf2.enabled()]):
                 name = "%s_%s_static_shape_%s__compute_uv_%s_full_%s" % (
                     dtype.__name__, "_".join(map(str, shape)), use_static_shape,
                     compute_uv, full_matrices)
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 76e90ffea3914ac6b22443e8d17265882e4a3cdd..147e7fde5793d4ac0b85696715aa7645f8e79bb2 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gradients_impl
@@ -123,10 +124,7 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayWritePack(dtypes.int64)
     self._testTensorArrayWritePack(dtypes.complex64)
     self._testTensorArrayWritePack(dtypes.complex128)
-    if not (test.is_gpu_available() and
-            tensor_array_ops.ENABLE_TENSOR_ARRAY_V2):
-      # TODO(b/119684648): Enable this.
-      self._testTensorArrayWritePack(dtypes.string)
+    self._testTensorArrayWritePack(dtypes.string)
 
   def testTensorArrayWritePack(self):
     self._testTensorArrayWritePackMaybeLegacy()
@@ -164,7 +162,6 @@ class TensorArrayTest(test.TestCase):
           convert([[4.0, 5.0], [104.0, 105.0], [204.0, 205.0], [6.0, 7.0],
                    [106.0, 107.0], [8.0, 9.0]]), c0)
 
-  @test_util.disable_control_flow_v2("b/118343594 (TensorArray.concat)")
   @test_util.run_deprecated_v1
   def testTensorArrayWriteConcat(self):
     self._testTensorArrayWriteConcat(dtypes.float32)
@@ -189,7 +186,7 @@ class TensorArrayTest(test.TestCase):
                           self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
   @test_util.disable_control_flow_v2("b/118890905")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/118890905")
   def testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros()
 
@@ -206,7 +203,7 @@ class TensorArrayTest(test.TestCase):
                         self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
   @test_util.disable_control_flow_v2("b/118890905")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/118890905")
   def testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros()
 
@@ -255,10 +252,7 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayUnpackRead(dtypes.int64)
     self._testTensorArrayUnpackRead(dtypes.complex64)
     self._testTensorArrayUnpackRead(dtypes.complex128)
-    if not (test.is_gpu_available() and
-            tensor_array_ops.ENABLE_TENSOR_ARRAY_V2):
-      # TODO(b/119684648): Enable this.
-      self._testTensorArrayUnpackRead(dtypes.string)
+    self._testTensorArrayUnpackRead(dtypes.string)
 
   def testTensorArrayUnpackRead(self):
     self._testTensorArrayUnpackReadMaybeLegacy()
@@ -305,7 +299,6 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(convert([]).reshape(0, 2), d1)
       self.assertAllEqual(convert([[3.0, 301.0]]), d2)
 
-  @test_util.disable_control_flow_v2("b/118343962 (TensorArray.split)")
   @test_util.run_deprecated_v1
   def testTensorArraySplitRead(self):
     self._testTensorArraySplitRead(dtypes.float32)
@@ -317,7 +310,7 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArraySplitRead(dtypes.string)
 
   @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("v2 does not support TensorArray.grad.")
   def testSkipEagerTensorGradArrayWriteRead(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
@@ -353,7 +346,7 @@ class TensorArrayTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayGradGrad(self):
-    if not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2:
+    if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
       self.skipTest("Legacy TensorArray does not support double derivatives.")
     with self.test_session(use_gpu=True) as session:
       x = constant_op.constant(4.0)
@@ -372,7 +365,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([2.0], session.run(g2))
 
   @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("v2 does not support TensorArray.grad.")
   def testSkipEagerTensorGradArrayDynamicWriteRead(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
@@ -415,7 +408,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(3, g_vs)
 
   @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("v2 does not support TensorArray.grad.")
   def testSkipEagerTensorGradAccessTwiceReceiveSameObject(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
@@ -432,12 +425,11 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(t_g_ta_0, t_g_ta_1)
       self.assertAllEqual([[4.0, 5.0]], d_r1_0)
 
-  @test_util.run_deprecated_v1
   def testTensorArrayWriteWrongIndexOrDataTypeFails(self):
     with self.session(use_gpu=True):
       ta = _make_ta(3, "foo", dtype=dtypes.float32)
       # Test writing the wrong datatype
-      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+      if (control_flow_util.ENABLE_CONTROL_FLOW_V2 and
           not context.executing_eagerly()):
         error_msg = ("Invalid data types; op elements string but list elements "
                      "float")
@@ -448,7 +440,7 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(0, "wrong_type_scalar").flow)
 
-      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+      if (control_flow_util.ENABLE_CONTROL_FLOW_V2 and
           not context.executing_eagerly()):
         error_msg = "Trying to modify element -1 in a list with 3 elements."
       else:
@@ -456,7 +448,7 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(-1, 3.0).flow)
 
-      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+      if (control_flow_util.ENABLE_CONTROL_FLOW_V2 and
           not context.executing_eagerly()):
         error_msg = "Trying to modify element 3 in a list with 3 elements"
       else:
@@ -466,7 +458,6 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(3, 3.0).flow)
 
-  @test_util.run_deprecated_v1
   def testTensorArrayReadWrongIndexOrDataTypeFails(self):
     with self.session(use_gpu=True):
       ta = _make_ta(3, "foo", dtype=dtypes.float32)
@@ -475,14 +466,14 @@ class TensorArrayTest(test.TestCase):
 
       # Test reading wrong datatype (only possible when constructing graphs).
       if (not context.executing_eagerly() and
-          not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2):
+          not control_flow_util.ENABLE_CONTROL_FLOW_V2):
         r0_bad = gen_data_flow_ops.tensor_array_read_v3(
             handle=w0.handle, index=0, dtype=dtypes.float64, flow_in=w0.flow)
         with self.assertRaisesOpError(
             "TensorArray dtype is float but Op requested dtype double."):
           self.evaluate(r0_bad)
 
-      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+      if (control_flow_util.ENABLE_CONTROL_FLOW_V2 and
           not context.executing_eagerly()):
         error_msg = "Trying to access element -1 in a list with 3 elements."
       else:
@@ -491,7 +482,7 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.read(-1))
 
-      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+      if (control_flow_util.ENABLE_CONTROL_FLOW_V2 and
           not context.executing_eagerly()):
         error_msg = "Trying to access element 3 in a list with 3 elements."
       else:
@@ -501,7 +492,7 @@ class TensorArrayTest(test.TestCase):
         self.evaluate(ta.read(3))
 
   @test_util.disable_control_flow_v2("v2 allows multiple writes.")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("v2 allows multiple writes.")
   def testSkipEagerTensorArrayWriteMultipleFails(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -512,8 +503,6 @@ class TensorArrayTest(test.TestCase):
           "it has already been written to."):
         self.evaluate(ta.write(2, 3.0).write(2, 3.0).flow)
 
-  @test_util.disable_control_flow_v2("b/118343594 (TensorArray.concat)")
-  @test_util.run_deprecated_v1
   def testTensorArrayConcatIncompatibleShapesFails(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -545,8 +534,6 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError("shape"):
         self.evaluate(w3.concat())
 
-  @test_util.disable_control_flow_v2("b/118343962 (TensorArray.split)")
-  @test_util.run_deprecated_v1
   def testTensorArraySplitIncompatibleShapesFails(self):
     with self.session(use_gpu=True):
       in_eager_mode = context.executing_eagerly()
@@ -559,22 +546,32 @@ class TensorArrayTest(test.TestCase):
           lengths = array_ops.placeholder(dtypes.int64)
           ta.split([1.0, 2.0, 3.0], lengths).flow.eval(feed_dict={lengths: 1})
 
-      with self.assertRaisesOpError(
-          r"Expected sum of lengths to be equal to values.shape\[0\], "
-          r"but sum of lengths is 1 and value's shape is: \[3\]"):
+      error_msg = ("Unused values in tensor. Length of tensor: 3 Values used: 1"
+                   if control_flow_util.ENABLE_CONTROL_FLOW_V2 and
+                   not in_eager_mode else
+                   r"Expected sum of lengths to be equal to values.shape\[0\], "
+                   r"but sum of lengths is 1 and value's shape is: \[3\]")
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.split([1.0, 2.0, 3.0], [1]).flow)
 
       ta = _make_ta(1, "baz")
-      with self.assertRaisesOpError(
-          r"Expected value to be at least a vector, but received shape: \[\]"):
-        self.evaluate(ta.split(1.0, [1]).flow)
+      if control_flow_util.ENABLE_CONTROL_FLOW_V2 and not in_eager_mode:
+        with self.assertRaisesRegexp(
+            ValueError, "Shape must be at least rank 1 but is rank 0"):
+          self.evaluate(ta.split(1.0, [1]).flow)
+      else:
+        with self.assertRaisesOpError(
+            r"Expected value to be at least a vector, but received shape: \[\]"
+        ):
+          self.evaluate(ta.split(1.0, [1]).flow)
 
-      ta = _make_ta(2, "buz")
-      with self.assertRaisesOpError(
-          r"TensorArray's size is not equal to the size of lengths "
-          r"\(2 vs. 1\), and the TensorArray is not marked as "
-          r"dynamically resizeable"):
-        self.evaluate(ta.split([1.0], [1]).flow)
+      if not control_flow_util.ENABLE_CONTROL_FLOW_V2 or in_eager_mode:
+        ta = _make_ta(2, "buz")
+        with self.assertRaisesOpError(
+            r"TensorArray's size is not equal to the size of lengths "
+            r"\(2 vs. 1\), and the TensorArray is not marked as "
+            r"dynamically resizeable"):
+          self.evaluate(ta.split([1.0], [1]).flow)
 
   def _testTensorArrayWriteGradientAddMultipleAdds(self, dtype):
     with self.cached_session(use_gpu=True):
@@ -611,14 +608,14 @@ class TensorArrayTest(test.TestCase):
         wb1_grad.flow.eval()
 
   @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("v2 does not support TensorArray.grad.")
   def testSkipEagerTensorArrayWriteGradientAddMultipleAdds(self):
     for dtype in (dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64,
                   dtypes.complex64, dtypes.complex128):
       self._testTensorArrayWriteGradientAddMultipleAdds(dtype)
 
   @test_util.disable_control_flow_v2("Low level legacy TA op test.")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("Low level legacy TA op test.")
   def testSkipEagerTensorArrayGradWithShapeKnownElementShape(self):
     with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
@@ -649,7 +646,7 @@ class TensorArrayTest(test.TestCase):
                           sess.run(read_value, feed_dict={value: fed_value}))
 
   @test_util.disable_control_flow_v2("Low level legacy TA op test.")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("Low level legacy TA op test.")
   def testSkipEagerTensorArrayGradWithShapeUnknownElementShape(self):
     with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
@@ -773,13 +770,12 @@ class TensorArrayTest(test.TestCase):
       self.assertAllClose([2.0 - 0.5 + 20.0, 3.0 + 1.5 + 30.0], grad_vals[0])
       self.assertAllEqual([4.0 + 40.0, 5.0 + 50.0], grad_vals[1])
 
-  @test_util.disable_control_flow_v2("b/118343594 (TensorArray.concat)")
   @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayGradientWritePackConcatAndRead(self):
     self._testTensorArrayGradientWritePackConcatAndRead()
 
   @test_util.disable_control_flow_v2("v2 does not support clear_after_read.")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("v2 does not support clear_after_read.")
   def testTensorArrayReadTwice(self):
     with self.session(use_gpu=True):
       value = constant_op.constant([[1.0, -1.0], [10.0, -10.0]])
@@ -837,7 +833,6 @@ class TensorArrayTest(test.TestCase):
   def testSkipEagerTensorArrayGradientUnpackRead(self):
     self._testTensorArrayGradientUnpackRead()
 
-  @test_util.disable_control_flow_v2("b/118343962 (TensorArray.split)")
   @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayGradientSplitConcat(self):
     with self.session(use_gpu=True) as session:
@@ -960,7 +955,7 @@ class TensorArrayTest(test.TestCase):
         v0_grad = gradients_impl.gradients([vout], [v0], [grad_val])[0]
         state0_grad = gradients_impl.gradients([vout], [state0], [grad_val])[0]
         var_grad = gradients_impl.gradients([vout], [var], [grad_val])[0]
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
       state0_t, var_t, v0_t, vout_t, v0_grad_t, var_grad_t, state0_grad_t = (
           self.evaluate(
@@ -1005,22 +1000,8 @@ class TensorArrayTest(test.TestCase):
     # self._testWhileLoopWritePackGradients(
     #     dynamic_size=False, dtype=tf.int64)
 
-  @test_util.disable_control_flow_v2("Testing v1 while_loop with v2 TA")
-  @test_util.enable_tensor_array_v2
-  def testWhileLoopV1WithTensorArrayV2(self):
-    size = 3
-    ta = tensor_array_ops.TensorArray(
-        dtype=dtypes.int32, size=size, element_shape=tensor_shape.scalar())
-
-    def Body(counter, ta):
-      return counter + 1, ta.write(counter, counter)
-
-    _, ta = control_flow_ops.while_loop(lambda i, _: i < size, Body, [0, ta])
-
-    for i in range(size):
-      self.assertEqual(self.evaluate(ta.read(i)), i)
-
   @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
+  @test_util.run_v1_only("b/117943489")
   def testSkipEagerWhileLoopDynamicWritePackGradients(self):
     self._testWhileLoopWritePackGradients(
         dynamic_size=True, dtype=dtypes.float32)
@@ -1168,7 +1149,6 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w0.write(0, c2)
 
-  @test_util.disable_control_flow_v2("b/118343962 (TensorArray.split)")
   @test_util.run_deprecated_v1
   def testSkipEagerPartlyUnknownShape(self):
     with self.session(use_gpu=True):
@@ -1241,11 +1221,10 @@ class TensorArrayTest(test.TestCase):
         w1.write(4, c2)
 
   @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/117943489")
   def testUnpackShape(self):
     self._testUnpackShape()
 
-  @test_util.disable_control_flow_v2("b/118343962 (TensorArray.split)")
   @test_util.run_deprecated_v1
   def testSplitShape(self):
     with self.session(use_gpu=True):
@@ -1273,9 +1252,10 @@ class TensorArrayTest(test.TestCase):
         self.assertEqual((2, 2), w0.read(1).get_shape())
       else:
         self.assertEqual(r0.get_shape().ndims, None)
-        self.assertEqual(
-            tensor_shape.TensorShape(
-                ta1.handle.op.get_attr("element_shape")).ndims, None)
+        if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
+          self.assertEqual(
+              tensor_shape.TensorShape(
+                  ta1.handle.op.get_attr("element_shape")).ndims, None)
 
   @test_util.run_deprecated_v1
   def testSkipEagerWriteUnknownShape(self):
@@ -1321,12 +1301,12 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(np.array([1.0, 1.0, 1.0]), self.evaluate(grad)[0])
 
   @test_util.disable_control_flow_v2("b/117943489")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArrayUnpackDynamic(self):
     self._testTensorArrayUnpackDynamic()
 
-  @test_util.disable_control_flow_v2("b/118343594 (TensorArray.concat)")
-  @test_util.run_deprecated_v1
+  @test_util.disable_control_flow_v2("b/117943489")
+  @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArraySplitDynamic(self):
     with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
@@ -1349,11 +1329,11 @@ class TensorArrayTest(test.TestCase):
           "TensorArray has size zero, but element shape <unknown> is not "
           "fully defined. Currently only static shapes are supported when "
           "packing zero-size TensorArrays.")
-      with self.assertRaisesOpError(v2_msg if tensor_array_ops
-                                    .ENABLE_TENSOR_ARRAY_V2 else v1_msg):
+      with self.assertRaisesOpError(
+          v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
         ta.stack().eval()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSkipEagerTensorArrayEvalEmpty(self):
     self._testTensorArrayEvalEmpty()
 
@@ -1374,12 +1354,12 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([0, 5], self.evaluate(concatenated).shape)
 
   @test_util.disable_control_flow_v2("b/117943489")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArrayEvalEmptyWithDefault(self):
     self._testTensorArrayEvalEmptyWithDefault()
 
   @test_util.disable_control_flow_v2("b/117943489")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArrayScatterReadAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
@@ -1407,7 +1387,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
   @test_util.disable_control_flow_v2("b/117943286")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/117943286")
   def testTensorArrayWriteGatherAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
@@ -1445,7 +1425,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(expected_grad, grad_vals[0])
 
   @test_util.disable_control_flow_v2("colocate_with not supported in v2.")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSkipEagerTensorArrayGetsDeviceFromFirstWrite(self):
     with ops.device("/job:worker/task:0/cpu:0"):
       # this initial device will be ignored.
@@ -1495,7 +1475,7 @@ class TensorArrayTest(test.TestCase):
             [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
 
   @test_util.disable_control_flow_v2("colocate_with not supported in v2.")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSkipEagerTensorArrayGetsDeviceFromFirstWriteInWhileLoop(self):
     with ops.device("/job:worker/task:0/cpu:0"):
       ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
@@ -1526,7 +1506,7 @@ class TensorArrayTest(test.TestCase):
             [s for s in dev_stats[d] if "TensorArray" == s.node_name])
 
   @test_util.disable_control_flow_v2("colocate_with not supported in v2.")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSkipEagerTensorArrayDisabledColocateWithFirstWriteCall(self):
     with ops.device("/job:worker/task:0/cpu:0"):
       ta = tensor_array_ops.TensorArray(
@@ -1594,7 +1574,7 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(tensor_shape.scalar(), read1.get_shape())
 
       if not context.executing_eagerly():
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
       read0_v, read1_v, size0_v, size1_v = self.evaluate((read0, read1, size0,
                                                           size1))
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index 123c9b376c9de0b39b1b6a61548819501ec4bd59..febfe23b16d0a5b56102dd1c4c21d5cf16a0e1dc 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -20,9 +20,11 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test as test_lib
@@ -39,6 +41,7 @@ def _add_test(test, test_name, fn):
 
 class TensordotTest(test_lib.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def test_invalid_shape(self):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4], [5, 6]]
@@ -62,6 +65,7 @@ class TensordotTest(test_lib.TestCase):
                 axes_ph: (a_axes, b_axes)
             })
 
+  @test_util.run_v1_only("b/120545219")
   def test_invalid_axes(self):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4]]
@@ -99,11 +103,12 @@ class TensordotTest(test_lib.TestCase):
 
         tf_a = array_ops.ones((3, 3), dtype=dtypes.float32)
         tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
-        tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value).eval()
+        tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value)
 
         self.assertAllEqual(tf_ans.shape, np_ans.shape)
         self.assertAllEqual(tf_ans, np_ans)
 
+  @test_util.run_v1_only("b/120545219")
   def test_partial_shape_inference(self):
     for axes in ([1], [0]), 1:
       a = array_ops.placeholder(dtypes.float32)
@@ -178,7 +183,7 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
                   axes: (a_dims_np, b_dims_np)
               })
         else:
-          tf_ans = math_ops.tensordot(a_np, b_np, (a_dims_np, b_dims_np)).eval()
+          tf_ans = math_ops.tensordot(a_np, b_np, (a_dims_np, b_dims_np))
       self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol)
       self.assertAllEqual(tf_ans.shape, np_ans.shape)
 
@@ -208,7 +213,7 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
           c = math_ops.tensordot(a, b, axes=axes)
           tf_ans = sess.run(c, feed_dict={a: a_np, b: b_np})
         else:
-          tf_ans = math_ops.tensordot(a_np, b_np, axes=axes).eval()
+          tf_ans = math_ops.tensordot(a_np, b_np, axes=axes)
       self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol)
       self.assertAllEqual(tf_ans.shape, np_ans.shape)
 
@@ -220,7 +225,8 @@ if __name__ == "__main__":
     for rank_a in 1, 2, 4, 5:
       for rank_b in 1, 2, 4, 5:
         for num_dims in range(0, min(rank_a, rank_b) + 1):
-          for dynamic_shape in False, True:
+          # TF2 does not support placeholders under eager so we skip it
+          for dynamic_shape in set([False, not tf2.enabled()]):
             for testcase in _get_tensordot_tests(dtype, rank_a, rank_b,
                                                  num_dims, dynamic_shape):
               name = "%s_%s_%s_%s_%s_%s" % (testcase.__name__, dtype.__name__,
diff --git a/tensorflow/python/kernel_tests/unicode_decode_op_test.py b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
index c165021eea3eba54fbc77aa328acebaccd844a74..9a59f8a7acb8f87381399a556411d523a49d5d37 100644
--- a/tensorflow/python/kernel_tests/unicode_decode_op_test.py
+++ b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
@@ -19,134 +19,686 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+import numpy as np
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors_impl as errors
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_string_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import test
 
 
-# Account for python2 and python3 execution of the test.
-def codepoint(s):
-  if isinstance(s, bytes):
-    return ord(s.decode("utf-8"))
-  elif isinstance(s, str):
-    return ord(s)
-
-
-class UnicodeDecodeTest(test.TestCase):
-
-  def testBatchDecode(self):
-    text = constant_op.constant(
-        ["仅今年前", "分享介面終於迎來更新"])
-    row_splits, utf8_text, offsets = gen_string_ops.unicode_decode_with_offsets(
-        text, "utf-8")
-
-    with self.test_session():
-      self.assertAllEqual([
-          codepoint("仅"),
-          codepoint("今"),
-          codepoint("年"),
-          codepoint("前"),
-          codepoint("分"),
-          codepoint("享"),
-          codepoint("介"),
-          codepoint("面"),
-          codepoint("終"),
-          codepoint("於"),
-          codepoint("迎"),
-          codepoint("來"),
-          codepoint("更"),
-          codepoint("新")
-      ],
-                          self.evaluate(utf8_text).tolist())
-      self.assertAllEqual([0, 4, 14], self.evaluate(row_splits).tolist())
-      self.assertAllEqual([0, 3, 6, 9, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27],
-                          self.evaluate(offsets).tolist())
-
-  def testBasicDecodeWithOffset(self):
-    text = constant_op.constant(["仅今年前"])
-    row_splits, utf8_text, starts = gen_string_ops.unicode_decode_with_offsets(
-        text, "utf-8")
-
-    with self.test_session():
-      self.assertAllEqual([
-          codepoint("仅"),
-          codepoint("今"),
-          codepoint("年"),
-          codepoint("前"),
-      ],
-                          self.evaluate(utf8_text).tolist())
-      self.assertAllEqual(self.evaluate(row_splits).tolist(), [0, 4])
-      self.assertAllEqual(self.evaluate(starts).tolist(), [0, 3, 6, 9])
-
-  @test_util.run_deprecated_v1
-  def testStrictError(self):
-    text = constant_op.constant([b"\xFEED"])
-    _, error, _ = gen_string_ops.unicode_decode_with_offsets(
-        text, "utf-8", errors="strict")
-
-    with self.assertRaises(errors.InvalidArgumentError):
-      with self.test_session():
-        self.evaluate(error)
-
-  def testReplaceOnError(self):
-    text = constant_op.constant([b"\xFE"])
-
-    _, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
-        text, "utf-8", errors="replace")
-
-    with self.test_session():
-      self.assertAllEqual(self.evaluate(utf8_text).tolist(), [65533])
-
-  @test_util.run_deprecated_v1
-  def testBadReplacementChar(self):
-    text = constant_op.constant([b"\xFE"])
-    _, error, _ = gen_string_ops.unicode_decode_with_offsets(
-        text, "utf-8", errors="replace", replacement_char=11141111)
-
-    with self.assertRaises(errors.InvalidArgumentError):
-      with self.test_session():
-        self.evaluate(error)
-
-  def testIgnoreOnError(self):
-    text = constant_op.constant([b"\xFEhello"])
-
-    _, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
-        text, "utf-8", errors="ignore")
-
-    with self.test_session():
-      self.assertAllEqual(self.evaluate(utf8_text).tolist(), [
-          codepoint("h"),
-          codepoint("e"),
-          codepoint("l"),
-          codepoint("l"),
-          codepoint("o")
-      ])
-
-  @test_util.run_deprecated_v1
-  def testBadErrorPolicy(self):
-    text = constant_op.constant(["hippopotamus"])
-
-    with self.assertRaises(ValueError):
-      _, _, _ = gen_string_ops.unicode_decode_with_offsets(
-          text, "utf-8", errors="oranguatan")
-
-  def testReplaceControlChars(self):
-    text = constant_op.constant(["\x02仅今年前"])
-    row_splits, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
-        text, "utf-8", replace_control_characters=True)
-
-    with self.test_session():
-      self.assertAllEqual([
-          65533,
-          codepoint("仅"),
-          codepoint("今"),
-          codepoint("年"),
-          codepoint("前"),
-      ],
-                          self.evaluate(utf8_text).tolist())
-      self.assertAllEqual([0, 5], self.evaluate(row_splits).tolist())
+def _nested_encode(x, encoding):
+  """Encode each string in a nested list with `encoding`."""
+  if isinstance(x, list):
+    return [_nested_encode(v, encoding) for v in x]
+  else:
+    return x.encode(encoding)
+
+
+def _nested_codepoints(x):
+  """Replace each string in a nested list with a list of its codepoints."""
+  # Works for Python 2 and 3, and for both UCS2 and UCS4 builds
+  if isinstance(x, list):
+    return [_nested_codepoints(v) for v in x]
+  else:
+    b = list(x.encode("utf-32-be"))
+    if any(isinstance(c, str) for c in b):
+      b = [ord(c) for c in b]
+    return [(b0 << 24) + (b1 << 16) + (b2 << 8) + b3
+            for b0, b1, b2, b3 in zip(b[::4], b[1::4], b[2::4], b[3::4])]
+
+
+def _nested_offsets(x, encoding):
+  """Replace each string in a nested list with a list of start offsets."""
+  if isinstance(x, list):
+    return [_nested_offsets(v, encoding) for v in x]
+  else:
+    if not x:
+      return []
+    encoded_x = x.encode("utf-32-be")
+    encoded_chars = [encoded_x[i:i + 4] for i in range(0, len(encoded_x), 4)]
+    char_lens = [
+        len(c.decode("utf-32-be").encode(encoding)) for c in encoded_chars
+    ]
+    return [0] + np.cumsum(char_lens).tolist()[:-1]
+
+
+def _nested_splitchars(x, encoding):
+  """Replace each string in a nested list with a list of char substrings."""
+  if isinstance(x, list):
+    return [_nested_splitchars(v, encoding) for v in x]
+  else:
+    b = x.encode("utf-32-be")
+    chars = zip(b[::4], b[1::4], b[2::4], b[3::4])
+    if str is bytes:
+      return [b"".join(c).decode("utf-32-be").encode(encoding) for c in chars]
+    else:
+      return [bytes(c).decode("utf-32-be").encode(encoding) for c in chars]
+
+
+def _make_sparse_tensor(indices, values, dense_shape, dtype=np.int32):
+  return sparse_tensor.SparseTensorValue(
+      np.array(indices, np.int64), np.array(values, dtype),
+      np.array(dense_shape, np.int64))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class UnicodeDecodeTest(ragged_test_util.RaggedTensorTestCase,
+                        parameterized.TestCase):
+
+  def testScalarDecode(self):
+    text = constant_op.constant(u"仅今年前".encode("utf-8"))
+    chars = ragged_string_ops.unicode_decode(text, "utf-8")
+    self.assertAllEqual(chars, [ord(c) for c in u"仅今年前"])
+
+  def testScalarDecodeWithOffset(self):
+    text = constant_op.constant(u"仅今年前".encode("utf-8"))
+    chars, starts = ragged_string_ops.unicode_decode_with_offsets(text, "utf-8")
+    self.assertAllEqual(chars, [ord(c) for c in u"仅今年前"])
+    self.assertAllEqual(starts, [0, 3, 6, 9])
+
+  def testVectorDecode(self):
+    text = constant_op.constant([u"仅今年前".encode("utf-8"), b"hello"])
+    chars = ragged_string_ops.unicode_decode(text, "utf-8")
+    expected_chars = [[ord(c) for c in u"仅今年前"],
+                      [ord(c) for c in u"hello"]]
+    self.assertRaggedEqual(chars, expected_chars)
+
+  def testVectorDecodeWithOffset(self):
+    text = constant_op.constant([u"仅今年前".encode("utf-8"), b"hello"])
+    chars, starts = ragged_string_ops.unicode_decode_with_offsets(text, "utf-8")
+    expected_chars = [[ord(c) for c in u"仅今年前"],
+                      [ord(c) for c in u"hello"]]
+    self.assertRaggedEqual(chars, expected_chars)
+    self.assertRaggedEqual(starts, [[0, 3, 6, 9], [0, 1, 2, 3, 4]])
+
+  @parameterized.parameters([
+      {"texts": u"仅今年前"},
+      {"texts": [u"G\xf6\xf6dnight", u"\U0001f60a"]},
+      {"texts": ["Hello", "world", "", u"👍"]},
+      {"texts": [["Hi", "there"], ["", u"\U0001f60a"]], "ragged_rank": 0},
+      {"texts": [["Hi", "there", ""], [u"😊"]], "ragged_rank": 1},
+      {"texts": [[[u"😊", u"🤠🧐"], []], [[u"🤓👻🤖"]]], "ragged_rank": 2},
+      {"texts": []}
+  ])  # pyformat: disable
+  def testBasicDecode(self, texts, ragged_rank=None):
+    input_tensor = ragged_factory_ops.constant_value(
+        _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
+    result = ragged_string_ops.unicode_decode(input_tensor, "UTF-8")
+    expected = _nested_codepoints(texts)
+    self.assertRaggedEqual(expected, result)
+
+  @parameterized.parameters([
+      {"texts": u"仅今年前"},
+      {"texts": [u"G\xf6\xf6dnight", u"\U0001f60a"]},
+      {"texts": ["Hello", "world", "", u"👍"]},
+      {"texts": [["Hi", "there"], ["", u"\U0001f60a"]], "ragged_rank": 0},
+      {"texts": [["Hi", "there", ""], [u"😊"]], "ragged_rank": 1},
+      {"texts": [[[u"😊", u"🤠🧐"], []], [[u"🤓👻🤖"]]], "ragged_rank": 2},
+      {"texts": []}
+  ])  # pyformat: disable
+  def testBasicDecodeWithOffsets(self, texts, ragged_rank=None):
+    input_tensor = ragged_factory_ops.constant_value(
+        _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
+    result = ragged_string_ops.unicode_decode_with_offsets(
+        input_tensor, "UTF-8")
+    expected_codepoints = _nested_codepoints(texts)
+    expected_offsets = _nested_offsets(texts, "UTF-8")
+    self.assertRaggedEqual(expected_codepoints, result[0])
+    self.assertRaggedEqual(expected_offsets, result[1])
+
+  def testDocstringExamples(self):
+    texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]]
+    codepoints1 = ragged_string_ops.unicode_decode(texts, "UTF-8")
+    codepoints2, offsets = ragged_string_ops.unicode_decode_with_offsets(
+        texts, "UTF-8")
+    self.assertRaggedEqual(
+        codepoints1, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]])
+    self.assertRaggedEqual(
+        codepoints2, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]])
+    self.assertRaggedEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
+
+  @parameterized.parameters([
+      dict(
+          texts=["Hello", "world", "", u"👍"],
+          expected=_make_sparse_tensor(
+              indices=[[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [1, 0], [1, 1],
+                       [1, 2], [1, 3], [1, 4], [3, 0]],
+              values=[72, 101, 108, 108, 111, 119, 111, 114, 108, 100, 128077],
+              dense_shape=[4, 5])),
+      dict(
+          texts=[["Hi", "there"], ["", u"\U0001f60a"]],
+          expected=_make_sparse_tensor(
+              indices=[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [0, 1, 2],
+                       [0, 1, 3], [0, 1, 4], [1, 1, 0]],
+              values=[72, 105, 116, 104, 101, 114, 101, 128522],
+              dense_shape=[2, 2, 5])),
+      dict(
+          texts=[],
+          expected=_make_sparse_tensor(np.zeros([0, 2], np.int64), [], [0, 0])),
+  ])
+  def testDecodeWithSparseOutput(self, texts, expected):
+    input_tensor = np.array(_nested_encode(texts, "UTF-8"), dtype=bytes)
+    result = ragged_string_ops.unicode_decode(input_tensor, "UTF-8").to_sparse()
+    self.assertIsInstance(result, sparse_tensor.SparseTensor)
+    self.assertAllEqual(expected.indices, result.indices)
+    self.assertAllEqual(expected.values, result.values)
+    self.assertAllEqual(expected.dense_shape, result.dense_shape)
+
+  @parameterized.parameters([
+      dict(
+          texts=["Hello", "world", "", u"👍"],
+          expected=[[72, 101, 108, 108, 111], [119, 111, 114, 108, 100],
+                    [-1, -1, -1, -1, -1], [128077, -1, -1, -1, -1]]),
+      dict(
+          texts=[["Hi", "there"], ["", u"\U0001f60a"]],
+          expected=[[[72, 105, -1, -1, -1], [116, 104, 101, 114, 101]],
+                    [[-1, -1, -1, -1, -1], [128522, -1, -1, -1, -1]]],
+          ragged_rank=0),
+      dict(
+          texts=[["Hi", "there", ""], [u"😊"]],
+          expected=[[[72, 105, -1, -1, -1],
+                     [116, 104, 101, 114, 101],
+                     [-1, -1, -1, -1, -1]],
+                    [[128522, -1, -1, -1, -1],
+                     [-1, -1, -1, -1, -1],
+                     [-1, -1, -1, -1, -1]]]),
+      dict(
+          texts=[[[u"😊", u"🤠🧐"], []], [[u"🤓👻🤖"]]],
+          expected=[
+              [[[128522, -1, -1], [129312, 129488, -1]],
+               [[-1, -1, -1], [-1, -1, -1]]],
+              [[[129299, 128123, 129302], [-1, -1, -1]],
+               [[-1, -1, -1], [-1, -1, -1]]]]),
+      dict(texts=[], expected=np.zeros([0, 0], np.int64)),
+  ])  # pyformat: disable
+  def testDecodeWithPaddedOutput(self, texts, expected, ragged_rank=None):
+    input_tensor = ragged_factory_ops.constant_value(
+        _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
+    result = ragged_string_ops.unicode_decode(
+        input_tensor, "UTF-8").to_tensor(default_value=-1)
+    self.assertAllEqual(expected, result)
+
+  @parameterized.parameters([
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          expected=[[65533], [104, 101, 108, 108, 111],
+                    [61, 61, 65533, 61, 61], [119, 111, 114, 108, 100]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          replacement_char=0,
+          expected=[[0], [104, 101, 108, 108, 111],
+                    [61, 61, 0, 61, 61], [119, 111, 114, 108, 100]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="ignore",
+          expected=[[], [104, 101, 108, 108, 111],
+                    [61, 61, 61, 61], [119, 111, 114, 108, 100]]),
+      dict(
+          input=[b"\x00", b"hello", b"==\x01==", b"world"],
+          input_encoding="UTF-8",
+          replace_control_characters=True,
+          expected=[[65533], [104, 101, 108, 108, 111],
+                    [61, 61, 65533, 61, 61], [119, 111, 114, 108, 100]]),
+      dict(
+          input=[b"\x00", b"hello", b"==\x01==", b"world"],
+          input_encoding="UTF-8",
+          replace_control_characters=True,
+          replacement_char=0,
+          expected=[[0], [104, 101, 108, 108, 111],
+                    [61, 61, 0, 61, 61], [119, 111, 114, 108, 100]]),
+  ])  # pyformat: disable
+  def testErrorModes(self, expected=None, **args):
+    result = ragged_string_ops.unicode_decode(**args)
+    self.assertRaggedEqual(expected, result)
+
+  @parameterized.parameters([
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          expected=[[65533], [104, 101, 108, 108, 111],
+                    [61, 61, 65533, 61, 61], [119, 111, 114, 108, 100]],
+          expected_offsets=[[0], [0, 1, 2, 3, 4],
+                            [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          replacement_char=0,
+          expected=[[0], [104, 101, 108, 108, 111],
+                    [61, 61, 0, 61, 61], [119, 111, 114, 108, 100]],
+          expected_offsets=[[0], [0, 1, 2, 3, 4],
+                            [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="ignore",
+          expected=[[], [104, 101, 108, 108, 111],
+                    [61, 61, 61, 61], [119, 111, 114, 108, 100]],
+          expected_offsets=[[], [0, 1, 2, 3, 4],
+                            [0, 1, 3, 4], [0, 1, 2, 3, 4]]),
+      dict(
+          input=[b"\x00", b"hello", b"==\x01==", b"world"],
+          input_encoding="UTF-8",
+          replace_control_characters=True,
+          expected=[[65533], [104, 101, 108, 108, 111],
+                    [61, 61, 65533, 61, 61], [119, 111, 114, 108, 100]],
+          expected_offsets=[[0], [0, 1, 2, 3, 4],
+                            [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]),
+      dict(
+          input=[b"\x00", b"hello", b"==\x01==", b"world"],
+          input_encoding="UTF-8",
+          replace_control_characters=True,
+          replacement_char=0,
+          expected=[[0], [104, 101, 108, 108, 111],
+                    [61, 61, 0, 61, 61], [119, 111, 114, 108, 100]],
+          expected_offsets=[[0], [0, 1, 2, 3, 4],
+                            [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]),
+  ])  # pyformat: disable
+  def testErrorModesWithOffsets(self,
+                                expected=None,
+                                expected_offsets=None,
+                                **args):
+    result = ragged_string_ops.unicode_decode_with_offsets(**args)
+    self.assertRaggedEqual(result[0], expected)
+    self.assertRaggedEqual(result[1], expected_offsets)
+
+  @parameterized.parameters(
+      ("UTF-8", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-16-BE", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-32-BE", [u"こんにちは", u"你好", u"Hello"]),
+      ("US-ASCII", [u"Hello", "world"]),
+      ("ISO-8859-1", [u"ÀÈÓ", "AEO"]),
+      ("SHIFT-JIS", [u"Hello", u"こんにちは"]),
+  )
+  def testDecodeWithDifferentEncodings(self, encoding, texts):
+    expected = _nested_codepoints(texts)
+    input_tensor = constant_op.constant(_nested_encode(texts, encoding))
+    result = ragged_string_ops.unicode_decode(input_tensor, encoding)
+    self.assertRaggedEqual(expected, result)
+
+  @parameterized.parameters(
+      ("UTF-8", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-16-BE", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-32-BE", [u"こんにちは", u"你好", u"Hello"]),
+      ("US-ASCII", [u"Hello", "world"]),
+      ("ISO-8859-1", [u"ÀÈÓ", "AEO"]),
+      ("SHIFT-JIS", [u"Hello", u"こんにちは"]),
+  )
+  def testDecodeWithOffsetsWithDifferentEncodings(self, encoding, texts):
+    expected_codepoints = _nested_codepoints(texts)
+    expected_offsets = _nested_offsets(texts, encoding)
+    input_tensor = constant_op.constant(_nested_encode(texts, encoding))
+    result = ragged_string_ops.unicode_decode_with_offsets(
+        input_tensor, encoding)
+    self.assertRaggedEqual(expected_codepoints, result[0])
+    self.assertRaggedEqual(expected_offsets, result[1])
+
+  @parameterized.parameters([
+      dict(input=[b"\xFEED"],
+           errors="strict",
+           input_encoding="UTF-8",
+           exception=errors.InvalidArgumentError,
+           message="Invalid formatting on input string"),
+      dict(input="x",
+           input_encoding="UTF-8",
+           replacement_char=11141111,
+           exception=errors.InvalidArgumentError,
+           message="replacement_char out of unicode codepoint range"),
+      dict(input="x",
+           input_encoding="UTF-8",
+           errors="oranguatan",
+           exception=(ValueError, errors.InvalidArgumentError)),
+  ])  # pyformat: disable
+  def testExceptions(self, exception=None, message=None, **args):
+    with self.assertRaisesRegexp(exception, message):
+      self.evaluate(ragged_string_ops.unicode_decode(**args))
+
+  def testUnknownRankError(self):
+    if context.executing_eagerly():
+      return
+    s = array_ops.placeholder(dtypes.string)
+    message = "Rank of `input` must be statically known."
+    with self.assertRaisesRegexp(ValueError, message):
+      self.evaluate(ragged_string_ops.unicode_decode(s, input_encoding="UTF-8"))
+
+  @parameterized.parameters([
+      dict(
+          doc="Single string",
+          input=_nested_encode([u"仅今年前"], "utf-8"),
+          input_encoding="UTF-8",
+          expected_char_values=_nested_codepoints(u"仅今年前"),
+          expected_row_splits=[0, 4],
+          expected_char_to_byte_starts=[0, 3, 6, 9]),
+      dict(
+          doc="Multiple strings",
+          input=_nested_encode([u"仅今年前", u"你好"], "utf-8"),
+          input_encoding="UTF-8",
+          expected_char_values=_nested_codepoints(u"仅今年前你好"),
+          expected_row_splits=[0, 4, 6],
+          expected_char_to_byte_starts=[0, 3, 6, 9, 0, 3]),
+      dict(
+          doc="errors=replace",
+          input=b"=\xFE=",
+          input_encoding="UTF-8",
+          errors="replace",
+          expected_char_values=[61, 65533, 61],
+          expected_row_splits=[0, 3],
+          expected_char_to_byte_starts=[0, 1, 2]),
+      dict(
+          doc="errors=ignore",
+          input=b"=\xFE=",
+          input_encoding="UTF-8",
+          errors="ignore",
+          expected_char_values=[61, 61],
+          expected_row_splits=[0, 2],
+          expected_char_to_byte_starts=[0, 2]),
+  ])
+  def testDecodeGenOp(self,
+                      doc,
+                      expected_row_splits=None,
+                      expected_char_values=None,
+                      expected_char_to_byte_starts=None,
+                      **args):
+    """Test for the c++ interface (gen_string_ops.unicode_decode)."""
+    result = gen_string_ops.unicode_decode_with_offsets(**args)
+    self.assertAllEqual(expected_row_splits, result.row_splits)
+    self.assertAllEqual(expected_char_values, result.char_values)
+    self.assertAllEqual(expected_char_to_byte_starts,
+                        result.char_to_byte_starts)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class UnicodeSplitTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
+
+  def testScalarSplit(self):
+    text = constant_op.constant(u"仅今年前".encode("UTF-8"))
+    chars = ragged_string_ops.unicode_split(text, "UTF-8")
+    self.assertAllEqual(chars, [c.encode("UTF-8") for c in u"仅今年前"])
+
+  def testScalarSplitWithOffset(self):
+    text = constant_op.constant(u"仅今年前".encode("UTF-8"))
+    chars, starts = ragged_string_ops.unicode_split_with_offsets(text, "UTF-8")
+    self.assertAllEqual(chars, [c.encode("UTF-8") for c in u"仅今年前"])
+    self.assertAllEqual(starts, [0, 3, 6, 9])
+
+  def testVectorSplit(self):
+    text = constant_op.constant([u"仅今年前".encode("UTF-8"), b"hello"])
+    chars = ragged_string_ops.unicode_split(text, "UTF-8")
+    expected_chars = [[c.encode("UTF-8") for c in u"仅今年前"],
+                      [c.encode("UTF-8") for c in u"hello"]]
+    self.assertRaggedEqual(chars, expected_chars)
+
+  def testVectorSplitWithOffset(self):
+    text = constant_op.constant([u"仅今年前".encode("UTF-8"), b"hello"])
+    chars, starts = ragged_string_ops.unicode_split_with_offsets(text, "UTF-8")
+    expected_chars = [[c.encode("UTF-8") for c in u"仅今年前"],
+                      [c.encode("UTF-8") for c in u"hello"]]
+    self.assertRaggedEqual(chars, expected_chars)
+    self.assertRaggedEqual(starts, [[0, 3, 6, 9], [0, 1, 2, 3, 4]])
+
+  @parameterized.parameters([
+      {"texts": u"仅今年前"},
+      {"texts": [u"G\xf6\xf6dnight", u"\U0001f60a"]},
+      {"texts": ["Hello", "world", "", u"👍"]},
+      {"texts": [["Hi", "there"], ["", u"\U0001f60a"]], "ragged_rank": 0},
+      {"texts": [["Hi", "there", ""], [u"😊"]], "ragged_rank": 1},
+      {"texts": [[[u"😊", u"🤠🧐"], []], [[u"🤓👻🤖"]]], "ragged_rank": 2},
+      {"texts": []}
+  ])  # pyformat: disable
+  def testBasicSplit(self, texts, ragged_rank=None):
+    input_tensor = ragged_factory_ops.constant_value(
+        _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
+    result = ragged_string_ops.unicode_split(input_tensor, "UTF-8")
+    expected = _nested_splitchars(texts, "UTF-8")
+    self.assertRaggedEqual(expected, result)
+
+  @parameterized.parameters([
+      {"texts": u"仅今年前"},
+      {"texts": [u"G\xf6\xf6dnight", u"\U0001f60a"]},
+      {"texts": ["Hello", "world", "", u"👍"]},
+      {"texts": [["Hi", "there"], ["", u"\U0001f60a"]], "ragged_rank": 0},
+      {"texts": [["Hi", "there", ""], [u"😊"]], "ragged_rank": 1},
+      {"texts": [[[u"😊", u"🤠🧐"], []], [[u"🤓👻🤖"]]], "ragged_rank": 2},
+      {"texts": []}
+  ])  # pyformat: disable
+  def testBasicSplitWithOffsets(self, texts, ragged_rank=None):
+    input_tensor = ragged_factory_ops.constant_value(
+        _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
+    result = ragged_string_ops.unicode_split_with_offsets(input_tensor, "UTF-8")
+    expected_codepoints = _nested_splitchars(texts, "UTF-8")
+    expected_offsets = _nested_offsets(texts, "UTF-8")
+    self.assertRaggedEqual(expected_codepoints, result[0])
+    self.assertRaggedEqual(expected_offsets, result[1])
+
+  def testDocstringExamples(self):
+    texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]]
+    codepoints1 = ragged_string_ops.unicode_split(texts, "UTF-8")
+    codepoints2, offsets = ragged_string_ops.unicode_split_with_offsets(
+        texts, "UTF-8")
+    self.assertRaggedEqual(
+        codepoints1,
+        [[b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t"],
+         [b"\xf0\x9f\x98\x8a"]])
+    self.assertRaggedEqual(
+        codepoints2,
+        [[b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t"],
+         [b"\xf0\x9f\x98\x8a"]])
+    self.assertRaggedEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
+
+  @parameterized.parameters([
+      dict(
+          texts=["Hello", "world", "", u"👍"],
+          expected=_make_sparse_tensor(
+              indices=[[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [1, 0], [1, 1],
+                       [1, 2], [1, 3], [1, 4], [3, 0]],
+              values=[b"H", b"e", b"l", b"l", b"o",
+                      b"w", b"o", b"r", b"l", b"d", b"\xf0\x9f\x91\x8d"],
+              dense_shape=[4, 5],
+              dtype=bytes)),
+      dict(
+          texts=[["Hi", "there"], ["", u"\U0001f60a"]],
+          expected=_make_sparse_tensor(
+              indices=[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [0, 1, 2],
+                       [0, 1, 3], [0, 1, 4], [1, 1, 0]],
+              values=[b"H", b"i", b"t", b"h", b"e", b"r", b"e",
+                      b"\xf0\x9f\x98\x8a"],
+              dense_shape=[2, 2, 5],
+              dtype=bytes)),
+      dict(
+          texts=[],
+          expected=_make_sparse_tensor(
+              np.zeros([0, 2], np.int64), [], [0, 0], dtype=bytes)),
+  ])  # pyformat: disable
+  def testSplitWithSparseOutput(self, texts, expected):
+    input_tensor = np.array(_nested_encode(texts, "UTF-8"), dtype=bytes)
+    result = ragged_string_ops.unicode_split(input_tensor, "UTF-8").to_sparse()
+    self.assertIsInstance(result, sparse_tensor.SparseTensor)
+    self.assertAllEqual(expected.indices, result.indices)
+    self.assertAllEqual(expected.values, result.values)
+    self.assertAllEqual(expected.dense_shape, result.dense_shape)
+
+  @parameterized.parameters([
+      dict(
+          texts=["Hello", "world", "", u"👍"],
+          expected=[[b"H", b"e", b"l", b"l", b"o"],
+                    [b"w", b"o", b"r", b"l", b"d"],
+                    ["", "", "", "", ""],
+                    [b"\xf0\x9f\x91\x8d", "", "", "", ""]]),
+      dict(
+          texts=[["Hi", "there"], ["", u"\U0001f60a"]],
+          expected=[[[b"H", b"i", "", "", ""],
+                     [b"t", b"h", b"e", b"r", b"e"]],
+                    [["", "", "", "", ""],
+                     [b"\xf0\x9f\x98\x8a", "", "", "", ""]]],
+          ragged_rank=0),
+      dict(
+          texts=[["Hi", "there", ""], [u"😊"]],
+          expected=[[[b"H", b"i", "", "", ""],
+                     [b"t", b"h", b"e", b"r", b"e"],
+                     ["", "", "", "", ""]],
+                    [[b"\xf0\x9f\x98\x8a", "", "", "", ""],
+                     ["", "", "", "", ""],
+                     ["", "", "", "", ""]]]),
+      dict(
+          texts=[[[u"😊", u"🤠🧐"], []], [[u"🤓👻🤖"]]],
+          expected=[[[[b"\xf0\x9f\x98\x8a", "", ""],
+                      [b"\xf0\x9f\xa4\xa0", b"\xf0\x9f\xa7\x90", ""]],
+                     [["", "", ""],
+                      ["", "", ""]]],
+                    [[[b"\xf0\x9f\xa4\x93", b"\xf0\x9f\x91\xbb",
+                       b"\xf0\x9f\xa4\x96"],
+                      ["", "", ""]],
+                     [["", "", ""],
+                      ["", "", ""]]]]),
+      dict(texts=[], expected=np.zeros([0, 0], np.int64)),
+  ])  # pyformat: disable
+  def testSplitWithPaddedOutput(self, texts, expected, ragged_rank=None):
+    input_tensor = ragged_factory_ops.constant_value(
+        _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
+    result = ragged_string_ops.unicode_split(
+        input_tensor, "UTF-8").to_tensor(default_value="")
+    self.assertAllEqual(np.array(expected, dtype=bytes), result)
+
+  @parameterized.parameters([
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          expected=[[b"\xef\xbf\xbd"],
+                    [b"h", b"e", b"l", b"l", b"o"],
+                    [b"=", b"=", b"\xef\xbf\xbd", b"=", b"="],
+                    [b"w", b"o", b"r", b"l", b"d"]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          replacement_char=0,
+          expected=[[b"\x00"],
+                    [b"h", b"e", b"l", b"l", b"o"],
+                    [b"=", b"=", b"\x00", b"=", b"="],
+                    [b"w", b"o", b"r", b"l", b"d"]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="ignore",
+          expected=[[],
+                    [b"h", b"e", b"l", b"l", b"o"],
+                    [b"=", b"=", b"=", b"="],
+                    [b"w", b"o", b"r", b"l", b"d"]]),
+  ])  # pyformat: disable
+  def testErrorModes(self, expected=None, **args):
+    result = ragged_string_ops.unicode_split(**args)
+    self.assertRaggedEqual(expected, result)
+
+  @parameterized.parameters([
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          expected=[[b"\xef\xbf\xbd"],
+                    [b"h", b"e", b"l", b"l", b"o"],
+                    [b"=", b"=", b"\xef\xbf\xbd", b"=", b"="],
+                    [b"w", b"o", b"r", b"l", b"d"]],
+          expected_offsets=[[0], [0, 1, 2, 3, 4],
+                            [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          replacement_char=0,
+          expected=[[b"\x00"],
+                    [b"h", b"e", b"l", b"l", b"o"],
+                    [b"=", b"=", b"\x00", b"=", b"="],
+                    [b"w", b"o", b"r", b"l", b"d"]],
+          expected_offsets=[[0], [0, 1, 2, 3, 4],
+                            [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="ignore",
+          expected=[[],
+                    [b"h", b"e", b"l", b"l", b"o"],
+                    [b"=", b"=", b"=", b"="],
+                    [b"w", b"o", b"r", b"l", b"d"]],
+          expected_offsets=[[], [0, 1, 2, 3, 4],
+                            [0, 1, 3, 4], [0, 1, 2, 3, 4]]),
+  ])  # pyformat: disable
+  def testErrorModesWithOffsets(self,
+                                expected=None,
+                                expected_offsets=None,
+                                **args):
+    result = ragged_string_ops.unicode_split_with_offsets(**args)
+    self.assertRaggedEqual(expected, result[0])
+    self.assertRaggedEqual(expected_offsets, result[1])
+
+  @parameterized.parameters(
+      ("UTF-8", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-16-BE", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-32-BE", [u"こんにちは", u"你好", u"Hello"]),
+  )
+  def testSplitWithDifferentEncodings(self, encoding, texts):
+    expected = _nested_splitchars(texts, encoding)
+    input_tensor = constant_op.constant(_nested_encode(texts, encoding))
+    result = ragged_string_ops.unicode_split(input_tensor, encoding)
+    self.assertRaggedEqual(expected, result)
+
+  @parameterized.parameters(
+      ("UTF-8", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-16-BE", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-32-BE", [u"こんにちは", u"你好", u"Hello"]),
+  )
+  def testSplitWithOffsetsWithDifferentEncodings(self, encoding, texts):
+    expected_codepoints = _nested_splitchars(texts, encoding)
+    expected_offsets = _nested_offsets(texts, encoding)
+    input_tensor = constant_op.constant(_nested_encode(texts, encoding))
+    result = ragged_string_ops.unicode_split_with_offsets(
+        input_tensor, encoding)
+    self.assertRaggedEqual(expected_codepoints, result[0])
+    self.assertRaggedEqual(expected_offsets, result[1])
+
+  @parameterized.parameters([
+      dict(input=[b"\xFEED"],
+           errors="strict",
+           input_encoding="UTF-8",
+           exception=errors.InvalidArgumentError,
+           message="Invalid formatting on input string"),
+      dict(input="x",
+           input_encoding="UTF-8",
+           replacement_char=11141111,
+           exception=errors.InvalidArgumentError,
+           message="replacement_char out of unicode codepoint range"),
+      dict(input="x",
+           input_encoding="UTF-8",
+           errors="oranguatan",
+           exception=(ValueError, errors.InvalidArgumentError)),
+  ])  # pyformat: disable
+  def testExceptions(self, exception=None, message=None, **args):
+    with self.assertRaisesRegexp(exception, message):
+      self.evaluate(ragged_string_ops.unicode_split(**args))
+
+  def testUnknownRankError(self):
+    if context.executing_eagerly():
+      return
+    s = array_ops.placeholder(dtypes.string)
+    message = "Rank of `input` must be statically known."
+    with self.assertRaisesRegexp(ValueError, message):
+      self.evaluate(ragged_string_ops.unicode_decode(s, input_encoding="UTF-8"))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/unicode_encode_op_test.py b/tensorflow/python/kernel_tests/unicode_encode_op_test.py
index a5a5c2017c6fd7bb92a1e110a74ecff056d04a44..2f3cd8a6577e06fc4b3de81585d8b48231ae7076 100644
--- a/tensorflow/python/kernel_tests/unicode_encode_op_test.py
+++ b/tensorflow/python/kernel_tests/unicode_encode_op_test.py
@@ -23,14 +23,25 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl as errors
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.platform import test
 
 
 class UnicodeEncodeOpTest(test.TestCase, parameterized.TestCase):
 
+  def assertRaggedEqual(self, rt, expected):
+    with self.cached_session() as sess:
+      value = sess.run(rt)
+      if isinstance(value, np.ndarray):
+        value = value.tolist()
+      elif isinstance(value, ragged_tensor_value.RaggedTensorValue):
+        value = value.to_list()
+      self.assertEqual(value, expected)
+
   def testScalar(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -53,97 +64,80 @@ class UnicodeEncodeOpTest(test.TestCase, parameterized.TestCase):
   @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
   def testStrictErrors(self, encoding):
     test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
-    with self.cached_session():
+    with self.cached_session() as session:
       with self.assertRaises(errors.InvalidArgumentError):
-        ragged_string_ops.unicode_encode(test_value, encoding, "strict").eval()
+        session.run(
+            ragged_string_ops.unicode_encode(test_value, encoding, "strict"))
 
   @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
   def testIgnoreErrors(self, encoding):
     test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
     expected_value = u"Heo".encode(encoding)
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
                                                          "ignore")
-    with self.cached_session():
-      result = unicode_encode_op.eval()
+    with self.cached_session() as session:
+      result = session.run(unicode_encode_op)
       self.assertIsInstance(result, bytes)
       self.assertAllEqual(result, expected_value)
 
   @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
   def testReplaceErrors(self, encoding):
     test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
     expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
                                                          "replace")
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertIsInstance(result, bytes)
-      self.assertAllEqual(result, expected_value)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
 
     # Test custom replacement character
     test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
     expected_value = u"Heooo".encode(encoding)
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
                                                          "replace", 111)
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertIsInstance(result, bytes)
-      self.assertAllEqual(result, expected_value)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
 
     # Verify "replace" is default
     test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
     expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertIsInstance(result, bytes)
-      self.assertAllEqual(result, expected_value)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
 
     # Replacement_char must be within range
     test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
                                                          "replace", 1114112)
-    with self.cached_session():
-      with self.assertRaises(errors.InvalidArgumentError):
-        unicode_encode_op.eval()
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(unicode_encode_op)
 
   # -- regular Tensor tests -- #
 
   @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
   def testVector(self, encoding):
     test_value = np.array([72, 101, 108, 108, 111], np.int32)
     expected_value = u"Hello".encode(encoding)
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertIsInstance(result, bytes)
-      self.assertAllEqual(result, expected_value)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
 
     test_value = np.array([72, 101, 195, 195, 128516], np.int32)
     expected_value = u"He\xc3\xc3\U0001f604".encode(encoding)
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertIsInstance(result, bytes)
-      self.assertAllEqual(result, expected_value)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
 
     # Single character string
     test_value = np.array([72], np.int32)
     expected_value = u"H".encode(encoding)
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertIsInstance(result, bytes)
-      self.assertAllEqual(result, expected_value)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
 
     test_value = np.array([128516], np.int32)
     expected_value = u"\U0001f604".encode(encoding)
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertIsInstance(result, bytes)
-      self.assertAllEqual(result, expected_value)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
 
   @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
   def testMatrix(self, encoding):
     test_value = np.array(
         [[72, 128516, 108, 108, 111], [87, 128516, 114, 108, 100]], np.int32)
@@ -151,12 +145,10 @@ class UnicodeEncodeOpTest(test.TestCase, parameterized.TestCase):
         u"H\U0001f604llo".encode(encoding), u"W\U0001f604rld".encode(encoding)
     ]
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertIsInstance(unicode_encode_op, ops.Tensor)
-      self.assertAllEqual(result, expected_value)
+    self.assertAllEqual(unicode_encode_op, expected_value)
 
   @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
   def test3DimMatrix(self, encoding):
     test_value = constant_op.constant(
         [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]],
@@ -166,12 +158,10 @@ class UnicodeEncodeOpTest(test.TestCase, parameterized.TestCase):
                       [u"fixed".encode(encoding), u"words".encode(encoding)],
                       [u"Hyper".encode(encoding), u"cube.".encode(encoding)]]
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertIsInstance(unicode_encode_op, ops.Tensor)
-      self.assertAllEqual(result, expected_value)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
 
   @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
   def test4DimMatrix(self, encoding):
     test_value = constant_op.constant(
         [[[[72, 101, 108, 108, 111]], [[87, 111, 114, 108, 100]]],
@@ -184,14 +174,12 @@ class UnicodeEncodeOpTest(test.TestCase, parameterized.TestCase):
                       [[u"Hyper".encode(encoding)],
                        [u"cube.".encode(encoding)]]]
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertIsInstance(unicode_encode_op, ops.Tensor)
-      self.assertAllEqual(result, expected_value)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
 
   # -- Ragged Tensor tests -- #
 
   @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
   def testRaggedMatrix(self, encoding):
     test_value = ragged_factory_ops.constant(
         [[72, 195, 108, 108, 111], [87, 128516, 114, 108, 100, 46]], np.int32)
@@ -199,12 +187,10 @@ class UnicodeEncodeOpTest(test.TestCase, parameterized.TestCase):
         u"H\xc3llo".encode(encoding), u"W\U0001f604rld.".encode(encoding)
     ]
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertIsInstance(unicode_encode_op, ops.Tensor)
-      self.assertAllEqual(result, expected_value)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
 
   @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
   def test3DimMatrixWithRagged2ndDim(self, encoding):
     test_value = ragged_factory_ops.constant(
         [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]],
@@ -218,12 +204,10 @@ class UnicodeEncodeOpTest(test.TestCase, parameterized.TestCase):
                           u"cube.".encode(encoding)
                       ]]
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertEqual(unicode_encode_op.ragged_rank, 1)
-      self.assertAllEqual(result.tolist(), expected_value)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
 
   @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
   def test3DimMatrixWithRagged3rdDim(self, encoding):
     test_value = ragged_factory_ops.constant(
         [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100, 46]],
@@ -235,12 +219,10 @@ class UnicodeEncodeOpTest(test.TestCase, parameterized.TestCase):
                           u"w\xc3rry, be".encode(encoding)
                       ], [u"\U0001f604".encode(encoding), u"".encode(encoding)]]
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertEqual(unicode_encode_op.ragged_rank, 1)
-      self.assertAllEqual(result.tolist(), expected_value)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
 
   @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
   def test3DimMatrixWithRagged2ndAnd3rdDim(self, encoding):
     test_value = ragged_factory_ops.constant(
         [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100, 46]], [],
@@ -248,12 +230,10 @@ class UnicodeEncodeOpTest(test.TestCase, parameterized.TestCase):
     expected_value = [[u"Hello".encode(encoding), u"World.".encode(encoding)],
                       [], [u"\U0001f604".encode(encoding)]]
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertEqual(unicode_encode_op.ragged_rank, 1)
-      self.assertAllEqual(result.tolist(), expected_value)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
 
   @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
   def test4DimRaggedMatrix(self, encoding):
     test_value = ragged_factory_ops.constant(
         [[[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]]],
@@ -261,40 +241,30 @@ class UnicodeEncodeOpTest(test.TestCase, parameterized.TestCase):
     expected_value = [[[u"Hello".encode(encoding), u"World".encode(encoding)]],
                       [[u"".encode(encoding)], [u"Hype".encode(encoding)]]]
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertEqual(unicode_encode_op.ragged_rank, 2)
-      self.assertAllEqual(result.tolist(), expected_value)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
 
   @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
   def testRaggedMatrixWithMultiDimensionInnerValues(self, encoding):
-    test_inner_values = constant_op.constant([[[72, 101, 108, 108, 111],
-                                               [87, 111, 114, 108, 100]],
-                                              [[102, 105, 120, 101, 100],
-                                               [119, 111, 114, 100, 115]],
-                                              [[72, 121, 112, 101, 114],
-                                               [99, 117, 98, 101, 46]]])
+    test_flat_values = constant_op.constant([[[72, 101, 108, 108, 111],
+                                              [87, 111, 114, 108, 100]],
+                                             [[102, 105, 120, 101, 100],
+                                              [119, 111, 114, 100, 115]],
+                                             [[72, 121, 112, 101, 114],
+                                              [99, 117, 98, 101, 46]]])
     test_row_splits = [
         constant_op.constant([0, 2, 3], dtype=np.int64),
         constant_op.constant([0, 1, 1, 3], dtype=np.int64)
     ]
-    test_value = ragged_factory_ops.from_nested_row_splits(test_inner_values,
-                                                           test_row_splits)
+    test_value = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        test_flat_values, test_row_splits)
     expected_value = [[[[u"Hello".encode(encoding), u"World".encode(encoding)]],
                        []],
                       [[[u"fixed".encode(encoding), u"words".encode(encoding)],
                         [u"Hyper".encode(encoding),
                          u"cube.".encode(encoding)]]]]
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
-    with self.cached_session():
-      result = unicode_encode_op.eval()
-      self.assertEqual(unicode_encode_op.ragged_rank, 2)
-      self.assertAllEqual(result.tolist(), expected_value)
-      # These next two assertions don't necessarily need to be here as they test
-      # internal representations and we already verified the value is correct.
-      self.assertAllEqual(len(result.nested_row_splits), len(test_row_splits))
-      self.assertEqual(unicode_encode_op.inner_values.shape.ndims,
-                       test_inner_values.shape.ndims - 1)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 44d4bd5e30fa802212e68b153f0616ed2ff2be3a..451eb3853062203a190def09f432f9d9e12f2edd 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -237,7 +237,8 @@ class VariableScopeTest(test.TestCase):
         _ = d2(x)
         self.assertEqual(len(d2.variables), 2)
         v3, v4 = d2.variables
-        self.assertAllEqual([v1, v2], [v3, v4])
+        self.assertEqual(v1, v3)
+        self.assertEqual(v2, v4)
       f()
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
@@ -1684,7 +1685,7 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
       with variable_scope.variable_creator_scope(creator_b):
         variable_scope.variable(1.0, name="one_name")
 
-    self.assertAllEqual(variable_names, ["forced_name"])
+    self.assertEqual(variable_names[0], "forced_name")
 
     called = [False]
 
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index 08d885e8a87cc20314a7e9e812fd498c7e9da417..07807e89d0e60bf5e053e75618112e266a3ca882 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -43,7 +43,7 @@ from tensorflow.python.util import compat
 
 class VariablesTestCase(test.TestCase):
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testInitialization(self):
     with self.cached_session():
       var0 = variables.VariableV1(0.0)
@@ -66,12 +66,12 @@ class VariablesTestCase(test.TestCase):
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         self.evaluate(var1)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertAllClose(0.0, self.evaluate(var0))
       self.assertAllClose(1.1, self.evaluate(var1))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testInitializationOrder(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([3, 6]), name="rnd")
@@ -96,11 +96,11 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual([3, 6], depdep.get_shape())
       self.assertEqual([3, 6], depdep.shape)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
-      self.assertAllClose(rnd.eval(), self.evaluate(dep))
-      self.assertAllClose(rnd.eval() + self.evaluate(dep) + 2.0,
-                          self.evaluate(depdep))
+      self.assertAllClose(self.evaluate(rnd), self.evaluate(dep))
+      self.assertAllClose(
+          self.evaluate(rnd) + self.evaluate(dep) + 2.0, self.evaluate(depdep))
 
   def testIterable(self):
     with self.assertRaisesRegexp(TypeError, "not iterable"):
@@ -117,7 +117,7 @@ class VariablesTestCase(test.TestCase):
       plus_one = var.assign_add(1.0)
       minus_one = var.assign_sub(2.0)
       four = var.assign(4.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(0.0, self.evaluate(var))
 
       self.assertAllClose(1.0, self.evaluate(plus_one))
@@ -136,7 +136,7 @@ class VariablesTestCase(test.TestCase):
       plus_one = var.assign_add(1.0)
       minus_one = var.assign_sub(2.0)
       four = var.assign(4.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(0.0, self.evaluate(var))
 
       self.evaluate(plus_one)
@@ -166,7 +166,7 @@ class VariablesTestCase(test.TestCase):
       var = variables.Variable(zero)
       count_up_to = var.count_up_to(3)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(0, self.evaluate(var))
 
       self.assertEqual(0, self.evaluate(count_up_to))
@@ -194,7 +194,7 @@ class VariablesTestCase(test.TestCase):
   def testCountUpToInt64(self):
     self._countUpToTest(dtypes.int64)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testControlDepsNone(self):
     with self.cached_session():
       c = constant_op.constant(1.0)
@@ -208,7 +208,7 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual([], var_x.value().op.control_inputs)
       self.assertEqual([], var_x._ref().op.control_inputs)  # pylint: disable=protected-access
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testControlFlow(self):
     with self.cached_session() as sess:
       v0 = variables.Variable(0, name="v0")
@@ -245,7 +245,7 @@ class VariablesTestCase(test.TestCase):
       self.evaluate(v0.initializer)
       self.evaluate(add)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testControlFlowInitialization(self):
     """Expects an error if an initializer is in a control-flow scope."""
     def cond(i, _):
@@ -264,10 +264,10 @@ class VariablesTestCase(test.TestCase):
     with self.cached_session():
       var_x = variables.Variable(2.0)
       var_y = variables.Variable(3.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(2.0, self.evaluate(var_x))
       self.assertAllClose(3.0, self.evaluate(var_y))
-      self.assertAllClose(5.0, math_ops.add(var_x, var_y).eval())
+      self.assertAllClose(5.0, self.evaluate(math_ops.add(var_x, var_y)))
 
   @test_util.run_deprecated_v1
   def testZeroSizeVarSameAsConst(self):
@@ -277,9 +277,9 @@ class VariablesTestCase(test.TestCase):
       variable_mul = math_ops.matmul(zero_size_const, zero_size_var)
       const_mul = math_ops.matmul(
           zero_size_const, zero_size_const, transpose_b=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variable_output = self.evaluate(variable_mul)
-      self.assertAllClose(const_mul.eval(), variable_output)
+      self.assertAllClose(self.evaluate(const_mul), variable_output)
       self.assertAllClose([[0., 0.], [0., 0.]], variable_output)
 
   @test_util.run_deprecated_v1
@@ -372,7 +372,7 @@ class VariablesTestCase(test.TestCase):
       matmul = var_m.__matmul__([[10.0], [20.0]])
       rmatmul = var_m.__rmatmul__([[10.0], [20.0]])
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose([2.0], self.evaluate(add))
       self.assertAllClose([3.0], self.evaluate(radd))
       self.assertAllClose([1.0], self.evaluate(sub))
@@ -409,10 +409,10 @@ class VariablesTestCase(test.TestCase):
   def testSession(self):
     with self.cached_session() as sess:
       var = variables.Variable([1, 12])
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose([1, 12], self.evaluate(var))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testColocation(self):
     with ops.device("/job:ps"):
       var = variables.VariableV1(0, name="v")
@@ -421,7 +421,7 @@ class VariablesTestCase(test.TestCase):
     self.assertDeviceEqual("/job:ps", assign_op.device)
     self.assertEqual([b"loc:@v"], assign_op.op.colocation_groups())
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testInitializerFunction(self):
     value = [[-42], [133.7]]
     shape = [2, 1]
@@ -431,7 +431,7 @@ class VariablesTestCase(test.TestCase):
       v1 = variables.Variable(initializer, dtype=dtypes.float32)
       self.assertEqual(shape, v1.get_shape())
       self.assertEqual(shape, v1.shape)
-      self.assertAllClose(value, v1.initial_value.eval())
+      self.assertAllClose(value, self.evaluate(v1.initial_value))
       with self.assertRaises(errors_impl.FailedPreconditionError):
         self.evaluate(v1)
 
@@ -439,11 +439,11 @@ class VariablesTestCase(test.TestCase):
           math_ops.negative(v1.initialized_value()), dtype=dtypes.float32)
       self.assertEqual(v1.get_shape(), v2.get_shape())
       self.assertEqual(v1.shape, v2.shape)
-      self.assertAllClose(np.negative(value), v2.initial_value.eval())
+      self.assertAllClose(np.negative(value), self.evaluate(v2.initial_value))
 
       with self.assertRaises(errors_impl.FailedPreconditionError):
         self.evaluate(v2)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(np.negative(value), self.evaluate(v2))
 
   def testConstraintArg(self):
@@ -459,16 +459,16 @@ class VariablesTestCase(test.TestCase):
           lambda: constant_op.constant(1.),
           constraint=constraint)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testNoRefDataRace(self):
     with self.cached_session():
       a = variables.Variable([1, 2, 3], dtype=dtypes.float32)
       b = variables.Variable(a.initialized_value() + 2)
       c = variables.Variable(b.initialized_value() + 2)
-      variables.global_variables_initializer().run()
-      self.assertAllEqual(a.eval(), [1, 2, 3])
-      self.assertAllEqual(b.eval(), [3, 4, 5])
-      self.assertAllEqual(c.eval(), [5, 6, 7])
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual(self.evaluate(a), [1, 2, 3])
+      self.assertAllEqual(self.evaluate(b), [3, 4, 5])
+      self.assertAllEqual(self.evaluate(c), [5, 6, 7])
 
   @test_util.run_deprecated_v1
   def testInitializerFunctionDevicePlacement(self):
@@ -489,7 +489,7 @@ class VariablesTestCase(test.TestCase):
       for i in v2.initializer.inputs:
         self.assertEqual(expected_group_v2, i.op.colocation_groups())
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testVariableDefInitializedInstances(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v_def = variables.Variable(
@@ -503,7 +503,7 @@ class VariablesTestCase(test.TestCase):
       # initialized_value should not rerun the initializer_op if the variable
       # has already been initialized elsewhere.
       self.evaluate(v.assign(1.0))
-      self.assertEqual(1.0, v.initialized_value().eval())
+      self.assertEqual(1.0, self.evaluate(v.initialized_value()))
 
     v_def.ClearField("initial_value_name")
     with ops.Graph().as_default(), self.cached_session() as sess:
@@ -537,12 +537,12 @@ class VariablesTestCase(test.TestCase):
   def testLoad(self):
     with self.cached_session():
       var = variables.Variable(np.zeros((5, 5), np.float32))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       var.load(np.ones((5, 5), np.float32))
 
       self.assertAllClose(np.ones((5, 5), np.float32), self.evaluate(var))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testRepr(self):
     var = variables.VariableV1(np.zeros((5, 5), np.float32), name="noop")
     self.assertEqual(
@@ -573,10 +573,10 @@ class IsInitializedTest(test.TestCase):
       _ = v, w
       uninited = variables.report_uninitialized_variables()
       self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(0, self.evaluate(uninited).size)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testVariableList(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1([1, 2], name="v")
@@ -601,20 +601,20 @@ class IsInitializedTest(test.TestCase):
       b = variables.Variable(array_ops.ones([2, 2]))
       objective = math_ops.reduce_sum(b + math_ops.matmul(
           a, a, transpose_a=True))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       do_opt = gradient_descent.GradientDescentOptimizer(0.1).minimize(
           objective)
       self.evaluate([do_opt])
       self.assertAllClose([[0.9, 0.9], [0.9, 0.9]], self.evaluate(b))
 
 
+@test_util.run_v1_only("b/120545219")
 class ObsoleteIsInitializedTest(test.TestCase):
 
   def testNoVars(self):
     with ops.Graph().as_default():
       self.assertEqual(None, variables.assert_variables_initialized())
 
-  @test_util.run_deprecated_v1
   def testVariables(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1([1, 2])
@@ -623,10 +623,9 @@ class ObsoleteIsInitializedTest(test.TestCase):
       inited = variables.assert_variables_initialized()
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         self.evaluate(inited)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.evaluate(inited)
 
-  @test_util.run_deprecated_v1
   def testVariableList(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1([1, 2])
@@ -766,36 +765,36 @@ class PartitionedVariableTest(test.TestCase):
       assign_list = pv_1.assign([c_0, c_1])
       assign_part_value = pv_1.assign_add(assign_ones)
       assign_part_var = pv_1.assign_sub(pv_0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
-      self.assertEqual([1.0], plus_delta[0].eval())
+      self.assertEqual([1.0], self.evaluate(plus_delta[0]))
       self.assertEqual([1.0], self.evaluate(v0))
-      self.assertEqual([3.0], plus_delta[1].eval())
+      self.assertEqual([3.0], self.evaluate(plus_delta[1]))
       self.assertEqual([3.0], self.evaluate(v1))
 
-      self.assertEqual([-2.0], minus_delta[0].eval())
+      self.assertEqual([-2.0], self.evaluate(minus_delta[0]))
       self.assertEqual([-2.0], self.evaluate(v0))
-      self.assertEqual([-1.0], minus_delta[1].eval())
+      self.assertEqual([-1.0], self.evaluate(minus_delta[1]))
       self.assertEqual([-1.0], self.evaluate(v1))
 
-      self.assertEqual([1.0], assign_ones[0].eval())
+      self.assertEqual([1.0], self.evaluate(assign_ones[0]))
       self.assertEqual([1.0], self.evaluate(v0))
-      self.assertEqual([1.0], assign_ones[1].eval())
+      self.assertEqual([1.0], self.evaluate(assign_ones[1]))
       self.assertEqual([1.0], self.evaluate(v1))
 
-      self.assertEqual([2.0], assign_list[0].eval())
+      self.assertEqual([2.0], self.evaluate(assign_list[0]))
       self.assertEqual([2.0], self.evaluate(v2))
-      self.assertEqual([3.0], assign_list[1].eval())
+      self.assertEqual([3.0], self.evaluate(assign_list[1]))
       self.assertEqual([3.0], self.evaluate(v3))
 
-      self.assertEqual([3.0], assign_part_value[0].eval())
+      self.assertEqual([3.0], self.evaluate(assign_part_value[0]))
       self.assertEqual([3.0], self.evaluate(v2))
-      self.assertEqual([4.0], assign_part_value[1].eval())
+      self.assertEqual([4.0], self.evaluate(assign_part_value[1]))
       self.assertEqual([4.0], self.evaluate(v3))
 
-      self.assertEqual([2.0], assign_part_var[0].eval())
+      self.assertEqual([2.0], self.evaluate(assign_part_var[0]))
       self.assertEqual([2.0], self.evaluate(v2))
-      self.assertEqual([3.0], assign_part_var[1].eval())
+      self.assertEqual([3.0], self.evaluate(assign_part_var[1]))
       self.assertEqual([3.0], self.evaluate(v3))
 
 
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index 09cbeb1a0d55f39330b4494ccc22c647313a8ae3..cae459a34e934cc804a56f5738202377a1227274 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -53,6 +53,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       self.assertEqual(self.evaluate(ret), 16.)
       self.assertSequenceEqual(self.evaluate(grad), [32.])
 
+  @test_util.run_v1_only("b/120545219")
   def testReturnSameStructureTrue(self):
     x = constant_op.constant(2.)
     ret = while_loop_v2(
@@ -145,7 +146,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       self.assertSequenceEqual(self.evaluate(grad), [32.])
       self.assertSequenceEqual(self.evaluate(grad_grad), [48.])
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testPruning(self):
     x = constant_op.constant(1)
 
@@ -252,7 +253,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       if op.type == "While":
         while_op = op
 
-    body_graph = while_v2._get_body_graph(while_op)
+    body_graph = while_v2._get_graph(while_op, "body")
     # body_graph.inputs: [counter_arg, x_arg, tl_arg, *accumulators]
     x_input_t = body_graph.inputs[1]
     accumulator_count = len(
@@ -283,22 +284,26 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
         self.assertListEqual(actual_tensor_shape.as_list(), shape)
 
     def GetAccumulatorForInputAtIndex(while_op, idx):
-      body_graph = while_v2._get_body_graph(while_op)
+      body_graph = while_v2._get_graph(while_op, "body")
       y_input_t = body_graph.inputs[idx]
       push_back_node = [c for c in y_input_t.consumers()
                         if c.type == "TensorListPushBack"][0]
       output_idx = body_graph.outputs.index(push_back_node.outputs[0])
       return while_op.outputs[output_idx]
 
-    x = constant_op.constant(2.)
+    x = array_ops.placeholder(dtype=dtypes.float32, shape=shape)
     y = array_ops.placeholder(dtype=dtypes.float32, shape=shape)
 
     # Forward pass.
-    ret = while_loop_v2(
-        lambda v, u: v < 8.,
-        lambda v, u: (v * v, u), [x, y],
-        return_same_structure=False)
+    ret = while_loop_v2(lambda v, u: v < 8.,
+                        lambda v, u: (math_ops.pow(v, u), u),
+                        [x, y],
+                        return_same_structure=True)
     while_op = ret[0].op.inputs[0].op
+    # Gradient pass.
+    grad = gradients_impl.gradients(ret[0], x)
+    grad_while_op = grad[0].op.inputs[0].op
+
     # Get the TensorList output of While op containing the accumulated values
     # of y.
     # while_op.inputs: [counter_arg, x_arg, y_arg, *accumulators]
@@ -307,14 +312,14 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
                                            element_dtype=dtypes.float32)
     MatchShape(val.shape)
 
-    # Gradient pass.
-    grad = gradients_impl.gradients(ret[1], y)
-    grad_while_op = grad[0].op.inputs[0].op
+    # Take second derivative to generate intermediate grad_while_op outputs
+    gradients_impl.gradients(grad, x)
+
     # Get the TensorList output of gradient While op containing the accumulated
-    # values of grad_y.
+    # values of grad_x (note that grad_x is needed by the second derivative).
     # grad_while_op.inputs:
     # [counter_arg, total_iters_arg, grad_x_arg, grad_y_arg, *other_args]
-    grad_output = GetAccumulatorForInputAtIndex(grad_while_op, 3)
+    grad_output = GetAccumulatorForInputAtIndex(grad_while_op, 2)
     _, val = list_ops.tensor_list_pop_back(grad_output,
                                            element_dtype=dtypes.float32)
     MatchShape(val.shape)
@@ -357,14 +362,13 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
   @test_util.enable_control_flow_v2
   @test_util.run_deprecated_v1
   def testWhileAndTensorArray(self):
-    with self.cached_session() as sess:
-      param = constant_op.constant(2.0)
-      y0 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="elems")
-      # map_fn uses TensorArray internally.
-      r = functional_ops.map_fn(lambda x: math_ops.multiply(x, param), y0)
-      self.assertAllClose([2.0, 4.0, 6.0, 8.0, 10.0, 12.0], self.evaluate(r))
-      r = gradients_impl.gradients(r, param)[0]
-      self.assertAllClose(21.0, self.evaluate(r))
+    param = constant_op.constant(2.0)
+    y0 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="elems")
+    # map_fn uses TensorArray internally.
+    r = functional_ops.map_fn(lambda x: math_ops.multiply(x, param), y0)
+    grad = gradients_impl.gradients(r, param)[0]
+    self.assertAllClose([2.0, 4.0, 6.0, 8.0, 10.0, 12.0], self.evaluate(r))
+    self.assertAllClose(21.0, self.evaluate(grad))
 
   @test_util.run_deprecated_v1
   def testNestedWhile(self):
@@ -438,6 +442,26 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     # grad = gradients_impl.gradients(output, [n])
     # self.assertEqual(self.evaluate(grad), 3.5)
 
+  @test_util.run_deprecated_v1
+  def testForwardPassRewrite(self):
+    x = constant_op.constant(1.0, name="x")
+    output = while_v2.while_loop(lambda x: x < 10.0,
+                                 lambda x: x * 2.0,
+                                 [x])[0]
+    while_op = output.op.inputs[0].op
+    self.assertEqual(while_op.type, "While")
+    # outputs = [loop_counter, x]
+    self.assertLen(while_op.outputs, 2)
+
+    gradients_impl.gradients(output, x)
+    # while_op should have been rewritten to output 2.0 intermediate.
+    # outputs = [loop_counter, x, 2.0_accumulator, x_accumulator]
+    self.assertLen(while_op.outputs, 4)
+
+    gradients_impl.gradients(output, x)
+    # Computing the gradient again shouldn't rewrite while_op again.
+    self.assertLen(while_op.outputs, 4)
+
 
 def ScalarShape():
   return ops.convert_to_tensor([], dtype=dtypes.int32)
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index bfe591f875556c9dbcf3001bec4fe836bca3593f..5354d437b481195f81dba8f4c1bbf3d12e67d1a7 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -26,6 +26,7 @@ from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
@@ -551,6 +552,10 @@ class Layer(base_layer.Layer):
         setattr(result, k, copy.deepcopy(v, memo))
     return result
 
+  def __setattr__(self, value, name):
+    # By-pass the automatic dependency tracking performed by the parent Layer.
+    super(checkpointable.CheckpointableBase, self).__setattr__(value, name)
+
 
 def _add_elements_to_collection(elements, collection_list):
   if context.executing_eagerly():
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index cf6f0fbb7001d304fde9fbf29270ea29d352df22..b40a2682381ad50da67fe7499b75f4f862e00b3d 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -533,10 +533,13 @@ class FlattenTest(test.TestCase):
     self.assertEqual(y.get_shape().as_list(), [None, 6])
 
   @test_util.run_deprecated_v1
-  def testFlattenValueError(self):
+  def testFlatten0D(self):
     x = array_ops.placeholder(shape=(None,), dtype='float32')
-    with self.assertRaises(ValueError):
-      core_layers.Flatten()(x)
+    y = core_layers.Flatten()(x)
+    with self.cached_session() as sess:
+      np_output = sess.run(y, feed_dict={x: np.zeros((5,))})
+    self.assertEqual(list(np_output.shape), [5, 1])
+    self.assertEqual(y.shape.as_list(), [None, 1])
 
   @test_util.run_deprecated_v1
   def testFlattenUnknownAxes(self):
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
index 07d8e40b75973d39e876220a215333284c3c65d1..6535f74129ae166d41675aad494be09bdd0f5cd3 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/layers/normalization_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import saver as saver_lib
 
 
+@test_util.run_v1_only('b/120545219')
 class BNTest(test.TestCase):
 
   def _simple_model(self, image, fused, freeze_mode):
@@ -144,7 +145,6 @@ class BNTest(test.TestCase):
 
     return train_vars, loss_val
 
-  @test_util.run_deprecated_v1
   def testHalfPrecision(self):
     ref_vars, ref_loss = self._trainEvalSequence(
         dtype=dtypes.float32,
@@ -230,43 +230,33 @@ class BNTest(test.TestCase):
                                ckpt_b_use_gpu, use_gpu_test_a, use_gpu_test_b,
                                freeze_mode)
 
-  @test_util.run_deprecated_v1
   def testCheckpointFusedCPUAndFusedGPU(self):
     self._testCheckpointCrossDevice(True, False, True, True)
 
-  @test_util.run_deprecated_v1
   def testCheckpointFusedCPUAndFusedCPU(self):
     self._testCheckpointCrossDevice(True, False, True, False)
 
-  @test_util.run_deprecated_v1
   def testCheckpointFusedGPUAndFusedGPU(self):
     self._testCheckpointCrossDevice(True, True, True, True)
 
-  @test_util.run_deprecated_v1
   def testCheckpointNonFusedCPUAndNonFusedGPU(self):
     self._testCheckpointCrossDevice(False, False, False, True)
 
-  @test_util.run_deprecated_v1
   def testCheckpointNonFusedCPUAndNonFusedCPU(self):
     self._testCheckpointCrossDevice(False, False, False, False)
 
-  @test_util.run_deprecated_v1
   def testCheckpointNonFusedGPUAndNonFusedGPU(self):
     self._testCheckpointCrossDevice(False, True, False, True)
 
-  @test_util.run_deprecated_v1
   def testCheckpointNonFusedGPUAndFusedGPU(self):
     self._testCheckpointCrossDevice(False, True, True, True)
 
-  @test_util.run_deprecated_v1
   def testCheckpointNonFusedGPUAndFusedCPU(self):
     self._testCheckpointCrossDevice(False, True, True, False)
 
-  @test_util.run_deprecated_v1
   def testCheckpointNonFusedCPUAndFusedCPU(self):
     self._testCheckpointCrossDevice(False, False, True, False)
 
-  @test_util.run_deprecated_v1
   def testCreateBN(self):
     # Call layer.
     bn = normalization_layers.BatchNormalization(axis=1)
@@ -293,7 +283,6 @@ class BNTest(test.TestCase):
         ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
         bn.trainable_variables)
 
-  @test_util.run_deprecated_v1
   def testCreateFusedBNFloat16(self):
     # Call layer.
     bn = normalization_layers.BatchNormalization(axis=1, fused=True)
@@ -323,7 +312,6 @@ class BNTest(test.TestCase):
         ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
         bn.trainable_variables)
 
-  @test_util.run_deprecated_v1
   def test3DInputAxis1(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -367,7 +355,6 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
-  @test_util.run_deprecated_v1
   def test3DInputAxis2(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -451,7 +438,6 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
-  @test_util.run_deprecated_v1
   def test4DInputAxis2(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -493,7 +479,6 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
-  @test_util.run_deprecated_v1
   def test4DInputAxis3(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -535,7 +520,6 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
-  @test_util.run_deprecated_v1
   def test4DInputAxis3Fused(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -619,7 +603,6 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
-  @test_util.run_deprecated_v1
   def testNegativeAxis(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -662,7 +645,6 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
-  @test_util.run_deprecated_v1
   def testBooleanLearningPhase(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -703,7 +685,6 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
-  @test_util.run_deprecated_v1
   def testFunctionalNoReuse(self):
     inputs = variables.Variable(
         np.random.random((5, 4, 3, 6)), dtype=dtypes.float32)
@@ -756,7 +737,6 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
-  @test_util.run_deprecated_v1
   def testFunctionalReuse(self):
     inputs1 = variables.Variable(
         np.random.random((5, 4, 3, 6)), dtype=dtypes.float32)
@@ -821,7 +801,6 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=2)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
-  @test_util.run_deprecated_v1
   def testFunctionalReuseFromScope(self):
     inputs = variables.Variable(
         np.random.random((5, 4, 3, 6)), dtype=dtypes.float32)
@@ -836,7 +815,6 @@ class BNTest(test.TestCase):
           inputs, axis=-1, momentum=0.9, epsilon=epsilon, training=training)
       self.assertEqual(len(variables.global_variables()), 5)
 
-  @test_util.run_deprecated_v1
   def testNoCenter(self):
     bn = normalization_layers.BatchNormalization(axis=1, center=False)
     inputs = random_ops.random_uniform((5, 4, 3), seed=1)
@@ -852,7 +830,6 @@ class BNTest(test.TestCase):
     self.assertEqual(len(bn.trainable_variables), 1)
     self.assertEqual(len(bn.non_trainable_variables), 2)
 
-  @test_util.run_deprecated_v1
   def testNoScale(self):
     bn = normalization_layers.BatchNormalization(axis=1, scale=False)
     inputs = random_ops.random_uniform((5, 4, 3), seed=1)
@@ -868,7 +845,6 @@ class BNTest(test.TestCase):
     self.assertEqual(len(bn.trainable_variables), 1)
     self.assertEqual(len(bn.non_trainable_variables), 2)
 
-  @test_util.run_deprecated_v1
   def testRegularizers(self):
     reg = lambda x: 0.1 * math_ops.reduce_sum(x)
     bn = normalization_layers.BatchNormalization(axis=1, beta_regularizer=reg)
@@ -894,7 +870,6 @@ class BNTest(test.TestCase):
     self.assertEqual(bn.gamma_constraint, g_constraint)
     self.assertEqual(bn.beta_constraint, b_constraint)
 
-  @test_util.run_deprecated_v1
   def testRenorm(self):
     shape = (4, 3)
     xt = array_ops.placeholder(dtypes.float32, shape)
@@ -953,7 +928,6 @@ class BNTest(test.TestCase):
         self.assertAllClose(y_train, yt_val_train, atol=1e-5)
         self.assertAllClose(y_test, yt_val_test, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testAdjustment(self):
     shape = (4, 3)
     xt = array_ops.placeholder(dtypes.float32, shape)
@@ -998,7 +972,6 @@ class BNTest(test.TestCase):
         self.assertAllClose(y_train, yt_val_train, atol=1e-5)
         self.assertAllClose(y_test, yt_val_test, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testRenormWithAdjustment(self):
     shape = (4, 3)
     xt = array_ops.placeholder(dtypes.float32, shape)
@@ -1069,7 +1042,6 @@ class BNTest(test.TestCase):
       normalization_layers.batch_normalization(
           inp, virtual_batch_size=-1)
 
-  @test_util.run_deprecated_v1
   def testGhostBNVirtualBatchFull(self):
     shape = [6, 5, 4, 3]
     inp = random_ops.random_uniform(shape, seed=1)
@@ -1095,7 +1067,6 @@ class BNTest(test.TestCase):
         inp, virtual_batch_size=3)
     self.assertListEqual(out.shape.as_list(), shape)
 
-  @test_util.run_deprecated_v1
   def testGhostBNUnknownBatchSize(self):
     np_shape = [10, 5, 4]
     tf_shape = [None, 5, 4]
@@ -1111,7 +1082,6 @@ class BNTest(test.TestCase):
 
       self.assertListEqual(list(y.shape), np_shape)
 
-  @test_util.run_deprecated_v1
   def testGhostBN2Dims(self):
     shape = [6, 2]
     virtual_batch_size = 3
@@ -1165,7 +1135,6 @@ class BNTest(test.TestCase):
         self.assertAllClose(y_train, y_val_train, atol=1e-5)
         self.assertAllClose(y_test, y_val_test, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testGhostBN4DimsAxis3(self):
     shape = [6, 10, 10, 3]
     virtual_batch_size = 2
@@ -1219,7 +1188,6 @@ class BNTest(test.TestCase):
         self.assertAllClose(y_train, y_val_train, atol=1e-2)
         self.assertAllClose(y_test, y_val_test, atol=1e-2)
 
-  @test_util.run_deprecated_v1
   def testGhostBN4DimsAxis1(self):
     shape = [6, 3, 10, 10]
     virtual_batch_size = 2
@@ -1290,7 +1258,6 @@ class BNTest(test.TestCase):
       normalization_layers.batch_normalization(
           inp, axis=[1, 2, 1])   # duplicate
 
-  @test_util.run_deprecated_v1
   def test3DInputMultiAxis12(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -1332,7 +1299,6 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
-  @test_util.run_deprecated_v1
   def test5DInputMultiAxis123(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
@@ -1374,7 +1340,6 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
-  @test_util.run_deprecated_v1
   def testGhostBN5DimsMultiAxis14(self):
     shape = [6, 3, 10, 10, 4]
     virtual_batch_size = 3
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 9364aec373df9575282ae9254bce50a307bf61a0..97bebe86177ee264ef00bc9b969b293389aa2122 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -302,15 +302,14 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
 class NumpyTensorBuffer : public TensorBuffer {
  public:
   NumpyTensorBuffer(PyArrayObject* array, size_t len, void* data)
-      : array_(array), len_(len), data_(data) {}
+      : TensorBuffer(data), array_(array), len_(len) {}
 
   ~NumpyTensorBuffer() override {
     // Note: The session::run wrapper is responsible for freeing this while
     // holding the GIL.
-    DelayedNumpyDecref(data_, len_, array_);
+    DelayedNumpyDecref(data(), len_, array_);
   }
 
-  void* data() const override { return data_; }
   size_t size() const override { return len_; }
   TensorBuffer* root_buffer() override { return this; }
   void FillAllocationDescription(AllocationDescription* proto) const override {
@@ -329,7 +328,6 @@ class NumpyTensorBuffer : public TensorBuffer {
  private:
   PyArrayObject* array_;
   size_t len_;
-  void* data_;
 };
 
 Status PyObjectToString(PyObject* obj, string* str) {
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index 4caa5750bf6b180e97aacccb399274d6afda4ff3..ee55d89bffcbaca2a68cbb028ae9ca5157e6f6df 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -671,7 +671,7 @@ def walk(top, in_order=True):
 
 
 @tf_export("io.gfile.walk")
-def walk_v2(top, topdown, onerror=None):
+def walk_v2(top, topdown=True, onerror=None):
   """Recursive directory tree generator for directories.
 
   Args:
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 496d3855b56479f60b65f57f8d4184bd017d9e83..d4e35ca77b2b903ad7da6ad2ffeea0ba43b9f5a4 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -41,6 +41,7 @@ from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops.gen_array_ops import *
 from tensorflow.python.ops.gen_array_ops import reverse_v2 as reverse  # pylint: disable=unused-import
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
@@ -55,6 +56,7 @@ _BaseSlice = slice
 
 
 @tf_export("identity")
+@dispatch.add_dispatch_support
 def identity(input, name=None):  # pylint: disable=redefined-builtin
   r"""Return a tensor with the same shape and contents as input.
 
@@ -76,11 +78,16 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
       return input._copy()  # pylint: disable=protected-access
     return input
   else:
-    return gen_array_ops.identity(input, name=name)
+    ret = gen_array_ops.identity(input, name=name)
+    # Propagate handle data for happier shape inference for resource variables.
+    if hasattr(input, "_handle_data"):
+      ret._handle_data = input._handle_data  # pylint: disable=protected-access
+    return ret
 
 
 # pylint: disable=redefined-builtin,protected-access
 @tf_export(v1=["expand_dims"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "Use the `axis` argument instead", "dim")
 def expand_dims(input, axis=None, name=None, dim=None):
   """Inserts a dimension of 1 into a tensor's shape.
@@ -138,6 +145,7 @@ def expand_dims(input, axis=None, name=None, dim=None):
 
 
 @tf_export("expand_dims", v1=[])
+@dispatch.add_dispatch_support
 def expand_dims_v2(input, axis, name=None):
   """Inserts a dimension of 1 into a tensor's shape.
 
@@ -403,7 +411,7 @@ def size_internal(input, name=None, optimize=True, out_type=dtypes.int32):
       input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
     input = ops.convert_to_tensor(input)
     np_out_type = out_type.as_numpy_dtype
-    num_elements = np.prod(input._shape_tuple(), dtype=np_out_type)  # pylint: disable=protected-acces:
+    num_elements = np.prod(input._shape_tuple(), dtype=np_out_type)  # pylint: disable=protected-access
     return ops.convert_to_tensor(num_elements, dtype=out_type)
   with ops.name_scope(name, "Size", [input]) as name:
     if isinstance(input, (sparse_tensor.SparseTensor,
@@ -882,7 +890,7 @@ def _SliceHelperVar(var, slice_spec):
 
   """
 
-  return _slice_helper(var._AsTensor(), slice_spec, var)
+  return _slice_helper(var.value(), slice_spec, var)
 
 
 ops.Tensor._override_operator("__getitem__", _slice_helper)
@@ -940,6 +948,7 @@ def parallel_stack(values, name="parallel_stack"):
 
 
 @tf_export("stack")
+@dispatch.add_dispatch_support
 def stack(values, axis=0, name="stack"):
   """Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor.
 
@@ -1150,6 +1159,7 @@ def unstack(value, num=None, axis=0, name="unstack"):
 
 
 @tf_export("concat")
+@dispatch.add_dispatch_support
 def concat(values, axis, name="concat"):
   """Concatenates tensors along one dimension.
 
@@ -1327,6 +1337,7 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
 
 
 @tf_export("boolean_mask", v1=[])
+@dispatch.add_dispatch_support
 def boolean_mask_v2(tensor, mask, axis=None, name="boolean_mask"):
   """Apply boolean mask to tensor.
 
@@ -1809,6 +1820,7 @@ def zeros(shape, dtype=dtypes.float32, name=None):
 
 
 @tf_export(v1=["zeros_like"])
+@dispatch.add_dispatch_support
 def zeros_like(tensor, dtype=None, name=None, optimize=True):
   """Creates a tensor with all elements set to zero.
 
@@ -1839,6 +1851,7 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
 
 
 @tf_export("zeros_like", v1=[])
+@dispatch.add_dispatch_support
 def zeros_like_v2(
     input,  # pylint: disable=redefined-builtin
     dtype=None,
@@ -1898,6 +1911,7 @@ def zeros_like_impl(tensor, dtype, name, optimize=True):
 
 
 @tf_export(v1=["ones_like"])
+@dispatch.add_dispatch_support
 def ones_like(tensor, dtype=None, name=None, optimize=True):
   """Creates a tensor with all elements set to 1.
 
@@ -1928,6 +1942,7 @@ def ones_like(tensor, dtype=None, name=None, optimize=True):
 
 
 @tf_export("ones_like", v1=[])
+@dispatch.add_dispatch_support
 def ones_like_v2(
     input,  # pylint: disable=redefined-builtin
     dtype=None,
@@ -2642,7 +2657,7 @@ def required_space_to_batch_paddings(input_shape,
     return result_paddings, result_crops
 
 
-@tf_export("nn.space_to_batch", v1=["nn.space_to_batch", "space_to_batch"])
+@tf_export(v1=["nn.space_to_batch", "space_to_batch"])
 @deprecation.deprecated_endpoints("space_to_batch")
 def space_to_batch(input, paddings, block_size, name=None):  # pylint: disable=redefined-builtin
   result = space_to_batch_nd(
@@ -2657,7 +2672,15 @@ def space_to_batch(input, paddings, block_size, name=None):  # pylint: disable=r
 space_to_batch.__doc__ = gen_array_ops.space_to_batch.__doc__
 
 
-@tf_export("nn.space_to_depth", v1=["nn.space_to_depth", "space_to_depth"])
+@tf_export("space_to_batch", "nn.space_to_batch", v1=[])
+def space_to_batch_v2(input, block_shape, paddings, name=None):  # pylint: disable=redefined-builtin
+  return space_to_batch_nd(input, block_shape, paddings, name)
+
+
+space_to_batch_v2.__doc__ = gen_array_ops.space_to_batch_nd.__doc__
+
+
+@tf_export(v1=["nn.space_to_depth", "space_to_depth"])
 @deprecation.deprecated_endpoints("space_to_depth")
 def space_to_depth(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
@@ -2666,7 +2689,15 @@ def space_to_depth(input, block_size, name=None, data_format="NHWC"):  # pylint:
 space_to_depth.__doc__ = gen_array_ops.space_to_depth.__doc__
 
 
-@tf_export("nn.depth_to_space", v1=["nn.depth_to_space", "depth_to_space"])
+@tf_export("nn.space_to_depth", v1=[])
+def space_to_depth_v2(input, block_size, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
+  return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
+
+
+space_to_depth_v2.__doc__ = gen_array_ops.space_to_depth.__doc__
+
+
+@tf_export(v1=["nn.depth_to_space", "depth_to_space"])
 @deprecation.deprecated_endpoints("depth_to_space")
 def depth_to_space(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
@@ -2675,6 +2706,14 @@ def depth_to_space(input, block_size, name=None, data_format="NHWC"):  # pylint:
 depth_to_space.__doc__ = gen_array_ops.depth_to_space.__doc__
 
 
+@tf_export("nn.depth_to_space", v1=[])
+def depth_to_space_v2(input, block_size, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
+  return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
+
+
+depth_to_space_v2.__doc__ = gen_array_ops.depth_to_space.__doc__
+
+
 @tf_export(v1=["batch_to_space"])
 def batch_to_space(input, crops, block_size, name=None):  # pylint: disable=redefined-builtin
   result = batch_to_space_nd(
@@ -3114,6 +3153,7 @@ def squeeze_v2(input, axis=None, name=None):
 
 
 @tf_export("where")
+@dispatch.add_dispatch_support
 def where(condition, x=None, y=None, name=None):
   """Return the elements, either from `x` or `y`, depending on the `condition`.
 
@@ -3217,6 +3257,7 @@ reverse_sequence_v2.__doc__ = deprecation.rewrite_argument_docstring(
 
 
 @tf_export(v1=["gather"])
+@dispatch.add_dispatch_support
 def gather(params, indices, validate_indices=None, name=None, axis=0):
   del validate_indices
   if axis != 0:
@@ -3233,6 +3274,7 @@ def gather(params, indices, validate_indices=None, name=None, axis=0):
 
 
 @tf_export("gather", v1=[])
+@dispatch.add_dispatch_support
 def gather_v2(params, indices, validate_indices=None, axis=0, name=None):
   return gather(params, indices, validate_indices=validate_indices, name=name,
                 axis=axis)
@@ -3243,6 +3285,7 @@ gather.__doc__ = gather_v2.__doc__ = gen_array_ops.gather_v2.__doc__
 
 
 @tf_export("batch_gather")
+@dispatch.add_dispatch_support
 def batch_gather(params, indices, name=None):
   """Gather slices from `params` according to `indices` with leading batch dims.
 
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index c64000b65d4f8cf58ec5d7be66936d9b87e9a1c2..56f76a49d51bec99d35593041f3e72c2fcb580a4 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -151,7 +151,10 @@ def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
       seed2=seed2, name=name)
 
 
-@tf_export('nn.learned_unigram_candidate_sampler')
+@tf_export(
+    'random.learned_unigram_candidate_sampler',
+    'nn.learned_unigram_candidate_sampler')
+@deprecation.deprecated_endpoints(['nn.learned_unigram_candidate_sampler'])
 def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
                                       unique, range_max, seed=None, name=None):
   """Samples a set of classes from a distribution learned during training.
@@ -209,8 +212,7 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
 
 
 @tf_export('random.fixed_unigram_candidate_sampler',
-           'nn.fixed_unigram_candidate_sampler',
-           v1=['nn.fixed_unigram_candidate_sampler'])
+           'nn.fixed_unigram_candidate_sampler')
 def fixed_unigram_candidate_sampler(true_classes,
                                     num_true,
                                     num_sampled,
@@ -302,8 +304,7 @@ def fixed_unigram_candidate_sampler(true_classes,
       unigrams=unigrams, seed=seed1, seed2=seed2, name=name)
 
 
-@tf_export('random.all_candidate_sampler', 'nn.all_candidate_sampler',
-           v1=['nn.all_candidate_sampler'])
+@tf_export('random.all_candidate_sampler', 'nn.all_candidate_sampler')
 def all_candidate_sampler(true_classes, num_true, num_sampled, unique,
                           seed=None, name=None):
   """Generate the set of all classes.
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index 82803ac351682b93e332430f9688e523ace7c93f..a237cfff826bf0fb4cacd0c25fe5d361e3d7b26e 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -31,10 +31,12 @@ from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import numerics
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("clip_by_value")
+@dispatch.add_dispatch_support
 def clip_by_value(t, clip_value_min, clip_value_max,
                   name=None):
   """Clips tensor values to a specified min and max.
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index f7a95bd909ec8bac52a2412bcf813032ab38f9a0..7d09e32e241d55f064239bbfd4c4af45ac329c4b 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -61,7 +61,7 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
 
     # Automatic control dependencies are added in defuns, but not in v1
     # graphs. Propagate that behavior here.
-    add_control_dependencies = util.in_defun()
+    add_control_dependencies = ops.get_default_graph()._add_control_dependencies
     pred = ops.convert_to_tensor(pred)
 
     true_graph = func_graph_module.func_graph_from_py_func(
@@ -108,6 +108,46 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   assert ([t.dtype for t in true_grad_graph.outputs] ==
           [t.dtype for t in false_grad_graph.outputs])
 
+  if (true_grad_graph.if_op_needs_rewrite or
+      false_grad_graph.if_op_needs_rewrite):
+    # Modify 'op' to output the intermediates needed by the grad functions. Note
+    # that all needed intermediates are wrapped in optionals. Each optional
+    # intermediate output will have a value iff its corresponding branch is
+    # taken.
+    # NOTE(skyewm): if there are any active sessions, this modification to `op`
+    # may make them unrunnable!
+
+    if control_flow_util.InXlaContext(ops.get_default_graph()):
+      # XLA does not yet support optionals, so output intermediates directly and
+      # make them match via FakeParams, which can be converted to zeros in XLA.
+      # TODO(skyewm,jpienaar): can XLA support optionals?
+      true_intermediates = true_grad_graph.xla_intermediates
+      false_intermediates = false_grad_graph.xla_intermediates
+      extra_true_outputs, extra_false_outputs = _make_intermediates_match_xla(
+          true_graph, false_graph, true_intermediates, false_intermediates)
+    else:
+      true_intermediates = true_grad_graph.wrapped_intermediates
+      false_intermediates = false_grad_graph.wrapped_intermediates
+      # Make outputs match by adding none optionals.
+      extra_true_outputs, extra_false_outputs = _make_intermediates_match(
+          true_graph, false_graph, true_intermediates, false_intermediates)
+
+    true_graph.outputs.extend(extra_true_outputs)
+    false_graph.outputs.extend(extra_false_outputs)
+    # TODO(skyewm): indicate it's an internal bug if this fails.
+    _check_same_outputs(true_graph, false_graph)
+
+    true_graph.name += "_rewritten"
+    false_graph.name += "_rewritten"
+
+    op._set_func_attr("then_branch", util.create_new_tf_function(true_graph))
+    op._set_func_attr("else_branch", util.create_new_tf_function(false_graph))
+    op._set_type_list_attr("Tout", true_graph.output_types)
+    op._set_shape_list_attr("output_shapes", true_graph.output_shapes)
+    op._add_outputs(
+        [t.dtype for t in extra_true_outputs],
+        [t.shape for t in extra_true_outputs])
+
   # Resolve references to forward graph tensors in grad graphs and ensure
   # they are in-scope, i.e., belong to one of outer graphs of the grad graph.
   true_grad_inputs = _resolve_grad_inputs(true_graph, true_grad_graph)
@@ -150,40 +190,6 @@ def _build_cond(pred, true_graph, false_graph, true_inputs, false_inputs,
   cond_inputs = _make_inputs_match(true_graph, false_graph,
                                    true_inputs, false_inputs)
 
-  # Add all intermediate tensors as function outputs so they're available for
-  # the gradient computation. Since the outputs of the two functions must match,
-  # we wrap all the intermediates in optionals. Each intermediate output will
-  # have a value iff its corresponding branch is taken.
-
-  true_intermediates = _get_intermediates(true_graph)
-  false_intermediates = _get_intermediates(false_graph)
-
-  # Save the original number of outputs to return to the caller.
-  num_cond_outputs = len(true_graph.outputs)
-
-  if control_flow_util.InXlaContext(ops.get_default_graph()):
-    # XLA does not yet support optionals, so output intermediates directly and
-    # make them match via FakeParams, which can be converted to zeros in XLA.
-    # TODO(skyewm,jpienaar): can XLA support optionals?
-    extra_true_outputs, extra_false_outputs = _make_intermediates_match_xla(
-        true_graph, false_graph, true_intermediates, false_intermediates)
-  else:
-    # Wrap intermediates in optionals.
-    wrapped_true_intermediates = _wrap_intermediates(true_graph,
-                                                     true_intermediates)
-    wrapped_false_intermediates = _wrap_intermediates(false_graph,
-                                                      false_intermediates)
-
-    # Make outputs match by adding none optionals.
-    extra_true_outputs, extra_false_outputs = _make_intermediates_match(
-        true_graph, false_graph,
-        wrapped_true_intermediates, wrapped_false_intermediates)
-
-  true_graph.outputs.extend(extra_true_outputs)
-  false_graph.outputs.extend(extra_false_outputs)
-  # TODO(skyewm): somehow indicate it's a bug if this fails.
-  _check_same_outputs(true_graph, false_graph)
-
   # Create the If op.
   tensors = gen_functional_ops._if(  # pylint: disable=protected-access
       pred,
@@ -210,8 +216,7 @@ def _build_cond(pred, true_graph, false_graph, true_inputs, false_inputs,
 
   # Prevent fetching since the variant outputs can't be fetched directly.
   if_op.graph.prevent_fetching(if_op)
-
-  return tensors[:num_cond_outputs]
+  return tensors
 
 
 def _get_func_graphs(if_op):
@@ -540,13 +545,37 @@ def _get_output_shapes(true_graph_outputs, false_graph_outputs):
 class _CondGradFuncGraph(util.CondBranchFuncGraph):
   """FuncGraph for the gradient function of the branch of an If op.
 
-  Handles unwrapping optional intermediate values that are captured by the
-  gradient computation.
+  Handles wrapping and unwrapping intermediate values that are captured by the
+  gradient computation in optionals.
+
+  Attributes:
+    if_op_needs_rewrite: True if any intermediates were captured, meaning the
+      forward If op needs to be written to output the wrapped intermediates.
   """
 
   def __init__(self, name, forward_graph):
     super(_CondGradFuncGraph, self).__init__(name, read_only_collections=False)
+    self.if_op_needs_rewrite = False
     self._forward_graph = forward_graph
+    # Maps from forward intermediate tensor -> the unwrapped captured
+    # intermediate.
+    self._indirect_captures = {}
+    # Maps unwrapped intermediate -> optional-wrapped intermediate in the
+    # forward graph.
+    self._wrapped_intermediates = collections.OrderedDict()
+    # Raw intermediates captured from the forward graph. Populated iff we're in
+    # an XLA context.
+    self._xla_intermediates = []
+
+  @property
+  def wrapped_intermediates(self):
+    """The optional-wrapped intermediates captured from the forward graph."""
+    return list(self._wrapped_intermediates.values())
+
+  @property
+  def xla_intermediates(self):
+    """Raw intermediates captured from the forward graph if XLA is enabled."""
+    return self._xla_intermediates
 
   def _capture_helper(self, tensor, name):
     if (tensor.graph is not self._forward_graph or
@@ -554,19 +583,43 @@ class _CondGradFuncGraph(util.CondBranchFuncGraph):
         tensor in self._forward_graph.outputs):
       return super(_CondGradFuncGraph, self)._capture_helper(tensor, name)
 
-    # 'tensor' is an intermediate in the forward graph. We find the corresonding
-    # optional tensor, which is output from the If op, and capture it as
-    # normal. We then unwrap the captured optional value to get the raw
-    # intermediate value.
-    for consumer in tensor.consumers():
-      if (consumer.type == "OptionalFromValue"
-          and consumer.outputs[0] in self._forward_graph.outputs):
-        optional = consumer.outputs[0]
-        captured_optional = super(_CondGradFuncGraph, self)._capture_helper(
-            optional, name)
-        return gen_dataset_ops.optional_get_value(
-            captured_optional, [tensor.dtype], [tensor.shape])[0]
-    raise ValueError(
-        "Couldn't find OptionalFromValue consumer for tensor '%s'.\n"
-        "This is an internal bug, please report at "
-        "https://github.com/tensorflow/tensorflow/issues." % tensor.name)
+    if control_flow_util.InXlaContext(ops.get_default_graph()):
+      # XLA does not yet support optionals, so capture intermediates directly.
+      # TODO(skyewm,jpienaar): can XLA support optionals?
+      if tensor not in self.captures:
+        self.xla_intermediates.append(tensor)
+        self.if_op_needs_rewrite = True
+      return super(_CondGradFuncGraph, self)._capture_helper(tensor, name)
+
+    captured_tensor = self._indirect_captures.get(tensor)
+    if captured_tensor is not None:
+      return captured_tensor
+
+    # 'tensor' is an uncaptured intermediate in the forward graph. We wrap it in
+    # an optional in the forward graph and capture the optional normally. We
+    # then unwrap the captured optional value in the gradient graph to get the
+    # raw intermediate value.
+
+    if tensor not in self._wrapped_intermediates:
+      # If the gradient has already been computed for this If op, 'tensor' may
+      # already be wrapped.
+      for consumer in tensor.consumers():
+        if (consumer.type == "OptionalFromValue"
+            and consumer.outputs[0] in self._forward_graph.outputs):
+          optional = consumer.outputs[0]
+          break
+      else:
+        # 'tensor' hasn't been wrapped, do it now.
+        with self._forward_graph.as_default():
+          optional = gen_dataset_ops.optional_from_value([tensor])
+        self.if_op_needs_rewrite = True
+
+      self._wrapped_intermediates[tensor] = optional
+
+    optional = self._wrapped_intermediates[tensor]
+    captured_optional = super(_CondGradFuncGraph, self)._capture_helper(
+        optional, name)
+    captured_tensor = gen_dataset_ops.optional_get_value(
+        captured_optional, [tensor.dtype], [tensor.shape])[0]
+    self._indirect_captures[tensor] = captured_tensor
+    return captured_tensor
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index b7e50c1dae5ac1dc0968a3badb8f017e6b0384e1..99216d7fb15ff865ba70d01995606c6a5e3ab7c4 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -24,13 +24,11 @@ from __future__ import print_function
 import abc
 import collections
 import functools
-import os
 
 import six
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import control_flow_pb2
-from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -71,9 +69,6 @@ cond_v2 = LazyLoader("cond_v2", globals(),
 while_v2 = LazyLoader("while_v2", globals(),
                       "tensorflow.python.ops.while_v2")
 
-ENABLE_COND_V2 = tf2.enabled() or os.getenv("TF_ENABLE_COND_V2", "0") != "0"
-ENABLE_WHILE_V2 = tf2.enabled() or os.getenv("TF_ENABLE_WHILE_V2", "0") != "0"
-
 # We override the 'tuple' for a control flow op, so we keep python's
 # existing 'tuple' for later use in this module.
 _basetuple = tuple
@@ -2052,7 +2047,7 @@ def cond(pred,
   ```
 
   """
-  if ENABLE_COND_V2 and not context.executing_eagerly():
+  if util.ENABLE_CONTROL_FLOW_V2 and not context.executing_eagerly():
     return cond_v2.cond_v2(pred, true_fn, false_fn, name)
 
   # We needed to make true_fn/false_fn keyword arguments for
@@ -3487,7 +3482,7 @@ def while_loop(cond,
   ```
 
   """
-  if ENABLE_WHILE_V2 and not context.executing_eagerly():
+  if util.ENABLE_CONTROL_FLOW_V2 and not context.executing_eagerly():
     return while_v2.while_loop(
         cond,
         body,
diff --git a/tensorflow/python/ops/control_flow_ops_benchmark.py b/tensorflow/python/ops/control_flow_ops_benchmark.py
index 9ba5ff2c0f8af44e8536b49a3c0e7ef6bfae4d28..9dd1e6673b854c3cbc248f0e5a5be4c67d2bd72c 100644
--- a/tensorflow/python/ops/control_flow_ops_benchmark.py
+++ b/tensorflow/python/ops/control_flow_ops_benchmark.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -94,28 +95,28 @@ class CondWithManyIntermediatesBenchmark(test.Benchmark):
               iters=self.NUM_ITERS)
 
   def benchmark_cond_v1_defun(self):
-    old_val = control_flow_ops.ENABLE_COND_V2
-    control_flow_ops.ENABLE_COND_V2 = False
+    old_val = control_flow_util.ENABLE_CONTROL_FLOW_V2
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = False
     self._benchmark_defun()
-    control_flow_ops.ENABLE_COND_V2 = old_val
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = old_val
 
   def benchmark_cond_v2_defun(self):
-    old_val = control_flow_ops.ENABLE_COND_V2
-    control_flow_ops.ENABLE_COND_V2 = True
+    old_val = control_flow_util.ENABLE_CONTROL_FLOW_V2
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = True
     self._benchmark_defun()
-    control_flow_ops.ENABLE_COND_V2 = old_val
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = old_val
 
   def benchmark_cond_v1_graph(self):
-    old_val = control_flow_ops.ENABLE_COND_V2
-    control_flow_ops.ENABLE_COND_V2 = False
+    old_val = control_flow_util.ENABLE_CONTROL_FLOW_V2
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = False
     self._benchmark_graph()
-    control_flow_ops.ENABLE_COND_V2 = old_val
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = old_val
 
   def benchmark_cond_v2_graph(self):
-    old_val = control_flow_ops.ENABLE_COND_V2
-    control_flow_ops.ENABLE_COND_V2 = True
+    old_val = control_flow_util.ENABLE_CONTROL_FLOW_V2
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = True
     self._benchmark_graph()
-    control_flow_ops.ENABLE_COND_V2 = old_val
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = old_val
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index c020189ad63cb251f849183a521c787de8e63609..f1dd4f529fc37c054a051d69f6aa1bec23c0805e 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -21,9 +21,10 @@ from __future__ import print_function
 import collections
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
-from tensorflow.python.client import session
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -126,56 +127,56 @@ class GroupTestCase(test_util.TensorFlowTestCase):
       node { name: "root" op: "NoOp" input: "^a" input: "^b" }
     """, self._StripGraph(gd))
 
+  @test_util.run_deprecated_v1
   def testPassingNonTensors(self):
-    with ops.Graph().as_default():
-      with self.assertRaises(TypeError):
-        control_flow_ops.group(1, 2)
+    with self.assertRaises(TypeError):
+      control_flow_ops.group(1, 2)
 
 
 class ShapeTestCase(test_util.TensorFlowTestCase):
 
   def testShape(self):
-    with ops.Graph().as_default():
-      tensor = constant_op.constant([1.0, 2.0])
-      self.assertEquals([2], tensor.get_shape())
-      self.assertEquals([2],
-                        control_flow_ops.with_dependencies(
-                            [constant_op.constant(1.0)], tensor).get_shape())
+    tensor = constant_op.constant([1.0, 2.0])
+    self.assertEquals([2], tensor.get_shape())
+    self.assertEquals([2],
+                      control_flow_ops.with_dependencies(
+                          [constant_op.constant(1.0)], tensor).get_shape())
 
 
 class WithDependenciesTestCase(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testTupleDependencies(self):
-    with ops.Graph().as_default():
-      counter = variable_scope.get_variable(
-          "my_counter", shape=[], initializer=init_ops.zeros_initializer())
-      increment_counter = state_ops.assign_add(counter, 1)
-      const_with_dep = control_flow_ops.with_dependencies(
-          (increment_counter, constant_op.constant(42)),
-          constant_op.constant(7))
-      with self.cached_session():
-        variables.global_variables_initializer().run()
-        self.assertEquals(0, self.evaluate(counter))
-        self.assertEquals(7, self.evaluate(const_with_dep))
-        self.assertEquals(1, self.evaluate(counter))
-
+    counter = variable_scope.get_variable(
+        "my_counter", shape=[], initializer=init_ops.zeros_initializer())
+    increment_counter = state_ops.assign_add(counter, 1)
+    const_with_dep = control_flow_ops.with_dependencies(
+        (increment_counter, constant_op.constant(42)),
+        constant_op.constant(7))
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEquals(0, self.evaluate(counter))
+    self.assertEquals(7, self.evaluate(const_with_dep))
+    self.assertEquals(1, self.evaluate(counter))
+
+  @test_util.run_deprecated_v1
   def testListDependencies(self):
-    with ops.Graph().as_default():
-      counter = variable_scope.get_variable(
-          "my_counter", shape=[], initializer=init_ops.zeros_initializer())
-      increment_counter = state_ops.assign_add(counter, 1)
-      const_with_dep = control_flow_ops.with_dependencies(
-          [increment_counter, constant_op.constant(42)],
-          constant_op.constant(7))
-      with self.cached_session():
-        variables.global_variables_initializer().run()
-        self.assertEquals(0, self.evaluate(counter))
-        self.assertEquals(7, self.evaluate(const_with_dep))
-        self.assertEquals(1, self.evaluate(counter))
+    counter = variable_scope.get_variable(
+        "my_counter", shape=[], initializer=init_ops.zeros_initializer())
+    increment_counter = state_ops.assign_add(counter, 1)
+    const_with_dep = control_flow_ops.with_dependencies(
+        [increment_counter, constant_op.constant(42)],
+        constant_op.constant(7))
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEquals(0, self.evaluate(counter))
+    self.assertEquals(7, self.evaluate(const_with_dep))
+    self.assertEquals(1, self.evaluate(counter))
 
 
 class SwitchTestCase(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesWithDenseShape(self):
     with self.cached_session():
       data = ops.IndexedSlices(
@@ -189,68 +190,64 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
       self.assertAllEqual([1, 2, 3], switch_true.values.eval())
       self.assertAllEqual([0, 1], switch_true.indices.eval())
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesGradient(self):
-    with ops.Graph().as_default():
-      embedding_matrix = variable_scope.get_variable(
-          "embedding_matrix", [5, 5],
-          initializer=init_ops.random_normal_initializer())
-
-      def cond(it, _):
-        return it < 5
-
-      def body(it, cost):
-        embedding = embedding_ops.embedding_lookup(embedding_matrix + 0.0, [0])
-        cost += math_ops.reduce_sum(embedding)
-        return it + 1, cost
-
-      _, cost = control_flow_ops.while_loop(
-          cond, body, [constant_op.constant(0),
-                       constant_op.constant(0.0)])
-      optimizer = momentum.MomentumOptimizer(0.1, 0.9)
-      train_op = optimizer.minimize(cost)
-      with self.cached_session() as sess:
-        self.evaluate(variables.global_variables_initializer())
-        for _ in range(10):
-          self.evaluate([train_op])
+    embedding_matrix = variable_scope.get_variable(
+        "embedding_matrix", [5, 5],
+        initializer=init_ops.random_normal_initializer())
+
+    def cond(it, _):
+      return it < 5
+
+    def body(it, cost):
+      embedding = embedding_ops.embedding_lookup(embedding_matrix + 0.0, [0])
+      cost += math_ops.reduce_sum(embedding)
+      return it + 1, cost
+
+    _, cost = control_flow_ops.while_loop(
+        cond, body, [constant_op.constant(0),
+                     constant_op.constant(0.0)])
+    optimizer = momentum.MomentumOptimizer(0.1, 0.9)
+    train_op = optimizer.minimize(cost)
+    with self.cached_session():
+      self.evaluate(variables.global_variables_initializer())
+      for _ in range(10):
+        self.evaluate([train_op])
 
   def testResourceReadInLoop(self):
-    with ops.Graph().as_default():
-      embedding_matrix = variable_scope.get_variable(
-          "embedding_matrix",
-          initializer=[[2.0], [3.0]],
-          use_resource=True)
+    embedding_matrix = variable_scope.get_variable(
+        "embedding_matrix", initializer=[[2.0], [3.0]], use_resource=True)
 
-      def cond(it, _):
-        return it < 5
+    def cond(it, _):
+      return it < 5
 
-      def body(it, cost):
-        embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
-        cost += math_ops.reduce_sum(embedding)
-        return it + 1, cost
+    def body(it, cost):
+      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
+      cost += math_ops.reduce_sum(embedding)
+      return it + 1, cost
 
-      _, cost = control_flow_ops.while_loop(
-          cond, body, [constant_op.constant(0),
-                       constant_op.constant(0.0)])
-      with self.cached_session() as sess:
-        self.evaluate(variables.global_variables_initializer())
-        self.assertAllEqual(10.0, self.evaluate(cost))
+    _, cost = control_flow_ops.while_loop(
+        cond, body, [constant_op.constant(0),
+                     constant_op.constant(0.0)])
+    with self.cached_session():
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual(10.0, self.evaluate(cost))
 
   def doTestIndexedSlicesGradientInCondInWhileLoop(self, use_resource=False):
-    with ops.Graph().as_default():
-      embedding_matrix = variable_scope.get_variable(
-          "embedding_matrix", [5, 5],
-          initializer=init_ops.random_normal_initializer(),
-          use_resource=use_resource)
-
-      def cond(it, _):
-        return it < 5
-
-      def body(it, cost):
-        embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
-        cost = control_flow_ops.cond(
-            math_ops.equal(it, 3), lambda: math_ops.square(cost),
-            lambda: cost + math_ops.reduce_sum(embedding))
-        return it + 1, cost
+    embedding_matrix = variable_scope.get_variable(
+        "embedding_matrix", [5, 5],
+        initializer=init_ops.random_normal_initializer(),
+        use_resource=use_resource)
+
+    def cond(it, _):
+      return it < 5
+
+    def body(it, cost):
+      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
+      cost = control_flow_ops.cond(
+          math_ops.equal(it, 3), lambda: math_ops.square(cost),
+          (lambda: cost + math_ops.reduce_sum(embedding)))
+      return it + 1, cost
 
       _, cost = control_flow_ops.while_loop(
           cond, body, [constant_op.constant(0),
@@ -268,7 +265,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
       static_grads = math_ops.segment_sum(static_grads.values,
                                           static_grads.indices)
 
-      with self.cached_session() as sess:
+      with self.cached_session():
         self.evaluate(variables.global_variables_initializer())
         self.assertAllEqual(*self.evaluate([static_grads, dynamic_grads]))
 
@@ -278,6 +275,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
   def testIndexedSlicesGradientInCondInWhileLoopResource(self):
     self.doTestIndexedSlicesGradientInCondInWhileLoop(use_resource=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testIndexedSlicesWithShapeGradientInWhileLoop(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session() as sess:
@@ -307,6 +305,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
         self.assertEquals(o, 20)
         self.assertAllEqual(grad, [1] * num_steps)
 
+  @test_util.run_v1_only("b/120545219")
   def testIndexedSlicesWithDynamicShapeGradientInWhileLoop(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session() as sess:
@@ -334,105 +333,94 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
         self.assertEquals(o, 6)
         self.assertAllEqual(grad, [1] * 3)
 
+  @test_util.run_deprecated_v1
   def testGradientThroughSingleBranchOutsideOfContext(self):
-    with self.cached_session():
-      x = constant_op.constant(2.)
-      s = constant_op.constant(True)
-      x_false, x_true = control_flow_ops.switch(x, s)
-      grad_x_true = gradients_impl.gradients(x_true, x)[0]
-      grad_x_false = gradients_impl.gradients(x_false, x)[0]
-      self.assertEquals(grad_x_true.eval(), 1.)
-      self.assertEquals(grad_x_false.eval(), 0.)
+    x = constant_op.constant(2.)
+    s = constant_op.constant(True)
+    x_false, x_true = control_flow_ops.switch(x, s)
+    grad_x_true = gradients_impl.gradients(x_true, x)[0]
+    grad_x_false = gradients_impl.gradients(x_false, x)[0]
+    self.assertEquals(self.evaluate(grad_x_true), 1.)
+    self.assertEquals(self.evaluate(grad_x_false), 0.)
 
 
 class CondTest(test_util.TensorFlowTestCase):
 
   def testCondTrue(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(2)
-        y = constant_op.constant(5)
-        z = control_flow_ops.cond(
-            math_ops.less(x, y), lambda: math_ops.multiply(x, 17),
-            lambda: math_ops.add(y, 23))
-        self.assertEquals(z.eval(), 34)
+    x = constant_op.constant(2)
+    y = constant_op.constant(5)
+    z = control_flow_ops.cond(
+        math_ops.less(
+            x,
+            y), lambda: math_ops.multiply(x, 17), lambda: math_ops.add(y, 23))
+    self.assertEquals(self.evaluate(z), 34)
 
   def testCondFalse(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(2)
-        y = constant_op.constant(1)
-        z = control_flow_ops.cond(
-            math_ops.less(x, y), lambda: math_ops.multiply(x, 17),
-            lambda: math_ops.add(y, 23))
-        self.assertEquals(z.eval(), 24)
+    x = constant_op.constant(2)
+    y = constant_op.constant(1)
+    z = control_flow_ops.cond(
+        math_ops.less(
+            x,
+            y), lambda: math_ops.multiply(x, 17), lambda: math_ops.add(y, 23))
+    self.assertEquals(self.evaluate(z), 24)
 
   def testCondTrueLegacy(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(2)
-        y = constant_op.constant(5)
-        z = control_flow_ops.cond(
-            math_ops.less(x, y), fn1=lambda: math_ops.multiply(x, 17),
-            fn2=lambda: math_ops.add(y, 23))
-        self.assertEquals(z.eval(), 34)
+    x = constant_op.constant(2)
+    y = constant_op.constant(5)
+    z = control_flow_ops.cond(
+        math_ops.less(x, y),
+        fn1=lambda: math_ops.multiply(x, 17),
+        fn2=lambda: math_ops.add(y, 23))
+    self.assertEquals(self.evaluate(z), 34)
 
   def testCondFalseLegacy(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(2)
-        y = constant_op.constant(1)
-        z = control_flow_ops.cond(
-            math_ops.less(x, y), fn1=lambda: math_ops.multiply(x, 17),
-            fn2=lambda: math_ops.add(y, 23))
-        self.assertEquals(z.eval(), 24)
-
+    x = constant_op.constant(2)
+    y = constant_op.constant(1)
+    z = control_flow_ops.cond(
+        math_ops.less(x, y),
+        fn1=lambda: math_ops.multiply(x, 17),
+        fn2=lambda: math_ops.add(y, 23))
+    self.assertEquals(self.evaluate(z), 24)
+
+  @test_util.run_deprecated_v1
   def testCondModifyBoolPred(self):
     # This test in particular used to fail only when running in GPU, hence
     # use_gpu=True.
-    with ops.Graph().as_default():
-      with session.Session() as sess:
-        bool_var = variable_scope.get_variable("bool_var", dtype=dtypes.bool,
-                                               initializer=True)
-        cond_on_bool_var = control_flow_ops.cond(
-            pred=bool_var,
-            true_fn=lambda: state_ops.assign(bool_var, False),
-            false_fn=lambda: True)
-        self.evaluate(bool_var.initializer)
-        self.assertEquals(self.evaluate(cond_on_bool_var), False)
-        self.assertEquals(self.evaluate(cond_on_bool_var), True)
+    with test_util.use_gpu():
+      bool_var = variable_scope.get_variable(
+          "bool_var", dtype=dtypes.bool, initializer=True)
+      cond_on_bool_var = control_flow_ops.cond(
+          pred=bool_var,
+          true_fn=lambda: state_ops.assign(bool_var, False),
+          false_fn=lambda: True)
+      self.evaluate(bool_var.initializer)
+      self.assertEquals(self.evaluate(cond_on_bool_var), False)
+      self.assertEquals(self.evaluate(cond_on_bool_var), True)
 
   def testCondMissingArg1(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(1)
-        with self.assertRaises(TypeError):
-          control_flow_ops.cond(True, false_fn=lambda: x)
+    x = constant_op.constant(1)
+    with self.assertRaises(TypeError):
+      control_flow_ops.cond(True, false_fn=lambda: x)
 
   def testCondMissingArg2(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(1)
-        with self.assertRaises(TypeError):
-          control_flow_ops.cond(True, lambda: x)
+    x = constant_op.constant(1)
+    with self.assertRaises(TypeError):
+      control_flow_ops.cond(True, lambda: x)
 
   def testCondDuplicateArg1(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(1)
-        with self.assertRaises(TypeError):
-          control_flow_ops.cond(True, lambda: x, lambda: x, fn1=lambda: x)
+    x = constant_op.constant(1)
+    with self.assertRaises(TypeError):
+      control_flow_ops.cond(True, lambda: x, lambda: x, fn1=lambda: x)
 
   def testCondDuplicateArg2(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(1)
-        with self.assertRaises(TypeError):
-          control_flow_ops.cond(True, lambda: x, lambda: x, fn2=lambda: x)
+    x = constant_op.constant(1)
+    with self.assertRaises(TypeError):
+      control_flow_ops.cond(True, lambda: x, lambda: x, fn2=lambda: x)
 
 
 class ContextTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testCondContext(self):
     with self.cached_session() as sess:
       x = constant_op.constant(2)
@@ -462,12 +450,15 @@ class ContextTest(test_util.TensorFlowTestCase):
               control_flow_ops.WhileContext.from_proto(
                   control_flow_context.to_proto()).to_proto())
 
+  @test_util.run_deprecated_v1
   def testWhileContext(self):
     self._testWhileContextHelper()
 
+  @test_util.run_deprecated_v1
   def testWhileContextWithMaximumIterations(self):
     self._testWhileContextHelper(maximum_iterations=10)
 
+  @test_util.run_deprecated_v1
   def testControlContextImportScope(self):
     class NoABCControlFlowContext(control_flow_ops.ControlFlowContext):
       """A noop wrapper around `ControlFlowContext`.
@@ -574,7 +565,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                                         strict=strict)
 
     with self.cached_session() as sess:
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       true_feed_dict = {condition: True}
       true_feed_dict.update(feed_dict)
       result_cond, result_case = sess.run([output_cond, output_case],
@@ -590,6 +581,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
       if check_cond:
         self.assertAllEqualNested(result_case, expected_value_false)
 
+  @test_util.run_deprecated_v1
   def test_int(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: 1
@@ -599,6 +591,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape, strict=True)
     self._testReturnValues(fn_true, fn_false, 1, 2, strict=True)
 
+  @test_util.run_deprecated_v1
   def test_float(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: 1.0
@@ -606,12 +599,14 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, 1.0, 2.0)
 
+  @test_util.run_deprecated_v1
   def test_noop(self):
     shape = tensor_shape.TensorShape(None)
     self._testShape(control_flow_ops.no_op, control_flow_ops.no_op, shape)
     self._testReturnValues(control_flow_ops.no_op, control_flow_ops.no_op,
                            True, False, check_cond=False)
 
+  @test_util.run_deprecated_v1
   def test_string(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: "abc"
@@ -619,6 +614,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, b"abc", b"xyz")
 
+  @test_util.run_deprecated_v1
   def test_variable(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: variables.Variable(3.0)
@@ -626,6 +622,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, 3.0, 4.0)
 
+  @test_util.run_v1_only("b/120553181")
   def test_none(self):
     fn_none = lambda: None
     fn_tensor = lambda: constant_op.constant(1)
@@ -636,6 +633,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       control_flow_ops.cond(constant_op.constant(True), fn_tensor, fn_none)
 
+  @test_util.run_deprecated_v1
   def test_tensors(self):
 
     def _build_true_branch(dtype):
@@ -664,6 +662,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                              (np.zeros([2, 2]), np.ones([3, 3])),
                              (np.ones([2, 2]), np.zeros([3, 3])))
 
+  @test_util.run_deprecated_v1
   def test_tensors_unknown_shape(self):
 
     def _build_true_branch(dtype):
@@ -692,6 +691,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                              feed_dict={true_tensor: np.zeros([2, 2]),
                                         false_tensor: np.ones([2, 2])})
 
+  @test_util.run_deprecated_v1
   def test_sparse_tensors(self):
     shape = tensor_shape.TensorShape([None, None])
 
@@ -707,11 +707,14 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                                              values=[1, 2], dense_shape=[3, 4])
     value2 = sparse_tensor.SparseTensorValue(indices=[[0, 0], [2, 1]],
                                              values=[3, 4], dense_shape=[3, 4])
-    self._testShape(true_fn, false_fn, shape)
-    self._testReturnValues(true_fn, false_fn, value1, value2)
+    # Non-strict cond is only available in v1
+    if not tf2.enabled():
+      self._testShape(true_fn, false_fn, shape)
+      self._testReturnValues(true_fn, false_fn, value1, value2)
     self._testShape(true_fn, false_fn, [shape], strict=True)
     self._testReturnValues(true_fn, false_fn, [value1], [value2], strict=True)
 
+  @test_util.run_deprecated_v1
   def test_tensors_with_partially_specified_shapes(self):
 
     def _build_branch(dtype, shape):
@@ -741,6 +744,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                                         true_tensors[2]: np.ones([3, 3]),
                                         false_tensors[2]: np.ones([3, 3])})
 
+  @test_util.run_deprecated_v1
   def test_tensor_arrays(self):
     element_shape = tensor_shape.TensorShape([2])
     ta1 = _create_tensor_array(4, element_shape)
@@ -750,6 +754,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     fn_false = lambda: ta2
     self._testShape(fn_true, fn_false, shape)
 
+  @test_util.run_deprecated_v1
   def test_tensor_array_reads(self):
     shape = tensor_shape.TensorShape([2])
     ta = _create_tensor_array(4, shape)
@@ -757,6 +762,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     fn_false = lambda: ta.read(1)
     self._testShape(fn_true, fn_false, shape)
 
+  @test_util.run_deprecated_v1
   def test_list(self):
     shape = [tensor_shape.TensorShape([]), tensor_shape.TensorShape([]),
              tensor_shape.TensorShape([])]
@@ -765,6 +771,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, [1, 2, 3.0], [3, 4, 5.0])
 
+  @test_util.run_v1_only("Non-strict cond is only available in v1")
   def test_non_strict(self):
     shape = tensor_shape.TensorShape([])
     fn_tensor = lambda: constant_op.constant(1)
@@ -777,6 +784,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testReturnValues(fn_tensor, fn_tuple, 1, 3)
     self._testReturnValues(fn_list, fn_tuple, 2, 3)
 
+  @test_util.run_v1_only("b/120553181")
   def test_singleton_strict(self):
     fn_tensor = lambda: constant_op.constant(1)
     fn_list = lambda: [constant_op.constant(2)]
@@ -798,36 +806,46 @@ class DataTypesTest(test_util.TensorFlowTestCase):
       control_flow_ops.case([(constant_op.constant(True), fn_list)], fn_tuple,
                             strict=True)
 
+  @test_util.run_deprecated_v1
   def test_singleton_list(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: [constant_op.constant(1)]
     fn_false = lambda: [constant_op.constant(3)]
-    self._testShape(fn_true, fn_false, shape)
-    self._testReturnValues(fn_true, fn_false, 1, 3)
+    # Non-strict cond is only available in v1
+    if not tf2.enabled():
+      self._testShape(fn_true, fn_false, shape)
+      self._testReturnValues(fn_true, fn_false, 1, 3)
     self._testShape(fn_true, fn_false, [shape], strict=True)
     self._testReturnValues(fn_true, fn_false, [1], [3], strict=True)
 
+  @test_util.run_deprecated_v1
   def test_singleton_tuple(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: (constant_op.constant(1),)
     fn_false = lambda: (constant_op.constant(3),)
-    self._testShape(fn_true, fn_false, shape)
-    self._testReturnValues(fn_true, fn_false, 1, 3)
+    # Non-strict cond is only available in v1
+    if not tf2.enabled():
+      self._testShape(fn_true, fn_false, shape)
+      self._testReturnValues(fn_true, fn_false, 1, 3)
     self._testShape(fn_true, fn_false, (shape,), strict=True)
     self._testReturnValues(fn_true, fn_false, (1,), (3,),
                            strict=True)
 
+  @test_util.run_deprecated_v1
   def test_singleton_namedtuple(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: SingletonTestTuple(constant_op.constant(1))
     fn_false = lambda: SingletonTestTuple(constant_op.constant(3))
-    self._testShape(fn_true, fn_false, shape)
-    self._testReturnValues(fn_true, fn_false, 1, 3)
+    # Non-strict cond is only available in v1
+    if not tf2.enabled():
+      self._testShape(fn_true, fn_false, shape)
+      self._testReturnValues(fn_true, fn_false, 1, 3)
     self._testShape(fn_true, fn_false, SingletonTestTuple(shape),
                     strict=True)
     self._testReturnValues(fn_true, fn_false, SingletonTestTuple(1),
                            SingletonTestTuple(3), strict=True)
 
+  @test_util.run_deprecated_v1
   def test_tuple(self):
     shape = (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
     fn_true = lambda: (constant_op.constant(1), 2)
@@ -835,6 +853,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, (1, 2), (3, 4))
 
+  @test_util.run_deprecated_v1
   def test_namedtuple(self):
     shape = TestTuple(tensor_shape.TensorShape([]),
                       tensor_shape.TensorShape([]))
@@ -843,6 +862,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, TestTuple(1, 2), TestTuple(3, 4))
 
+  @test_util.run_deprecated_v1
   def test_nested(self):
     shape = [tensor_shape.TensorShape([]),
              TestTuple(tensor_shape.TensorShape([]),
@@ -868,6 +888,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
         [11, TestTuple(12, [13, 14]),
          np.ones([5, 5]), 16])
 
+  @test_util.run_deprecated_v1
   def test_cond_inside_while_loop(self):
 
     def body(i, matrix):
@@ -889,6 +910,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
 class CaseTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testCase_withDefault(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2)),
@@ -900,6 +922,7 @@ class CaseTest(test_util.TensorFlowTestCase):
       self.assertEqual(sess.run(output, feed_dict={x: 2}), 4)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 6)
 
+  @test_util.run_deprecated_v1
   def testCase_multiple_matches_exclusive(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2)),
@@ -913,6 +936,7 @@ class CaseTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 2})
 
+  @test_util.run_deprecated_v1
   def testCase_multiple_matches_non_exclusive(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2)),
@@ -925,6 +949,7 @@ class CaseTest(test_util.TensorFlowTestCase):
       self.assertEqual(sess.run(output, feed_dict={x: 2}), 4)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 8)
 
+  @test_util.run_deprecated_v1
   def testCase_withoutDefault(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2)),
@@ -938,6 +963,7 @@ class CaseTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 4})
 
+  @test_util.run_deprecated_v1
   def testCase_withoutDefault_oneCondition(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2))]
@@ -979,6 +1005,7 @@ class WhileLoopTestCase(test_util.TensorFlowTestCase):
     # Expect a tuple since that is what the body returns.
     self.assertEqual(self.evaluate(r), (10,))
 
+  @test_util.run_deprecated_v1
   def testWhileLoopSameReturnShape_False(self):
     i = constant_op.constant(0)
     c = lambda i, _: math_ops.less(i, 10)
@@ -1004,6 +1031,7 @@ class WhileLoopTestCase(test_util.TensorFlowTestCase):
 
 class AssertTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testAssert(self):
     i = constant_op.constant(0)
     c = control_flow_ops.Assert(i < 10, [i, [10], [i + 1]])
@@ -1014,6 +1042,18 @@ class AssertTest(test_util.TensorFlowTestCase):
     with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(c)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testAssertInFunction(self):
+
+    @def_function.function
+    def whiny(value):
+      control_flow_ops.Assert(value, ["Raised false"])
+      return constant_op.constant(5)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(whiny(False))
+
+    self.assertAllEqual(whiny(True), 5)
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
index cb628f4aa6441ec9cb03dfe873a79d06a66e37a1..1747f06109daa1e7092fd1bbbcd2e2cc5762fc6c 100644
--- a/tensorflow/python/ops/control_flow_util.py
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -23,10 +23,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import traceback
 
+from tensorflow.python import tf2
 from tensorflow.python.platform import tf_logging as logging
 
+ENABLE_CONTROL_FLOW_V2 = (tf2.enabled() or
+                          os.getenv("TF_ENABLE_CONTROL_FLOW_V2", "0") != "0" or
+                          os.getenv("TF_ENABLE_COND_V2", "0") != "0" or
+                          os.getenv("TF_ENABLE_WHILE_V2", "0") != "0" or
+                          os.getenv("TF_ENABLE_TENSOR_ARRAY_V2", "0") != "0")
+
 
 def IsInXLAContext(op):
   try:
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 3a7eb9355a66a213d3d60f103b818ef22fd839bd..45286f7c188a3e891b5bf3f332f546bed627e102 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
@@ -1029,7 +1030,7 @@ def _scan(fn, elems, initial, reverse=False, inclusive=False, final_only=False):
   for the forward backward use case.
 
   Examples:
-    scan(lambda a, e: a + e, [1.0, 2.0, 3.0], 1.0) => [2.0, 3.0, 4.0]
+    scan(lambda a, e: a + e, [1.0, 2.0, 3.0], 1.0) => [2.0, 4.0, 7.0]
 
     Multiple accumulators:
       scan(lambda a, e: (a[0] + e, a[1] * e), [1.0, 2.0, 3.0], (0.0, 1.0))
@@ -1127,4 +1128,5 @@ def _scan(fn, elems, initial, reverse=False, inclusive=False, final_only=False):
 
 def _get_dim(tensor, i):
   """Get value of tensor shape[i] preferring static value if available."""
-  return tensor.shape[i].value or array_ops.shape(tensor)[i]
+  return tensor_shape.dimension_value(
+      tensor.shape[i]) or array_ops.shape(tensor)[i]
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 1426e8851c5f2a379c750f34d34f60fe0674cdf8..d96601ac21c7d7d62423b65a2e43d08449e23129 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -236,6 +236,10 @@ def _graph_mode_decorator(f, *args, **kwargs):
   original_tensors = all_tensors
   with ops.get_default_graph().gradient_override_map({"IdentityN": name}):
     all_tensors = array_ops.identity_n(all_tensors)
+  # Propagate handle data for happier shape inference for resource variables.
+  for i, t in enumerate(original_tensors):
+    if t.dtype == dtypes.resource and hasattr(t, "_handle_data"):
+      all_tensors[i]._handle_data = t._handle_data  # pylint: disable=protected-access
   tape_lib.record_operation(
       f.__name__, all_tensors, original_tensors, tape_grad_fn)
   for ot, t in zip(original_tensors, all_tensors):
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 2030332e4eaec8574010217d26ef6ac52dd988d5..1557bdf0eda90c26a97ce83239190dd6f9023a58 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -113,8 +113,9 @@ def _shape_common(s1, s2):
 
 
 # pylint: disable=protected-access
-@tf_export("io.QueueBase", v1=["io.QueueBase", "QueueBase"])
-@deprecation.deprecated_endpoints("QueueBase")
+@tf_export("queue.QueueBase",
+           v1=["queue.QueueBase", "io.QueueBase", "QueueBase"])
+@deprecation.deprecated_endpoints(["io.QueueBase", "QueueBase"])
 class QueueBase(object):
   """Base class for queue implementations.
 
@@ -616,8 +617,11 @@ def _shared_name(shared_name):
 
 
 @tf_export(
-    "io.RandomShuffleQueue", v1=["io.RandomShuffleQueue", "RandomShuffleQueue"])
-@deprecation.deprecated_endpoints("RandomShuffleQueue")
+    "queue.RandomShuffleQueue",
+    v1=["queue.RandomShuffleQueue",
+        "io.RandomShuffleQueue", "RandomShuffleQueue"])
+@deprecation.deprecated_endpoints(
+    ["io.RandomShuffleQueue", "RandomShuffleQueue"])
 class RandomShuffleQueue(QueueBase):
   """A queue implementation that dequeues elements in a random order.
 
@@ -702,7 +706,8 @@ class RandomShuffleQueue(QueueBase):
     super(RandomShuffleQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
 
-@tf_export("FIFOQueue")
+@tf_export("queue.FIFOQueue", v1=["queue.FIFOQueue", "FIFOQueue"])
+@deprecation.deprecated_endpoints("FIFOQueue")
 class FIFOQueue(QueueBase):
   """A queue implementation that dequeues elements in first-in first-out order.
 
@@ -760,8 +765,9 @@ class FIFOQueue(QueueBase):
 
 
 @tf_export(
-    "io.PaddingFIFOQueue", v1=["io.PaddingFIFOQueue", "PaddingFIFOQueue"])
-@deprecation.deprecated_endpoints("PaddingFIFOQueue")
+    "queue.PaddingFIFOQueue",
+    v1=["queue.PaddingFIFOQueue", "io.PaddingFIFOQueue", "PaddingFIFOQueue"])
+@deprecation.deprecated_endpoints(["io.PaddingFIFOQueue", "PaddingFIFOQueue"])
 class PaddingFIFOQueue(QueueBase):
   """A FIFOQueue that supports batching variable-sized tensors by padding.
 
@@ -835,8 +841,9 @@ class PaddingFIFOQueue(QueueBase):
     super(PaddingFIFOQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
 
-@tf_export("io.PriorityQueue", v1=["io.PriorityQueue", "PriorityQueue"])
-@deprecation.deprecated_endpoints("PriorityQueue")
+@tf_export("queue.PriorityQueue",
+           v1=["queue.PriorityQueue", "io.PriorityQueue", "PriorityQueue"])
+@deprecation.deprecated_endpoints(["io.PriorityQueue", "PriorityQueue"])
 class PriorityQueue(QueueBase):
   """A queue implementation that dequeues elements in prioritized order.
 
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 9ce024ad9653e11be6410d8becf0d2c469bc018c..d0291e2095bdb6574c707c7458e4cc335fc4b825 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -247,7 +247,7 @@ def _embedding_lookup_and_transform(params,
       return ret
 
 
-@tf_export("nn.embedding_lookup")
+@tf_export(v1=["nn.embedding_lookup"])
 def embedding_lookup(
     params,
     ids,
@@ -316,7 +316,66 @@ def embedding_lookup(
       transform_fn=None)
 
 
-@tf_export("nn.embedding_lookup_sparse")
+@tf_export("nn.embedding_lookup", v1=[])
+def embedding_lookup_v2(
+    params,
+    ids,
+    partition_strategy="mod",
+    max_norm=None,
+    name=None):
+  """Looks up `ids` in a list of embedding tensors.
+
+  This function is used to perform parallel lookups on the list of
+  tensors in `params`.  It is a generalization of
+  `tf.gather`, where `params` is
+  interpreted as a partitioning of a large embedding tensor.  `params` may be
+  a `PartitionedVariable` as returned by using `tf.get_variable()` with a
+  partitioner.
+
+  If `len(params) > 1`, each element `id` of `ids` is partitioned between
+  the elements of `params` according to the `partition_strategy`.
+  In all strategies, if the id space does not evenly divide the number of
+  partitions, each of the first `(max_id + 1) % len(params)` partitions will
+  be assigned one more id.
+
+  If `partition_strategy` is `"mod"`, we assign each id to partition
+  `p = id % len(params)`. For instance,
+  13 ids are split across 5 partitions as:
+  `[[0, 5, 10], [1, 6, 11], [2, 7, 12], [3, 8], [4, 9]]`
+
+  If `partition_strategy` is `"div"`, we assign ids to partitions in a
+  contiguous manner. In this case, 13 ids are split across 5 partitions as:
+  `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`
+
+  The results of the lookup are concatenated into a dense
+  tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.
+
+  Args:
+    params: A single tensor representing the complete embedding tensor,
+      or a list of P tensors all of same shape except for the first dimension,
+      representing sharded embedding tensors.  Alternatively, a
+      `PartitionedVariable`, created by partitioning along dimension 0. Each
+      element must be appropriately sized for the given `partition_strategy`.
+    ids: A `Tensor` with type `int32` or `int64` containing the ids to be looked
+      up in `params`.
+    partition_strategy: A string specifying the partitioning strategy, relevant
+      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
+      is `"mod"`.
+    max_norm: If not `None`, each embedding is clipped if its l2-norm is
+      larger than this value.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` with the same type as the tensors in `params`.
+
+  Raises:
+    ValueError: If `params` is empty.
+  """
+  return embedding_lookup(params, ids, partition_strategy, name,
+                          max_norm=max_norm)
+
+
+@tf_export(v1=["nn.embedding_lookup_sparse"])
 def embedding_lookup_sparse(params,
                             sp_ids,
                             sp_weights,
@@ -491,7 +550,85 @@ def embedding_lookup_sparse(params,
     return embeddings
 
 
-@tf_export("nn.safe_embedding_lookup_sparse")
+@tf_export("nn.embedding_lookup_sparse", v1=[])
+def embedding_lookup_sparse_v2(params,
+                               sp_ids,
+                               sp_weights,
+                               partition_strategy="mod",
+                               combiner=None,
+                               max_norm=None,
+                               name=None):
+  return embedding_lookup_sparse_v2(
+      params, sp_ids, sp_weights, partition_strategy, name, combiner, max_norm)
+
+
+embedding_lookup_sparse_v2.__doc__ = embedding_lookup_sparse.__doc__
+
+
+@tf_export("nn.safe_embedding_lookup_sparse", v1=[])
+def safe_embedding_lookup_sparse_v2(embedding_weights,
+                                    sparse_ids,
+                                    sparse_weights=None,
+                                    combiner="mean",
+                                    default_id=None,
+                                    max_norm=None,
+                                    name=None):
+  """Lookup embedding results, accounting for invalid IDs and empty features.
+
+  The partitioned embedding in `embedding_weights` must all be the same shape
+  except for the first dimension. The first dimension is allowed to vary as the
+  vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
+  may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
+  partitioner.
+
+  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
+  with non-positive weight. For an entry with no features, the embedding vector
+  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.
+
+  The ids and weights may be multi-dimensional. Embeddings are always aggregated
+  along the last dimension.
+
+  Note: when doing embedding lookup on `embedding_weights`, "div" partition
+  strategy will be used. Support for other partition strategy will be added
+  later.
+
+  Args:
+    embedding_weights:  A list of `P` float `Tensor`s or values representing
+      partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
+      created by partitioning along dimension 0.  The total unpartitioned shape
+      should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the vocab size
+      and `e_1, ..., e_m` are the embedding dimensions.
+    sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
+      ids. `d_0` is typically batch size.
+    sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
+      float weights corresponding to `sparse_ids`, or `None` if all weights are
+      be assumed to be 1.0.
+    combiner: A string specifying how to combine embedding results for each
+      entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the
+      default.
+    default_id: The id to use for an entry with no features.
+    max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
+      combining.
+    name: A name for this operation (optional).
+
+  Returns:
+    Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
+
+  Raises:
+    ValueError: if `embedding_weights` is empty.
+  """
+  return safe_embedding_lookup_sparse(
+      embedding_weights,
+      sparse_ids,
+      sparse_weights=sparse_weights,
+      combiner=combiner,
+      default_id=default_id,
+      name=name,
+      partition_strategy="div",
+      max_norm=max_norm)
+
+
+@tf_export(v1=["nn.safe_embedding_lookup_sparse"])
 def safe_embedding_lookup_sparse(embedding_weights,
                                  sparse_ids,
                                  sparse_weights=None,
@@ -554,7 +691,10 @@ def safe_embedding_lookup_sparse(embedding_weights,
 
   dtype = sparse_weights.dtype if sparse_weights is not None else None
   embedding_weights = [
-      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
+      w if (isinstance(w, resource_variable_ops.ResourceVariable)
+            and dtype in (None, w.dtype))
+      else ops.convert_to_tensor(w, dtype=dtype)
+      for w in embedding_weights
   ]
 
   with ops.name_scope(name, 'embedding_lookup',
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 57542e3c7baa0f4eb3dc53431c9a3060f0998c5b..df4be1d65a042f35eacfaae924af197600ece702 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -143,7 +143,8 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         lambda i, a: i < n, compute, [i, a],
         parallel_iterations=parallel_iterations,
         back_prop=back_prop,
-        swap_memory=swap_memory)
+        swap_memory=swap_memory,
+        maximum_iterations=n)
 
     # TODO(akshayka): Remove the in_graph_mode check once caching devices are
     # supported in Eager
@@ -253,7 +254,8 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         compute, [i, a],
         parallel_iterations=parallel_iterations,
         back_prop=back_prop,
-        swap_memory=swap_memory)
+        swap_memory=swap_memory,
+        maximum_iterations=n)
 
     # TODO(akshayka): Remove the in_graph_mode check once caching devices are
     # supported in Eager
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 3926ff69003124d29ae1579151c5a05fa49e3dc3..683f78ce9b21c5a1b5d8b60017588ee8a09686f2 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -157,7 +157,8 @@ def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta,
   # as delta. Convert to float32 here. Since numeric_jacobian is expected to
   # be the groundtruth to compare against, it shouldn't lose any information.
   if x.dtype == dtypes.bfloat16:
-    x = math_ops.cast(x, dtypes.float32)
+    x = math_ops.cast(x, dtypes.float32)  # TODO(wangpeng): Now that the new x
+            # is an output of the old x, isn't feeding to the new x a mistake?
   if y.dtype == dtypes.bfloat16:
     y = math_ops.cast(y, dtypes.float32)
   if x_data.dtype == dtypes.bfloat16.as_numpy_dtype:
@@ -300,7 +301,6 @@ def compute_gradient(x,
       as the initial value.
     delta: (optional) the amount of perturbation.
     init_targets: list of targets to run to initialize model params.
-      TODO(mrry): remove this argument.
     extra_feed_dict: dict that allows fixing specified tensor values
       during the Jacobian calculation.
 
@@ -310,6 +310,7 @@ def compute_gradient(x,
     where "x_size" is the number of elements in x and "y_size" is the
     number of elements in y. If x is a list, returns a list of two numpy arrays.
   """
+  # TODO(mrry): remove argument `init_targets`
   if extra_feed_dict is None:
     extra_feed_dict = {}
 
@@ -327,6 +328,16 @@ def compute_gradient(x,
     return ret
 
 
+def _compute_error(grad):
+  if isinstance(grad, tuple):
+    grad = [grad]
+  error = 0
+  for j_t, j_n in grad:
+    if j_t.size or j_n.size:  # Handle zero size tensors correctly
+      error = np.maximum(error, np.fabs(j_t - j_n).max())
+  return error
+
+
 @tf_export(v1=["test.compute_gradient_error"])
 def compute_gradient_error(x,
                            x_shape,
@@ -369,10 +380,4 @@ def compute_gradient_error(x,
   """
   grad = compute_gradient(x, x_shape, y, y_shape, x_init_value, delta,
                           init_targets, extra_feed_dict=extra_feed_dict)
-  if isinstance(grad, tuple):
-    grad = [grad]
-  error = 0
-  for j_t, j_n in grad:
-    if j_t.size or j_n.size:  # Handle zero size tensors correctly
-      error = np.maximum(error, np.fabs(j_t - j_n).max())
-  return error
+  return _compute_error(grad)
diff --git a/tensorflow/python/ops/gradient_checker_v2.py b/tensorflow/python/ops/gradient_checker_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d473eeb5f4f00087672da53c5fef3ab63bdbd08
--- /dev/null
+++ b/tensorflow/python/ops/gradient_checker_v2.py
@@ -0,0 +1,329 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Gradient checker for functions.
+
+The gradient checker verifies numerically that an function properly
+computes the gradients
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+
+
+def _product(t):
+  if isinstance(t, int):
+    return t
+  else:
+    y = 1
+    for x in t:
+      y *= x
+    return y
+
+
+def _eval_indexed_slices(a):
+  """Converts IndexedSlices to IndexedSlicesValue with numpy indices/values.
+
+  When eager execution is enabled, converts IndexedSlices
+  to IndexedSlicesValue with numpy indices/values.
+
+  Args:
+    a: any value.
+
+  Returns:
+    If a is IndexedSlices and eager execution is enabled, calls numpy() on a's
+    fields. Otherwise returns a unchanged.
+  """
+  if isinstance(a, ops.IndexedSlices) and context.executing_eagerly():
+    return ops.IndexedSlicesValue(
+        indices=[x.numpy() for x in a.indices],
+        values=[x.numpy() for x in a.values],
+        dense_shape=a.dense_shape)
+  return a
+
+
+def _to_numpy(a):
+  """Converts Tensors and EagerTensors to numpy arrays.
+
+  Args:
+    a: any value.
+
+  Returns:
+    If a is EagerTensor or Tensor, returns the evaluation of a by calling
+    numpy() or run(). Otherwise returns a unchanged.
+  """
+  if isinstance(a, ops.EagerTensor):
+    return a.numpy()
+  if isinstance(a, ops.Tensor):
+    sess = ops.get_default_session()
+    return sess.run(a)
+  return a
+
+
+def _prepare(f, xs_dtypes):
+  """Return a function that executes 'f'.
+
+    In TF 2.x, this is the same as `f`.
+    In TF 1.x, returns a Python function that executes the graph defined by `f`
+    in a Session.
+
+  Args:
+    f: the function.
+    xs_dtypes: dtypes of f's arguments.
+
+  Returns:
+    a function that will be evaluated in both graph and eager mode
+  """
+  if context.executing_eagerly():
+
+    def decorated_eager(*xs_data):
+      return f(*map(ops.convert_to_tensor, xs_data))
+
+    return decorated_eager
+  xs = [array_ops.placeholder(x_dtype) for x_dtype in xs_dtypes]
+  y = f(*xs)
+  sess = ops.get_default_session()
+  def decorated_graph(*xs_data):
+    xs_data = [_to_numpy(a) for a in xs_data]
+    return sess.run(y, feed_dict=dict(zip(xs, xs_data)))
+  return decorated_graph
+
+
+def _compute_theoretical_jacobian(f, y_shape, y_dtype, xs, param):
+  """Computes the theoretical Jacobian for f regarding xs[param].
+
+  One can think of the relation among f, xs and y as y = f(xs).
+
+  Args:
+    f: the function.
+    y_shape: the shape of the result.
+    y_dtype: the dtype of the result.
+    xs: a list of tensors.
+    param: the index of the target parameter.
+
+  Returns:
+    A 2-d numpy array representing the Jacobian. It has "x_size" rows
+    and "y_size" columns where "x_size" is the number of elements in xs[param]
+    and "y_size" is the number of elements in the result.
+
+  Raises:
+    ValueError: If result is empty but the gradient is nonzero.
+  """
+  x = xs[param]
+  # Complex vectors are treated as vectors of twice as many reals.
+  x_shape = tuple(x.shape) + (2,) if x.dtype.is_complex else x.shape
+  y_factor = 2 if y_dtype.is_complex else 1
+
+  # To compute the jacobian, we treat x and y as one-dimensional vectors.
+  x_size = _product(x_shape)
+  x_val_size = _product(x_shape[1:])  # This is used for sparse gradients
+  y_size = _product(y_shape) * y_factor
+
+  # Allocate 2-D Jacobian, with x dimensions smashed into the first
+  # dimension and y dimensions smashed into the second.
+  jacobian = np.zeros((x_size, y_size), dtype=x.dtype.real_dtype.as_numpy_dtype)
+
+  # For each of the entry of dy, we set this to be 1 and
+  # everything else to be 0 and compute the gradients -- this will give us one
+  # one column of the Jacobian matrix.
+  dy_data = np.zeros(y_shape, dtype=y_dtype.as_numpy_dtype)
+  dy_data_flat = dy_data.ravel().view(y_dtype.real_dtype.as_numpy_dtype)
+  grad_fn_unprep = backprop.gradients_function(f, [param])
+  grad_fn = _prepare(lambda dy, *xs: grad_fn_unprep(*xs, dy=dy),
+                     [y_dtype] + [x.dtype for x in xs])
+  for col in range(y_size):
+    dy_data_flat[col] = 1
+    grad = _to_numpy(grad_fn(dy_data, *xs)[0])
+    grad = _eval_indexed_slices(grad)
+    dy_data_flat[col] = 0
+    if isinstance(grad, ops.IndexedSlicesValue):
+      for i, v in zip(grad.indices, grad.values):
+        r_begin = i * x_val_size
+        r_end = r_begin + x_val_size
+        jacobian[r_begin:r_end, col] += v.flat
+    else:
+      jacobian[:, col] = grad.ravel().view(jacobian.dtype)
+
+  # If the output is empty, run the gradients at least once and make sure
+  # they produce zeros.
+  if y_size == 0:  # don't use 'not y_size', because y_size may not be an int
+    grad = _to_numpy(grad_fn(dy_data, *xs)[0])
+    if grad.shape != x.shape:
+      raise ValueError("Empty gradient has wrong shape: expected %s, got %s" %
+                       (x.shape, grad.shape))
+    if np.any(grad):
+      raise ValueError("Empty tensor with nonzero gradients")
+
+  logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian)
+  return jacobian
+
+
+def _compute_numeric_jacobian(f, y_size, y_dtype, xs, param,
+                              delta):
+  """Computes the numeric Jacobian for f regarding xs[param].
+
+  One can think of the relation among f, xs and y as y = f(xs).
+
+  Args:
+    f: the function.
+    y_size: the number of elements of the result.
+    y_dtype: the dtype of the result.
+    xs: a list of tensors.
+    param: the index of the target parameter.
+    delta: the amount of perturbation we give to the input.
+
+  Returns:
+    A 2-d numpy array representing the Jacobian. It has "x_size" rows
+    and "y_size" columns where "x_size" is the number of elements in xs[param]
+    and "y_size" is the number of elements in the result.
+  """
+  # bfloat16 doesn't have enough bits to represent high precision numbers such
+  # as delta. Convert to float32 here. Since numeric_jacobian is expected to
+  # be the groundtruth to compare against, it shouldn't lose any information.
+  x_shape = xs[param].shape
+  x_dtype = xs[param].dtype
+  if y_dtype == dtypes.bfloat16:
+    f = lambda *xs: math_ops.cast(f(*xs), dtypes.float32)
+    y_dtype = dtypes.float32
+
+  # To compute the jacobian, we treat x and y as one-dimensional vectors
+  x_size = _product(x_shape) * (2 if x_dtype.is_complex else 1)
+  y_size = y_size * (2 if y_dtype.is_complex else 1)
+  x_dtype = x_dtype.real_dtype.as_numpy_dtype
+  y_dtype = y_dtype.real_dtype.as_numpy_dtype
+
+  xs_dtypes = [x.dtype for x in xs]
+  # Converts xs to numpy arrays to do in-place perturbation.
+  # Calls asarray() to avoid copying in ravel() later.
+  xs = [np.asarray(_to_numpy(x)) for x in xs]
+  x = xs[param]
+
+  # Make sure we have the right types
+  scale = np.asarray(2 * delta, dtype=y_dtype)[()]
+
+  jacobian = np.zeros((x_size, y_size), dtype=x_dtype)
+  # For each of the entry of x, we slightly perturbs this by adding and
+  # subtracting a delta and then compute difference between the outputs. This
+  # will give us one row of the Jacobian matrix.
+
+  f = _prepare(f, xs_dtypes)
+  for row in range(x_size):
+    original = x.ravel().view(x_dtype)[row]
+    x.ravel().view(x_dtype)[row] += delta
+    y_pos = _to_numpy(f(*xs))
+    x.ravel().view(x_dtype)[row] = original
+    x.ravel().view(x_dtype)[row] -= delta
+    y_neg = _to_numpy(f(*xs))
+    x.ravel().view(x_dtype)[row] = original
+    diff = (y_pos - y_neg) / scale
+    jacobian[row, :] = diff.ravel().view(y_dtype)
+
+  logging.vlog(1, "Numeric Jacobian =\n%s", jacobian)
+  return jacobian
+
+
+def _compute_gradient(f,
+                      y_shape,
+                      y_dtype,
+                      xs,
+                      param,
+                      delta):
+  """Computes the theoretical and numerical jacobian."""
+  x = xs[param]
+  t = x.dtype
+  allowed_types = [dtypes.float16, dtypes.bfloat16, dtypes.float32,
+                   dtypes.float64, dtypes.complex64, dtypes.complex128]
+  assert t.base_dtype in allowed_types, ("Cannot compute gradient for"
+                                         "unsupported type %s of argument %s" %
+                                         (t.name, param))
+  t2 = y_dtype
+  assert t2.base_dtype in allowed_types, ("Cannot compute gradient for"
+                                          "unsupported type %s of y" % t2.name)
+  y_size = _product(y_shape)
+  jacob_t = _compute_theoretical_jacobian(f, y_shape, y_dtype,
+                                          xs, param)
+  jacob_n = _compute_numeric_jacobian(f, y_size, y_dtype, xs,
+                                      param, delta)
+  return jacob_t, jacob_n
+
+
+def _compute_gradient_list(f, xs, delta):
+  """Compute gradients for a list of x values."""
+  # convert xs to tensors so that dtype and shape have uniform types
+  xs = list(map(ops.convert_to_tensor, xs))
+  # run the function to get info of the result
+  xs_dtypes = [x.dtype for x in xs]
+  f_temp = _prepare(f, xs_dtypes)
+  y = f_temp(*xs)
+  return zip(*[_compute_gradient(f, y.shape, dtypes.as_dtype(y.dtype),
+                                 xs, i, delta) for i in range(len(xs))])
+
+
+@tf_export("test.compute_gradient", v1=[])
+def compute_gradient(f, x, delta=1e-3):
+  """Computes the theoretical and numeric Jacobian of f.
+
+  With y = f(x), computes the theoretical and numeric Jacobian dy/dx.
+
+  Args:
+    f: the function.
+    x: a list of tensors.
+    delta: (optional) perturbation used to compute numeric Jacobian.
+
+  Returns:
+    A pair of lists, where the first is a list of 2-d numpy arrays representing
+    the theoretical Jacobians for each argument, and the second list is the
+    numerical ones. Each 2-d array has "x_size" rows
+    and "y_size" columns where "x_size" is the number of elements in the
+    corresponding argument and "y_size" is the number of elements in f(x).
+
+  Raises:
+    ValueError: If result is empty but the gradient is nonzero.
+  """
+  if not isinstance(x, list):
+    raise ValueError(
+        "`x` must be a list of Tensors (arguments to `f`), not a %s" % type(x))
+  return _compute_gradient_list(f, x, delta)
+
+
+def max_error(grad1, grad2):
+  """Computes maximum elementwise gap.
+
+  Computes the maximum elementwise gap between two lists of tensors of the same
+  shape.
+
+  Args:
+    grad1: a lists of tensors.
+    grad2: a lists of tensors with the same shape as grad1.
+
+  Returns:
+    The maximum elementwise gap between the two.
+  """
+  error = 0
+  for j_t, j_n in zip(grad1, grad2):
+    if j_t.size or j_n.size:  # Handle zero size tensors correctly
+      error = np.maximum(error, np.fabs(j_t - j_n).max())
+  return error
diff --git a/tensorflow/python/ops/gradient_checker_v2_test.py b/tensorflow/python/ops/gradient_checker_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..191b2b6568104b7cf49aa2844f7929284c00d74d
--- /dev/null
+++ b/tensorflow/python/ops/gradient_checker_v2_test.py
@@ -0,0 +1,300 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for compute_gradient.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import custom_gradient
+from tensorflow.python.ops import \
+gradient_checker_v2 as gradient_checker
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+# needs this to register gradient for SoftmaxCrossEntropyWithLogits:
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def _random_complex(shape, dtype):
+  data = np.random.random_sample(shape).astype(dtype.as_numpy_dtype)
+  if dtype.is_complex:
+    data.imag = np.random.random_sample(shape)
+  return data
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class GradientCheckerTest(test.TestCase):
+
+  def testAddSimple(self):
+    size = (2, 3)
+    x1 = constant_op.constant(2.0, shape=size, name="x1")
+    x2 = constant_op.constant(3.0, shape=size, name="x2")
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        lambda x1: math_ops.add(x1, x2), [x1]))
+    tf_logging.info("x1 error = %f", error)
+    assert error < 1e-4
+
+  def testAddCustomized(self):
+    size = (2, 3)
+    x1 = constant_op.constant(
+        2.0, shape=size, dtype=dtypes.float64, name="x1")
+    x2 = np.asarray(np.arange(6, dtype=np.float64).reshape(2, 3))
+    # checkint gradients for x2 using a special delta
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        lambda x2: math_ops.add(x1, x2),
+        [x2], delta=1e-2))
+    tf_logging.info("x2 error = %f", error)
+    assert error < 1e-10
+
+  def testGather(self):
+    def f(params):
+      index_values = [1, 3]
+      indices = constant_op.constant(index_values, name="i")
+      return array_ops.gather(params, indices, name="y")
+    p_shape = (4, 2)
+    p_size = 8
+    params = constant_op.constant(
+        np.arange(p_size).astype(np.float), shape=p_shape, name="p")
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [params]))
+    tf_logging.info("gather error = %f", error)
+    assert error < 1e-4
+
+  def testNestedGather(self):
+    def f(params):
+      index_values = [1, 3, 5, 6]
+      indices = constant_op.constant(index_values, name="i")
+      y = array_ops.gather(params, indices, name="y")
+      index_values2 = [0, 2]
+      indices2 = constant_op.constant(index_values2, name="i2")
+      return array_ops.gather(y, indices2, name="y2")
+    p_shape = (8, 2)
+    p_size = 16
+    params = constant_op.constant(
+        np.arange(p_size).astype(np.float), shape=p_shape, name="p")
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [params]))
+    tf_logging.info("nested gather error = %f", error)
+    assert error < 1e-4
+
+  def testComplexMul(self):
+    c = constant_op.constant(5 + 7j, dtype=dtypes.complex64)
+    def f(x):
+      return c * x
+    x_shape = c.shape
+    x_dtype = c.dtype
+    x = constant_op.constant(_random_complex(x_shape, x_dtype))
+    analytical, numerical = gradient_checker.compute_gradient(
+        f, [x])
+    correct = np.array([[5, 7], [-7, 5]])
+    self.assertAllEqual(correct, analytical[0])
+    self.assertAllClose(correct, numerical[0], rtol=1e-4)
+    x = constant_op.constant(_random_complex(x_shape, x_dtype))
+    self.assertLess(
+        gradient_checker.max_error(*gradient_checker.compute_gradient(
+            f, [x])), 3e-4)
+
+  def testComplexConj(self):
+    def f(x):
+      return math_ops.conj(x)
+    x_shape = ()
+    x_dtype = dtypes.complex64
+    x = constant_op.constant(_random_complex(x_shape, x_dtype))
+    analytical, numerical = gradient_checker.compute_gradient(
+        f, [x])
+    correct = np.array([[1, 0], [0, -1]])
+    self.assertAllEqual(correct, analytical[0])
+    self.assertAllClose(correct, numerical[0], rtol=2e-5)
+    x = constant_op.constant(_random_complex(x_shape, x_dtype))
+    self.assertLess(
+        gradient_checker.max_error(*gradient_checker.compute_gradient(
+            f, [x])), 2e-5)
+
+  def testEmptySucceeds(self):
+    def f(x):
+      return array_ops.identity(x)
+    x = constant_op.constant(np.random.random_sample((0, 3)),
+                             dtype=dtypes.float32)
+    for grad in gradient_checker.compute_gradient(f, [x]):
+      self.assertEqual(grad[0].shape, (0, 0))
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [x]))
+    self.assertEqual(error, 0)
+
+  def testEmptyFails(self):
+    @custom_gradient.custom_gradient
+    def id_bad_grad(x):
+      y = array_ops.identity(x)
+      def grad_fn(dy):
+        # dx = constant_op.constant(np.zeros((1, 4)), dtype=dtypes.float32)
+        dx = array_ops.transpose(dy)
+        return dx
+      return y, grad_fn
+    def f(x):
+      return id_bad_grad(x)
+    x = constant_op.constant(np.random.random_sample((0, 3)),
+                             dtype=dtypes.float32)
+    bad = r"Empty gradient has wrong shape: expected \(0, 3\), got \(3, 0\)"
+    with self.assertRaisesRegexp(ValueError, bad):
+      gradient_checker.compute_gradient(f, [x])
+
+  def testNaNGradFails(self):
+    @custom_gradient.custom_gradient
+    def id_nan_grad(x):
+      y = array_ops.identity(x)
+      def grad_fn(dy):
+        dx = np.nan * dy
+        # dx = dy
+        return dx
+      return y, grad_fn
+    def f(x):
+      return id_nan_grad(x)
+    x = constant_op.constant(np.random.random_sample((1, 1)),
+                             dtype=dtypes.float32)
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [x]))
+    # Typical test would assert error < max_err, so assert this test would
+    # raise AssertionError, since NaN is not < 1.0.
+    with self.assertRaisesRegexp(AssertionError, "False is not true"):
+      self.assertTrue(error < 1.0)
+
+  def testGradGrad(self):
+
+    def f(x):
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = math_ops.square(x)
+        z = math_ops.square(y)
+      return tape.gradient(z, x)
+
+    analytical, numerical = gradient_checker.compute_gradient(f, [2.0])
+    self.assertAllEqual([[[48.]]], analytical)
+    self.assertAllClose([[[48.]]], numerical, rtol=1e-4)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MiniMNISTTest(test.TestCase):
+
+  # Gradient checker for MNIST.
+  def _BuildAndTestMiniMNIST(self, param_index, tag):
+    # Fix seed to avoid occasional flakiness
+    np.random.seed(6)
+
+    # Hyperparameters
+    batch = 3
+    inputs = 16
+    features = 32
+    classes = 10
+
+    # Define the parameters
+    inp_data = np.random.random_sample(inputs * batch)
+    hidden_weight_data = np.random.randn(inputs * features) / np.sqrt(inputs)
+    hidden_bias_data = np.random.random_sample(features)
+    sm_weight_data = np.random.randn(features * classes) / np.sqrt(features)
+    sm_bias_data = np.random.random_sample(classes)
+
+    # special care for labels since they need to be normalized per batch
+    label_data = np.random.random(batch * classes).reshape((batch, classes))
+    s = label_data.sum(axis=1)
+    label_data /= s[:, None]
+
+    # We treat the inputs as "parameters" here
+    inp = constant_op.constant(
+        inp_data.tolist(),
+        shape=[batch, inputs],
+        dtype=dtypes.float64,
+        name="inp")
+    hidden_weight = constant_op.constant(
+        hidden_weight_data.tolist(),
+        shape=[inputs, features],
+        dtype=dtypes.float64,
+        name="hidden_weight")
+    hidden_bias = constant_op.constant(
+        hidden_bias_data.tolist(),
+        shape=[features],
+        dtype=dtypes.float64,
+        name="hidden_bias")
+    softmax_weight = constant_op.constant(
+        sm_weight_data.tolist(),
+        shape=[features, classes],
+        dtype=dtypes.float64,
+        name="softmax_weight")
+    softmax_bias = constant_op.constant(
+        sm_bias_data.tolist(),
+        shape=[classes],
+        dtype=dtypes.float64,
+        name="softmax_bias")
+
+    # List all the parameter so that we can test them one at a time
+    all_params = [
+        inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias
+    ]
+
+    # Now, Building MNIST
+    def f(inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias):
+      features = nn_ops.relu(
+          nn_ops.xw_plus_b(inp, hidden_weight, hidden_bias), name="features")
+      logits = nn_ops.xw_plus_b(
+          features, softmax_weight, softmax_bias, name="logits")
+      labels = constant_op.constant(
+          label_data.tolist(),
+          shape=[batch, classes],
+          dtype=dtypes.float64,
+          name="labels")
+      cost = nn_ops.softmax_cross_entropy_with_logits(
+          labels=labels, logits=logits, name="cost")
+      return cost
+
+    def f_restricted(x):
+      xs = all_params
+      i = param_index
+      # use x for the i-th parameter
+      xs = xs[0:i]+[x]+xs[i+1:]
+      return f(*xs)
+    # Test the gradients.
+    err = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f_restricted, [all_params[param_index]], delta=1e-5))
+
+    tf_logging.info("Mini MNIST: %s gradient error = %g", tag, err)
+    return err
+
+  def testInputGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(0, "input"), 1e-8)
+
+  def testHiddenWeightGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(1, "hidden_weight"), 1e-8)
+
+  def testHiddenBiasGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(2, "hidden_bias"), 1e-8)
+
+  def testSoftmaxWeightGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(3, "softmax_weight"), 1e-8)
+
+  def testSoftmaxBiasGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(4, "softmax_bias"), 1e-8)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 8cc4d926c7f0ff392e243db6d6c028b43c9f9a31..0a70d6ee61e64f94c41c1f1d0a5b6c3610b45c04 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -297,8 +297,12 @@ def _DefaultGradYs(grad_ys,
   return new_grad_ys
 
 
-def IsTrainable(tensor):
-  dtype = dtypes.as_dtype(tensor.dtype)
+def IsTrainable(tensor_or_dtype):
+  if isinstance(tensor_or_dtype, ops.Tensor):
+    dtype = tensor_or_dtype.dtype
+  else:
+    dtype = tensor_or_dtype
+  dtype = dtypes.as_dtype(dtype)
   return dtype.base_dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
                               dtypes.complex64, dtypes.complex128,
                               dtypes.resource, dtypes.variant)
@@ -322,6 +326,10 @@ def _VerifyGeneratedGradients(grads, op):
     ValueError: if sizes of gradients and inputs don't match.
     TypeError: if type of any gradient is not valid for its input.
   """
+  # While ops have inputs added to them during the gradient computation, so we
+  # skip the below check. See while_v2 for details.
+  if op.type == "While": return
+
   if len(grads) != len(op.inputs):
     raise ValueError("Num gradients %d generated for op %s do not match num "
                      "inputs %d" % (len(grads), op.node_def, len(op.inputs)))
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index a9058c4a341dda3a19a7f5390da1455981ee5d4c..c53afef63bc1d2fc1ba1927c687f7ecad4eb46a4 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -158,6 +158,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       grads = gradients.gradients(z, [x])
       self.assertTrue(all(x is not None for x in grads))
 
+  @test_util.run_v1_only("b/120545219")
   def testBoundaryContinue(self):
     # Test that we differentiate both 'x' and 'y' correctly when x is a
     # predecessor of y.
@@ -169,6 +170,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       self.assertTrue(all(x is not None for x in grads))
       self.assertEqual(6.0, grads[0].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testAggregationMethodAccumulateN(self):
     with self.cached_session():
       x = constant(1.0)
@@ -182,6 +184,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       self.assertEqual(20.0, grads[0].eval())
       self.assertEqual(10.0, grads[1].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testAggregationMethodAddN(self):
     with self.cached_session():
       x = constant(1.0)
@@ -193,6 +196,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       self.assertEqual(20.0, grads[0].eval())
       self.assertEqual(10.0, grads[1].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testAggregationMethodTree(self):
     with self.cached_session():
       x = constant(1.0)
@@ -239,6 +243,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
             [dx, dy], feed_dict={x: [1.0], dy.indices: [0], dy.values: [2.0]})
       self.assertEqual(vdx, vdy)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonDifferentiableSwitchInWhileLoop(self):
     with ops.Graph().as_default():
       v = array_ops.placeholder(dtypes.float32, [])
@@ -270,6 +275,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       gradient = gradients.gradients(graph.as_graph_element(var), var)
       self.assertIsNotNone(gradient)
 
+  @test_util.run_v1_only("b/120545219")
   def testVariableRefGradient(self):
     with ops.Graph().as_default():
       init = constant_op.constant(100.0)
@@ -277,6 +283,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       gradient = gradients.gradients(var._ref(), var)
       self.assertIsNotNone(gradient)
 
+  @test_util.run_v1_only("b/120545219")
   def testDependentYs(self):
     with self.cached_session():
       x = constant_op.constant(3.0)
@@ -292,6 +299,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       g = gradients.gradients([z, z2], x)
       self.assertAllClose(17502.0, g[0].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testPartialDerivatives(self):
     with self.cached_session():
       x = constant_op.constant(1.)
@@ -302,6 +310,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       partialg = gradients.gradients(z, [x, y], stop_gradients=[x, y])
       self.assertEqual([1.0, 1.0], [g.eval() for g in partialg])
 
+  @test_util.run_v1_only("b/120545219")
   def testStopGradients(self):
     def _MakeGraph(rng, stop_gradients=()):
       def _FunctionOf(xs, k=3):
@@ -606,6 +615,7 @@ class PreventGradientTest(test_util.TensorFlowTestCase):
 
 class HessianVectorProductTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testHessianVectorProduct(self):
     # Manually compute the Hessian explicitly for a low-dimensional problem
     # and check that HessianVectorProduct matches multiplication by the
@@ -634,6 +644,7 @@ class HessianVectorProductTest(test_util.TensorFlowTestCase):
 
 class HessianTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testHessian1D(self):
     # Manually compute the Hessian explicitly for a low-dimensional problem
     # and check that `hessian` matches. Specifically, the Hessian of
@@ -651,6 +662,7 @@ class HessianTest(test_util.TensorFlowTestCase):
       hess_actual = self.evaluate(hess)
     self.assertAllClose(hess_value, hess_actual)
 
+  @test_util.run_v1_only("b/120545219")
   def testHessian1D_multi(self):
     # Test the computation of the hessian with respect to multiple tensors
     m = 4
@@ -671,6 +683,7 @@ class HessianTest(test_util.TensorFlowTestCase):
     for hess_value, hess_actual in zip(hess_values, hessians_actual):
       self.assertAllClose(hess_value, hess_actual)
 
+  @test_util.run_v1_only("b/120545219")
   def testHessianInvalidDimension(self):
     for shape in [(10, 10), None]:
       with self.cached_session(use_gpu=True):
@@ -679,6 +692,7 @@ class HessianTest(test_util.TensorFlowTestCase):
         with self.assertRaises(ValueError):
           gradients.hessians(x, x)
 
+  @test_util.run_v1_only("b/120545219")
   def testHessian2D_square_matrix(self):
     # Manually compute the Hessian explicitly for a low-dimensional problem
     # and check that `hessian` matches. Specifically, the Hessian of
@@ -700,6 +714,7 @@ class HessianTest(test_util.TensorFlowTestCase):
     self.assertAllEqual((m, m, m, m), hess_actual.shape)
     self.assertAllClose(hess_value, hess_actual.reshape((m * m, m * m)))
 
+  @test_util.run_v1_only("b/120545219")
   def testHessian2D_non_square_matrix(self):
     m = 3
     n = 4
@@ -722,6 +737,7 @@ class HessianTest(test_util.TensorFlowTestCase):
 
 class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testIndexedSlicesToTensor(self):
     with self.cached_session():
       np_val = np.random.rand(4, 4, 4, 4).astype(np.float32)
@@ -731,6 +747,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
       c_dense = math_ops.multiply(c_sparse, 1.0)
       self.assertAllClose(np_val, self.evaluate(c_dense))
 
+  @test_util.run_v1_only("b/120545219")
   def testIndexedSlicesToTensorList(self):
     with self.cached_session():
       numpy_list = []
@@ -747,6 +764,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
       packed_sparse = array_ops.stack(sparse_list)
       self.assertAllClose(packed_dense.eval(), self.evaluate(packed_sparse))
 
+  @test_util.run_v1_only("b/120545219")
   def testInt64Indices(self):
     with self.cached_session():
       np_val = np.random.rand(4, 4, 4, 4).astype(np.float32)
@@ -759,6 +777,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
       c_dense = math_ops.multiply(c_sparse, 1.0)
       self.assertAllClose(np_val, self.evaluate(c_dense))
 
+  @test_util.run_v1_only("b/120545219")
   def testWarnings(self):
     # TODO(gunan) Reenable after this issue is fixed:
     # https://github.com/google/protobuf/issues/2812
@@ -802,6 +821,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
 
 class OnlyRealGradientsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testRealOnly(self):
     x = constant_op.constant(7+3j, dtype=dtypes.complex64)
     y = math_ops.square(x)
@@ -814,6 +834,7 @@ class OnlyRealGradientsTest(test_util.TensorFlowTestCase):
 
 class ResourceCondTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testBasic(self):
     gamma = resource_variable_ops.ResourceVariable(
         np.random.random((3,)),
@@ -943,6 +964,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       self.assertEqual(6., math_ops.reduce_sum(dx).numpy())
       self.assertEqual(8., math_ops.reduce_sum(dw).numpy())
 
+  @test_util.run_v1_only("b/120545219")
   def testCustomGradientErrorsWithNonResourceVariables(self):
 
     def F(x, use_resource=False):
@@ -993,6 +1015,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       # Smoke test to ensure numpy inputs are accepted
       F(x)
 
+  @test_util.run_v1_only("b/120545219")
   def testRVGradientsDynamicCond(self):
     with self.cached_session():
       alpha = resource_variable_ops.ResourceVariable(
@@ -1004,7 +1027,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
           conditional, lambda: alpha * 2, lambda: alpha * 3)
 
       g, = gradients_impl.gradients(output, alpha)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllEqual(g.eval(), [2.0])
       self.assertAllEqual(g.eval(feed_dict={conditional: False}), [3.0])
 
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 229393c970386a942ab4cff1afb02bb742455618..24d049b726fb93401d916d60c0d37fe85de30719 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -2046,9 +2046,8 @@ def sample_distorted_bounding_box_v2(image_size,
       3-D with shape `[batch, N, 4]` describing the N bounding boxes
       associated with the image.
     seed: An optional `int`. Defaults to `0`.
-      If either `seed` or `seed2` are set to non-zero, the random number
-      generator is seeded by the given `seed`.  Otherwise, it is seeded by a
-      random seed.
+      If `seed` is set to non-zero, the random number generator is seeded by
+      the given `seed`.  Otherwise, it is seeded by a random seed.
     min_object_covered: A Tensor of type `float32`. Defaults to `0.1`.
       The cropped area of the image must contain at least this
       fraction of any bounding box supplied. The value of this parameter should
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 03d2201a9ae7aa91f7679e2940831194ba716494..c0a4bcd51dd10f352366b74955241e5f97133130 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -55,6 +55,15 @@ class Initializer(object):
   """
 
   def __call__(self, shape, dtype=None, partition_info=None):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. If not provided use the initializer
+        dtype.
+      partition_info: Optional information about the possible partitioning of a
+        tensor.
+    """
     raise NotImplementedError
 
   def get_config(self):
@@ -143,7 +152,8 @@ class Constant(Initializer):
     value: A Python scalar, list or tuple of values, or a N-dimensional numpy
       array. All elements of the initialized variable will be set to the
       corresponding value in the `value` argument.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer.
     verify_shape: Boolean that enables verification of the shape of `value`. If
       `True`, the initializer will throw an error if the shape of `value` is not
       compatible with the shape of the initialized tensor.
@@ -239,7 +249,8 @@ class RandomUniform(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer.
   """
 
   def __init__(self, minval=0, maxval=None, seed=None, dtype=dtypes.float32):
@@ -275,7 +286,8 @@ class RandomNormal(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
   """
 
   def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32):
@@ -316,7 +328,8 @@ class TruncatedNormal(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
   """
 
   def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32):
@@ -369,8 +382,9 @@ class UniformUnitScaling(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
-    
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
   References:
       [Sussillo et al., 2014](https://arxiv.org/abs/1412.6558)
       ([pdf](http://arxiv.org/pdf/1412.6558.pdf))
@@ -437,7 +451,8 @@ class VarianceScaling(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
 
   Raises:
     ValueError: In case of an invalid value for the "scale", mode" or
@@ -483,7 +498,7 @@ class VarianceScaling(Initializer):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal" or self.distribution == "truncated_normal":
-      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+    # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
       stddev = math.sqrt(scale) / .87962566103423978
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
@@ -534,8 +549,9 @@ class Orthogonal(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type.
-  
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
   References:
       [Saxe et al., 2014](https://openreview.net/forum?id=_wzZwKpTDF_9C)
       ([pdf](https://arxiv.org/pdf/1312.6120.pdf))
@@ -592,8 +608,9 @@ class ConvolutionDeltaOrthogonal(Initializer):
       `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
-    
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
   References:
       [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
       ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
@@ -652,8 +669,9 @@ class ConvolutionOrthogonal(Initializer):
       `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
-    
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
   References:
       [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
       ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
@@ -721,8 +739,9 @@ class ConvolutionOrthogonal2D(ConvolutionOrthogonal):
       a factor of `gain`.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
-    
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
   References:
       [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
       ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
@@ -862,8 +881,9 @@ class ConvolutionOrthogonal1D(ConvolutionOrthogonal):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type.
-    
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
   References:
       [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
       ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
@@ -982,8 +1002,9 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
       `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
-    
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
   References:
       [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
       ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
@@ -1132,7 +1153,8 @@ class Identity(Initializer):
 
   Args:
     gain: Multiplicative factor to apply to the identity matrix.
-    dtype: The type of the output.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
   """
 
   def __init__(self, gain=1.0, dtype=dtypes.float32):
@@ -1170,9 +1192,10 @@ class GlorotUniform(VarianceScaling):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
 
-  References: 
+  References:
       [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
       ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
   """
@@ -1208,9 +1231,10 @@ class GlorotNormal(VarianceScaling):
   Args:
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
 
-  References: 
+  References:
       [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
       ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
   """
@@ -1264,7 +1288,7 @@ def lecun_normal(seed=None):
       An initializer.
 
   References:
-      - Self-Normalizing Neural Networks, 
+      - Self-Normalizing Neural Networks,
       [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
       ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
       - Efficient Backprop,
@@ -1289,7 +1313,7 @@ def lecun_uniform(seed=None):
       An initializer.
 
   References:
-      - Self-Normalizing Neural Networks, 
+      - Self-Normalizing Neural Networks,
       [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
       ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
       - Efficient Backprop,
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 2c9476a9bd32c3df3a6a2750eae8184de544d411..df2bd887cdde6f651db572c2bdfebd2bc0170716 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -44,6 +44,7 @@ einsum = special_math_ops.einsum
 eye = linalg_ops.eye
 inv = linalg_ops.matrix_inverse
 logm = gen_linalg_ops.matrix_logarithm
+lu = gen_linalg_ops.lu
 tf_export('linalg.logm')(logm)
 lstsq = linalg_ops.matrix_solve_ls
 norm = linalg_ops.norm
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 8efafda3a1e7424442163a76aca95d14af4b8a70..6be81f4b34191414d3c4c00ac7158bfa1539ef27 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -381,7 +381,10 @@ class LinearOperator(object):
       `Dimension` object.
     """
     # Derived classes get this "for free" once .shape is implemented.
-    return self.shape[-1]
+    if self.shape.rank is None:
+      return tensor_shape.Dimension(None)
+    else:
+      return self.shape.dims[-1]
 
   def domain_dimension_tensor(self, name="domain_dimension_tensor"):
     """Dimension (in the sense of vector spaces) of the domain of this operator.
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index 89ff48ebd9eeee9b665b466ec348781e3176141d..dbaae886d43e46ac193d1e7f28a6367192d2a640 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -30,7 +30,7 @@ from tensorflow.python.ops.gen_list_ops import *
 # pylint: enable=wildcard-import
 
 
-ops.NotDifferentiable("TensorListConcat")
+ops.NotDifferentiable("TensorListConcatLists")
 ops.NotDifferentiable("TensorListElementShape")
 ops.NotDifferentiable("TensorListLength")
 ops.NotDifferentiable("TensorListPushBackBatch")
@@ -65,6 +65,21 @@ def tensor_list_from_tensor(tensor, element_shape, name=None):
       name=name)
 
 
+def tensor_list_concat(input_handle, element_dtype, name=None):
+  # Ignore the lengths output of TensorListConcat. It is only used during
+  # gradient computation.
+  return gen_list_ops.tensor_list_concat(
+      input_handle=input_handle, element_dtype=element_dtype, name=name)[0]
+
+
+def tensor_list_split(tensor, element_shape, lengths, name=None):
+  return gen_list_ops.tensor_list_split(
+      tensor=tensor,
+      element_shape=_build_element_shape(element_shape),
+      lengths=lengths,
+      name=name)
+
+
 @ops.RegisterGradient("TensorListPushBack")
 def _PushBackGrad(op, dresult):
   return gen_list_ops.tensor_list_pop_back(
@@ -86,6 +101,25 @@ def _TensorListStackGrad(unused_op, dtensor):
   return tensor_list_from_tensor(dtensor, element_shape=dtensor.shape[1:])
 
 
+@ops.RegisterGradient("TensorListConcat")
+def _TensorListConcatGrad(op, dtensor, unused_dlengths):
+  # TODO(srbs): We lose the element_shape information in tensor_list_concat.
+  # Consider providing that as an output of TensorListConcat?
+  if dtensor.shape.rank is None:
+    element_shape = None
+  else:
+    element_shape = [None] + dtensor.shape.as_list()[1:]
+  return tensor_list_split(
+      dtensor,
+      element_shape=_build_element_shape(element_shape),
+      lengths=op.outputs[1])
+
+
+@ops.RegisterGradient("TensorListSplit")
+def _TensorListSplitGrad(op, dlist):
+  return tensor_list_concat(dlist, element_dtype=op.inputs[0].dtype), None, None
+
+
 @ops.RegisterGradient("TensorListFromTensor")
 def _TensorListFromTensorGrad(op, dlist):
   """Gradient for TensorListFromTensor."""
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 397d56ef40936c02d879c719027ceb5cfd10d93a..e96c93c15c27ebbdf833c6b97dd9f2ce8c0e4faa 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import string_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_lookup_ops import *
 # pylint: enable=wildcard-import
+from tensorflow.python.training.checkpointable import base as checkpointable_base
 from tensorflow.python.training.checkpointable import tracking as checkpointable
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
@@ -160,7 +161,9 @@ class InitializableLookupTableBase(LookupInterface):
     self._default_value = ops.convert_to_tensor(
         default_value, dtype=self._value_dtype)
     self._default_value.get_shape().merge_with(tensor_shape.scalar())
-    self._initializer = initializer
+    if isinstance(initializer, checkpointable_base.CheckpointableBase):
+      self._initializer = self._track_checkpointable(
+          initializer, "_initializer")
     self._resource_handle = self.create_resource()
     self._init_op = self.initialize()
 
@@ -309,7 +312,7 @@ class HashTable(InitializableLookupTableBase):
     return exported_keys, exported_values
 
 
-class TableInitializerBase(object):
+class TableInitializerBase(checkpointable_base.CheckpointableBase):
   """Base class for lookup table initializers."""
 
   def __init__(self, key_dtype, value_dtype):
@@ -522,12 +525,14 @@ class TextFileInitializer(TableInitializerBase):
     if (vocab_size is not None) and (vocab_size <= 0):
       raise ValueError("Invalid vocab_size %s." % vocab_size)
 
-    self._filename = filename
     self._key_index = key_index
     self._value_index = value_index
     self._vocab_size = vocab_size
     self._delimiter = delimiter
     self._name = name
+    self._filename = self._track_checkpointable(
+        checkpointable.TrackableAsset(filename),
+        "_filename")
 
     super(TextFileInitializer, self).__init__(key_dtype, value_dtype)
 
@@ -943,7 +948,7 @@ def index_table_from_file(vocabulary_file=None,
   `[vocabulary size, vocabulary size + num_oov_buckets - 1]`.
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.init.run()` once.
+  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
 
   To specify multi-column vocabulary files, use key_column_index and
   value_column_index and delimiter.
@@ -1072,7 +1077,7 @@ def index_table_from_tensor(vocabulary_list,
   `[vocabulary list size, vocabulary list size + num_oov_buckets - 1]`.
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.init.run()` once.
+  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
 
   Elements in `vocabulary_list` cannot have duplicates, otherwise when executing
   the table initializer op, it will throw a `FailedPreconditionError`.
@@ -1174,7 +1179,7 @@ def index_to_string_table_from_file(vocabulary_file,
   (an out-of-vocabulary entry) is assigned the `default_value`
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.init.run()` once.
+  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
 
   To specify multi-column vocabulary files, use key_column_index and
   value_column_index and delimiter.
@@ -1271,7 +1276,7 @@ def index_to_string_table_from_tensor(vocabulary_list,
   (an out-of-vocabulary entry) is assigned the `default_value`
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.init.run()` once.
+  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
 
   Elements in `vocabulary_list` cannot have duplicates, otherwise when executing
   the table initializer op, it will throw a `FailedPreconditionError`.
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 9e9de62e6cad1053a631cf6f935dea3063bf9e78..20397612bca9a9b81d9816ac1626ce15024d45f6 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -38,9 +38,10 @@ class ReductionV2(object):
   """Types of loss reduction.
 
   Contains the following values:
-  `NONE`: Un-reduced weighted losses with the same shape as input.
-  `SUM`: Scalar sum of weighted losses.
-  `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
+
+  * `NONE`: Un-reduced weighted losses with the same shape as input.
+  * `SUM`: Scalar sum of weighted losses.
+  * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
   """
 
   NONE = "none"
@@ -62,13 +63,14 @@ class Reduction(object):
   """Types of loss reduction.
 
   Contains the following values:
-  `NONE`: Un-reduced weighted losses with the same shape as input.
-  `SUM`: Scalar sum of weighted losses.
-  `MEAN`: Scalar `SUM` divided by sum of weights. DEPRECATED.
-  `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
-  `SUM_OVER_NONZERO_WEIGHTS`: Scalar `SUM` divided by number of non-zero
+
+  * `NONE`: Un-reduced weighted losses with the same shape as input.
+  * `SUM`: Scalar sum of weighted losses.
+  * `MEAN`: Scalar `SUM` divided by sum of weights. DEPRECATED.
+  * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
+  * `SUM_OVER_NONZERO_WEIGHTS`: Scalar `SUM` divided by number of non-zero
      weights. DEPRECATED.
-  `SUM_BY_NONZERO_WEIGHTS`: Same as `SUM_OVER_NONZERO_WEIGHTS`.
+  * `SUM_BY_NONZERO_WEIGHTS`: Same as `SUM_OVER_NONZERO_WEIGHTS`.
   """
 
   NONE = "none"
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 35278d9680408aa44c81ec3276e61cd382a58c57..c7ec1c57d1b07232e2bdb05fc30f5456b792890f 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -1041,11 +1041,12 @@ def _PowGrad(op, grad):
   # Avoid false singularity at x = 0
   if x.dtype.is_complex:
     # real(x) < 0 is fine for the complex case
-    log_x = array_ops.where(
-        math_ops.not_equal(x, 0), math_ops.log(x), array_ops.zeros_like(x))
+    mask = math_ops.not_equal(x, 0)
   else:
     # There's no sensible real value to return if x < 0, so return 0
-    log_x = array_ops.where(x > 0, math_ops.log(x), array_ops.zeros_like(x))
+    mask = x > 0
+  safe_x = array_ops.where(mask, x, array_ops.ones_like(x))
+  log_x = array_ops.where(mask, math_ops.log(safe_x), array_ops.zeros_like(x))
   gy = array_ops.reshape(math_ops.reduce_sum(grad * z * log_x, ry), sy)
   return gx, gy
 
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index 88aa48271f3b22f334aa209851148793b0bf0adf..f415e65787d406e59725ec866845b0ab50f44d76 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -20,6 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import execution_callbacks
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -30,6 +33,8 @@ from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
+RAISE = execution_callbacks.ExecutionCallback.RAISE
+
 
 class SquaredDifferenceOpTest(test.TestCase):
 
@@ -370,5 +375,25 @@ class XdivyTest(test.TestCase):
       self.assertAllClose(zero, xdivy_ygrad)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class PowGradTest(test.TestCase):
+
+  def test_zero_grad_tf_gradients(self):
+    if context.executing_eagerly():
+      self.skipTest("tf.gradients not supported in eager.")
+
+    x = constant_op.constant([-1., 0., 1.])
+    g = self.evaluate(gradients.gradients(math_ops.pow(x, 2), x)[0])
+    self.assertAllClose([-2., 0., 2.], g)
+
+  def test_zero_grad_tape(self):
+    with execution_callbacks.errstate(inf_or_nan=RAISE):
+      x = constant_op.constant([-1, 0., 1.])
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        g = tape.gradient(math_ops.pow(x, 2), x)
+      g = self.evaluate(g)
+      self.assertAllClose([-2., 0., 2.], g)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index c3feb1870d8f1dbcc70abc5862ad06d5cb354b71..1467678f2943a6400836cb8bd77f7e6f661ce516 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops.gen_math_ops import *
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -165,6 +166,7 @@ def argmin_v2(input,
 # pylint: disable=anomalous-backslash-in-string,protected-access
 # pylint: disable=g-docstring-has-escape
 @tf_export("math.abs", "abs")
+@dispatch.add_dispatch_support
 def abs(x, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the absolute value of a tensor.
 
@@ -189,22 +191,10 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
       of type `float32` or `float64`, respectively.
   """
   with ops.name_scope(name, "Abs", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      if x.values.dtype.is_complex:
-        x_abs = gen_math_ops.complex_abs(
-            x.values, Tout=x.values.dtype.real_dtype, name=name)
-        return sparse_tensor.SparseTensor(
-            indices=x.indices, values=x_abs, dense_shape=x.dense_shape)
-      x_abs = gen_math_ops._abs(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_abs, dense_shape=x.dense_shape)
-    else:
-      x = ops.convert_to_tensor(x, name="x")
-      if x.dtype.is_complex:
-        return gen_math_ops.complex_abs(x, Tout=x.dtype.real_dtype, name=name)
-      return gen_math_ops._abs(x, name=name)
-
-
+    x = ops.convert_to_tensor(x, name="x")
+    if x.dtype.is_complex:
+      return gen_math_ops.complex_abs(x, Tout=x.dtype.real_dtype, name=name)
+    return gen_math_ops._abs(x, name=name)
 # pylint: enable=g-docstring-has-escape
 
 
@@ -240,6 +230,7 @@ class DivideDelegateWithName(object):
 
 
 @tf_export("math.divide", "divide")
+@dispatch.add_dispatch_support
 def divide(x, y, name=None):
   """Computes Python style division of `x` by `y`."""
 
@@ -252,6 +243,7 @@ def divide(x, y, name=None):
 
 
 @tf_export("math.multiply", "multiply")
+@dispatch.add_dispatch_support
 def multiply(x, y, name=None):
   return gen_math_ops.mul(x, y, name)
 
@@ -272,6 +264,7 @@ _mul.__doc__ = (
 
 
 @tf_export("math.subtract", "subtract")
+@dispatch.add_dispatch_support
 def subtract(x, y, name=None):
   return gen_math_ops.sub(x, y, name)
 
@@ -291,31 +284,7 @@ _sub.__doc__ = (
     gen_math_ops.sub.__doc__ + ("" if _sub.__doc__ is None else _sub.__doc__))
 
 
-# pylint: disable=g-docstring-has-escape
-@tf_export("math.negative", "negative")
-def negative(x, name=None):
-  """Computes numerical negative value element-wise.
-
-  I.e., \\(y = -x\\).
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Neg", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_neg = gen_math_ops.neg(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_neg, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.neg(x, name=name)
-
-
-# pylint: enable=g-docstring-has-escape
+negative = gen_math_ops.neg
 
 
 # pylint: disable=g-docstring-has-escape
@@ -341,105 +310,6 @@ def _neg(x, name=None):
 # pylint: enable=g-docstring-has-escape
 
 
-@tf_export("math.sign", "sign")
-def sign(x, name=None):
-  """Returns an element-wise indication of the sign of a number.
-
-  `y = sign(x) = -1` if `x < 0`; 0 if `x == 0` or `tf.is_nan(x)`; 1 if `x > 0`.
-
-  Zero is returned for NaN inputs.
-
-  For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-  @compatibility(numpy)
-  Equivalent to numpy.sign except for the behavior for input values of NaN.
-  @end_compatibility
-  """
-  with ops.name_scope(name, "Sign", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_sign = gen_math_ops.sign(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_sign, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.sign(x, name=name)
-
-
-@tf_export("math.square", "square")
-def square(x, name=None):
-  r"""Computes square of x element-wise.
-
-  I.e., \\(y = x * x = x^2\\).
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Square", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_square = gen_math_ops.square(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_square, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.square(x, name=name)
-
-
-@tf_export("math.sqrt", "sqrt")
-def sqrt(x, name=None):
-  r"""Computes square root of x element-wise.
-
-  I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Sqrt", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_sqrt = gen_math_ops.sqrt(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_sqrt, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.sqrt(x, name=name)
-
-
-@tf_export("math.erf", v1=["math.erf", "erf"])
-@deprecation.deprecated_endpoints("erf")
-def erf(x, name=None):
-  """Computes the Gauss error function of `x` element-wise.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Erf", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_erf = gen_math_ops.erf(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_erf, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.erf(x, name=name)
-
-
 @tf_export(v1=["math.scalar_mul", "scalar_mul"])
 def scalar_mul(scalar, x, name=None):
   """Multiplies a scalar times a `Tensor` or `IndexedSlices` object.
@@ -480,6 +350,7 @@ def scalar_mul_v2(scalar, x, name=None):
 
 
 @tf_export("math.pow", "pow")
+@dispatch.add_dispatch_support
 def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the power of one value to another.
 
@@ -508,6 +379,7 @@ def pow(x, y, name=None):  # pylint: disable=redefined-builtin
 
 # pylint: disable=redefined-builtin,redefined-outer-name
 @tf_export("dtypes.complex", "complex")
+@dispatch.add_dispatch_support
 def complex(real, imag, name=None):
   r"""Converts two real numbers to a complex number.
 
@@ -551,6 +423,7 @@ def complex(real, imag, name=None):
 
 @tf_export("math.real", v1=["math.real", "real"])
 @deprecation.deprecated_endpoints("real")
+@dispatch.add_dispatch_support
 def real(input, name=None):
   r"""Returns the real part of a complex (or real) tensor.
 
@@ -583,6 +456,7 @@ def real(input, name=None):
 
 @tf_export("math.imag", v1=["math.imag", "imag"])
 @deprecation.deprecated_endpoints("imag")
+@dispatch.add_dispatch_support
 def imag(input, name=None):
   r"""Returns the imaginary part of a complex (or real) tensor.
 
@@ -614,6 +488,7 @@ def imag(input, name=None):
 
 @tf_export("math.angle", v1=["math.angle", "angle"])
 @deprecation.deprecated_endpoints("angle")
+@dispatch.add_dispatch_support
 def angle(input, name=None):
   r"""Returns the element-wise argument of a complex (or real) tensor.
 
@@ -653,6 +528,7 @@ def angle(input, name=None):
 
 
 @tf_export("math.round", "round")
+@dispatch.add_dispatch_support
 def round(x, name=None):  # pylint: disable=redefined-builtin
   """Rounds the values of a tensor to the nearest integer, element-wise.
 
@@ -680,6 +556,7 @@ def round(x, name=None):  # pylint: disable=redefined-builtin
 
 
 @tf_export("dtypes.cast", "cast")
+@dispatch.add_dispatch_support
 def cast(x, dtype, name=None):
   """Casts a tensor to a new type.
 
@@ -743,6 +620,7 @@ def cast(x, dtype, name=None):
 
 
 @tf_export("dtypes.saturate_cast", "saturate_cast")
+@dispatch.add_dispatch_support
 def saturate_cast(value, dtype, name=None):
   """Performs a safe saturating cast of `value` to `dtype`.
 
@@ -1068,6 +946,7 @@ def _div_python2(x, y, name=None):
 
 
 @tf_export("math.truediv", "truediv")
+@dispatch.add_dispatch_support
 def truediv(x, y, name=None):
   """Divides x / y elementwise (using Python 3 division operator semantics).
 
@@ -1125,6 +1004,7 @@ def div(x, y, name=None):
 
 
 @tf_export("div_no_nan")
+@dispatch.add_dispatch_support
 def div_no_nan(x, y, name=None):
   """Computes an unsafe divide which returns 0 if the y is zero.
 
@@ -1154,6 +1034,7 @@ mod = gen_math_ops.floor_mod
 # TODO(aselle): Deprecate this once all internal functionality uses
 # tf.truncatediv
 @tf_export("math.floordiv", v1=["math.floordiv", "floordiv"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("floordiv")
 def floordiv(x, y, name=None):
   """Divides `x / y` elementwise, rounding toward the most negative integer.
@@ -1183,16 +1064,11 @@ def floordiv(x, y, name=None):
 
 
 realdiv = gen_math_ops.real_div
-tf_export("realdiv")(realdiv)
 truncatediv = gen_math_ops.truncate_div
-tf_export("truncatediv")(truncatediv)
 # TODO(aselle): Rename this to floordiv when we can.
 floor_div = gen_math_ops.floor_div
-tf_export("floor_div")(floor_div)
 truncatemod = gen_math_ops.truncate_mod
-tf_export("truncatemod")(truncatemod)
 floormod = gen_math_ops.floor_mod
-tf_export("floormod", "mod")(floormod)
 
 
 def _mul_dispatch(x, y, name=None):
@@ -1228,6 +1104,7 @@ _OverrideBinaryOperatorHelper(pow, "pow")
 
 
 @tf_export("math.logical_xor", v1=["math.logical_xor", "logical_xor"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("logical_xor")
 def logical_xor(x, y, name="LogicalXor"):
   """x ^ y = (x | y) & ~(x & y)."""
@@ -1410,6 +1287,7 @@ def reduce_sum_v1(input_tensor,
 
 
 @tf_export("math.reduce_sum", "reduce_sum", v1=[])
+@dispatch.add_dispatch_support
 def reduce_sum(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the sum of elements across dimensions of a tensor.
 
@@ -1587,7 +1465,7 @@ def count_nonzero_v2(input,  # pylint: disable=redefined-builtin
     return cast(
         reduce_sum(
             # int64 reduction happens on GPU
-            to_int64(gen_math_ops.not_equal(input, zero)),
+            cast(gen_math_ops.not_equal(input, zero), dtypes.int64),
             axis=axis,
             keepdims=keepdims),
         dtype=dtype)
@@ -1657,6 +1535,7 @@ def reduce_mean_v1(input_tensor,
 
 
 @tf_export("math.reduce_mean", "reduce_mean", v1=[])
+@dispatch.add_dispatch_support
 def reduce_mean(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the mean of elements across dimensions of a tensor.
 
@@ -1757,7 +1636,7 @@ def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
   name = name if name else "reduce_variance"
   with ops.name_scope(name):
     means = reduce_mean(input_tensor, axis=axis, keepdims=True)
-    squared_deviations = square(input_tensor - means)
+    squared_deviations = gen_math_ops.square(input_tensor - means)
     return reduce_mean(squared_deviations, axis=axis, keepdims=keepdims)
 
 
@@ -1804,10 +1683,11 @@ def reduce_std(input_tensor, axis=None, keepdims=False, name=None):
   name = name if name else "reduce_std"
   with ops.name_scope(name):
     variance = reduce_variance(input_tensor, axis=axis, keepdims=keepdims)
-    return sqrt(variance)
+    return gen_math_ops.sqrt(variance)
 
 
 @tf_export("math.reduce_prod", "reduce_prod", v1=[])
+@dispatch.add_dispatch_support
 def reduce_prod(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the product of elements across dimensions of a tensor.
 
@@ -1929,6 +1809,7 @@ def reduce_min_v1(input_tensor,
 
 
 @tf_export("math.reduce_min", "reduce_min", v1=[])
+@dispatch.add_dispatch_support
 def reduce_min(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the minimum of elements across dimensions of a tensor.
 
@@ -2007,6 +1888,7 @@ def reduce_max_v1(input_tensor,
 
 
 @tf_export("math.reduce_max", "reduce_max", v1=[])
+@dispatch.add_dispatch_support
 def reduce_max(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the maximum of elements across dimensions of a tensor.
 
@@ -2094,6 +1976,7 @@ def reduce_all_v1(input_tensor,
 
 
 @tf_export("reduce_all", "math.reduce_all", v1=[])
+@dispatch.add_dispatch_support
 def reduce_all(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the "logical and" of elements across dimensions of a tensor.
 
@@ -2190,6 +2073,7 @@ def reduce_any_v1(input_tensor,
 
 
 @tf_export("math.reduce_any", "reduce_any", v1=[])
+@dispatch.add_dispatch_support
 def reduce_any(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the "logical or" of elements across dimensions of a tensor.
 
@@ -2671,7 +2555,8 @@ def matvec(a,
 
 _OverrideBinaryOperatorHelper(matmul, "matmul")
 
-sparse_matmul = gen_math_ops.sparse_mat_mul
+sparse_matmul = deprecation.deprecated(None, "Use `tf.linalg.matmul` instead")(
+    gen_math_ops.sparse_mat_mul)
 tf_export(v1=["sparse_matmul"])(sparse_matmul)
 
 
@@ -2751,9 +2636,12 @@ def _as_indexed_slices_list(inputs, optimize=True):
 
 
 @tf_export("math.add_n", "add_n")
+@dispatch.add_dispatch_support
 def add_n(inputs, name=None):
   """Adds all input tensors element-wise.
 
+  Converts `IndexedSlices` objects into dense tensors prior to adding.
+
   Args:
     inputs: A list of `Tensor` or `IndexedSlices` objects, each with same shape
       and type.
@@ -2776,7 +2664,7 @@ def add_n(inputs, name=None):
 
   if len(inputs) == 1:
     if isinstance(inputs[0], ops.IndexedSlices):
-      values = inputs[0].values
+      values = ops.convert_to_tensor(inputs[0])
     else:
       values = inputs[0]
     if name:
@@ -2896,6 +2784,7 @@ def sigmoid(x, name=None):
 
 
 @tf_export("math.log_sigmoid", v1=["math.log_sigmoid", "log_sigmoid"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("log_sigmoid")
 def log_sigmoid(x, name=None):
   """Computes log sigmoid of `x` element-wise.
@@ -2915,27 +2804,6 @@ def log_sigmoid(x, name=None):
     return gen_math_ops.neg(gen_nn_ops.softplus(-x), name=name)
 
 
-@tf_export("math.tanh", "nn.tanh", "tanh")
-def tanh(x, name=None):
-  """Computes hyperbolic tangent of `x` element-wise.
-
-  Args:
-    x: A Tensor or SparseTensor with type `float16`, `float32`, `double`,
-      `complex64`, or `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A Tensor or SparseTensor respectively with the same type as `x`.
-  """
-  with ops.name_scope(name, "Tanh", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_tanh = gen_math_ops.tanh(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_tanh, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.tanh(x, name=name)
-
-
 @tf_export("math.bincount", v1=[])
 def bincount(arr,
              weights=None,
@@ -3126,6 +2994,7 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
 
 
 @tf_export("math.conj", v1=["math.conj", "conj"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("conj")
 def conj(x, name=None):
   r"""Returns the complex conjugate of a complex number.
@@ -3230,6 +3099,7 @@ def _unsorted_segment_N(data, segment_ids, num_segments):
     "math.unsorted_segment_mean",
     v1=["math.unsorted_segment_mean", "unsorted_segment_mean"])
 @deprecation.deprecated_endpoints("unsorted_segment_mean")
+@dispatch.add_dispatch_support
 def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
   r"""Computes the mean along segments of a tensor.
 
@@ -3275,6 +3145,7 @@ def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
     "math.unsorted_segment_sqrt_n",
     v1=["math.unsorted_segment_sqrt_n", "unsorted_segment_sqrt_n"])
 @deprecation.deprecated_endpoints("unsorted_segment_sqrt_n")
+@dispatch.add_dispatch_support
 def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
   r"""Computes the sum along segments of a tensor divided by the sqrt(N).
 
@@ -3762,63 +3633,3 @@ def polyval(coeffs, x, name=None):
     for c in coeffs[1:]:
       p = c + p * x
     return p
-
-
-@tf_export("math.bessel_i0e")
-def bessel_i0e(x, name=None):
-  """Computes the Bessel i0e function of `x` element-wise.
-
-  Exponentially scaled modified Bessel function of order 0 defined as
-  `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-
-  This function is faster and numerically stabler than `bessel_i0(x)`.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-  @compatibility(scipy)
-  Equivalent to scipy.special.i0e
-  @end_compatibility
-  """
-  with ops.name_scope(name, "bessel_i0e", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_i0e = gen_math_ops.bessel_i0e(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_i0e, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.bessel_i0e(x, name=name)
-
-
-@tf_export("math.bessel_i1e")
-def bessel_i1e(x, name=None):
-  """Computes the Bessel i1e function of `x` element-wise.
-
-  Exponentially scaled modified Bessel function of order 1 defined as
-  `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-
-  This function is faster and numerically stabler than `bessel_i1(x)`.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-  @compatibility(scipy)
-  Equivalent to scipy.special.i1e
-  @end_compatibility
-  """
-  with ops.name_scope(name, "bessel_i1e", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_i1e = gen_math_ops.bessel_i1e(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_i1e, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.bessel_i1e(x, name=name)
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index add1621a56b8386f1dfb68bb3a95c25dd458b2e0..4de56fce0ad4a0532d8d68668a91485a6e415514 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -223,7 +223,7 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testSquaredDifference(self):
-    for dtype in [np.int32, np.float16]:
+    for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
       x = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)
       y = np.array([-3, -2, -1], dtype=dtype)
       z = (x - y) * (x - y)
@@ -231,6 +231,17 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase):
         z_tf = self.evaluate(math_ops.squared_difference(x, y))
         self.assertAllClose(z, z_tf)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testComplexSquaredDifference(self):
+    for dtype in [np.complex64, np.complex128]:
+      x = np.array([[1 + 3j, 2 + 2j, 3 + 1j], [4 - 1j, 5 - 2j, 6 - 3j]],
+                   dtype=dtype)
+      y = np.array([-3 + 1j, -2 + 2j, -1 + 3j], dtype=dtype)
+      z = np.conj(x - y) * (x - y)
+      with test_util.device(use_gpu=False):
+        z_tf = self.evaluate(math_ops.squared_difference(x, y))
+        self.assertAllClose(z, z_tf)
+
 
 class ApproximateEqualTest(test_util.TensorFlowTestCase):
 
@@ -392,6 +403,18 @@ class AddNTest(test_util.TensorFlowTestCase):
                             [g.eval() for g in add_n_grad])
 
 
+  @test_util.run_deprecated_v1
+  def testIndexedSlices(self):
+    slc = ops.IndexedSlices(
+        array_ops.constant([1, 2], shape=[1, 2]), array_ops.constant([1]),
+        array_ops.constant([2, 2]))
+    slc_as_dense = np.array([[0, 0], [1, 2]])
+    with self.test_session(use_gpu=True):
+      # add_n currently always converts IndexedSlices to dense
+      self.assertAllEqual(slc_as_dense, math_ops.add_n([slc]).eval())
+      self.assertAllEqual(2 * slc_as_dense, math_ops.add_n([slc, slc]).eval())
+
+
 class DivAndModTest(test_util.TensorFlowTestCase):
   # TODO(aselle): Test more types before exposing new division operators.
 
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index cb421990112a2d9a0e4e77066cadb43763dbabe1..ec39b1790e340a0d194dea8ab3419ca78fc9d126 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -34,7 +35,6 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 26bd13eaee464ed303cb0043edff191f245f61db..48dcab4842864b7322610e4328c1771f95ee352d 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -262,7 +262,7 @@ def weighted_cross_entropy_with_logits(targets, logits, pos_weight, name=None):
         name=name)
 
 
-@tf_export("nn.relu_layer")
+@tf_export(v1=["nn.relu_layer"])
 def relu_layer(x, weights, biases, name=None):
   """Computes Relu(x * weight + biases).
 
@@ -1462,7 +1462,111 @@ def _compute_sampled_logits(weights,
     return out_logits, out_labels
 
 
-@tf_export("nn.nce_loss")
+@tf_export("nn.nce_loss", v1=[])
+def nce_loss_v2(weights,
+                biases,
+                labels,
+                inputs,
+                num_sampled,
+                num_classes,
+                num_true=1,
+                sampled_values=None,
+                remove_accidental_hits=False,
+                name="nce_loss"):
+  """Computes and returns the noise-contrastive estimation training loss.
+
+  See [Noise-contrastive estimation: A new estimation principle for
+  unnormalized statistical
+  models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+  Also see our [Candidate Sampling Algorithms
+  Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf)
+
+  A common use case is to use this method for training, and calculate the full
+  sigmoid loss for evaluation or inference as in the following example:
+
+  ```python
+  if mode == "train":
+    loss = tf.nn.nce_loss(
+        weights=weights,
+        biases=biases,
+        labels=labels,
+        inputs=inputs,
+        ...)
+  elif mode == "eval":
+    logits = tf.matmul(inputs, tf.transpose(weights))
+    logits = tf.nn.bias_add(logits, biases)
+    labels_one_hot = tf.one_hot(labels, n_classes)
+    loss = tf.nn.sigmoid_cross_entropy_with_logits(
+        labels=labels_one_hot,
+        logits=logits)
+    loss = tf.reduce_sum(loss, axis=1)
+  ```
+
+  Note: when doing embedding lookup on `weights` and `bias`, "div" partition
+  strategy will be used. Support for other partition strategy will be added
+  later.
+
+  Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
+  so your labels must be sorted in order of decreasing frequency to achieve
+  good results.  For more details, see
+  `tf.nn.log_uniform_candidate_sampler`.
+
+  Note: In the case where `num_true` > 1, we assign to each target class
+  the target probability 1 / `num_true` so that the target probabilities
+  sum to 1 per-example.
+
+  Note: It would be useful to allow a variable number of target classes per
+  example.  We hope to provide this functionality in a future release.
+  For now, if you have a variable number of target classes, you can pad them
+  out to a constant number by either repeating them or by padding
+  with an otherwise unused class.
+
+  Args:
+    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
+      objects whose concatenation along dimension 0 has shape [num_classes,
+      dim].  The (possibly-partitioned) class embeddings.
+    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
+    labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
+      target classes.
+    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
+      the input network.
+    num_sampled: An `int`.  The number of negative classes to randomly sample
+      per batch. This single sample of negative classes is evaluated for each
+      element in the batch.
+    num_classes: An `int`. The number of possible classes.
+    num_true: An `int`.  The number of target classes per training example.
+    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
+      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
+      (if None, we default to `log_uniform_candidate_sampler`)
+    remove_accidental_hits:  A `bool`.  Whether to remove "accidental hits"
+      where a sampled class equals one of the target classes.  If set to `True`,
+      this is a "Sampled Logistic" loss instead of NCE, and we are learning to
+      generate log-odds instead of log probabilities.  See our [Candidate
+      Sampling Algorithms Reference]
+        (https://www.tensorflow.org/extras/candidate_sampling.pdf). Default is
+          False.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `batch_size` 1-D tensor of per-example NCE losses.
+  """
+  # TODO(yuefengz): get partition_strategy from either variables or distribution
+  # strategies.
+  return nce_loss(
+      weights,
+      biases,
+      labels,
+      inputs,
+      num_sampled,
+      num_classes,
+      num_true=num_true,
+      sampled_values=sampled_values,
+      remove_accidental_hits=remove_accidental_hits,
+      partition_strategy="div",
+      name=name)
+
+
+@tf_export(v1=["nn.nce_loss"])
 def nce_loss(weights,
              biases,
              labels,
@@ -1573,7 +1677,98 @@ def nce_loss(weights,
   return _sum_rows(sampled_losses)
 
 
-@tf_export("nn.sampled_softmax_loss")
+@tf_export("nn.sampled_softmax_loss", v1=[])
+def sampled_softmax_loss_v2(weights,
+                            biases,
+                            labels,
+                            inputs,
+                            num_sampled,
+                            num_classes,
+                            num_true=1,
+                            sampled_values=None,
+                            remove_accidental_hits=True,
+                            seed=None,
+                            name="sampled_softmax_loss"):
+  """Computes and returns the sampled softmax training loss.
+
+  This is a faster way to train a softmax classifier over a huge number of
+  classes.
+
+  This operation is for training only.  It is generally an underestimate of
+  the full softmax loss.
+
+  A common use case is to use this method for training, and calculate the full
+  sigmoid loss for evaluation or inference as in the following example:
+
+  ```python
+  if mode == "train":
+    loss = tf.nn.sampled_softmax_loss(
+        weights=weights,
+        biases=biases,
+        labels=labels,
+        inputs=inputs,
+        ...)
+  elif mode == "eval":
+    logits = tf.matmul(inputs, tf.transpose(weights))
+    logits = tf.nn.bias_add(logits, biases)
+    labels_one_hot = tf.one_hot(labels, n_classes)
+    loss = tf.nn.softmax_cross_entropy_with_logits_v2(
+        labels=labels_one_hot,
+        logits=logits)
+  ```
+
+  See our [Candidate Sampling Algorithms Reference]
+  (https://www.tensorflow.org/extras/candidate_sampling.pdf)
+
+  Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
+  ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
+
+  Note: when doing embedding lookup on `weights` and `bias`, "div" partition
+  strategy will be used. Support for other partition strategy will be added
+  later.
+
+  Args:
+    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
+      objects whose concatenation along dimension 0 has shape [num_classes,
+      dim].  The (possibly-sharded) class embeddings.
+    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
+    labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
+      target classes.  Note that this format differs from the `labels` argument
+      of `nn.softmax_cross_entropy_with_logits_v2`.
+    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
+      the input network.
+    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_classes: An `int`. The number of possible classes.
+    num_true: An `int`.  The number of target classes per training example.
+    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
+      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
+      (if None, we default to `log_uniform_candidate_sampler`)
+    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
+      where a sampled class equals one of the target classes.  Default is True.
+    seed: random seed for candidate sampling. Default to None, which doesn't set
+      the op-level random seed for candidate sampling.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `batch_size` 1-D tensor of per-example sampled softmax losses.
+
+  """
+  return sampled_softmax_loss(
+      weights,
+      biases,
+      labels,
+      inputs,
+      num_sampled,
+      num_classes,
+      num_true=num_true,
+      sampled_values=sampled_values,
+      remove_accidental_hits=remove_accidental_hits,
+      partition_strategy="div",
+      name=name,
+      seed=seed)
+
+
+@tf_export(v1=["nn.sampled_softmax_loss"])
 def sampled_softmax_loss(weights,
                          biases,
                          labels,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 74e88b765339240fefd8b7cc29a96bd5e311aa2d..6f2d2c15bd40109b79e7497c6b279fd8edf23bd7 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -36,13 +36,14 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_nn_ops import *
 # pylint: enable=wildcard-import
-
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
+
 from tensorflow.python.util.tf_export import tf_export
 
 # Aliases for some automatically-generated names.
@@ -2168,6 +2169,14 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   # If dim is not the last dimension, we have to do a transpose so that we can
   # still perform softmax on its last dimension.
 
+  # In case dim is negative (and is not last dimension -1), add shape.ndims
+  ndims = array_ops.rank(logits)
+  if not isinstance(dim, ops.Tensor):
+    if dim < 0:
+      dim += ndims
+  else:
+    dim = array_ops.where(math_ops.less(dim, 0), dim + ndims, dim)
+
   # Swap logits' dimension of dim and its last dimension.
   input_rank = array_ops.rank(logits)
   dim_axis = dim % shape.ndims
@@ -2308,9 +2317,8 @@ def _ensure_xent_args(name, sentinel, labels, logits):
     raise ValueError("Both labels and logits must be provided.")
 
 
-@tf_export("nn.softmax_cross_entropy_with_logits",
-           v1=["nn.softmax_cross_entropy_with_logits_v2"])
-def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
+@tf_export("nn.softmax_cross_entropy_with_logits", v1=[])
+def softmax_cross_entropy_with_logits_v2(labels, logits, axis=-1, name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -2332,7 +2340,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
 
   A common use case is to have logits and labels of shape
   `[batch_size, num_classes]`, but higher dimensions are supported, with
-  the `dim` argument specifying the class dimension.
+  the `axis` argument specifying the class dimension.
 
   `logits` and `labels` must have the same dtype (either `float16`, `float32`,
   or `float64`).
@@ -2350,8 +2358,64 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
       `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
       probability distribution.
     logits: Unscaled log probabilities.
-    dim: The class dimension. Defaulted to -1 which is the last dimension.
+    axis: The class dimension. Defaulted to -1 which is the last dimension.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` that contains the softmax cross entropy loss. Its type is the
+    same as `logits` and its shape is the same as `labels` except that it does
+    not have the last dimension of `labels`.
+  """
+  return softmax_cross_entropy_with_logits_v2_helper(
+      labels=labels, logits=logits, axis=axis, name=name)
+
+
+@tf_export(v1=["nn.softmax_cross_entropy_with_logits_v2"])
+@deprecated_args(None, "dim is deprecated, use axis instead", "dim")
+def softmax_cross_entropy_with_logits_v2_helper(
+    labels, logits, axis=None, name=None, dim=None):
+  """Computes softmax cross entropy between `logits` and `labels`.
+
+  Measures the probability error in discrete classification tasks in which the
+  classes are mutually exclusive (each entry is in exactly one class).  For
+  example, each CIFAR-10 image is labeled with one and only one label: an image
+  can be a dog or a truck, but not both.
+
+  **NOTE:**  While the classes are mutually exclusive, their probabilities
+  need not be.  All that is required is that each row of `labels` is
+  a valid probability distribution.  If they are not, the computation of the
+  gradient will be incorrect.
+
+  If using exclusive `labels` (wherein one and only
+  one class is true at a time), see `sparse_softmax_cross_entropy_with_logits`.
+
+  **WARNING:** This op expects unscaled logits, since it performs a `softmax`
+  on `logits` internally for efficiency.  Do not call this op with the
+  output of `softmax`, as it will produce incorrect results.
+
+  A common use case is to have logits and labels of shape
+  `[batch_size, num_classes]`, but higher dimensions are supported, with
+  the `axis` argument specifying the class dimension.
+
+  `logits` and `labels` must have the same dtype (either `float16`, `float32`,
+  or `float64`).
+
+  Backpropagation will happen into both `logits` and `labels`.  To disallow
+  backpropagation into `labels`, pass label tensors through `tf.stop_gradient`
+  before feeding it to this function.
+
+  **Note that to avoid confusion, it is required to pass only named arguments to
+  this function.**
+
+  Args:
+    labels: Each vector along the class dimension should hold a valid
+      probability distribution e.g. for the case in which labels are of shape
+      `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
+      probability distribution.
+    logits: Unscaled log probabilities.
+    axis: The class dimension. Defaulted to -1 which is the last dimension.
     name: A name for the operation (optional).
+    dim: Deprecated alias for axis.
 
   Returns:
     A `Tensor` that contains the softmax cross entropy loss. Its type is the
@@ -2361,6 +2425,10 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
   # TODO(pcmurray) Raise an error when the labels do not sum to 1. Note: This
   # could break users who call this with bad labels, but disregard the bad
   # results.
+  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  del dim
+  if axis is None:
+    axis = -1
 
   with ops.name_scope(name, "softmax_cross_entropy_with_logits",
                       [logits, labels]) as name:
@@ -2377,7 +2445,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
     shape = logits.get_shape()
 
     # Move the dim to the end if dim is not the last dimension.
-    if dim is not -1:
+    if axis != -1:
 
       def _move_dim_to_end(tensor, dim_index, rank):
         return array_ops.transpose(
@@ -2387,8 +2455,8 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
                 math_ops.range(dim_index + 1, rank), [dim_index]
             ], 0))
 
-      precise_logits = _move_dim_to_end(precise_logits, dim, input_rank)
-      labels = _move_dim_to_end(labels, dim, input_rank)
+      precise_logits = _move_dim_to_end(precise_logits, axis, input_rank)
+      labels = _move_dim_to_end(labels, axis, input_rank)
 
     input_shape = array_ops.shape(precise_logits)
 
@@ -2402,7 +2470,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
     cost, unused_backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
         precise_logits, labels, name=name)
 
-    # The output cost shape should be the input minus dim.
+    # The output cost shape should be the input minus axis.
     output_shape = array_ops.slice(input_shape, [0],
                                    [math_ops.subtract(input_rank, 1)])
     cost = array_ops.reshape(cost, output_shape)
@@ -2412,7 +2480,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
     if not context.executing_eagerly(
     ) and shape is not None and shape.dims is not None:
       shape = shape.as_list()
-      del shape[dim]
+      del shape[axis]
       cost.set_shape(shape)
 
     if convert_to_float32:
@@ -2490,7 +2558,7 @@ def softmax_cross_entropy_with_logits(
     labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
 
   return softmax_cross_entropy_with_logits_v2(
-      labels=labels, logits=logits, dim=dim, name=name)
+      labels=labels, logits=logits, axis=dim, name=name)
 
 
 @tf_export("nn.sparse_softmax_cross_entropy_with_logits")
@@ -2517,8 +2585,9 @@ def sparse_softmax_cross_entropy_with_logits(
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
-  A common use case is to have logits and labels of shape
-  `[batch_size, num_classes]`, but higher dimensions are supported, in which
+  A common use case is to have logits of shape
+  `[batch_size, num_classes]` and have labels of shape
+  `[batch_size]`, but higher dimensions are supported, in which
   case the `dim`-th dimension is assumed to be of size `num_classes`.
   `logits` must have the dtype of `float16`, `float32`, or `float64`, and
   `labels` must have the dtype of `int32` or `int64`.
@@ -2857,11 +2926,15 @@ def _get_noise_shape(x, noise_shape):
 
 
 @tf_export(v1=["nn.dropout"])
-def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: disable=invalid-name
+@deprecation.deprecated_args(None, "Please use `rate` instead of `keep_prob`. "
+                             "Rate should be set to `rate = 1 - keep_prob`.",
+                             "keep_prob")
+def dropout(x, keep_prob=None, noise_shape=None, seed=None, name=None,
+            rate=None):  # pylint: disable=invalid-name
   """Computes dropout.
 
-  With probability `keep_prob`, outputs the input element scaled up by
-  `1 / keep_prob`, otherwise outputs `0`.  The scaling is so that the expected
+  For each element of `x`, with probability `rate`, outputs `0`, and otherwise
+  scales up the input by `1 / (1-rate)`. The scaling is such that the expected
   sum is unchanged.
 
   By default, each element is kept or dropped independently.  If `noise_shape`
@@ -2874,48 +2947,34 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
 
   Args:
     x: A floating point tensor.
-    keep_prob: A scalar `Tensor` with the same type as x. The probability
-      that each element is kept.
+    keep_prob: (deprecated) A deprecated alias for `(1-rate)`.
     noise_shape: A 1-D `Tensor` of type `int32`, representing the
       shape for randomly generated keep/drop flags.
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.set_random_seed` for behavior.
     name: A name for this operation (optional).
+    rate: A scalar `Tensor` with the same type as `x`. The probability that each
+      element of `x` is discarded.
 
   Returns:
     A Tensor of the same shape of `x`.
 
   Raises:
-    ValueError: If `keep_prob` is not in `(0, 1]` or if `x` is not a floating
+    ValueError: If `rate` is not in `[0, 1)` or if `x` is not a floating
       point tensor.
   """
-  with ops.name_scope(name, "dropout", [x]) as name:
-    x = ops.convert_to_tensor(x, name="x")
-    if not x.dtype.is_floating:
-      raise ValueError("x has to be a floating point tensor since it's going to"
-                       " be scaled. Got a %s tensor instead." % x.dtype)
-    if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1:
-      raise ValueError("keep_prob must be a scalar tensor or a float in the "
-                       "range (0, 1], got %g" % keep_prob)
-
-    # Early return if nothing needs to be dropped.
-    if isinstance(keep_prob, float) and keep_prob == 1:
-      return x
-    if context.executing_eagerly():
-      if isinstance(keep_prob, ops.EagerTensor):
-        if keep_prob.numpy() == 1:
-          return x
-    else:
-      keep_prob = ops.convert_to_tensor(
-          keep_prob, dtype=x.dtype, name="keep_prob")
-      keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
+  try:
+    keep = 1. - keep_prob if keep_prob is not None else None
+  except TypeError:
+    raise ValueError("keep_prob must be a floating point number or Tensor "
+                     "(got %r)" % keep_prob)
 
-      # Do nothing if we know keep_prob == 1
-      if tensor_util.constant_value(keep_prob) == 1:
-        return x
+  rate = deprecation.deprecated_argument_lookup(
+      "rate", rate,
+      "keep_prob", keep)
 
-    rate = 1 - keep_prob
+  if rate is None:
+    raise ValueError("You must provide a rate to dropout.")
 
   return dropout_v2(x, rate, noise_shape=noise_shape, seed=seed, name=name)
 
@@ -2960,12 +3019,12 @@ def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):  # pylint: disa
     if not x.dtype.is_floating:
       raise ValueError("x has to be a floating point tensor since it's going to"
                        " be scaled. Got a %s tensor instead." % x.dtype)
-    if isinstance(rate, numbers.Real) and not 0 <= rate < 1:
+    if isinstance(rate, numbers.Real) and not (rate >= 0 and rate < 1):
       raise ValueError("rate must be a scalar tensor or a float in the "
                        "range [0, 1), got %g" % rate)
 
     # Early return if nothing needs to be dropped.
-    if isinstance(rate, float) and rate == 0:
+    if isinstance(rate, numbers.Real) and rate == 0:
       return x
     if context.executing_eagerly():
       if isinstance(rate, ops.EagerTensor):
@@ -2989,7 +3048,7 @@ def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):  # pylint: disa
         noise_shape, seed=seed, dtype=x.dtype)
     # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
     binary_tensor = math_ops.floor(random_tensor)
-    ret = math_ops.div(x, keep_prob) * binary_tensor
+    ret = math_ops.divide(x, keep_prob) * binary_tensor
     if not context.executing_eagerly():
       ret.set_shape(x.get_shape())
     return ret
@@ -3725,7 +3784,7 @@ def erosion2d_v2(value,
             name=name))
 
 
-@tf_export("math.in_top_k", "nn.in_top_k")
+@tf_export(v1=["math.in_top_k", "nn.in_top_k"])
 def in_top_k(predictions, targets, k, name=None):
   r"""Says whether the targets are in the top `K` predictions.
 
@@ -3759,8 +3818,15 @@ def in_top_k(predictions, targets, k, name=None):
     return gen_nn_ops.in_top_kv2(predictions, targets, k, name=name)
 
 
+@tf_export("math.in_top_k", "nn.in_top_k", v1=[])
+def in_top_k_v2(targets, predictions, k, name=None):
+  return in_top_k(predictions, targets, k, name)
+
+
+in_top_k_v2.__doc__ = in_top_k.__doc__
+
+
 tf_export(v1=["nn.quantized_avg_pool"])(gen_nn_ops.quantized_avg_pool)
 tf_export(v1=["nn.quantized_conv2d"])(gen_nn_ops.quantized_conv2d)
 tf_export(v1=["nn.quantized_relu_x"])(gen_nn_ops.quantized_relu_x)
 tf_export(v1=["nn.quantized_max_pool"])(gen_nn_ops.quantized_max_pool)
-
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index a7ed834c72368b38ad18b0459b8ef6a57edc0a97..82fab741830fddd4ee0ba5c8e2644702ec199b4d 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -450,6 +450,18 @@ class DropoutTest(test_lib.TestCase):
     with self.assertRaises(ValueError):
       nn_ops.dropout(t, array_ops.placeholder(dtypes.float32, shape=[2]))
 
+  @test_util.run_deprecated_v1
+  def testInvalidRate(self):
+    x_dim = 40
+    y_dim = 30
+    t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+    with self.assertRaises(ValueError):
+      nn_ops.dropout_v2(t, -1.0)
+    with self.assertRaises(ValueError):
+      nn_ops.dropout_v2(t, 1.1)
+    with self.assertRaises(ValueError):
+      nn_ops.dropout_v2(t, [0.0, 1.0])
+
   @test_util.run_deprecated_v1
   def testShapedDropoutShapeError(self):
     # Runs shaped dropout and verifies an error is thrown on misshapen noise.
@@ -471,12 +483,13 @@ class DropoutTest(test_lib.TestCase):
     _ = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
     _ = nn_ops.dropout(t, keep_prob, noise_shape=[1, 1])
 
-  @test_util.run_deprecated_v1
   def testNoDropoutFast(self):
     x = array_ops.zeros((5,))
-    for p in 1, constant_op.constant(1.0):
-      y = nn_ops.dropout(x, keep_prob=p)
-      self.assertTrue(x is y)
+    y = nn_ops.dropout(x, keep_prob=1)
+    self.assertTrue(x is y)
+
+    y = nn_ops.dropout_v2(x, rate=0)
+    self.assertTrue(x is y)
 
   def testDropoutWithIntegerInputs(self):
     x = constant_op.constant([1, 1, 1, 1, 1])
@@ -790,7 +803,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_nce_loss = np.sum(
         _SigmoidCrossEntropyWithLogits(exp_logits, exp_labels), 1)
 
-    got_nce_loss = nn_impl.nce_loss(
+    got_nce_loss = nn_impl.nce_loss_v2(
         weights=constant_op.constant(weights),
         biases=constant_op.constant(biases),
         labels=constant_op.constant(labels, shape=(batch_size, 1)),
@@ -798,15 +811,14 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
         num_sampled=4,
         num_classes=num_classes,
         num_true=1,
-        sampled_values=sampled_vals,
-        partition_strategy="div")
+        sampled_values=sampled_vals)
 
     self.assertAllClose(exp_nce_loss, self.evaluate(got_nce_loss), 1e-4)
 
     # Test with sharded weights and sharded biases.
     weight_shards, bias_shards = self._ShardTestEmbeddings(
         weights, biases, num_shards=3)
-    got_nce_loss = nn_impl.nce_loss(
+    got_nce_loss = nn_impl.nce_loss_v2(
         weights=[constant_op.constant(shard) for shard in weight_shards],
         biases=[constant_op.constant(shard) for shard in bias_shards],
         labels=constant_op.constant(labels, shape=(batch_size, 1)),
@@ -814,8 +826,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
         num_sampled=4,
         num_classes=num_classes,
         num_true=1,
-        sampled_values=sampled_vals,
-        partition_strategy="div")
+        sampled_values=sampled_vals)
 
     self.assertAllClose(exp_nce_loss, self.evaluate(got_nce_loss), 1e-4)
 
@@ -846,7 +857,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_sampled_softmax_loss = _SoftmaxCrossEntropyWithLogits(
         exp_logits, exp_labels)
 
-    got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
+    got_sampled_softmax_loss = nn_impl.sampled_softmax_loss_v2(
         weights=constant_op.constant(weights),
         biases=constant_op.constant(biases),
         labels=constant_op.constant(labels, shape=(batch_size, 1)),
@@ -855,8 +866,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
         num_classes=num_classes,
         num_true=1,
         sampled_values=sampled_vals,
-        remove_accidental_hits=False,
-        partition_strategy="div")
+        remove_accidental_hits=False)
 
     self.assertAllClose(exp_sampled_softmax_loss,
                         self.evaluate(got_sampled_softmax_loss), 1e-4)
@@ -864,7 +874,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     # Test with sharded weights and sharded biases.
     weight_shards, bias_shards = self._ShardTestEmbeddings(
         weights, biases, num_shards=3)
-    got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
+    got_sampled_softmax_loss = nn_impl.sampled_softmax_loss_v2(
         weights=[constant_op.constant(shard) for shard in weight_shards],
         biases=[constant_op.constant(shard) for shard in bias_shards],
         labels=constant_op.constant(labels, shape=(batch_size, 1)),
@@ -873,8 +883,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
         num_classes=num_classes,
         num_true=1,
         sampled_values=sampled_vals,
-        remove_accidental_hits=False,
-        partition_strategy="div")
+        remove_accidental_hits=False)
 
     self.assertAllClose(exp_sampled_softmax_loss,
                         self.evaluate(got_sampled_softmax_loss), 1e-4)
@@ -915,7 +924,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     sampled_vals_bf16 = (sampled, true_exp_bf16, sampled_exp_bf16)
 
     got_sampled_softmax_loss = math_ops.cast(
-        nn_impl.sampled_softmax_loss(
+        nn_impl.sampled_softmax_loss_v2(
             weights=constant_op.constant(weights, dtype=dtypes.bfloat16),
             biases=constant_op.constant(biases, dtype=dtypes.bfloat16),
             labels=constant_op.constant(
@@ -925,8 +934,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
             num_classes=num_classes,
             num_true=1,
             sampled_values=sampled_vals_bf16,
-            remove_accidental_hits=False,
-            partition_strategy="div"), dtypes.float32)
+            remove_accidental_hits=False), dtypes.float32)
 
     self.assertAllClose(exp_sampled_softmax_loss,
                         self.evaluate(got_sampled_softmax_loss), 1e-1)
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index c2484766aac3be75057def40c67cb0ee397fa1fa..933bddd8ccaa830a394c8d69e4f1b33311315c99 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -927,7 +927,10 @@ class NNTest(PForTest):
               outputs[1] = constant_op.constant(0.)
               outputs[2] = constant_op.constant(0.)
             loss = nn.l2_loss(outputs[0])
-          gradients = g.gradient(loss, [x1, scale, offset])
+          if is_training:
+            gradients = g.gradient(loss, [x1, scale, offset])
+          else:
+            gradients = [constant_op.constant(0.)] * 3
           return outputs + gradients
 
         # pylint: enable=cell-var-from-loop
@@ -1133,7 +1136,7 @@ class TensorArrayTest(PForTest):
     # y = x * x. Hence dy/dx = 2 * x.
     actual_grad = 2.0 * x
     with session.Session() as sess:
-      actual_grad, computed_grad = self.evaluate([t1, actual_grad])
+      actual_grad, computed_grad = sess.run([t1, actual_grad])
       self.assertAllClose(actual_grad, computed_grad)
 
 
@@ -1287,7 +1290,7 @@ class ControlFlowTest(PForTest):
     expected_output = array_ops.transpose(expected_output, [1, 0])
 
     with session.Session() as sess:
-      out, expected = self.evaluate([out, expected_output])
+      out, expected = sess.run([out, expected_output])
       self.assertAllClose(expected, out)
 
   def test_tensor_array_as_loop_variable(self):
@@ -1475,7 +1478,7 @@ class Benchmarks(test.Benchmark):
     sess = session.Session()
     with sess:
       init = variables.global_variables_initializer()
-      self.evaluate(init)
+      sess.run(init)
       run_fn = sess.make_callable(targets)
       run_fn()  # Warm up
       begin = time.time()
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index e335c5cb6f324c3142080443e8783269dccb1fe0..89b8c4a2b305e7cd584d8bc215ae30490572f2e4 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 package(
     default_visibility = [
         "//intelligence/datum/prensor:__pkg__",
@@ -11,8 +13,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
-
 #-------------------------------------------------------------------------------
 # RaggedTensor
 #-------------------------------------------------------------------------------
@@ -25,7 +25,7 @@ py_library(
     deps = [
         ":ragged_array_ops",
         ":ragged_conversion_ops",
-        ":ragged_elementwise_ops",
+        ":ragged_dispatch",
         ":ragged_factory_ops",
         ":ragged_functional_ops",
         ":ragged_getitem",
@@ -48,7 +48,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged_conversion_ops",
-        ":ragged_factory_ops",
         ":ragged_functional_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
@@ -82,6 +81,7 @@ py_library(
         "//tensorflow/python:ragged_conversion_ops_gen",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -95,6 +95,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:ragged_tensor_value",
         "//third_party/py/numpy",
@@ -110,6 +111,7 @@ py_library(
         ":ragged_tensor",
         ":ragged_util",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -147,24 +149,6 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:ragged_math_ops_gen",
         "//tensorflow/python:tensor_util",
-    ],
-)
-
-py_library(
-    name = "ragged_elementwise_ops",
-    srcs = ["ragged_elementwise_ops.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":ragged_factory_ops",
-        ":ragged_tensor",
-        ":ragged_tensor_shape",
-        ":ragged_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:clip_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
     ],
 )
@@ -174,9 +158,9 @@ py_library(
     srcs = ["ragged_operators.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_elementwise_ops",
         ":ragged_getitem",
         ":ragged_tensor",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:util",
     ],
 )
@@ -191,7 +175,11 @@ py_library(
         ":ragged_factory_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
     ],
 )
@@ -202,9 +190,13 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged_tensor_value",
+        ":ragged_util",
+        ":segment_id_ops",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:ragged_conversion_ops_gen",
         "//tensorflow/python:session",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -219,10 +211,11 @@ py_library(
         ":ragged_tensor",
         ":ragged_util",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
     ],
 )
@@ -231,7 +224,10 @@ py_library(
     name = "ragged_tensor_value",
     srcs = ["ragged_tensor_value.py"],
     srcs_version = "PY2AND3",
-    deps = ["//third_party/py/numpy"],
+    deps = [
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
 )
 
 py_library(
@@ -260,6 +256,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -268,27 +265,63 @@ py_library(
     srcs = ["ragged_map_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_array_ops",
-        ":ragged_factory_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
     ],
 )
 
+py_library(
+    name = "ragged_dispatch",
+    srcs = ["ragged_dispatch.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_shape",
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
 #-------------------------------------------------------------------------------
 # RaggedTensor Tests
 #-------------------------------------------------------------------------------
 
+py_library(
+    name = "ragged_test_util",
+    srcs = ["ragged_test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_tensor",
+        ":ragged_tensor_value",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "ragged_tensor_test",
     size = "medium",
@@ -298,13 +331,20 @@ py_test(
         "no_windows",
     ],
     deps = [
-        ":ragged",
+        ":ragged",  # fixdeps: keep
+        ":ragged_factory_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_value",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -316,9 +356,9 @@ py_test(
     srcs = ["ragged_eager_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_test_util",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -329,7 +369,8 @@ py_test(
     srcs = ["ragged_range_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_math_ops",
+        ":ragged_test_util",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -341,7 +382,9 @@ py_test(
     srcs = ["ragged_tensor_bounding_shape_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_test_util",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
@@ -352,7 +395,10 @@ py_test(
     srcs = ["ragged_row_lengths_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_test_util",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "@absl_py//absl/testing:parameterized",
@@ -364,7 +410,9 @@ py_test(
     srcs = ["ragged_gather_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
@@ -372,6 +420,7 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
     ],
 )
 
@@ -380,12 +429,16 @@ py_test(
     srcs = ["ragged_batch_gather_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -395,11 +448,15 @@ py_test(
     srcs = ["ragged_gather_nd_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -410,7 +467,8 @@ py_test(
     srcs = ["ragged_row_splits_to_segment_ids_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_test_util",
+        ":segment_id_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -422,7 +480,8 @@ py_test(
     srcs = ["ragged_segment_ids_to_row_splits_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_test_util",
+        ":segment_id_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -434,7 +493,8 @@ py_test(
     srcs = ["ragged_from_tensor_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_tensor",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_test_lib",
@@ -452,13 +512,19 @@ py_test(
         "no_windows",
     ],
     deps = [
-        ":ragged",
+        ":ragged",  # fixdeps: keep
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_tensor",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients_impl",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
     ],
 )
 
@@ -467,13 +533,15 @@ py_test(
     srcs = ["ragged_from_sparse_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_tensor",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/eager:context",
     ],
 )
 
@@ -482,7 +550,8 @@ py_test(
     srcs = ["ragged_to_tensor_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
@@ -496,7 +565,10 @@ py_test(
     srcs = ["ragged_segment_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:errors",
@@ -511,23 +583,29 @@ py_test(
     srcs = ["ragged_reduce_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_math_ops",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_test(
-    name = "ragged_map_inner_values_op_test",
-    srcs = ["ragged_map_inner_values_op_test.py"],
+    name = "ragged_map_flat_values_op_test",
+    srcs = ["ragged_map_flat_values_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_tensor",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
@@ -535,7 +613,6 @@ py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -544,8 +621,10 @@ py_test(
     srcs = ["ragged_const_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":ragged",
         ":ragged_factory_ops",
         ":ragged_tensor",
+        ":ragged_test_util",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -561,7 +640,10 @@ py_test(
         "no_windows",
     ],
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_value",
+        ":ragged_test_util",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "//third_party/py/numpy",
@@ -574,7 +656,9 @@ py_test(
     srcs = ["convert_to_tensor_or_ragged_tensor_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_test_util",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
@@ -589,12 +673,15 @@ py_test(
     srcs = ["ragged_boolean_mask_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -604,13 +691,16 @@ py_test(
     srcs = ["ragged_concat_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -620,7 +710,9 @@ py_test(
     srcs = ["ragged_stack_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_test_util",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -635,6 +727,7 @@ py_test(
     deps = [
         ":ragged_array_ops",
         ":ragged_factory_ops",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
@@ -649,6 +742,7 @@ py_test(
     srcs = ["ragged_util_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":ragged_test_util",
         ":ragged_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -664,7 +758,9 @@ py_test(
     srcs = ["ragged_expand_dims_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_test_util",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "@absl_py//absl/testing:parameterized",
@@ -676,7 +772,9 @@ py_test(
     srcs = ["ragged_where_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_test_util",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "@absl_py//absl/testing:parameterized",
@@ -684,17 +782,26 @@ py_test(
 )
 
 py_test(
-    name = "ragged_elementwise_ops_test",
-    srcs = ["ragged_elementwise_ops_test.py"],
+    name = "ragged_dispatch_test",
+    srcs = ["ragged_dispatch_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged",  # fixdeps: keep
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:clip_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -705,7 +812,9 @@ py_test(
     srcs = ["ragged_operators_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged",  # fixdeps: keep
+        ":ragged_factory_ops",
+        ":ragged_test_util",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
@@ -717,7 +826,13 @@ py_test(
     srcs = ["ragged_map_fn_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged",  # fixdeps: keep
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_map_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
@@ -725,6 +840,7 @@ py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:string_ops",
         "//tensorflow/python/keras:backend",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -734,9 +850,15 @@ py_test(
     srcs = ["ragged_tensor_shape_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged",  # fixdeps: keep
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_shape",
+        ":ragged_test_util",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/ops/ragged/__init__.py b/tensorflow/python/ops/ragged/__init__.py
index 1b2a7be95fc4347634d360bbb006a462bf5d41ef..7806f5697852fa69cea46e930fa37a3477c8e380 100644
--- a/tensorflow/python/ops/ragged/__init__.py
+++ b/tensorflow/python/ops/ragged/__init__.py
@@ -1,236 +1,47 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Ragged Tensors.
 
-This package defines the [`RaggedTensor`](ragged/RaggedTensor.md) class, which
-represents tensors with non-uniform shapes.  In particular, each `RaggedTensor`
+This package defines ops for manipulating ragged tensors (`tf.RaggedTensor`),
+which are tensors with non-uniform shapes.  In particular, each `RaggedTensor`
 has one or more *ragged dimensions*, which are dimensions whose slices may have
 different lengths.  For example, the inner (column) dimension of
 `rt=[[3, 1, 4, 1], [], [5, 9, 2], [6], []]` is ragged, since the column slices
 (`rt[0, :]`, ..., `rt[4, :]`) have different lengths.  For a more detailed
-description of ragged tensors, see the [`RaggedTensor`](ragged/RaggedTensor.md)
-class documentation.
-
-## RaggedTensor Operations
-
-This package also defines a collection of operations for manipulating
-ragged tensors.
-
-### RaggedTensor Versions of Standard Tensor Operations
-
-Many of the operations defined by this package are analogous to
-[`Tensor`](https://www.tensorflow.org/api_docs/python/tf/Tensor)
-operations, but they accept `RaggedTensor`s as input and can return
-`RaggedTensor`s as output.  For example, `ragged.add` performs elementwise
-addition just like `tf.add`, but can be used on `RaggedTensor`s.
-
-These `RaggedTensor` versions of the standard `Tensor` operations can also be
-used with standard `Tensors`; and for the most part, they will return the same
-value that the standard `Tensor` operation would return.  However, there are
-a few notable exceptions:
-
-* For [`ragged.stack(...)`](ragged/stack.md) and
-  [`ragged.concat(...)`](ragged/concat.md), the input tensors are not required
-  to have matching shapes.  In the returned tensor, all dimensions up to the
-  `axis` dimension will be ragged.
-
-### Ragged-Tensor Specific Operations
-
-The following operations are specific to ragged tensors:
-
-* **Factory ops**:
-  [`constant(...)`](ragged/constant.md),
-  [`from_row_splits(...)`](ragged/from_row_splits.md),
-  [`from_row_lengths(...)`](ragged/from_row_lengths.md),
-  [`from_row_starts(...)`](ragged/from_row_starts.md),
-  [`from_row_limits(...)`](ragged/from_row_limits.md),
-  [`from_value_rowids(...)`](ragged/from_value_rowids.md),
-  [`from_nested_row_splits(...)`](ragged/from_nested_row_splits.md),
-  [`from_nested_value_rowids(...)`](ragged/from_nested_value_rowids.md).
-
-* **Conversion ops**:
-  [`from_tensor(...)`](ragged/from_tensor.md),
-  [`to_tensor(...)`](ragged/to_tensor.md),
-  [`from_sparse(...)`](ragged/from_sparse.md),
-  [`to_sparse(...)`](ragged/to_sparse.md),
-  [`from_variant(...)`](ragged/from_variant.md),
-  [`to_variant(...)`](ragged/to_variant.md),
-  [`convert_to_tensor_or_ragged_tensor(...)`](
-  ragged/convert_to_tensor_or_ragged_tensor.md).
-
-* **Shape ops**:
-  [`row_splits(...)`](ragged/row_splits.md),
-  [`row_lengths(...)`](ragged/row_lengths.md),
-  [`row_starts(...)`](ragged/row_starts.md),
-  [`row_limits(...)`](ragged/row_limits.md),
-  [`value_rowids(...)`](ragged/value_rowids.md),
-  [`nrows(...)`](ragged/nrows.md),
-  [`nested_row_splits(...)`](ragged/nested_row_splits.md),
-  [`row_splits_to_segment_ids(...)`](ragged/row_splits_to_segment_ids.md),
-  [`segment_ids_to_row_splits(...)`](ragged/segment_ids_to_row_splits.md),
-  [`bounding_shape(...)`](ragged/bounding_shape.md).
-
-* **Functional ops**:
-  [`map_inner_values(...)`](ragged/map_inner_values.md),
-  [`make_elementwise_op(...)`](ragged/make_elementwise_op.md).
-
-
-<!-- Ragged Classes & related helper functions -->
-@@RaggedTensor
-@@RaggedTensorType
-@@RaggedTensorValue
-@@is_ragged
-
-<!-- Factory Ops -->
-@@constant
-@@constant_value
-@@from_row_splits
-@@from_row_lengths
-@@from_row_starts
-@@from_row_limits
-@@from_value_rowids
-@@from_nested_row_splits
-@@from_nested_value_rowids
-@@convert_to_tensor_or_ragged_tensor
-
-<!-- Conversion Ops -->
-@@from_tensor
-@@to_tensor
-@@from_sparse
-@@to_sparse
-@@row_splits_to_segment_ids
-@@segment_ids_to_row_splits
-
-<!-- Array Ops -->
-@@row_splits
-@@row_lengths
-@@row_starts
-@@row_limits
-@@value_rowids
-@@nrows
-@@nested_row_splits
-@@bounding_shape
-@@gather
-@@batch_gather
-@@gather_nd
-@@boolean_mask
-@@concat
-@@stack
-@@tile
-@@expand_dims
-@@where
-
-<!-- Math Ops -->
-@@range
-
-@@segment_sum
-@@segment_prod
-@@segment_min
-@@segment_max
-@@segment_mean
-@@segment_sqrt_n
-
-@@reduce_sum
-@@reduce_prod
-@@reduce_min
-@@reduce_max
-@@reduce_mean
-@@reduce_all
-@@reduce_any
-
-<!-- Functional Ops -->
-@@map_inner_values
-@@map_fn
-
-<!-- Elementwise Ops -->
-@@make_elementwise_op
-
-<!-- Shape & broadcasting -->
-@@RaggedTensorDynamicShape
-@@broadcast_to
-@@broadcast_dynamic_shape
-
-<!-- Symbols from  ragged_elementwise_ops._symbols_to_export are whitelisted -->
+description of ragged tensors, see the `tf.RaggedTensor` class documentation
+and the [Ragged Tensor Guide](/guides/ragged_tensor).
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_dispatch
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_getitem
+from tensorflow.python.ops.ragged import ragged_map_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_operators
 from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_shape
+from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.ops.ragged import segment_id_ops
 
-from tensorflow.python.ops.ragged.ragged_array_ops import batch_gather
-from tensorflow.python.ops.ragged.ragged_array_ops import boolean_mask
-from tensorflow.python.ops.ragged.ragged_array_ops import bounding_shape
-from tensorflow.python.ops.ragged.ragged_array_ops import concat
-from tensorflow.python.ops.ragged.ragged_array_ops import expand_dims
-from tensorflow.python.ops.ragged.ragged_array_ops import gather
-from tensorflow.python.ops.ragged.ragged_array_ops import gather_nd
-from tensorflow.python.ops.ragged.ragged_array_ops import nrows
-from tensorflow.python.ops.ragged.ragged_array_ops import row_lengths
-from tensorflow.python.ops.ragged.ragged_array_ops import row_limits
-from tensorflow.python.ops.ragged.ragged_array_ops import row_starts
-from tensorflow.python.ops.ragged.ragged_array_ops import stack
-from tensorflow.python.ops.ragged.ragged_array_ops import tile
-from tensorflow.python.ops.ragged.ragged_array_ops import value_rowids
-from tensorflow.python.ops.ragged.ragged_array_ops import where
-
-from tensorflow.python.ops.ragged.ragged_conversion_ops import from_sparse
-from tensorflow.python.ops.ragged.ragged_conversion_ops import from_tensor
-from tensorflow.python.ops.ragged.ragged_conversion_ops import to_sparse
-from tensorflow.python.ops.ragged.ragged_conversion_ops import to_tensor
-
-# pylint: disable=protected-access, wildcard-import
-from tensorflow.python.ops.ragged.ragged_elementwise_ops import *
-from tensorflow.python.ops.ragged.ragged_elementwise_ops import _symbols_to_export as _elementwise_ops
-# pylint: enable=protected-access, wildcard-import
-
-from tensorflow.python.ops.ragged.ragged_factory_ops import constant
-from tensorflow.python.ops.ragged.ragged_factory_ops import constant_value
-from tensorflow.python.ops.ragged.ragged_factory_ops import convert_to_tensor_or_ragged_tensor
-from tensorflow.python.ops.ragged.ragged_factory_ops import from_nested_row_splits
-from tensorflow.python.ops.ragged.ragged_factory_ops import from_nested_value_rowids
-from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_lengths
-from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_limits
-from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_splits
-from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_starts
-from tensorflow.python.ops.ragged.ragged_factory_ops import from_value_rowids
-
-from tensorflow.python.ops.ragged.ragged_functional_ops import map_inner_values
-
-from tensorflow.python.ops.ragged.ragged_map_ops import map_fn
-
-from tensorflow.python.ops.ragged.ragged_math_ops import range  # pylint: disable=redefined-builtin
-
-from tensorflow.python.ops.ragged.ragged_math_ops import reduce_all
-from tensorflow.python.ops.ragged.ragged_math_ops import reduce_any
-from tensorflow.python.ops.ragged.ragged_math_ops import reduce_max
-from tensorflow.python.ops.ragged.ragged_math_ops import reduce_mean
-from tensorflow.python.ops.ragged.ragged_math_ops import reduce_min
-from tensorflow.python.ops.ragged.ragged_math_ops import reduce_prod
-from tensorflow.python.ops.ragged.ragged_math_ops import reduce_sum
-
-from tensorflow.python.ops.ragged.ragged_math_ops import segment_max
-from tensorflow.python.ops.ragged.ragged_math_ops import segment_mean
-from tensorflow.python.ops.ragged.ragged_math_ops import segment_min
-from tensorflow.python.ops.ragged.ragged_math_ops import segment_prod
-from tensorflow.python.ops.ragged.ragged_math_ops import segment_sqrt_n
-from tensorflow.python.ops.ragged.ragged_math_ops import segment_sum
-
-from tensorflow.python.ops.ragged.ragged_tensor import is_ragged
-from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
-from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensorType
-
-from tensorflow.python.ops.ragged.ragged_tensor_shape import broadcast_dynamic_shape
-from tensorflow.python.ops.ragged.ragged_tensor_shape import broadcast_to
-from tensorflow.python.ops.ragged.ragged_tensor_shape import RaggedTensorDynamicShape
-
-from tensorflow.python.ops.ragged.ragged_tensor_value import RaggedTensorValue
-
-from tensorflow.python.ops.ragged.segment_id_ops import row_splits_to_segment_ids
-from tensorflow.python.ops.ragged.segment_id_ops import segment_ids_to_row_splits
-
-from tensorflow.python.util import all_util as _all_util
-
-# Any symbol that is not referenced (with "@@name") in the module docstring
-# above, or included in the "_elementwise_ops" whitelist, will be removed.
-_all_util.remove_undocumented(__name__, _elementwise_ops)
+# Add a list of the ops that support Ragged Tensors.
+__doc__ += ragged_dispatch.ragged_op_list()  # pylint: disable=redefined-builtin
diff --git a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
index ef3464f2437b7c46838b07e68e531f8a227d7905..be1ccd9c727d18cd00445f442583d92dad7a8f73 100644
--- a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.convert_to_tensor_or_ragged_tensor."""
+"""Tests for ragged_tensor.convert_to_tensor_or_ragged."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,12 +24,15 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
-                                              parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedConvertToTensorOrRaggedTensorTest(
+    ragged_test_util.RaggedTensorTestCase, parameterized.TestCase):
 
   #=============================================================================
   # Tests where the 'value' param is a RaggedTensor
@@ -40,8 +43,8 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
       dict(pylist=[[1, 2], [3]], preferred_dtype=dtypes.string),
   ])
   def testConvertRaggedTensor(self, pylist, dtype=None, preferred_dtype=None):
-    rt = ragged.constant(pylist)
-    converted = ragged.convert_to_tensor_or_ragged_tensor(
+    rt = ragged_factory_ops.constant(pylist)
+    converted = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         rt, dtype, preferred_dtype)
     self.assertIs(converted, rt)
 
@@ -62,35 +65,40 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
                                    message,
                                    dtype=None,
                                    preferred_dtype=None):
-    rt = ragged.constant(pylist)
+    rt = ragged_factory_ops.constant(pylist)
 
     with self.assertRaisesRegexp(ValueError, message):
-      ragged.convert_to_tensor_or_ragged_tensor(rt, dtype, preferred_dtype)
+      ragged_tensor.convert_to_tensor_or_ragged_tensor(rt, dtype,
+                                                       preferred_dtype)
 
   #=============================================================================
   # Tests where the 'value' param is a RaggedTensorValue
   #=============================================================================
-  @parameterized.parameters([
-      dict(
-          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
-          expected_dtype=dtypes.int32),
-      dict(
-          value=ragged.constant_value([[b'a', b'b'], [b'c']]),
-          expected_dtype=dtypes.string),
-      dict(
-          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
-          dtype=dtypes.float32,
-          expected_dtype=dtypes.float32),
-      dict(
-          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
-          preferred_dtype=dtypes.float32,
-          expected_dtype=dtypes.float32),
-      dict(
-          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
-          preferred_dtype=dtypes.string,
-          expected_dtype=dtypes.int32),
-  ])
-  @test_util.run_deprecated_v1
+  @parameterized.parameters(
+      [
+          dict(
+              value=ragged_factory_ops.constant_value([[1, 2], [3]],
+                                                      dtype=np.int32),
+              expected_dtype=dtypes.int32),
+          dict(
+              value=ragged_factory_ops.constant_value([[b'a', b'b'], [b'c']]),
+              expected_dtype=dtypes.string),
+          dict(
+              value=ragged_factory_ops.constant_value([[1, 2], [3]],
+                                                      dtype=np.int32),
+              dtype=dtypes.float32,
+              expected_dtype=dtypes.float32),
+          dict(
+              value=ragged_factory_ops.constant_value([[1, 2], [3]],
+                                                      dtype=np.int32),
+              preferred_dtype=dtypes.float32,
+              expected_dtype=dtypes.float32),
+          dict(
+              value=ragged_factory_ops.constant_value([[1, 2], [3]],
+                                                      dtype=np.int32),
+              preferred_dtype=dtypes.string,
+              expected_dtype=dtypes.int32),
+      ])
   def testConvertRaggedTensorValue(self,
                                    value,
                                    dtype=None,
@@ -98,16 +106,16 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
                                    expected_dtype=None):
     if expected_dtype is None:
       expected_dtype = value.dtype if dtype is None else dtype
-    converted = ragged.convert_to_tensor_or_ragged_tensor(
+    converted = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         value, dtype, preferred_dtype)
     self.assertEqual(value.ragged_rank, converted.ragged_rank)
     self.assertEqual(dtypes.as_dtype(expected_dtype), converted.dtype)
-    with self.test_session():
-      self.assertEqual(value.tolist(), self.evaluate(converted).tolist())
+    self.assertEqual(value.to_list(), self.eval_to_list(converted))
 
   @parameterized.parameters([
       dict(
-          value=ragged.constant_value([['a', 'b'], ['c']], dtype=str),
+          value=ragged_factory_ops.constant_value([['a', 'b'], ['c']],
+                                                  dtype=str),
           dtype=dtypes.int32,
           message=r"invalid literal for int\(\) with base 10: 'a'"),
   ])
@@ -117,7 +125,8 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
                                         dtype=None,
                                         preferred_dtype=None):
     with self.assertRaisesRegexp(ValueError, message):
-      ragged.convert_to_tensor_or_ragged_tensor(value, dtype, preferred_dtype)
+      ragged_tensor.convert_to_tensor_or_ragged_tensor(value, dtype,
+                                                       preferred_dtype)
 
   #=============================================================================
   # Tests where the 'value' param is a Tensor
@@ -129,10 +138,9 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
   ])
   def testConvertTensor(self, pylist, dtype=None, preferred_dtype=None):
     tensor = constant_op.constant(pylist)
-    converted = ragged.convert_to_tensor_or_ragged_tensor(
+    converted = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         tensor, dtype, preferred_dtype)
-    with self.test_session():
-      self.assertIs(tensor, converted)
+    self.assertIs(tensor, converted)
 
   @parameterized.parameters([
       dict(
@@ -146,7 +154,6 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
           message=('Tensor conversion requested dtype string for '
                    'Tensor with dtype int32')),
   ])
-  @test_util.run_deprecated_v1
   def testConvertTensorError(self,
                              pylist,
                              message,
@@ -154,7 +161,8 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
                              preferred_dtype=None):
     tensor = constant_op.constant(pylist)
     with self.assertRaisesRegexp(ValueError, message):
-      ragged.convert_to_tensor_or_ragged_tensor(tensor, dtype, preferred_dtype)
+      ragged_tensor.convert_to_tensor_or_ragged_tensor(tensor, dtype,
+                                                       preferred_dtype)
 
   #=============================================================================
   # Tests where the 'value' param is a np.array
@@ -186,11 +194,10 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
                             expected_dtype=None):
     if expected_dtype is None:
       expected_dtype = value.dtype if dtype is None else dtype
-    converted = ragged.convert_to_tensor_or_ragged_tensor(
+    converted = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         value, dtype, preferred_dtype)
     self.assertEqual(dtypes.as_dtype(expected_dtype), converted.dtype)
-    with self.test_session():
-      self.assertAllEqual(value, converted)
+    self.assertAllEqual(value, converted)
 
   @parameterized.parameters([
       dict(
@@ -204,7 +211,8 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
                                  dtype=None,
                                  preferred_dtype=None):
     with self.assertRaisesRegexp(ValueError, message):
-      ragged.convert_to_tensor_or_ragged_tensor(value, dtype, preferred_dtype)
+      ragged_tensor.convert_to_tensor_or_ragged_tensor(value, dtype,
+                                                       preferred_dtype)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index 603e39d1dcf7e05af63f51d5e1292bfb1a53d1d1..8ba8c53212f250dd48e5ac6485000494e9726f38 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -27,288 +27,18 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_ragged_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_conversion_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged import segment_id_ops
 
-#===============================================================================
-# Row Partitioning
-#===============================================================================
-
-
-def value_rowids(rt_input, name=None):
-  """Returns the row indices for the `values` in the given ragged tensor.
-
-  `value_rowids(rt)` corresponds one-to-one with the outermost dimension of
-  `rt.values`, and specifies the row containing each value.  In particular,
-  the row `rt[row]` consists of the values `rt.values[j]` where
-  `value_rowids(rt)[j] == row`.
-
-  Args:
-    rt_input: The RaggedTensor whose row indices should be returned.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A 1-D `int64` `Tensor` with shape `self.values.shape[:1]`.
-    The returned tensor is nonnegative, and is sorted in ascending order.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-    >>> rt.values.eval()
-    [3, 1, 4, 1, 5, 9, 2, 6]
-    >>> ragged.value_rowids(rt).eval()
-    [0, 0, 0, 0, 2, 2, 2, 3]  # corresponds 1:1 with rt.values
-    ```
-  """
-  if not ragged_tensor.is_ragged(rt_input):
-    raise TypeError(
-        'rt_input expected RaggedTensor, got %s' % type(rt_input).__name__)
-  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
-      rt_input.cached_value_rowids is not None):
-    return rt_input.cached_value_rowids
-
-  with ops.name_scope(name, 'RaggedValueRowIds', [rt_input]):
-    return segment_id_ops.row_splits_to_segment_ids(rt_input.row_splits)
-
-
-def nrows(rt_input, out_type=dtypes.int64, name=None):
-  """Returns the number of rows in the given potentially ragged tensor.
-
-  I.e., the size of the outermost dimension of the tensor.
-
-  Args:
-    rt_input: The potentially ragged tensor whose number of rows should be
-      returned.
-    out_type: `dtype` for the returned tensor.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A scalar `Tensor` with dtype `out_type`.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-    >>> ragged.nrows(rt).eval()  # rt has 5 rows.
-    5
-    ```
-  """
-  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
-      rt_input.cached_nrows is not None):
-    return rt_input.cached_nrows
-
-  with ops.name_scope(name, 'RaggedNRows', [rt_input]):
-    if ragged_tensor.is_ragged(rt_input):
-      return array_ops.shape(rt_input.row_splits, out_type=out_type)[0] - 1
-    else:
-      return array_ops.shape(rt_input, out_type=out_type)[0]
-
-
-def row_starts(rt_input, name=None):
-  """Returns the start indices for rows in the given ragged tensor.
-
-  These indices specify where the values for each row begin in
-  `rt_input.values`.  `ragged.row_starts(rt_input)` is equal to
-  `rt_input.row_splits[:-1]`.
-
-  Args:
-    rt_input: The RaggedTensor whose row starts should be returned.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A 1-D Tensor of int64 with shape `[nrows]`.
-    The returned tensor is nonnegative, and is sorted in ascending order.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-    >>> ragged.values(rt).eval()
-    [3, 1, 4, 1, 5, 9, 2, 6]
-    >>> ragged.row_starts(rt).eval()  # indices of row starts in ragged.values
-    [0, 4, 4, 7, 8]
-    ```
-  """
-  if not ragged_tensor.is_ragged(rt_input):
-    raise TypeError(
-        'rt_input expected RaggedTensor, got %s' % type(rt_input).__name__)
-  with ops.name_scope(name, 'RaggedRowStarts', [rt_input]):
-    return rt_input.row_splits[:-1]
-
-
-def row_limits(rt_input, name=None):
-  """Returns the limit indices for rows in the given ragged tensor.
-
-  These indices specify where the values for each row end in
-  `rt_input.values`.  `ragged.row_limits(rt_input)` is equal to
-  `rt_input.row_splits[:-1]`.
-
-  Args:
-    rt_input: The RaggedTensor whose row limits should be returned.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A 1-D Tensor of int64 with shape `[nrows]`.
-    The returned tensor is nonnegative, and is sorted in ascending order.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-    >>> ragged.values(rt).eval()
-    [3, 1, 4, 1, 5, 9, 2, 6]
-    >>> ragged.row_limits(rt).eval()  # indices of row limits in ragged.values
-    [4, 4, 7, 8, 8]
-    ```
-  """
-  if not ragged_tensor.is_ragged(rt_input):
-    raise TypeError(
-        'rt_input expected RaggedTensor, got %s' % type(rt_input).__name__)
-  with ops.name_scope(name, 'RaggedRowLimits', [rt_input]):
-    return rt_input.row_splits[1:]
-
-
-def row_lengths(rt_input, axis=1, name=None):
-  """Returns the lengths of the rows in the given potentially ragged tensor.
-
-  `ragged.row_lengths(rt_input)[i]` indicates the number of values in the
-  `i`th row of `rt_input`.
-
-  Args:
-    rt_input: The potentially ragged tensor whose row lengths should be
-      returned.  Must have at least `axis+1` dimensions.
-    axis: An integer constant indicating the axis whose row lengths should be
-      returned.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A potentially Tensor of int64 with shape `rt_input.shape[:axis]`.
-
-  Raises:
-    ValueError: If rt_input is a scalar, or `axis` is out of bounds.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.constant([[[3, 1, 4], [1]], [], [[5, 9], [2]], [[6]], []])
-    >>> ragged.row_lengths(rt).eval()  # lengths of rows in rt
-    [2, 0, 2, 1, 0]
-    >>> ragged.row_lengths(rt, axis=2).eval()  # lengths of axis=2 rows.
-    [[3, 1], [], [2, 1], [1], []]
-    ```
-  """
-  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
-      rt_input.cached_row_lengths is not None):
-    return rt_input.cached_row_lengths
-
-  with ops.name_scope(name, 'RaggedRowLengths', [rt_input]):
-    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        rt_input, name='rt_input')
-    ndims = rt_input.shape.ndims
-    if ndims is not None:
-      if ndims == 0:
-        raise ValueError('rt_input may not be a scalar.')
-      elif not -ndims <= axis < ndims:
-        raise ValueError('axis=%d out of bounds: expected %d<=axis<%d.' %
-                         (axis, -ndims, ndims))
-    if ragged_tensor.is_ragged(rt_input):
-      axis = ragged_util.get_positive_axis(axis, rt_input.shape.ndims)
-      if axis == 0:
-        return nrows(rt_input)
-      elif axis == 1:
-        splits = rt_input.row_splits
-        return splits[1:] - splits[:-1]
-      else:
-        return rt_input.with_values(row_lengths(rt_input.values, axis - 1))
-    else:
-      shape = array_ops.shape(rt_input, out_type=dtypes.int64)
-      return array_ops.ones(shape[:axis], dtypes.int64) * shape[axis]
-
-
-def nested_row_lengths(rt_input, name=None):
-  """Returns a tuple containing the row_lengths for all ragged dimensions.
-
-  `nested_row_lengths(rt)` is a tuple containing the `row_lengths` tensors for
-  all ragged dimensions in `rt`, ordered from outermost to innermost.
-
-  Args:
-    rt_input: A potentially ragged tensor.
-    name: A name prefix for the returned tensors (optional).
-
-  Returns:
-    A `tuple` of 1-D `int64` `Tensors`.  The length of the tuple is equal to
-    `rt_input.ragged_rank`.
-  """
-  with ops.name_scope(name, 'RaggedNestedRowLengths', [rt_input]):
-    rt_nested_row_lengths = []
-    while isinstance(rt_input, ragged_tensor.RaggedTensor):
-      rt_nested_row_lengths.append(row_lengths(rt_input))
-      rt_input = rt_input.values
-    return tuple(rt_nested_row_lengths)
-
-
-#===============================================================================
-# Bounding Shape
-#===============================================================================
-def bounding_shape(rt_input, axis=None, name=None):
-  """Returns the tight bounding box shape for a potentially ragged tensor.
-
-  Args:
-    rt_input: A potentially ragged tensor.
-    axis: An integer scalar or vector indicating which axes to return the
-      bounding box for.  If not specified, then the full bounding box is
-      returned.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    An int64 `Tensor`.  If `axis` is not specified, then `output`
-    is a vector with `output.shape=[rt_input.shape.ndims]`.  If `axis` is a
-    scalar, then the `output` is a scalar.  If `axis` is a vector, then
-    `output` is a vector, where `output[i]` is the bounding size for
-    dimension `axis[i]`.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
-    >>> ragged.bounding_shape(rt).eval().tolist()
-    [5, 4]
-    ```
-  """
-  with ops.name_scope(name, 'RaggedBoundingBox', [rt_input, axis]):
-    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        rt_input, name='rt_input')
-    if not ragged_tensor.is_ragged(rt_input):
-      bbox = array_ops.shape(rt_input)
-      return bbox if axis is None else array_ops.gather(bbox, axis)
-
-    nested_splits = rt_input.nested_row_splits
-    rt_inner_values = rt_input.inner_values
-
-    # Optimized special cases for when axis=0 or axis=1:
-    if isinstance(axis, int):
-      if axis == 0:
-        return array_ops.shape(nested_splits[0], out_type=dtypes.int64)[0] - 1
-      elif axis == 1:
-        return math_ops.maximum(math_ops.reduce_max(row_lengths(rt_input)), 0)
-
-    splits_shape = array_ops.shape(rt_input.row_splits, out_type=dtypes.int64)
-    inner_values_shape = array_ops.shape(rt_inner_values, out_type=dtypes.int64)
-
-    ragged_dimensions = array_ops.stack([splits_shape[0] - 1] + [
-        math_ops.maximum(math_ops.reduce_max(splits[1:] - splits[:-1]), 0)
-        for splits in nested_splits
-    ])
-    inner_dimensions = inner_values_shape[1:]
-
-    bbox = array_ops.concat([ragged_dimensions, inner_dimensions], axis=0)
-    return bbox if axis is None else array_ops.gather(bbox, axis)
-
 
 #===============================================================================
 # ragged_gather
 #===============================================================================
 # TODO(edloper): Add an `axis` argument
-def gather(params, indices, name=None):
+def gather(params, indices, validate_indices=None, axis=0, name=None):
   """Gathers ragged slices from `params` axis `0` according to `indices`.
 
   Returns `RaggedTensor` output, such that:
@@ -328,16 +58,16 @@ def gather(params, indices, name=None):
   ```python
   >>> params = tf.constant(['a', 'b', 'c', 'd', 'e'])
   >>> indices = tf.constant([3, 1, 2, 1, 0])
-  >>> ragged_params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
-  >>> ragged_indices = ragged.constant([[3, 1, 2], [1], [], [0]])
+  >>> ragged_params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+  >>> ragged_indices = tf.ragged.constant([[3, 1, 2], [1], [], [0]])
 
-  >>> print ragged.gather(params, ragged_indices).eval().tolist()
+  >>> print ragged.gather(params, ragged_indices)
   [['d', 'b', 'c'], ['b'], [], ['a']]
 
-  >>> print ragged.gather(ragged_params, indices).eval().tolist()
+  >>> print ragged.gather(ragged_params, indices)
   [['e'], ['d'], [], ['d'], ['a', 'b', 'c']]
 
-  >>> print ragged.gather(ragged_params, ragged_indices).eval().tolist()
+  >>> print ragged.gather(ragged_params, ragged_indices)
   [[['e'], ['d'], []], [['d']], [], [['a', 'b', 'c']]]
   ```
 
@@ -347,6 +77,8 @@ def gather(params, indices, name=None):
     indices: The potentially ragged tensor indicating which values to gather.
       Must have dtype `int32` or `int64`.  Values must be in the range `[0,
       params.shape[0]]`.
+    validate_indices: Ignored.
+    axis: Must be zero.
     name: A name for the operation (optional).
 
   Returns:
@@ -357,10 +89,13 @@ def gather(params, indices, name=None):
   Raises:
     ValueError: If indices.shape.ndims is not known statically.
   """
+  del validate_indices
+  if not isinstance(axis, int) or axis != 0:
+    raise ValueError('axis>0 is not supported for ragged gather yet.')
   with ops.name_scope(name, 'RaggedGather', [params, indices]):
-    params = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         params, name='params')
-    indices = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         indices, name='indices')
 
     if ragged_tensor.is_ragged(indices):
@@ -375,13 +110,13 @@ def gather(params, indices, name=None):
 
     result = gen_ragged_array_ops.ragged_gather(
         indices=indices,
-        params_dense_values=params.inner_values,
+        params_dense_values=params.flat_values,
         params_nested_splits=params.nested_row_splits,
         OUTPUT_RAGGED_RANK=indices.shape.ndims + len(params.nested_row_splits) -
         1)
 
     # Compose the RaggedTensor from splits & values.
-    return ragged_factory_ops.from_nested_row_splits(
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
         result.output_dense_values, result.output_nested_splits)
 
 
@@ -414,8 +149,8 @@ def batch_gather(params, indices, name=None):
 
   #### Example:
     ```python
-    >>> params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
-    >>> indices = ragged.constant([[1, 2, 0], [], [], [0, 0]])
+    >>> params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+    >>> indices = tf.ragged.constant([[1, 2, 0], [], [], [0, 0]])
     >>> ragged.batch_gather(params, indices)
     [['b', 'c', 'a'], [], [], ['e', 'e']]
     ```
@@ -424,9 +159,9 @@ def batch_gather(params, indices, name=None):
     return array_ops.batch_gather(params, indices, name)
 
   with ops.name_scope(name, 'RaggedBatchGather', [params, indices]):
-    params = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         params, name='params')
-    indices = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         indices, name='indices')
     indices_ndims = indices.shape.ndims
     if indices_ndims is None:
@@ -443,7 +178,7 @@ def batch_gather(params, indices, name=None):
                            'not match params shape')
         checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)]
         with ops.control_dependencies(checks):
-          return ragged_factory_ops.from_row_splits(
+          return ragged_tensor.RaggedTensor.from_row_splits(
               batch_gather(params.values, indices.values), indices.row_splits)
 
       # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension.
@@ -457,11 +192,11 @@ def batch_gather(params, indices, name=None):
 
         # Adjust indices from within-batch to global (in params.values), and
         # then use ragged.gather to gather them.
-        num_indices = row_lengths(indices)
-        params_starts = row_starts(params)
+        num_indices = indices.row_lengths()
+        params_starts = params.row_starts()
         adjustments = ragged_util.repeat(params_starts, num_indices, axis=0)
         adjusted_index_values = math_ops.to_int64(indices.values) + adjustments
-        return ragged_factory_ops.from_row_splits(
+        return ragged_tensor.RaggedTensor.from_row_splits(
             gather(params.values, adjusted_index_values), indices.row_splits)
 
     else:  # params is a RaggedTensor and indices is a Tensor.
@@ -469,7 +204,7 @@ def batch_gather(params, indices, name=None):
         return gather(params, indices)
       elif indices_ndims == 2:
         # Adjust indices from batch-local to global (in params.values)
-        adjustments = array_ops.expand_dims(row_starts(params), 1)
+        adjustments = array_ops.expand_dims(params.row_starts(), 1)
         adjusted_indices = math_ops.to_int64(indices) + adjustments
         return gather(params.values, adjusted_indices)
       else:
@@ -527,9 +262,9 @@ def gather_nd(params, indices, name=None):
 
   with ops.name_scope(name, 'RaggedGatherNd', [params, indices]):
 
-    params = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         params, name='params')
-    indices = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         indices, name='indices')
     indices_shape = indices.shape
     indices_ndims = indices_shape.ndims
@@ -543,7 +278,7 @@ def gather_nd(params, indices, name=None):
 
     # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions
     # that each index slices into.
-    index_size = indices_shape[-1].value
+    index_size = tensor_shape.dimension_value(indices_shape[-1])
     if index_size is None:
       raise ValueError('indices.shape[-1] must be statically known.')
 
@@ -555,8 +290,7 @@ def gather_nd(params, indices, name=None):
       if indices_is_dense:
         indices = ragged_conversion_ops.from_tensor(
             indices, ragged_rank=indices_ndims - 2)
-      result = indices.with_inner_values(
-          gather_nd(params, indices.inner_values))
+      result = indices.with_flat_values(gather_nd(params, indices.flat_values))
       if (indices_is_dense and ragged_tensor.is_ragged(result) and
           result.ragged_rank == indices_ndims - 2):
         result = ragged_conversion_ops.to_tensor(result)
@@ -570,7 +304,7 @@ def gather_nd(params, indices, name=None):
     # Handle corner case: An empty index tuple selects the entire `params`
     # value.  So if `index_size` is zero, then tile `params`.
     if index_size == 0:
-      params_ndims = params.ragged_rank + array_ops.rank(params.inner_values)
+      params_ndims = params.ragged_rank + array_ops.rank(params.flat_values)
       for dim in range(indices_ndims - 1):
         params = expand_dims(params, axis=0)
       multiples = array_ops.concat([
@@ -608,7 +342,7 @@ def gather_nd(params, indices, name=None):
           return array_ops.gather_nd(flattened_params, flattened_index_tuples)
 
         flattened_index_tuples = array_ops.gather(
-            row_starts(flattened_params), flattened_index_tuples)
+            flattened_params.row_starts(), flattened_index_tuples)
         flattened_index_tuples += indices[..., dim]
         flattened_params = flattened_params.values
 
@@ -704,9 +438,8 @@ def boolean_mask(data, mask, keepdims=False, name=None):
   """
   with ops.name_scope(name, 'RaggedMask', [data, mask]):
     # Convert inputs to tensors.
-    data = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        data, name='data')
-    mask = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name='data')
+    mask = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         mask, dtypes.bool, name='mask')
 
     # Get static rank of mask.
@@ -737,7 +470,7 @@ def boolean_mask(data, mask, keepdims=False, name=None):
           else:
             # Count the number of True mask values in each row to find the
             # lengths of the filtered rows; then convert to splits.
-            int_mask = ragged_functional_ops.map_inner_values(
+            int_mask = ragged_functional_ops.map_flat_values(
                 math_ops.cast, mask, dtype=dtypes.int64)
             masked_row_lengths = ragged_math_ops.reduce_sum(int_mask, axis=1)
             splits.append(ragged_util.lengths_to_splits(masked_row_lengths))
@@ -749,7 +482,7 @@ def boolean_mask(data, mask, keepdims=False, name=None):
 
         # Add the ragged `splits` back to the result.
         if keepdims:
-          masked_values = ragged_factory_ops.from_nested_row_splits(
+          masked_values = ragged_tensor.RaggedTensor.from_nested_row_splits(
               masked_values, splits)
 
         return masked_values
@@ -760,7 +493,7 @@ def boolean_mask(data, mask, keepdims=False, name=None):
       # Get the masked splits: first get the length of each row, then filter
       # out the rows that we are deleting, and convert that filtered set of
       # masks back to a splits tensor.
-      lengths = row_lengths(data)
+      lengths = data.row_lengths()
       masked_lengths = array_ops.boolean_mask(lengths, mask)
       masked_splits = ragged_util.lengths_to_splits(masked_lengths)
 
@@ -772,7 +505,8 @@ def boolean_mask(data, mask, keepdims=False, name=None):
       segment_mask = array_ops.gather(mask, segment_ids)
       masked_values = boolean_mask(data.values, segment_mask, keepdims=False)
 
-      return ragged_factory_ops.from_row_splits(masked_values, masked_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(masked_values,
+                                                        masked_splits)
 
     # If mask is non-ragged and has rank>1, then convert it to be ragged,
     # with a ragged rank matching data.
@@ -793,7 +527,7 @@ def boolean_mask(data, mask, keepdims=False, name=None):
         # and values to get the innermost ragged tensor.
         masked_lengths = math_ops.count_nonzero(mask, axis=-1)
         flattened_masked_lengths = array_ops.reshape(masked_lengths, [-1])
-        masked_values = ragged_factory_ops.from_row_lengths(
+        masked_values = ragged_tensor.RaggedTensor.from_row_lengths(
             masked_values, flattened_masked_lengths)
 
         # Wrap remaining ragged dimensions.
@@ -803,7 +537,7 @@ def boolean_mask(data, mask, keepdims=False, name=None):
           for dim in range(mask.shape.ndims - 3, -1, -1):
             elt_size = mask_shape[dim + 1]
             masked_splits = math_ops.range(split_size[dim]) * elt_size
-            masked_values = ragged_factory_ops.from_row_splits(
+            masked_values = ragged_tensor.RaggedTensor.from_row_splits(
                 masked_values, masked_splits)
 
       return masked_values
@@ -812,86 +546,86 @@ def boolean_mask(data, mask, keepdims=False, name=None):
 #===============================================================================
 # Concatenation and Stacking
 #===============================================================================
-def concat(rt_inputs, axis, name=None):
+def concat(values, axis, name=None):
   """Concatenates potentially ragged tensors along one dimension.
 
   Given a list of tensors with the same rank `K` (`K >= axis`), returns a
   rank-`K` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
-  concatenation of `[rt[i0...iaxis] for rt in rt_inputs]`.
+  concatenation of `[rt[i0...iaxis] for rt in values]`.
 
   Args:
-    rt_inputs: A list of potentially ragged tensors.  May not be empty. All
-      `rt_inputs` must have the same rank and the same dtype; but unlike
+    values: A list of potentially ragged tensors.  May not be empty. All
+      `values` must have the same rank and the same dtype; but unlike
       `tf.concat`, they can have arbitrary shapes.
     axis: A python integer, indicating the dimension along which to concatenate.
       (Note: Unlike `tf.concat`, the `axis` parameter must be statically known.)
         Negative values are supported only if the rank of at least one
-        `rt_inputs` value is statically known.
+        `values` value is statically known.
     name: A name prefix for the returned tensor (optional).
 
   Returns:
     A `RaggedTensor` with rank `K`.
-    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in rt_inputs]))`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
 
   Raises:
-    ValueError: If `rt_inputs` is empty, if `axis` is out of bounds or if
+    ValueError: If `values` is empty, if `axis` is out of bounds or if
       the input tensors have different ranks.
 
   #### Example:
     ```python
-    >>> t1 = ragged.constant([[1, 2], [3, 4, 5]])
-    >>> t2 = ragged.constant([[6], [7, 8, 9]])
+    >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
+    >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
     >>> ragged.concat([t1, t2], axis=0)
     [[1, 2], [3, 4, 5], [6], [7, 8, 9]]
     >>> ragged.concat([t1, t2], axis=1)
     [[1, 2, 6], [3, 4, 5, 7, 8, 9]]
     ```
   """
-  if not isinstance(rt_inputs, (list, tuple)):
-    rt_inputs = [rt_inputs]
-  with ops.name_scope(name, 'RaggedConcat', rt_inputs):
-    return _ragged_stack_concat_helper(rt_inputs, axis, stack_values=False)
+  if not isinstance(values, (list, tuple)):
+    values = [values]
+  with ops.name_scope(name, 'RaggedConcat', values):
+    return _ragged_stack_concat_helper(values, axis, stack_values=False)
 
 
-def stack(rt_inputs, axis, name=None):
+def stack(values, axis=0, name=None):
   """Stacks potentially ragged tensors along one dimension.
 
   Given a list of tensors with the same rank `K` (`K >= axis`), returns a
   rank-`K+1` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
-  list `[rt[i0...iaxis] for rt in rt_inputs]`.
+  list `[rt[i0...iaxis] for rt in values]`.
 
   Args:
-    rt_inputs: A list of potentially ragged tensors.  May not be empty. All
-      `rt_inputs` must have the same rank and the same dtype; but unlike
+    values: A list of potentially ragged tensors.  May not be empty. All
+      `values` must have the same rank and the same dtype; but unlike
       `tf.concat`, they can have arbitrary shapes.
     axis: A python integer, indicating the dimension along which to stack.
       (Note: Unlike `tf.stack`, the `axis` parameter must be statically known.)
         Negative values are supported only if the rank of at least one
-        `rt_inputs` value is statically known.
+        `values` value is statically known.
     name: A name prefix for the returned tensor (optional).
 
   Returns:
     A `RaggedTensor` with rank `K+1`.
-    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in rt_inputs]))`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
 
   Raises:
-    ValueError: If `rt_inputs` is empty, if `axis` is out of bounds or if
+    ValueError: If `values` is empty, if `axis` is out of bounds or if
       the input tensors have different ranks.
 
   #### Example:
     ```python
-    >>> t1 = ragged.constant([[1, 2], [3, 4, 5]])
-    >>> t2 = ragged.constant([[6], [7, 8, 9]])
+    >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
+    >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
     >>> ragged.stack([t1, t2], axis=0)
     [[[1, 2], [3, 4, 5]], [[6], [7, 9, 0]]]
     >>> ragged.stack([t1, t2], axis=1)
     [[[1, 2], [6]], [[3, 4, 5], [7, 8, 9]]]
     ```
   """
-  if not isinstance(rt_inputs, (list, tuple)):
-    rt_inputs = [rt_inputs]
-  with ops.name_scope(name, 'RaggedConcat', rt_inputs):
-    return _ragged_stack_concat_helper(rt_inputs, axis, stack_values=True)
+  if not isinstance(values, (list, tuple)):
+    values = [values]
+  with ops.name_scope(name, 'RaggedConcat', values):
+    return _ragged_stack_concat_helper(values, axis, stack_values=True)
 
 
 def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
@@ -914,7 +648,7 @@ def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
 
   # Convert input tensors.
   rt_inputs = [
-      ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+      ragged_tensor.convert_to_tensor_or_ragged_tensor(
           rt_input, name='rt_input') for rt_input in rt_inputs
   ]
 
@@ -965,7 +699,7 @@ def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
     values = [rt.values for rt in rt_inputs]
     splits = [[rt_input.row_splits] for rt_input in rt_inputs]
     with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
-      return ragged_factory_ops.from_row_splits(
+      return ragged_tensor.RaggedTensor.from_row_splits(
           _ragged_stack_concat_helper(values, axis - 1, stack_values),
           splits[0][0])
 
@@ -982,8 +716,8 @@ def _ragged_stack_concat_axis_0(rt_inputs, stack_values):
     A RaggedTensor.
   """
   # Concatenate the inner values together.
-  inner_values = [rt.inner_values for rt in rt_inputs]
-  concatenated_inner_values = array_ops.concat(inner_values, axis=0)
+  flat_values = [rt.flat_values for rt in rt_inputs]
+  concatenated_flat_values = array_ops.concat(flat_values, axis=0)
 
   # Concatenate the splits together for each ragged dimension (adjusting
   # split offsets as necessary).
@@ -997,12 +731,12 @@ def _ragged_stack_concat_axis_0(rt_inputs, stack_values):
 
   # If we are performing a stack operation, then add another splits.
   if stack_values:
-    stack_lengths = array_ops.stack([nrows(rt) for rt in rt_inputs])
+    stack_lengths = array_ops.stack([_nrows(rt) for rt in rt_inputs])
     stack_splits = ragged_util.lengths_to_splits(stack_lengths)
     concatenated_nested_splits.insert(0, stack_splits)
 
-  return ragged_factory_ops.from_nested_row_splits(concatenated_inner_values,
-                                                   concatenated_nested_splits)
+  return ragged_tensor.RaggedTensor.from_nested_row_splits(
+      concatenated_flat_values, concatenated_nested_splits)
 
 
 def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
@@ -1018,10 +752,10 @@ def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
   """
   num_inputs = len(rt_inputs)
 
-  rt_nrows = nrows(rt_inputs[0])
+  rt_nrows = _nrows(rt_inputs[0])
   nrows_msg = 'Input tensors have incompatible shapes.'
   nrows_checks = [
-      check_ops.assert_equal(nrows(rt), rt_nrows, message=nrows_msg)
+      check_ops.assert_equal(_nrows(rt), rt_nrows, message=nrows_msg)
       for rt in rt_inputs[1:]
   ]
 
@@ -1045,14 +779,15 @@ def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
       # Add a new splits tensor to group together the values.
       stack_splits = math_ops.range(0, rt_nrows * num_inputs + 1, num_inputs)
       _copy_row_shape(rt_inputs, stack_splits)
-      return ragged_factory_ops.from_row_splits(permuted_rt, stack_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt,
+                                                        stack_splits)
     else:
       # Merge together adjacent rows by dropping the row-split indices that
       # separate them.
       concat_splits = permuted_rt.row_splits[::num_inputs]
       _copy_row_shape(rt_inputs, concat_splits)
-      return ragged_factory_ops.from_row_splits(permuted_rt.values,
-                                                concat_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt.values,
+                                                        concat_splits)
 
 
 def _copy_row_shape(rt_inputs, splits):
@@ -1065,53 +800,53 @@ def _copy_row_shape(rt_inputs, splits):
 #===============================================================================
 # Tiling
 #===============================================================================
-def tile(rt_input, multiples, name=None):
+def tile(input, multiples, name=None):  # pylint: disable=redefined-builtin
   """Constructs a `RaggedTensor` by tiling a given `RaggedTensor`.
 
-  The values of `rt_input` are replicated `multiples[i]` times along the
+  The values of `input` are replicated `multiples[i]` times along the
   `i`th dimension (for each dimension `i`).  For every dimension `axis` in
-  `rt_input`, the length of each output element in that dimension is the
+  `input`, the length of each output element in that dimension is the
   length of corresponding input element multiplied by `multiples[axis]`.
 
   Args:
-    rt_input: A `RaggedTensor`.
+    input: A `RaggedTensor`.
     multiples: A 1-D integer `Tensor`.  Length must be the same as the number of
-      dimensions in `rt_input`.
+      dimensions in `input`.
     name: A name for the operation (optional).
 
   Returns:
-    A `RaggedTensor` with the same type, rank, and ragged_rank as `rt_input`.
+    A `RaggedTensor` with the same type, rank, and ragged_rank as `input`.
 
   #### Example:
     ```python
-    >>> rt = ragged.constant([[1, 2], [3]])
-    >>> ragged.tile(rt, [3, 2]).eval().tolist()
+    >>> rt = tf.ragged.constant([[1, 2], [3]])
+    >>> ragged.tile(rt, [3, 2])
     [[1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3]]
     ```
   """
-  with ops.name_scope(name, 'RaggedTile', [rt_input, multiples]):
-    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        rt_input, name='rt_input')
+  with ops.name_scope(name, 'RaggedTile', [input, multiples]):
+    input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        input, name='input')
     multiples = ragged_util.convert_to_int_tensor(
         multiples, name='multiples', dtype=dtypes.int64)
     multiples.shape.assert_has_rank(1)
-    if not ragged_tensor.is_ragged(rt_input):
-      return array_ops.tile(rt_input, multiples, name)
+    if not ragged_tensor.is_ragged(input):
+      return array_ops.tile(input, multiples, name)
 
     # If the constant value of `multiples` is available, then we can use it
     # to skip tiling dimensions where `multiples=1`.
     const_multiples = tensor_util.constant_value(multiples)
 
-    return ragged_factory_ops.from_nested_row_splits(
-        _tile_ragged_values(rt_input, multiples, const_multiples),
-        _tile_ragged_splits(rt_input, multiples, const_multiples))
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
+        _tile_ragged_values(input, multiples, const_multiples),
+        _tile_ragged_splits(input, multiples, const_multiples))
 
 
 def _tile_ragged_values(rt_input, multiples, const_multiples=None):
-  """Builds inner_values tensor for a tiled `RaggedTensor`.
+  """Builds flat_values tensor for a tiled `RaggedTensor`.
 
   Returns a tensor that repeats the values in
-  `rt_input.inner_values` in the
+  `rt_input.flat_values` in the
   appropriate pattern to construct a `RaggedTensor` that tiles `rt_input` as
   specified by `multiples`.
 
@@ -1123,19 +858,19 @@ def _tile_ragged_values(rt_input, multiples, const_multiples=None):
       dimensions where `multiples=1`.
 
   Returns:
-    A `Tensor` with the same type and rank as `rt_input.inner_values`.
+    A `Tensor` with the same type and rank as `rt_input.flat_values`.
 
   #### Example:
     ```python
-    >>> rt = ragged.constant([[1, 2], [3]])
-    >>> _tile_ragged_values(rt, [3, 2]).eval().tolist()
+    >>> rt = tf.ragged.constant([[1, 2], [3]])
+    >>> _tile_ragged_values(rt, [3, 2])
     [1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3]
     ```
   """
   ragged_rank = rt_input.ragged_rank
   nested_splits = rt_input.nested_row_splits
 
-  # Pointers to the values in `rt_input.inner_values`.
+  # Pointers to the values in `rt_input.flat_values`.
   inner_value_ids = math_ops.range(nested_splits[-1][-1])
 
   # For each ragged dimension (working from the innermost to outermost),
@@ -1158,9 +893,9 @@ def _tile_ragged_values(rt_input, multiples, const_multiples=None):
     prev_splits = splits
 
   # Gather the tiled inner values.
-  ragged_tiled_values = array_ops.gather(rt_input.inner_values, inner_value_ids)
+  ragged_tiled_values = array_ops.gather(rt_input.flat_values, inner_value_ids)
 
-  # Tile the inner_values for the uniform dimensions (i.e., for `axis=0` plus
+  # Tile the flat_values for the uniform dimensions (i.e., for `axis=0` plus
   # `axis=range(ragged_rank, rank)`).
   inner_repeats = array_ops.concat([multiples[:1], multiples[ragged_rank + 1:]],
                                    axis=0)
@@ -1186,8 +921,8 @@ def _tile_ragged_splits(rt_input, multiples, const_multiples=None):
 
   #### Example:
     ```python
-    >>> rt = ragged.constant([[1, 2], [3]])
-    >>> _tile_ragged_splits(rt, [3, 2]).eval().tolist()
+    >>> rt = tf.ragged.constant([[1, 2], [3]])
+    >>> _tile_ragged_splits(rt, [3, 2])
     [0, 4, 6, 10, 12, 16, 18]
     ```
   """
@@ -1240,26 +975,26 @@ def _tile_ragged_splits(rt_input, multiples, const_multiples=None):
 #===============================================================================
 
 
-def expand_dims(rt_input, axis, name=None):
+def expand_dims(input, axis, name=None):  # pylint: disable=redefined-builtin
   """Inserts a dimension with shape 1 into a potentially ragged tensor's shape.
 
-  Given a potentially ragged tenor `rt_input`, this operation inserts a
-  dimension with size 1 at the dimension `axis` of `rt_input`'s shape.
+  Given a potentially ragged tenor `input`, this operation inserts a
+  dimension with size 1 at the dimension `axis` of `input`'s shape.
 
-  * If `rt_input` is a `Tensor`, then this is equivalent to
+  * If `input` is a `Tensor`, then this is equivalent to
     `tf.expand_dims`.
-  * If `rt_input` is ragged, and `axis=0`, then the new dimension will be
+  * If `input` is ragged, and `axis=0`, then the new dimension will be
     uniform; but the previously outermost dimension will become ragged.
-  * If `rt_input` is ragged, and `0 < axis < rt_input.ragged_rank`, then the
+  * If `input` is ragged, and `0 < axis < input.ragged_rank`, then the
     new dimension will be ragged.
-  * If `rt_input` is ragged, and axis >= rt_input.ragged_rank`, then the new
+  * If `input` is ragged, and axis >= input.ragged_rank`, then the new
     dimension will be uniform.
 
   The following table gives some examples showing how `ragged.expand_dims`
   impacts the shapes of different input tensors.  Ragged dimensions are
   indicated by enclosing them in parentheses.
 
-  rt_input.shape          | axis | result.shape
+  input.shape             | axis | result.shape
   ----------------------- | ---- | -----------------------------
   `[D1, D2]`              |  `0` | `[1, D1, D2]`
   `[D1, D2]`              |  `1` | `[D1, 1, D2]`
@@ -1271,55 +1006,55 @@ def expand_dims(rt_input, axis, name=None):
   `[D1, (D2), (D3), D4]`  |  `4` | `[D1, (D2), (D3), D4, 1]`
 
   Args:
-    rt_input: The potentially tensor that should be expanded with a new
+    input: The potentially tensor that should be expanded with a new
       dimension.
     axis: An integer constant indicating where the new dimension should be
       inserted.
     name: A name for the operation (optional).
 
   Returns:
-    A tensor with the same values as `rt_input`, with an added dimension of
+    A tensor with the same values as `input`, with an added dimension of
     size 1 at `axis`.
 
   #### Examples:
     ```python
-    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> rt = tf.ragged.constant([[1, 2], [3]])
     >>> print rt.shape
     TensorShape([2, None])
 
     >>> expanded = ragged.expand_dims(rt, axis=0)
-    >>> print(expanded.shape, expanded.eval().tolist())
+    >>> print(expanded.shape, expanded)
     TensorShape([1, None, None]) [[[1, 2], [3]]]
 
     >>> expanded = ragged.expand_dims(rt, axis=1)
-    >>> print(expanded.shape, expanded.eval().tolist())
+    >>> print(expanded.shape, expanded)
     TensorShape([2, None, None]) [[[1, 2]], [[3]]]
 
     >>> expanded = ragged.expand_dims(rt, axis=2)
-    >>> print(expanded.shape, expanded.eval().tolist())
+    >>> print(expanded.shape, expanded)
     TensorShape([2, None, 1]) [[[1], [2]], [[3]]]
     ```
   """
-  with ops.name_scope(name, 'RaggedExpandDims', [rt_input]):
-    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        rt_input, name='rt_input')
+  with ops.name_scope(name, 'RaggedExpandDims', [input]):
+    input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        input, name='input')
 
-    if not ragged_tensor.is_ragged(rt_input):
-      return array_ops.expand_dims(rt_input, axis)
+    if not ragged_tensor.is_ragged(input):
+      return array_ops.expand_dims(input, axis)
 
-    ndims = None if rt_input.shape.ndims is None else rt_input.shape.ndims + 1
+    ndims = None if input.shape.ndims is None else input.shape.ndims + 1
     axis = ragged_util.get_positive_axis(axis, ndims)
     if axis == 0:
-      values = rt_input
-      splits = array_ops.stack([0, nrows(rt_input)])
+      values = input
+      splits = array_ops.stack([0, input.nrows()])
     elif axis == 1:
-      values = rt_input
-      splits = math_ops.range(nrows(rt_input) + 1)
+      values = input
+      splits = math_ops.range(input.nrows() + 1)
     else:
-      values = expand_dims(rt_input.values, axis - 1)
-      splits = rt_input.row_splits
+      values = expand_dims(input.values, axis - 1)
+      splits = input.row_splits
 
-    return ragged_factory_ops.from_row_splits(values, splits)
+    return ragged_tensor.RaggedTensor.from_row_splits(values, splits)
 
 
 #===============================================================================
@@ -1374,21 +1109,23 @@ def where(condition, x=None, y=None, name=None):
   #### Examples:
     ```python
     >>> # Coordinates where condition is true.
-    >>> condition = ragged.constant_value([[True, False, True], [False, True]])
+    >>> condition = tf.ragged.constant_value(
+    ...     [[True, False, True], [False, True]])
     >>> ragged.where(condition)
     [[0, 0], [0, 2], [1, 1]]
 
     >>> # Elementwise selection between x and y, based on condition.
-    >>> condition = ragged.constant_value([[True, False, True], [False, True]])
-    >>> x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
-    >>> y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> condition = tf.ragged.constant_value(
+    ...     [[True, False, True], [False, True]])
+    >>> x = tf.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y = tf.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
     >>> ragged.where(condition, x, y)
     [['A', 'b', 'C'], ['d', 'E']]
 
     >>> # Row selection between x and y, based on condition.
     >>> condition = [True, False]
-    >>> x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
-    >>> y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> x = tf.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y = tf.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
     >>> ragged.where(condition, x, y)
     [['A', 'B', 'C'], ['d', 'e']]
     ```
@@ -1396,13 +1133,13 @@ def where(condition, x=None, y=None, name=None):
   if (x is None) != (y is None):
     raise ValueError('x and y must be either both None or both non-None')
   with ops.name_scope('RaggedWhere', name, [condition, x, y]):
-    condition = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    condition = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         condition, name='condition')
     if x is None:
       return _coordinate_where(condition)
     else:
-      x = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(x, name='x')
-      y = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(y, name='y')
+      x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
+      y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, name='y')
       return _elementwise_where(condition, x, y)
 
 
@@ -1416,15 +1153,15 @@ def _elementwise_where(condition, x, y):
     return array_ops.where(condition, x, y)
 
   elif condition_is_ragged and x_is_ragged and y_is_ragged:
-    return ragged_functional_ops.map_inner_values(array_ops.where, condition, x,
-                                                  y)
+    return ragged_functional_ops.map_flat_values(array_ops.where, condition, x,
+                                                 y)
   elif not condition_is_ragged:
     # Concatenate x and y, and then use `gather` to assemble the selected rows.
     condition.shape.assert_has_rank(1)
-    x_nrows = nrows(x)
+    x_nrows = _nrows(x)
     x_and_y = concat([x, y], axis=0)
     indices = array_ops.where(condition, math_ops.range(x_nrows),
-                              x_nrows + math_ops.range(nrows(y)))
+                              x_nrows + math_ops.range(_nrows(y)))
     return gather(x_and_y, indices)
 
   else:
@@ -1441,7 +1178,7 @@ def _coordinate_where(condition):
 
   # Convert the first index in each coordinate to a row index and column index.
   first_index = selected_coords[:, 0]
-  selected_rows = array_ops.gather(value_rowids(condition), first_index)
+  selected_rows = array_ops.gather(condition.value_rowids(), first_index)
   selected_row_starts = array_ops.gather(condition.row_splits, selected_rows)
   selected_cols = first_index - selected_row_starts
 
@@ -1477,3 +1214,11 @@ def _concat_ragged_splits(splits_list):
     pieces.append(splits[1:] + splits_offset)
     splits_offset += splits[-1]
   return array_ops.concat(pieces, axis=0)
+
+
+def _nrows(rt_input, out_type=dtypes.int64, name=None):
+  if isinstance(rt_input, ragged_tensor.RaggedTensor):
+    return rt_input.nrows(out_type=out_type, name=name)
+  else:
+    with ops.name_scope(name, 'RaggedNRows', [rt_input]):
+      return array_ops.shape(rt_input, out_type=out_type)[0]
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
index d9d840500cb7be5edee3a885b6a1a6cd4119151b..431d350db8a5a266113df9a03e39a90643893d79 100644
--- a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf.ragged.batch_gather."""
+"""Tests for ragged_array_ops.batch_gather."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,15 +20,20 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedBatchGatherOpTest(ragged_test_util.RaggedTensorTestCase,
                               parameterized.TestCase):
 
   @parameterized.parameters([
@@ -37,10 +42,12 @@ class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
       #=========================================================================
       dict(
           descr='Docstring example',
-          params=ragged.constant_value([['a', 'b', 'c'], ['d'], [], ['e']]),
-          indices=ragged.constant_value([[1, 2, 0], [], [], [0, 0]]),
-          expected=ragged.constant_value([[b'b', b'c', b'a'], [], [],
-                                          [b'e', b'e']])),
+          params=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d'], [],
+                                                    ['e']]),
+          indices=ragged_factory_ops.constant_value([[1, 2, 0], [], [], [0,
+                                                                         0]]),
+          expected=ragged_factory_ops.constant_value([[b'b', b'c', b'a'], [],
+                                                      [], [b'e', b'e']])),
       #=========================================================================
       # 0 Batch Dimensions
       #=========================================================================
@@ -51,9 +58,10 @@ class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
           expected=[b'd', b'c']),
       dict(
           descr='params: [P1, (P2)], indices: [I], result: [I, (P2)]',
-          params=ragged.constant_value([['a', 'b'], [], ['c'], ['d', 'e']]),
+          params=ragged_factory_ops.constant_value([['a', 'b'], [], ['c'],
+                                                    ['d', 'e']]),
           indices=[3, 2],
-          expected=ragged.constant_value([[b'd', b'e'], [b'c']])),
+          expected=ragged_factory_ops.constant_value([[b'd', b'e'], [b'c']])),
       #=========================================================================
       # 1 Batch Dimension
       #=========================================================================
@@ -64,22 +72,24 @@ class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
           expected=[[b'c', b'a'], [b'd', b'e'], [b'h', b'g']]),
       dict(
           descr='params: [B1, (P1)], indices: [B1, I], result: [B1, I]',
-          params=ragged.constant_value([['a', 'b', 'c'], ['d', 'e'], ['g']]),
+          params=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d', 'e'],
+                                                    ['g']]),
           indices=[[2, 0], [0, 1], [0, 0]],
           expected=[[b'c', b'a'], [b'd', b'e'], [b'g', b'g']]),
       dict(
           descr='params: [B1, P1], indices: [B1, (I)], result: [B1, (I)]',
           params=[['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i']],
-          indices=ragged.constant_value([[2, 0, 2], [0], [1]]),
-          expected=ragged.constant_value([[b'c', b'a', b'c'], [b'd'], [b'h']])),
+          indices=ragged_factory_ops.constant_value([[2, 0, 2], [0], [1]]),
+          expected=ragged_factory_ops.constant_value([[b'c', b'a', b'c'],
+                                                      [b'd'], [b'h']])),
       dict(
           descr=('params: [B1, (P1), (P2), P3], indices: [B1, I], '
                  'result: [B1, I, (P2), P3]'),
-          params=ragged.constant_value(
+          params=ragged_factory_ops.constant_value(
               [[[['a']], [['b'], ['c']]], [[['d'], ['e']], [['f']]], [[['g']]]],
               ragged_rank=2),
           indices=[[1, 0], [0, 1], [0, 0]],
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[[b'b'], [b'c']], [[b'a']]], [[[b'd'], [b'e']], [[b'f']]],
                [[[b'g']], [[b'g']]]],
               ragged_rank=2)),
@@ -95,31 +105,31 @@ class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
       dict(
           descr=('params: [B1, (B2), P1], indices: [B1, (B2), I], '
                  'result: [B1, (B2), I]'),
-          params=ragged.constant_value(
+          params=ragged_factory_ops.constant_value(
               [[['a', 'b', 'c'], ['d', 'e', 'f']], [['g', 'h', 'i']]],
               ragged_rank=1),
-          indices=ragged.constant_value([[[2, 0], [0, 1]], [[1, 0]]],
-                                        ragged_rank=1),
-          expected=ragged.constant_value(
+          indices=ragged_factory_ops.constant_value(
+              [[[2, 0], [0, 1]], [[1, 0]]], ragged_rank=1),
+          expected=ragged_factory_ops.constant_value(
               [[[b'c', b'a'], [b'd', b'e']], [[b'h', b'g']]], ragged_rank=1)),
       dict(
           descr=('params: [B1, (B2), (P1)], indices: [B1, (B2), I], '
                  'result: [B1, (B2), I]'),
-          params=ragged.constant_value([[['a', 'b', 'c'], ['d']], [['e', 'f']]],
-                                       ragged_rank=2),
-          indices=ragged.constant_value([[[2, 0], [0, 0]], [[1, 0]]],
-                                        ragged_rank=1),
-          expected=ragged.constant_value(
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b', 'c'], ['d']], [['e', 'f']]], ragged_rank=2),
+          indices=ragged_factory_ops.constant_value(
+              [[[2, 0], [0, 0]], [[1, 0]]], ragged_rank=1),
+          expected=ragged_factory_ops.constant_value(
               [[[b'c', b'a'], [b'd', b'd']], [[b'f', b'e']]], ragged_rank=1)),
       dict(
           descr=('params: [B1, (B2), P1], indices: [B1, (B2), (I)], '
                  'result: [B1, (B2), (I)]'),
-          params=ragged.constant_value(
+          params=ragged_factory_ops.constant_value(
               [[['a', 'b', 'c'], ['d', 'e', 'f']], [['g', 'h', 'i']]],
               ragged_rank=1),
-          indices=ragged.constant_value([[[2, 1, 0], [0]], [[1, 1]]],
-                                        ragged_rank=2),
-          expected=ragged.constant_value(
+          indices=ragged_factory_ops.constant_value(
+              [[[2, 1, 0], [0]], [[1, 1]]], ragged_rank=2),
+          expected=ragged_factory_ops.constant_value(
               [[[b'c', b'b', b'a'], [b'd']], [[b'h', b'h']]], ragged_rank=2)),
       #=========================================================================
       # 3 Batch Dimensions
@@ -128,74 +138,77 @@ class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
           descr=(
               'params: [B1, (B2), (B3), (P1)], indices: [B1, (B2), (B3), I], '
               'result: [B1, (B2), (B3), I]'),
-          params=ragged.constant_value(
+          params=ragged_factory_ops.constant_value(
               [[[['a', 'b', 'c'], ['d']], [['e', 'f']]]], ragged_rank=3),
-          indices=ragged.constant_value([[[[2, 0], [0, 0]], [[1, 0]]]],
-                                        ragged_rank=2),
-          expected=ragged.constant_value(
+          indices=ragged_factory_ops.constant_value(
+              [[[[2, 0], [0, 0]], [[1, 0]]]], ragged_rank=2),
+          expected=ragged_factory_ops.constant_value(
               [[[[b'c', b'a'], [b'd', b'd']], [[b'f', b'e']]]], ragged_rank=2)),
   ])
-  @test_util.run_deprecated_v1
   def testRaggedBatchGather(self, descr, params, indices, expected):
-    result = ragged.batch_gather(params, indices)
-    self.assertEqual(
-        getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
-    with self.test_session():
-      if hasattr(expected, 'tolist'):
-        expected = expected.tolist()
-      self.assertEqual(result.eval().tolist(), expected)
+    result = ragged_array_ops.batch_gather(params, indices)
+    self.assertRaggedEqual(result, expected)
 
-  @test_util.run_deprecated_v1
   def testRaggedBatchGatherUnknownRankError(self):
+    if context.executing_eagerly():
+      return
     params = [['a', 'b'], ['c', 'd']]
     indices = array_ops.placeholder(dtypes.int32, shape=None)
-    ragged_indices = ragged.from_row_splits(indices, [0, 2, 4])
+    ragged_indices = ragged_tensor.RaggedTensor.from_row_splits(
+        indices, [0, 2, 4])
 
     with self.assertRaisesRegexp(
         ValueError, 'batch_gather does not allow indices with unknown shape.'):
-      ragged.batch_gather(params, indices)
+      ragged_array_ops.batch_gather(params, indices)
 
     with self.assertRaisesRegexp(
         ValueError, 'batch_gather does not allow indices with unknown shape.'):
-      ragged.batch_gather(params, ragged_indices)
+      ragged_array_ops.batch_gather(params, ragged_indices)
 
-  @parameterized.parameters([
-      dict(
-          params=ragged.constant([['a'], ['b'], ['c']]),
-          indices=ragged.constant([[0], [0]]),
-          message='Dimensions 3 and 2 are not compatible'),
-      dict(
-          params=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
-          indices=ragged.constant([[[0, 0], [0, 0, 0]], [[0]]]),
-          message='batch shape from indices does not match params shape'),
-      dict(
-          params=ragged.constant([[[0, 0], [0, 0, 0]], [[0]]]),
-          indices=ragged.constant([[[0, 0]], [[0, 0, 0]], [[0]]]),
-          message='Dimensions must be equal, but are 3 and 4'),
-      dict(
-          params=ragged.constant([[[0, 0], [0, 0, 0]], [[0]], [[0]]]),
-          indices=ragged.constant([[[0, 0]], [[0, 0, 0]], [[0]]]),
-          error=errors.InvalidArgumentError,
-          message='Condition x == y did not hold element-wise'),
-      dict(
-          params=ragged.constant(['a', 'b', 'c']),
-          indices=ragged.constant([[0], [0]]),
-          message='batch shape from indices does not match params shape'),
-      dict(params=ragged.constant_value([['a']]),
-           indices=0,
-           message='indices.rank must be at least 1.'),
-      dict(params=ragged.constant_value([['a']]),
-           indices=[[[0]]],
-           message='batch shape from indices does not match params shape'),
-  ])
-  @test_util.run_deprecated_v1
+  @parameterized.parameters(
+      [
+          dict(
+              params=ragged_factory_ops.constant_value([['a'], ['b'], ['c']]),
+              indices=ragged_factory_ops.constant_value([[0], [0]]),
+              message='Dimensions 3 and 2 are not compatible'),
+          dict(
+              params=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+              indices=ragged_factory_ops.constant_value([[[0, 0], [0, 0, 0]],
+                                                         [[0]]]),
+              message='batch shape from indices does not match params shape'),
+          dict(  # rank mismatch
+              params=ragged_factory_ops.constant_value([[[0, 0], [0, 0, 0]],
+                                                        [[0]]]),
+              indices=ragged_factory_ops.constant_value([[[0, 0]], [[0, 0, 0]],
+                                                         [[0]]]),
+              error=(ValueError, errors.InvalidArgumentError)),
+          dict(
+              params=ragged_factory_ops.constant_value([[[0, 0], [0, 0, 0]],
+                                                        [[0]], [[0]]]),
+              indices=ragged_factory_ops.constant_value([[[0, 0]], [[0, 0, 0]],
+                                                         [[0]]]),
+              error=errors.InvalidArgumentError,
+              message='.*Condition x == y did not hold.*'),
+          dict(
+              params=ragged_factory_ops.constant_value(['a', 'b', 'c']),
+              indices=ragged_factory_ops.constant_value([[0], [0]]),
+              message='batch shape from indices does not match params shape'),
+          dict(
+              params=ragged_factory_ops.constant_value([['a']]),
+              indices=0,
+              message='indices.rank must be at least 1.'),
+          dict(
+              params=ragged_factory_ops.constant_value([['a']]),
+              indices=[[[0]]],
+              message='batch shape from indices does not match params shape'),
+      ])
   def testRaggedBatchGatherStaticError(self,
                                        params,
                                        indices,
-                                       message,
+                                       message=None,
                                        error=ValueError):
     with self.assertRaisesRegexp(error, message):
-      ragged.batch_gather(params, indices)
+      ragged_array_ops.batch_gather(params, indices)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
index d939d9d63419217826cfc3e6db0c7a3464255953..19f7d216d22e84958743bf771ecd346cd6b55b83 100644
--- a/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.boolean_mask."""
+"""Tests for ragged_array_ops.boolean_mask."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,15 +20,19 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedBooleanMaskOpTest(ragged_test_util.RaggedTensorTestCase,
                               parameterized.TestCase):
   # Define short constants for true & false, so the data & mask can be lined
   # up in the examples below.  This makes it easier to read the examples, to
@@ -51,25 +55,25 @@ class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
           data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
           mask=[[T, F, T], [F, F, F], [T, F, F]],
           keepdims=True,
-          expected=ragged.constant_value([[1, 3], [], [7]])),
+          expected=ragged_factory_ops.constant_value([[1, 3], [], [7]])),
       dict(
           descr='Docstring example 3',
-          data=ragged.constant_value([[1, 2, 3], [4], [5, 6]]),
-          mask=ragged.constant_value([[F, F, T], [F], [T, T]]),
+          data=ragged_factory_ops.constant_value([[1, 2, 3], [4], [5, 6]]),
+          mask=ragged_factory_ops.constant_value([[F, F, T], [F], [T, T]]),
           keepdims=False,
           expected=[3, 5, 6]),
       dict(
           descr='Docstring example 4',
-          data=ragged.constant_value([[1, 2, 3], [4], [5, 6]]),
-          mask=ragged.constant_value([[F, F, T], [F], [T, T]]),
+          data=ragged_factory_ops.constant_value([[1, 2, 3], [4], [5, 6]]),
+          mask=ragged_factory_ops.constant_value([[F, F, T], [F], [T, T]]),
           keepdims=True,
-          expected=ragged.constant_value([[3], [], [5, 6]])),
+          expected=ragged_factory_ops.constant_value([[3], [], [5, 6]])),
       dict(
           descr='Docstring example 5',
-          data=ragged.constant_value([[1, 2, 3], [4], [5, 6]]),
+          data=ragged_factory_ops.constant_value([[1, 2, 3], [4], [5, 6]]),
           mask=[True, False, True],
           keepdims=False,
-          expected=ragged.constant_value([[1, 2, 3], [5, 6]])),
+          expected=ragged_factory_ops.constant_value([[1, 2, 3], [5, 6]])),
       #=========================================================================
       # Uniform data and uniform mask.
       #=========================================================================
@@ -90,7 +94,8 @@ class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
           data=[[1, 2, 3], [4, 5, 6], [7, 8, 9], [0, 1, 2], [3, 4, 5]],
           mask=[[F, F, F], [T, F, T], [T, T, T], [F, F, F], [T, T, F]],
           keepdims=True,
-          expected=ragged.constant_value([[], [4, 6], [7, 8, 9], [], [3, 4]])),
+          expected=ragged_factory_ops.constant_value(
+              [[], [4, 6], [7, 8, 9], [], [3, 4]])),
       dict(
           descr='data.shape=[3, 2, 2]; mask.shape=[3]; keepdims=True',
           data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
@@ -108,8 +113,9 @@ class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
           data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
           mask=[[T, F], [T, T], [F, F]],
           keepdims=True,
-          expected=ragged.constant_value([[[1, 2]], [[5, 6], [7, 8]], []],
-                                         ragged_rank=1)),
+          expected=ragged_factory_ops.constant_value(
+              [[[1, 2]], [[5, 6], [7, 8]], []],
+              ragged_rank=1)),
       dict(
           descr='data.shape=[3, 2, 2]; mask.shape=[3, 2]; keepdims=False',
           data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
@@ -121,7 +127,7 @@ class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
           data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
           mask=[[[T, T], [F, T]], [[F, F], [F, F]], [[T, F], [T, T]]],
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2], [4]], [[], []], [[2], [6, 8]]])),
       dict(
           descr='data.shape=mask.shape=[2, 2, 2, 2]; keepdims=True',
@@ -130,7 +136,7 @@ class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
           mask=[[[[T, T], [F, F]], [[T, F], [F, F]]],
                 [[[F, F], [F, F]], [[T, T], [T, F]]]],
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[[1, 2], []], [[5], []]], [[[], []], [[1, 3], [5]]]])),
       dict(
           descr='data.shape=mask.shape=[2, 2, 2, 2]; keepdims=False',
@@ -146,63 +152,64 @@ class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
       #=========================================================================
       dict(
           descr='data.shape=[5, (D2)]; mask.shape=[5, (D2)]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[1, 2], [3, 4, 5, 6], [7, 8, 9], [], [1, 2, 3]]),
-          mask=ragged.constant_value(
+          mask=ragged_factory_ops.constant_value(
               [[F, F], [F, T, F, T], [F, F, F], [], [T, F, T]]),
           keepdims=True,
-          expected=ragged.constant_value([[], [4, 6], [], [], [1, 3]])),
+          expected=ragged_factory_ops.constant_value(
+              [[], [4, 6], [], [], [1, 3]])),
       dict(
           descr='data.shape=[3, (D2), (D3)]; mask.shape=[3, (D2)]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]]),
-          mask=ragged.constant_value([[T, F], [T, T], [F, F]]),
+          mask=ragged_factory_ops.constant_value([[T, F], [T, T], [F, F]]),
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2]], [[5, 6], [7, 8]], []])),
       dict(
           descr='data.shape=[3, (D2), (D3)]; mask.shape=[3, (D2)]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]]),
-          mask=ragged.constant_value([[T, F], [T, T], [F, F]]),
+          mask=ragged_factory_ops.constant_value([[T, F], [T, T], [F, F]]),
           keepdims=False,
-          expected=ragged.constant_value([[1, 2], [5, 6], [7, 8]])),
+          expected=ragged_factory_ops.constant_value([[1, 2], [5, 6], [7, 8]])),
       dict(
           descr='data.shape=[3, (D2), D3]; mask.shape=[3, (D2)]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [[5, 6], [7, 8], [2, 4]], [[6, 8]]],
               ragged_rank=1),
-          mask=ragged.constant_value([[T, F], [T, T, F], [F]]),
+          mask=ragged_factory_ops.constant_value([[T, F], [T, T, F], [F]]),
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2]], [[5, 6], [7, 8]], []],
               ragged_rank=1)),
       dict(
           descr='data.shape=[3, (D2), D3]; mask.shape=[3, (D2)]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
               ragged_rank=1),
-          mask=ragged.constant_value([[T, F], [T, T], [F, F]]),
+          mask=ragged_factory_ops.constant_value([[T, F], [T, T], [F, F]]),
           keepdims=False,
           expected=[[1, 2], [5, 6], [7, 8]]),
       dict(
           descr='data.shape=[3, (D2), (D3)]; mask.shape=[3, (D2), (D3)]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4]]]),
-          mask=ragged.constant_value(
+          mask=ragged_factory_ops.constant_value(
               [[[T, T], [F, T]], [[F, F], [F, F]], [[T, F]]]),
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2], [4]], [[], []], [[2]]])),
       dict(
           descr=('data.shape=[3, (D2), (D3), (D4)]; '
                  'mask.shape=[3, (D2), (D3), (D4)]'),
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[[1, 2], [3, 4]], [[5, 6]]], [[[2, 4], [6, 8]]]]),
-          mask=ragged.constant_value(
+          mask=ragged_factory_ops.constant_value(
               [[[[T, T], [F, F]], [[T, F]]], [[[F, F], [T, T]]]]),
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[[1, 2], []], [[5]]], [[[], [6, 8]]]])),
 
       #=========================================================================
@@ -211,142 +218,132 @@ class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
       dict(
           descr='data.shape=[2, 3]; mask.shape=[2, (3)]',
           data=[[1, 2, 3], [4, 5, 6]],
-          mask=ragged.constant_value([[T, F, F], [F, T, T]]),
+          mask=ragged_factory_ops.constant_value([[T, F, F], [F, T, T]]),
           keepdims=True,
-          expected=ragged.constant_value([[1], [5, 6]])),
+          expected=ragged_factory_ops.constant_value([[1], [5, 6]])),
       dict(
           descr='data.shape=[2, 3, 2]; mask.shape=[2, (3)]',
           data=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 0], [2, 4]]],
-          mask=ragged.constant_value([[T, F, F], [F, T, T]]),
+          mask=ragged_factory_ops.constant_value([[T, F, F], [F, T, T]]),
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2]], [[9, 0], [2, 4]]],
               ragged_rank=1)),
       dict(
           descr='data.shape=[2, 3, 2]; mask.shape=[2, (3), 2]',
           data=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 0], [2, 4]]],
-          mask=ragged.constant_value(
+          mask=ragged_factory_ops.constant_value(
               [[[T, F], [F, F], [T, T]], [[T, F], [F, T], [F, F]]],
               ragged_rank=1),
           keepdims=True,
-          expected=ragged.constant_value([[[1], [], [5, 6]], [[7], [0], []]])),
+          expected=ragged_factory_ops.constant_value(
+              [[[1], [], [5, 6]], [[7], [0], []]])),
 
       #=========================================================================
       # Ragged data and uniform mask.
       #=========================================================================
       dict(
           descr='data.shape=[4, (D2)]; mask.shape=[4]',
-          data=ragged.constant_value([[1, 2, 3], [4], [], [5, 6]]),
+          data=ragged_factory_ops.constant_value([[1, 2, 3], [4], [], [5, 6]]),
           mask=[T, F, T, F],
           keepdims=False,
-          expected=ragged.constant_value([[1, 2, 3], []])),
+          expected=ragged_factory_ops.constant_value([[1, 2, 3], []])),
       dict(
           descr='data.shape=[4, (D2), (D3)]; mask.shape=[4]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2, 3]], [[4], []], [[5, 6]], []]),
           mask=[T, F, T, T],
           keepdims=False,
-          expected=ragged.constant_value([[[1, 2, 3]], [[5, 6]], []])),
+          expected=ragged_factory_ops.constant_value(
+              [[[1, 2, 3]], [[5, 6]], []])),
       dict(
           descr='data.shape=[4, (D2), 2]; mask.shape=[4]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [], [[5, 6]], [[7, 8], [9, 0], [1, 2]]],
               ragged_rank=1),
           mask=[T, F, F, T],
           keepdims=False,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [[7, 8], [9, 0], [1, 2]]],
               ragged_rank=1)),
       dict(
           descr='data.shape=[4, (D2), 2]; mask.shape=[4]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [], [[5, 6]], [[7, 8], [9, 0], [1, 2]]],
               ragged_rank=1),
           mask=[T, F, F, T],
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [[7, 8], [9, 0], [1, 2]]],
               ragged_rank=1)),
       dict(
           descr='data.shape=[1, (2)]; mask.shape=[1, 2]',
-          data=ragged.constant_value([[1, 2]]),
+          data=ragged_factory_ops.constant_value([[1, 2]]),
           mask=[[T, F]],
           keepdims=True,
-          expected=ragged.constant_value([[1]])),
+          expected=ragged_factory_ops.constant_value([[1]])),
       dict(
           descr='data.shape=[2, (2), (D3)]; mask.shape=[2, 2]',
-          data=ragged.constant_value([[[1], [2, 3]], [[], [4, 5, 6]]]),
+          data=ragged_factory_ops.constant_value(
+              [[[1], [2, 3]], [[], [4, 5, 6]]]),
           mask=[[T, F], [T, T]],
           keepdims=True,
-          expected=ragged.constant_value([[[1]], [[], [4, 5, 6]]])),
+          expected=ragged_factory_ops.constant_value([[[1]], [[], [4, 5, 6]]])),
       dict(
           descr='data.shape=[2, (2), 3]; mask.shape=[2, 2]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]],
               ragged_rank=1),
           mask=[[T, F], [T, T]],
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2, 3]], [[7, 8, 9], [2, 4, 6]]],
               ragged_rank=1)),
       dict(
           descr='data.shape=[2, (2), 3]; mask.shape=[2, 2, 3]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]],
               ragged_rank=1),
           mask=[[[T, F, F], [T, F, T]], [[T, F, T], [F, F, F]]],
           keepdims=True,
-          expected=ragged.constant_value([[[1], [4, 6]], [[7, 9], []]])),
+          expected=ragged_factory_ops.constant_value(
+              [[[1], [4, 6]], [[7, 9], []]])),
   ])  # pyformat: disable
-  @test_util.run_deprecated_v1
   def testBooleanMask(self, descr, data, mask, keepdims, expected):
-    actual = ragged.boolean_mask(data, mask, keepdims=keepdims)
-    self.assertEqual(
-        getattr(actual, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
-    with self.test_session():
-      if isinstance(expected, ragged.RaggedTensorValue):
-        expected = expected.tolist()
-      self.assertEqual(actual.eval().tolist(), expected)
+    actual = ragged_array_ops.boolean_mask(data, mask, keepdims=keepdims)
+    self.assertRaggedEqual(actual, expected)
 
-  @test_util.run_deprecated_v1
   def testErrors(self):
-    self.assertRaisesRegexp(ValueError,
-                            r'mask\.shape\.ndims must be kown statically',
-                            ragged.boolean_mask, [[1, 2]],
-                            array_ops.placeholder(dtypes.bool))
+    if not context.executing_eagerly():
+      self.assertRaisesRegexp(ValueError,
+                              r'mask\.shape\.ndims must be kown statically',
+                              ragged_array_ops.boolean_mask, [[1, 2]],
+                              array_ops.placeholder(dtypes.bool))
 
-    self.assertRaisesRegexp(TypeError,
-                            "Expected bool, got 0 of type 'int' instead.",
-                            ragged.boolean_mask, [[1, 2]], [[0, 1]])
+    self.assertRaises(TypeError, ragged_array_ops.boolean_mask, [[1, 2]],
+                      [[0, 1]])
     self.assertRaisesRegexp(
         ValueError, 'Tensor conversion requested dtype bool for '
-        'RaggedTensor with dtype int32', ragged.boolean_mask,
-        ragged.constant([[1, 2]]), ragged.constant([[0, 0]]))
+        'RaggedTensor with dtype int32', ragged_array_ops.boolean_mask,
+        ragged_factory_ops.constant([[1, 2]]),
+        ragged_factory_ops.constant([[0, 0]]))
 
     self.assertRaisesRegexp(
         ValueError, r'Shapes \(1, 2\) and \(1, 3\) are incompatible',
-        ragged.boolean_mask, [[1, 2]], [[True, False, True]])
-
-    # self.assertRaisesRegexp(ValueError,
-    #                         r'data=.* is non-ragged but mask=.* is ragged',
-    #                         ragged.boolean_mask, [[1, 2]],
-    #                         ragged.constant([[True, False]]))
-
-    # self.assertRaisesRegexp(
-    #     ValueError, r'data=.* is ragged but mask=.* is non-ragged',
-    #     ragged.boolean_mask, ragged.constant([[1, 2]]), [[True, False]])
+        ragged_array_ops.boolean_mask, [[1, 2]], [[True, False, True]])
 
     self.assertRaisesRegexp(errors.InvalidArgumentError,
                             r'Inputs must have identical ragged splits',
-                            ragged.boolean_mask, ragged.constant([[1, 2]]),
-                            ragged.constant([[True, False, True]]))
+                            ragged_array_ops.boolean_mask,
+                            ragged_factory_ops.constant([[1, 2]]),
+                            ragged_factory_ops.constant([[True, False, True]]))
 
     self.assertRaisesRegexp(ValueError, 'mask cannot be scalar',
-                            ragged.boolean_mask, [[1, 2]], True)
+                            ragged_array_ops.boolean_mask, [[1, 2]], True)
 
-    self.assertRaisesRegexp(ValueError,
-                            'mask cannot be scalar', ragged.boolean_mask,
-                            ragged.constant([[1, 2]]), True)
+    self.assertRaisesRegexp(ValueError, 'mask cannot be scalar',
+                            ragged_array_ops.boolean_mask,
+                            ragged_factory_ops.constant([[1, 2]]), True)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_concat_op_test.py b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
index 3699f90f46b658576a8c479aa222e35995764202..254afdaa21b489f0c3ea4191b0b02990fd7334cf 100644
--- a/tensorflow/python/ops/ragged/ragged_concat_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.concat."""
+"""Tests for ragged_array_ops.concat."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,22 +20,27 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
+                         parameterized.TestCase):
 
   def _rt_inputs_to_tensors(self, rt_inputs, ragged_ranks=None):
     if ragged_ranks is None:
       ragged_ranks = [None] * len(rt_inputs)
-    return [
-        ragged.constant(rt_input, ragged_rank=rrank)
+    return [  # pylint: disable=g-long-ternary
+        ragged_factory_ops.constant(rt_input, ragged_rank=rrank)
         if rrank != 0 else constant_op.constant(rt_input)
         for (rt_input, rrank) in zip(rt_inputs, ragged_ranks)
     ]
@@ -221,7 +226,6 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           axis=0,
           expected=[[b'a00', b'a01'], [], [b'a20', b'a21']]),
   )   # pyformat: disable
-  @test_util.run_deprecated_v1
   def testRaggedConcat(self,
                        descr,
                        rt_inputs,
@@ -231,13 +235,12 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                        expected_ragged_rank=None,
                        expected_shape=None):
     rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
-    concatenated = ragged.concat(rt_inputs, axis)
+    concatenated = ragged_array_ops.concat(rt_inputs, axis)
     if expected_ragged_rank is not None:
       self.assertEqual(concatenated.ragged_rank, expected_ragged_rank)
     if expected_shape is not None:
       self.assertEqual(concatenated.shape.as_list(), expected_shape)
-    with self.test_session():
-      self.assertEqual(concatenated.eval().tolist(), expected)
+    self.assertRaggedEqual(concatenated, expected)
 
   @parameterized.parameters(
       dict(
@@ -264,13 +267,17 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           ragged_ranks=(0, 0),
           rt_inputs=([[1, 2]], [[3, 4], [5, 6]]),
           axis=1,
-          error=ValueError,
-          message='Dimension 0 in both shapes must be equal'),
+          error=(ValueError, errors.InvalidArgumentError)),
   )
-  @test_util.run_deprecated_v1
-  def testStaticError(self, rt_inputs, axis, error, message, ragged_ranks=None):
+  def testStaticError(self,
+                      rt_inputs,
+                      axis,
+                      error,
+                      message=None,
+                      ragged_ranks=None):
     rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
-    self.assertRaisesRegexp(error, message, ragged.concat, rt_inputs, axis)
+    self.assertRaisesRegexp(error, message, ragged_array_ops.concat, rt_inputs,
+                            axis)
 
   @parameterized.parameters([
       dict(
@@ -280,27 +287,28 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           error=errors.InvalidArgumentError,
           message='Input tensors have incompatible shapes'),
   ])
-  @test_util.run_deprecated_v1
   def testRuntimeError(self, rt_inputs, axis, error, message,
                        ragged_ranks=None):
+    if context.executing_eagerly():
+      return
     rt_inputs = [
         array_ops.placeholder_with_default(rt, shape=None) for rt in rt_inputs
     ]
-    concatenated = ragged.concat(rt_inputs, axis)
-    with self.test_session():
-      self.assertRaisesRegexp(error, message, concatenated.eval)
+    concatenated = ragged_array_ops.concat(rt_inputs, axis)
+    with self.assertRaisesRegexp(error, message):
+      self.evaluate(concatenated)
 
-  @test_util.run_deprecated_v1
   def testNegativeAxisWithUnknownRankError(self):
+    if context.executing_eagerly():
+      return
     rt_inputs = [
         array_ops.placeholder(dtypes.int64),
         array_ops.placeholder(dtypes.int64)
     ]
     self.assertRaisesRegexp(
         ValueError, r'axis may only be negative if ndims is statically known.',
-        ragged.concat, rt_inputs, -1)
+        ragged_array_ops.concat, rt_inputs, -1)
 
-  @test_util.run_deprecated_v1
   def testSingleTensorInput(self):
     """Tests ragged_concat with a single tensor input.
 
@@ -308,10 +316,9 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     also pass in a single value (as with tf.concat), in which case it simply
     returns that tensor.  This test exercises that path.
     """
-    rt_inputs = ragged.constant([[1, 2], [3, 4]])
-    concatenated = ragged.concat(rt_inputs, 0)
-    with self.test_session():
-      self.assertEqual(concatenated.eval().tolist(), [[1, 2], [3, 4]])
+    rt_inputs = ragged_factory_ops.constant([[1, 2], [3, 4]])
+    concatenated = ragged_array_ops.concat(rt_inputs, 0)
+    self.assertRaggedEqual(concatenated, [[1, 2], [3, 4]])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_const_op_test.py b/tensorflow/python/ops/ragged/ragged_const_op_test.py
index 2505b23912a80a154d2a06441ac7ae5e20610e23..29a9bdf53db650ef3a075d564e056751f1f018bb 100644
--- a/tensorflow/python/ops/ragged/ragged_const_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_const_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.constant."""
+"""Tests for ragged_factory_ops.constant."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,15 +20,18 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
-
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedConstOpTest(ragged_test_util.RaggedTensorTestCase,
+                        parameterized.TestCase):
 
   @parameterized.parameters(
       #=========================================================================
@@ -133,7 +136,6 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       dict(pylist=[[b'a', b'b'], [b'c'], [b'd', b'e', b'f']],
            dtype=dtypes.string),
   )
-  @test_util.run_deprecated_v1
   def testRaggedConst(self,
                       pylist,
                       dtype=None,
@@ -176,23 +178,14 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     # If inner_shape was explicitly specified, check it.
     if inner_shape is not None:
       if isinstance(rt, ragged_tensor.RaggedTensor):
-        self.assertEqual(rt.inner_values.shape.as_list()[1:], list(inner_shape))
+        self.assertEqual(rt.flat_values.shape.as_list()[1:], list(inner_shape))
       else:
         self.assertEqual(rt.shape.as_list(), list(inner_shape))
 
     if expected_shape is not None:
       self.assertEqual(tuple(rt.shape.as_list()), expected_shape)
 
-    with self.test_session():
-      result = self.evaluate(rt)
-      if rt.shape.ndims > 0:
-        self.assertEqual(result.tolist(), pylist)
-        if expected_shape is not None:
-          self.assertEqual(result.shape, expected_shape)
-      else:
-        self.assertEqual(result, pylist)
-        if expected_shape is not None:
-          self.assertEqual((), expected_shape)
+    self.assertRaggedEqual(rt, pylist)
 
   @parameterized.parameters(
       dict(
@@ -236,11 +229,7 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           exception=ValueError,
           message='pylist has scalar values depth 2, but ragged_rank=2 '
           'requires scalar value depth greater than 2'),
-      dict(
-          pylist=[1, 2, 3],
-          inner_shape=(1, 1),
-          exception=TypeError,
-          message='Expected Tensor\'s shape'),
+      dict(pylist=[1, 2, 3], inner_shape=(1, 1), exception=TypeError),
       dict(
           pylist=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
           inner_shape=(2, 2),
@@ -259,7 +248,6 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           exception=ValueError,
           message='inner values have inconsistent shape'),
   )
-  @test_util.run_deprecated_v1
   def testRaggedConstError(self,
                            pylist,
                            dtype=None,
@@ -308,9 +296,9 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                   message=None):
     """Tests for the _find_scalar_and_max_depth helper function."""
     if exception is not None:
-      self.assertRaisesRegexp(
-          exception, message,
-          ragged_factory_ops._find_scalar_and_max_depth, pylist)
+      self.assertRaisesRegexp(exception, message,
+                              ragged_factory_ops._find_scalar_and_max_depth,
+                              pylist)
     else:
       self.assertEqual(
           ragged_factory_ops._find_scalar_and_max_depth(pylist),
@@ -360,11 +348,11 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     if exception is not None:
       self.assertRaisesRegexp(
           exception, message,
-          ragged_factory_ops._default_inner_shape_for_pylist, pylist,
+          ragged.ragged_factory_ops._default_inner_shape_for_pylist, pylist,
           ragged_rank)
     else:
       self.assertEqual(
-          ragged_factory_ops._default_inner_shape_for_pylist(
+          ragged.ragged_factory_ops._default_inner_shape_for_pylist(
               pylist, ragged_rank), inner_shape)
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
index d80518930dbb74b5e044269df73002e68c0df2d2..7f474594b415cfd3e3b3e2b03df3bb84225cbdf2 100644
--- a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.constant_value."""
+"""Tests for ragged_factory_ops.constant_value."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,11 +22,14 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedConstantValueOpTest(test_util.TensorFlowTestCase,
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedConstantValueOpTest(ragged_test_util.RaggedTensorTestCase,
                                 parameterized.TestCase):
 
   @parameterized.parameters(
@@ -144,8 +147,8 @@ class RaggedConstantValueOpTest(test_util.TensorFlowTestCase,
                        inner_shape=None,
                        expected_shape=None,
                        expected_dtype=None):
-    """Tests that `ragged_value(pylist).tolist() == pylist`."""
-    rt = ragged.constant_value(
+    """Tests that `ragged_value(pylist).to_list() == pylist`."""
+    rt = ragged_factory_ops.constant_value(
         pylist, dtype=dtype, ragged_rank=ragged_rank, inner_shape=inner_shape)
 
     # If dtype was explicitly specified, check it.
@@ -156,15 +159,15 @@ class RaggedConstantValueOpTest(test_util.TensorFlowTestCase,
 
     # If ragged_rank was explicitly specified, check it.
     if ragged_rank is not None:
-      if isinstance(rt, ragged.RaggedTensorValue):
+      if isinstance(rt, ragged_tensor_value.RaggedTensorValue):
         self.assertEqual(rt.ragged_rank, ragged_rank)
       else:
         self.assertEqual(0, ragged_rank)
 
     # If inner_shape was explicitly specified, check it.
     if inner_shape is not None:
-      if isinstance(rt, ragged.RaggedTensorValue):
-        self.assertEqual(rt.inner_values.shape[1:], inner_shape)
+      if isinstance(rt, ragged_tensor_value.RaggedTensorValue):
+        self.assertEqual(rt.flat_values.shape[1:], inner_shape)
       else:
         self.assertEqual(rt.shape, inner_shape)
 
@@ -172,7 +175,10 @@ class RaggedConstantValueOpTest(test_util.TensorFlowTestCase,
       self.assertEqual(tuple(rt.shape), expected_shape)
 
     if rt.shape:
-      self.assertEqual(rt.tolist(), pylist)
+      if isinstance(rt, ragged_tensor_value.RaggedTensorValue):
+        self.assertEqual(rt.to_list(), pylist)
+      else:
+        self.assertEqual(rt.tolist(), pylist)
       if expected_shape is not None:
         self.assertEqual(rt.shape, expected_shape)
     else:
@@ -252,11 +258,11 @@ class RaggedConstantValueOpTest(test_util.TensorFlowTestCase,
                             inner_shape=None,
                             exception=None,
                             message=None):
-    """Tests that `ragged.constant_value()` raises an expected exception."""
+    """Tests that `constant_value()` raises an expected exception."""
     self.assertRaisesRegexp(
         exception,
         message,
-        ragged.constant_value,
+        ragged_factory_ops.constant_value,
         pylist,
         dtype=dtype,
         ragged_rank=ragged_rank,
diff --git a/tensorflow/python/ops/ragged/ragged_conversion_ops.py b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
index 83212e49cf71c245d85b8216792ac0cfc97741dd..854c5b303c81d089baf78119ca8525a51e7a83c4 100644
--- a/tensorflow/python/ops/ragged/ragged_conversion_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
@@ -18,407 +18,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_ragged_conversion_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.ops.ragged import ragged_util
 
 
-#===============================================================================
-# RaggedTensor <-> Tensor conversion
-#===============================================================================
 def from_tensor(tensor, lengths=None, padding=None, ragged_rank=1, name=None):
-  """Converts a `Tensor` into a `RaggedTensor`.
-
-  The set of absent/default values may be specified using a vector of lengths
-  or a padding value (but not both).  If `lengths` is specified, then the
-  output tensor will satisfy `output[row] = tensor[row][:lengths[row]]`.
-  If `padding` is specified, then any row *suffix* consisting entirely of
-  `padding` will be excluded from the returned `RaggedTensor`.  If neither
-  `lengths` nor `padding` is specified, then the returned `RaggedTensor` will
-  have no absent/default values.
-
-  Examples:
-
-  ```python
-  >>> dt = tf.constant([[5, 7, 0], [0, 3, 0], [6, 0, 0]])
-  >>> ragged.from_tensor(dt).eval().tolist()
-  [[5, 7, 0], [0, 3, 0], [6, 0, 0]]
-  >>> ragged.from_tensor(dt, lengths=[2, 0, 3]).eval().tolist()
-  [[5, 7], [], [6, 0, 0]]
-  >>> ragged.from_tensor(dt, padding=0).eval().tolist()
-  [[5, 7], [0, 3], [6]]
-  ```
-
-  Args:
-    tensor: The `Tensor` to convert.  Must have rank `ragged_rank + 1` or
-      higher.
-    lengths: An optional set of row lengths, specified using a 1-D integer
-      `Tensor` whose length is equal to `tensor.shape[0]` (the number of rows in
-      `tensor`).  If specified, then `output[row]` will contain
-      `tensor[row][:lengths[row]]`.  Negative lengths are treated as zero.
-    padding: An optional padding value.  If specified, then any row suffix
-      consisting entirely of `padding` will be excluded from the returned
-      RaggedTensor.  `padding` is a `Tensor` with the same dtype as `tensor`
-      and with `shape=tensor.shape[ragged_rank + 1:]`.
-    ragged_rank: Integer specifying the ragged rank for the returned
-      `RaggedTensor`.  Must be greater than zero.
-    name: A name prefix for the returned tensors (optional).
-
-  Returns:
-    A `RaggedTensor` with the specified `ragged_rank`.  The shape of the
-    returned ragged tensor is compatible with the shape of `tensor`.
-  Raises:
-    ValueError: If both `lengths` and `padding` are specified.
-  """
-  if lengths is not None and padding is not None:
-    raise ValueError('Specify lengths or padding, but not both')
-  if not isinstance(ragged_rank, int):
-    raise TypeError('ragged_rank expected int, got %r' % ragged_rank)
-  if ragged_rank <= 0:
-    raise ValueError('ragged_rank must be greater than 0; got %s' % ragged_rank)
-
-  with ops.name_scope(name, 'RaggedFromTensor', [tensor, lengths, padding]):
-    tensor = ops.convert_to_tensor(tensor, name='tensor')
-    tensor.shape.with_rank_at_least(ragged_rank + 1)
-    input_shape = array_ops.shape(tensor, out_type=dtypes.int64)
-    ncols = input_shape[1]
-
-    # Handle ragged_rank>1 via recursion:
-    # If the output should have multiple ragged dimensions, then first
-    # flatten the tensor to eliminate all but the last ragged dimension,
-    # and recursively convert that flattened tensor.  Then add on the splits
-    # for the dimensions that we flattened out.
-    if ragged_rank > 1:
-      # Flatten `tensor` to eliminate all but the last ragged dimension.
-      new_shape = array_ops.concat(
-          [constant_op.constant([-1], dtypes.int64), input_shape[ragged_rank:]],
-          axis=0)
-      flattened = array_ops.reshape(tensor, new_shape)
-      # Recursively convert the flattened tensor.
-      values = from_tensor(flattened, lengths, padding)
-      # The total number of elements in each  dimension.  E.g., if
-      # input_shape=[3, 4, 5, 6], then dim[2] has 3*4*5 elements in total.
-      dim_size = math_ops.cumprod(input_shape)
-      # Construct splits tensors for the dimensions that were flattened.
-      new_splits = [
-          math_ops.range(0, dim_size[dim - 1] + 1) * input_shape[dim]
-          for dim in range(1, ragged_rank)
-      ]
-      return ragged_factory_ops.from_nested_row_splits(values, new_splits)
-
-    # If padding was specified, then use it to find row lengths.
-    if padding is not None:
-      padding = ops.convert_to_tensor(
-          padding, name='padding', dtype=tensor.dtype)
-      padding.shape.assert_is_compatible_with(tensor.shape[2:])
-
-      # Find places where the padding is equal to the tensor.  (This will
-      # broadcast `padding` across the outermost 2 dimensions of `tensor`,
-      # so `has_default_value.shape = tensor.shape`.)
-      has_default_value = math_ops.equal(padding, tensor)
-
-      # If the padding isn't a scalar, then require that all values in the
-      # padding match each item in the tensor.  After this block of code,
-      # `has_default.shape = tensor.shape[:2]`.  (Unfortunately, we can't just
-      # use reduce_all for both cases, becaue when you pass an empty `axis`
-      # list to reduce_all, it reduces all axes; but we want it to reduce no
-      # axes -- i.e., to be a no-op.)
-      tensor_rank = array_ops.rank(tensor)
-      reduce_axis = math_ops.range(2, tensor_rank)
-      has_default = control_flow_ops.cond(
-          tensor_rank > 2,
-          lambda: math_ops.reduce_all(has_default_value, axis=reduce_axis),
-          lambda: has_default_value)
-      has_default.set_shape(tensor_shape.TensorShape([None, None]))
-      has_default.set_shape(tensor.shape[:2])
-
-      # Use has_default it to find the length of each row: for each non-default
-      # item in a row, calculate the length that the row needs to have to
-      # include that item; and then take the max of those values (across each
-      # row).
-      has_nondefault = math_ops.logical_not(has_default)
-      has_nondefault = math_ops.cast(has_nondefault, dtypes.int64)
-      length_for_nondefault_value = (
-          has_nondefault * array_ops.expand_dims(
-              math_ops.range(1, ncols + 1), 0))
-      lengths = math_ops.reduce_max(length_for_nondefault_value, axis=1)
-
-    # If we have lengths (either directly supplied, or computed from paddings),
-    # then use those to construct splits; and then use masking to get the
-    # corresponding values.
-    if lengths is not None:
-      lengths = ragged_util.convert_to_int_tensor(lengths, 'lengths',
-                                                  dtypes.int64)
-      lengths.shape.assert_has_rank(1)
-      lengths = math_ops.minimum(lengths, ncols)
-      lengths = math_ops.maximum(lengths, 0)
-      limits = math_ops.cumsum(lengths)
-      splits = array_ops.concat(
-          [array_ops.zeros([1], dtypes.int64), limits], axis=0)
-      mask = array_ops.sequence_mask(lengths, maxlen=ncols)
-      values = array_ops.boolean_mask(tensor, mask)
-      return ragged_factory_ops.from_row_splits(values, splits)
-
-    # If neither padding nor lengths were specified, then create a splits
-    # vector that contains no default values, and reshape the input tensor
-    # to form the values for the RaggedTensor.
-    nrows = input_shape[0]
-    nvals = nrows * ncols
-    splits = math_ops.range(nrows + 1) * ncols
-    values_shape = array_ops.concat([[nvals], input_shape[2:]], axis=0)
-    values = array_ops.reshape(tensor, values_shape)
-    return ragged_factory_ops.from_row_splits(values, splits)
+  if ragged_tensor.is_ragged(tensor):
+    return tensor
+  else:
+    return ragged_tensor.RaggedTensor.from_tensor(tensor, lengths, padding,
+                                                  ragged_rank, name)
 
 
 def to_tensor(rt_input, default_value=None, name=None):
-  """Converts a `RaggedTensor` into a `Tensor`.
-
-  Example:
-
-  ```python
-  >>> rt = ragged.constant([[9, 8, 7], [], [6, 5], [4]])
-  >>> print ragged.to_tensor(rt).eval()
-  [[9 8 7]
-   [0 0 0]
-   [6 5 0]
-   [4 0 0]]
-  ```
-
-  Args:
-    rt_input: The input `RaggedTensor`.
-    default_value: Value to set for indices not specified in `rt_input`.
-      Defaults to zero.  `default_value` must be broadcastable to
-      `rt_input.shape[rt_input.ragged_rank + 1:]`.
-    name: A name prefix for the returned tensors (optional).
-
-  Returns:
-    A `Tensor` with shape `ragged.bounding_shape(rt_input)` and the
-    values specified by the non-empty values in `rt_input`.  Empty values are
-    assigned `default_value`.
-  """
-  with ops.name_scope(name, 'RaggedToTensor', [rt_input, default_value]):
-    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        rt_input, name='rt_input')
-    if not ragged_tensor.is_ragged(rt_input):
-      return rt_input  # already dense
-    if default_value is not None:
-      default_value = ops.convert_to_tensor(
-          default_value, name='default_value', dtype=rt_input.dtype)
-
-    # If ragged_rank > 1, then recursively convert the ragged values into a
-    # `Tensor` before we proceed.
-    values = rt_input.values
-    if ragged_tensor.is_ragged(values):
-      values = to_tensor(values, default_value)
-
-    # Tile the default value, if necessary.
-    if default_value is not None:
-      if values.shape.ndims is not None:
-        default_value.shape.with_rank_at_most(values.shape.ndims - 1)
-      if (values.shape.ndims is None or default_value.shape.ndims is None or
-          values.shape.ndims != default_value.shape.ndims + 1):
-        value_shape = array_ops.shape(values)[1:]
-        default_value = array_ops.broadcast_to(default_value, value_shape)
-      default_value.shape.assert_is_compatible_with(values.shape[1:])
-
-    # Get the expected dense shape ([nrows, ncols] + value_shape).
-    rt_row_lengths = [rt_input.row_splits[1:] - rt_input.row_splits[:-1]]
-    nrows = array_ops.shape(rt_input.row_splits, out_type=dtypes.int64)[0] - 1
-    ncols = math_ops.maximum(math_ops.reduce_max(rt_row_lengths), 0)
-    values_shape = array_ops.shape(values, out_type=dtypes.int64)
-    value_shape = values_shape[1:]
-    nvals = values_shape[0]
-
-    # Build a default value if none was supplied.
-    if default_value is None:
-      default_value = array_ops.zeros(value_shape, dtype=values.dtype)
-    default_value.shape.assert_is_compatible_with(values.shape[1:])
-    default_value.set_shape(values.shape[1:])
-
-    # Get the row start indices, and expand to shape=[nrows, 1].
-    starts = array_ops.expand_dims(rt_input.row_splits[:-1], 1)
-
-    # Get the row limit indices, and expand to shape=[nrows, 1].
-    limits = array_ops.expand_dims(rt_input.row_splits[1:], 1)
-
-    # Get the column indices, and expand to shape=[1, ncols].
-    columns = array_ops.expand_dims(math_ops.range(0, ncols), 0)
-
-    # Build a list containing the values plus the default value.  We will use
-    # tf.gather to collect values from this list for the `Tensor` (using
-    # nvals as the index for the default value).
-    values_and_default = array_ops.concat(
-        [values, array_ops.stack([default_value])], axis=0)
-
-    # Construct a matrix "indices" pointing into values_and_default.  I.e.,
-    # output[r, c] = values_and_default[indices[r, c].
-    nondefault_index = starts + columns
-    has_value = nondefault_index < limits
-    default_index = array_ops.fill(array_ops.stack([nrows, ncols]), nvals)
-    indices = array_ops.where(has_value, nondefault_index, default_index)
-
-    # Gather the results into a `Tensor`.
-    return array_ops.gather(values_and_default, indices)
+  if ragged_tensor.is_ragged(rt_input):
+    return rt_input.to_tensor(default_value, name)
+  else:
+    return rt_input
 
 
-#===============================================================================
-# RaggedTensor <-> SparseTensor conversion
-#===============================================================================
 def to_sparse(rt_input, name=None):
-  """Converts a `RaggedTensor` into a sparse tensor.
-
-  Example:
-
-  ```python
-  >>> rt = ragged.constant([[1, 2, 3], [4], [], [5, 6]])
-  >>> ragged.to_sparse(rt).eval()
-  SparseTensorValue(indices=[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]],
-                    values=[1, 2, 3, 4, 5, 6],
-                    dense_shape=[4, 3])
-  ```
-
-  Args:
-    rt_input: The input `RaggedTensor`.
-    name: A name prefix for the returned tensors (optional).
-
-  Returns:
-    A SparseTensor with the same values as `rt_input`.
-  """
-  if not ragged_tensor.is_ragged(rt_input):
-    raise TypeError('Expected RaggedTensor, got %s' % type(rt_input).__name__)
-  with ops.name_scope(name, 'RaggedToSparse', [rt_input]):
-    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        rt_input, name='rt_input')
-    result = gen_ragged_conversion_ops.ragged_tensor_to_sparse(
-        rt_input.nested_row_splits, rt_input.inner_values, name=name)
-    return sparse_tensor.SparseTensor(
-        result.sparse_indices, result.sparse_values, result.sparse_dense_shape)
-
-
-@ops.RegisterGradient('RaggedTensorToSparse')
-def _ragged_tensor_to_sparse_gradient(op, unused_sparse_indices_grad,
-                                      sparse_values_grad,
-                                      unused_sparse_shape_grad):
-  """Gradient for ragged.to_sparse."""
-  op_inputs_nested_row_splits = op.inputs[:-1]
-  op_inputs_inner_values = op.inputs[-1]
-
-  # No gradient for the RaggedTensor's nested_row_splits.
-  nested_row_splits_gradient = [None] * len(op_inputs_nested_row_splits)
-
-  # Gradient for the RaggedTensor's inner_values is formed by reshaping
-  # the gradient for the SparseTensor's values.
-  inner_values_shape = array_ops.shape(op_inputs_inner_values)
-  inner_values_gradient = array_ops.reshape(sparse_values_grad,
-                                            inner_values_shape)
-
-  return nested_row_splits_gradient + [inner_values_gradient]
+  return rt_input.to_sparse(name)
 
 
 def from_sparse(st_input, name=None):
-  """Converts a 2D `SparseTensor` to a `RaggedTensor`.
-
-  Each row of the `output` `RaggedTensor` will contain the explicit values from
-  the same row in `st_input`.  `st_input` must be ragged-right.  If not it is
-  not ragged-right, then an error will be generated.
-
-  Example:
-
-  ```python
-  >>> st = SparseTensor(indices=[[0, 1], [0, 2], [0, 3], [1, 0], [3, 0]],
-  ...                   values=[1, 2, 3, 4, 5],
-  ...                   dense_shape=[4, 3])
-  >>> ragged.from_sparse(st).eval().tolist()
-  [[1, 2, 3], [4], [], [5]]
-  ```
-
-  Currently, only two-dimensional `SparseTensors` are supported.
-
-  Args:
-    st_input: The sparse tensor to convert.  Must have rank 2.
-    name: A name prefix for the returned tensors (optional).
-
-  Returns:
-    A `RaggedTensor` with the same values as `st_input`.
-    `output.ragged_rank = rank(st_input) - 1`.
-    `output.shape = [st_input.dense_shape[0], None]`.
-  Raises:
-    ValueError: If the number of dimensions in `st_input` is not known
-      statically, or is not two.
-  """
-  if not sparse_tensor.is_sparse(st_input):
-    raise TypeError('Expected SparseTensor, got %s' % type(st_input).__name__)
-  with ops.name_scope(name, 'RaggedFromSparse', [st_input]):
-    st_input = sparse_tensor.convert_to_tensor_or_sparse_tensor(
-        st_input, name='rt_input')
-
-    static_rank_from_dense_shape = (
-        None if st_input.dense_shape.shape.ndims is None
-        else st_input.dense_shape.shape.dims[0].value)
-    static_rank_from_indices = (
-        None if st_input.indices.shape.ndims is None
-        else st_input.indices.shape.dims[1].value)
-
-    if static_rank_from_dense_shape != 2 and static_rank_from_indices != 2:
-      raise ValueError('rank(st_input) must be 2')
-
-    with ops.control_dependencies(
-        _assert_sparse_indices_are_ragged_right(st_input.indices)):
-      # Treat sparse row indices as segment ids to generate a splits tensor that
-      # we can pair with the sparse tensor values.  (Ignore sparse column
-      # indices.)
-      segment_ids = st_input.indices[:, 0]
-      num_segments = st_input.dense_shape[0]
-      return ragged_factory_ops.from_value_rowids(st_input.values, segment_ids,
-                                                  num_segments)
-
-
-def _assert_sparse_indices_are_ragged_right(indices):
-  """Checks that the given SparseTensor.indices tensor is ragged-right.
-
-  Example: `indices = [[0, 0], [0, 1], [2, 0], [3, 1]]` is not ragged right
-  because the entry `[3, 1]` skips a cell.
-
-  Args:
-    indices: The SparseTensor indices to check.
-
-  Returns:
-    A list of control dependency op tensors.
-  """
-  index_prefix = indices[:, :-1]
-  index_suffix = indices[:, -1]
-
-  # Check whether each index is starting a new row in the innermost dimension
-  # (prefix[i] != prefix[i-1]) or continuing a row (prefix[i] == prefix[i-1]).
-  # (Note: this skips the first index; we will check that separately below.)
-  index_prefix_changed = math_ops.reduce_any(
-      math_ops.not_equal(index_prefix[1:], index_prefix[:-1]), axis=1)
-
-  # Check two cases:
-  #   * For indices that start a new row: index_suffix[i] must be zero.
-  #   * For indices that continue a row: index_suffix[i] must be equal to
-  #     index_suffix[i-1]+1.
-  index_ok = array_ops.where(
-      index_prefix_changed, math_ops.equal(index_suffix[1:], 0),
-      math_ops.equal(index_suffix[1:], index_suffix[:-1] + 1))
-
-  # Also check that the very first index didn't skip any cells.  The first
-  # index starts a new row (by definition), so its suffix should be zero.
-  sparse_indices_are_ragged_right = math_ops.logical_and(
-      math_ops.reduce_all(math_ops.equal(index_suffix[:1], 0)),
-      math_ops.reduce_all(index_ok))
-
-  message = [
-      'SparseTensor is not right-ragged',
-      'SparseTensor.indices =', indices
-  ]
-  return [control_flow_ops.Assert(sparse_indices_are_ragged_right, message)]
+  return ragged_tensor.RaggedTensor.from_sparse(st_input, name)
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc64f9cc9ed0c673dc6ba7b921e1a9d7d2a5d376
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -0,0 +1,522 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operator dispatch for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_shape
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import dispatch
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_export
+from tensorflow.python.util import tf_inspect
+
+# @TODO(edloper): Set this to True in the CL that exports RaggedTensors.
+_UPDATE_DOCSTRINGS = False
+
+# Information about an argument to an operation: The name of the argument, its
+# position in the argument list, and a boolean flag indicating whether it
+# expects a list of tensors.
+_ArgInfo = collections.namedtuple('ArgInfo', ['name', 'position', 'is_list'])
+
+
+def _get_arg_infos(func, arg_names):
+  """Returns an `_ArgInfo` for each argument of `func` specified by `arg_names`.
+
+  Args:
+    func: The function whose arguments should be described.
+    arg_names: The names of the arguments to get info for.
+
+  Returns:
+    A tuple of `_ArgInfo`s.
+  """
+  arg_infos = []
+
+  # Inspect the func's argspec to find the position of each arg.
+  arg_spec = tf_inspect.getargspec(func)
+  for argname in arg_names:
+    assert isinstance(argname, str)
+    is_list = argname.startswith('[') and argname.endswith(']')
+    if is_list:
+      argname = argname[1:-1]
+    if argname not in arg_spec.args:
+      raise ValueError('Argument %r not found function in %s.  Args=%s' %
+                       (argname, func, arg_spec.args))
+    arg_infos.append(_ArgInfo(argname, arg_spec.args.index(argname), is_list))
+  return arg_infos
+
+
+def _is_convertible_to_tensor(value):
+  """Returns true if `value` is convertible to a `Tensor`."""
+  if value is None:
+    return True
+  if isinstance(value,
+                (ops.Tensor, variables.Variable, np.ndarray, int, float, str)):
+    return True
+  elif isinstance(value, (sparse_tensor.SparseTensor,)):
+    return False
+  else:
+    try:
+      ops.convert_to_tensor(value)
+      return True
+    except (TypeError, ValueError):
+      return False
+
+
+class UnaryRaggedElementwiseDispatcher(dispatch.OpDispatcher):
+  """OpDispatcher for unary ops that map a base op across ragged values."""
+
+  def __init__(self, original_op, arg_is_list=False):
+    self._original_op = original_op
+    self._arg_is_list = arg_is_list
+    arg_names = tf_inspect.getfullargspec(original_op)[0]
+    self._x = arg_names[0]
+    if _UPDATE_DOCSTRINGS:
+      original_op.__doc__ = (
+          original_op.__doc__.rstrip() + '\n\n' +
+          '    `{x}` may be a `tf.RaggedTensor`.\n'.format(x=self._x))
+
+  def handle(self, args, kwargs):
+    if args:
+      x, args = args[0], args[1:]
+    else:
+      kwargs = kwargs.copy()
+      x = kwargs.pop(self._x, None)
+    if x is None:
+      return self.NOT_SUPPORTED
+    if self._arg_is_list:
+      found_ragged = False
+      for elt in x:
+        if ragged_tensor.is_ragged(elt):
+          found_ragged = True
+        elif not _is_convertible_to_tensor(elt):
+          return self.NOT_SUPPORTED
+      if found_ragged:
+        nested_splits_lists = [
+            elt.nested_row_splits for elt in x if ragged_tensor.is_ragged(elt)
+        ]
+        flat_values = [
+            elt.flat_values if ragged_tensor.is_ragged(elt) else elt
+            for elt in x
+        ]
+        with ops.control_dependencies(
+            ragged_util.assert_splits_match(nested_splits_lists)):
+          return ragged_tensor.RaggedTensor.from_nested_row_splits(
+              self._original_op(flat_values, *args, **kwargs),
+              nested_splits_lists[0])
+      else:
+        return self.NOT_SUPPORTED
+    else:
+      found_ragged = ragged_tensor.is_ragged(x)
+      if found_ragged:
+        mapped_values = self._original_op(x.flat_values, *args, **kwargs)
+        return x.with_flat_values(mapped_values)
+      else:
+        return self.NOT_SUPPORTED
+
+
+class BinaryRaggedElementwiseDispatcher(dispatch.OpDispatcher):
+  """OpDispatcher for binary ops that map a base op across ragged values.
+
+  Supports broadcasting.
+  """
+
+  def __init__(self, original_op):
+    self._original_op = original_op
+    arg_names = tf_inspect.getfullargspec(original_op)[0]
+    self._x = arg_names[0]
+    self._y = arg_names[1]
+    if _UPDATE_DOCSTRINGS:
+      original_op.__doc__ = (
+          original_op.__doc__.rstrip() + '\n\n' +
+          '    `{x}` and `{y}` may be a `tf.RaggedTensor`.\n'.format(
+              x=self._x, y=self._y))
+
+  def handle(self, args, kwargs):
+    # Extract the binary args.
+    if len(args) > 1:
+      x = args[0]
+      y = args[1]
+      args = args[2:]
+    elif args:
+      kwargs = kwargs.copy()
+      x = args[0]
+      y = kwargs.pop(self._y, None)
+      args = args[1:]
+    else:
+      kwargs = kwargs.copy()
+      x = kwargs.pop(self._x, None)
+      y = kwargs.pop(self._y, None)
+
+    # Bail if we don't have at least one ragged argument.
+    x_is_ragged = ragged_tensor.is_ragged(x)
+    y_is_ragged = ragged_tensor.is_ragged(y)
+    if not (x_is_ragged or y_is_ragged):
+      return self.NOT_SUPPORTED
+
+    # Convert args to tensors.  Bail if conversion fails.
+    try:
+      if not x_is_ragged:
+        x = ops.convert_to_tensor(x, name=self._x, preferred_dtype=y.dtype)
+      if not y_is_ragged:
+        y = ops.convert_to_tensor(y, name=self._y, preferred_dtype=x.dtype)
+    except (TypeError, ValueError):
+      return self.NOT_SUPPORTED
+
+    if ((x_is_ragged and y_is_ragged) or
+        (x_is_ragged and x.flat_values.shape.ndims <= y.shape.ndims) or
+        (y_is_ragged and y.flat_values.shape.ndims <= x.shape.ndims)):
+      bcast_shape = ragged_tensor_shape.broadcast_dynamic_shape(
+          ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(x),
+          ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(y))
+      x = ragged_tensor_shape.broadcast_to(
+          x, bcast_shape, broadcast_inner_dimensions=False)
+      y = ragged_tensor_shape.broadcast_to(
+          y, bcast_shape, broadcast_inner_dimensions=False)
+
+    x_values = x.flat_values if ragged_tensor.is_ragged(x) else x
+    y_values = y.flat_values if ragged_tensor.is_ragged(y) else y
+    mapped_values = self._original_op(x_values, y_values, *args, **kwargs)
+    if ragged_tensor.is_ragged(x):
+      return x.with_flat_values(mapped_values)
+    else:
+      return y.with_flat_values(mapped_values)
+
+
+class RaggedDispatcher(dispatch.OpDispatcher):
+  """OpDispatcher for ragged ops.
+
+  Dispatches to a wrapped op-handler if at least one of the `tensor_args`
+  arguments is a RaggedTensor or a RaggedTensorValue; and all of the
+  `tensor_args` arguments are convertible to Tensor or RaggedTensor.
+  """
+
+  def __init__(self, original_op, ragged_op, ragged_args):
+    op_arg_names = tf_inspect.getfullargspec(original_op)[0]
+    ragged_arg_names = tf_inspect.getfullargspec(ragged_op)[0]
+    if op_arg_names != ragged_arg_names:
+      raise AssertionError(
+          'Signature must exactly match when overriding %s with %s: %s vs %s' %
+          (original_op, ragged_op, op_arg_names, ragged_arg_names))
+    self._ragged_op = ragged_op
+    self._ragged_args = _get_arg_infos(ragged_op, ragged_args)
+    if _UPDATE_DOCSTRINGS:
+      arg_list = ' and '.join('`%s`' % arg for arg in ragged_args)
+      original_op.__doc__ = (
+          original_op.__doc__.rstrip() + '\n\n' +
+          '    {0} may be a `tf.RaggedTensor`.\n'.format(arg_list))
+
+  def handle(self, args, kwargs):
+    if self.is_supported(args, kwargs):
+      return self._ragged_op(*args, **kwargs)
+    else:
+      return self.NOT_SUPPORTED
+
+  def is_supported(self, args, kwargs):
+    found_ragged = False
+    for arg_info in self._ragged_args:
+      if arg_info.position < len(args):
+        arg = args[arg_info.position]
+      else:
+        arg = kwargs.get(arg_info.name, None)
+
+      if arg_info.is_list:
+        if not isinstance(arg, (list, tuple)):
+          return False
+        for elt in arg:
+          if ragged_tensor.is_ragged(elt):
+            found_ragged = True
+          elif not _is_convertible_to_tensor(elt):
+            return False
+      else:
+        if ragged_tensor.is_ragged(arg):
+          found_ragged = True
+        elif not _is_convertible_to_tensor(arg):
+          return False
+    return found_ragged
+
+
+def ragged_dispatch(original_op, tensor_args):
+
+  def decorator(ragged_op):
+    dispatch.RaggedDispatcher(original_op, ragged_op,
+                              tensor_args).register(original_op)
+    return ragged_op
+
+  return decorator
+
+
+_UNARY_ELEMENTWISE_OPS = [
+    array_ops.check_numerics,
+    array_ops.identity,
+    array_ops.ones_like,
+    array_ops.ones_like_v2,
+    array_ops.zeros_like,
+    array_ops.zeros_like_v2,
+    clip_ops.clip_by_value,
+    math_ops.abs,
+    math_ops.acos,
+    math_ops.acosh,
+    math_ops.angle,
+    math_ops.asin,
+    math_ops.asinh,
+    math_ops.atan,
+    math_ops.atanh,
+    math_ops.cast,
+    math_ops.ceil,
+    math_ops.conj,
+    math_ops.cos,
+    math_ops.cosh,
+    math_ops.digamma,
+    math_ops.erf,
+    math_ops.erfc,
+    math_ops.exp,
+    math_ops.expm1,
+    math_ops.floor,
+    math_ops.imag,
+    math_ops.is_finite,
+    math_ops.is_inf,
+    math_ops.is_nan,
+    math_ops.lgamma,
+    math_ops.log,
+    math_ops.log1p,
+    math_ops.log_sigmoid,
+    math_ops.logical_not,
+    math_ops.negative,
+    math_ops.real,
+    math_ops.reciprocal,
+    math_ops.rint,
+    math_ops.round,
+    math_ops.rsqrt,
+    math_ops.saturate_cast,
+    math_ops.sign,
+    math_ops.sin,
+    math_ops.sinh,
+    math_ops.sqrt,
+    math_ops.square,
+    math_ops.tan,
+    parsing_ops.decode_compressed,
+    string_ops.string_to_number,
+    string_ops.string_to_hash_bucket,
+    string_ops.as_string,
+    string_ops.decode_base64,
+    string_ops.encode_base64,
+    string_ops.regex_full_match,
+    string_ops.regex_replace,
+    string_ops.string_strip,
+    string_ops.string_to_hash_bucket,
+    string_ops.string_to_hash_bucket_fast,
+    string_ops.string_to_hash_bucket_strong,
+    string_ops.substr,
+    string_ops.substr_v2,
+    string_ops.string_length,
+    string_ops.string_length_v2,
+    string_ops.unicode_script,
+]
+
+_UNARY_LIST_ELEMENTWISE_OPS = [
+    math_ops.add_n,
+    string_ops.string_join,
+]
+
+_BINARY_ELEMENTWISE_OPS = [
+    math_ops.add,
+    math_ops.atan2,
+    math_ops.complex,
+    math_ops.div_no_nan,
+    math_ops.divide,
+    math_ops.equal,
+    math_ops.floordiv,
+    math_ops.floormod,
+    math_ops.greater,
+    math_ops.greater_equal,
+    math_ops.less,
+    math_ops.less_equal,
+    math_ops.logical_and,
+    math_ops.logical_or,
+    math_ops.logical_xor,
+    math_ops.maximum,
+    math_ops.minimum,
+    math_ops.multiply,
+    math_ops.not_equal,
+    math_ops.pow,
+    math_ops.realdiv,
+    math_ops.squared_difference,
+    math_ops.subtract,
+    math_ops.truediv,
+    math_ops.truncatediv,
+    math_ops.truncatemod,
+]
+
+
+# We don't need to register a separate delegation handler for these v1 ops,
+# since they delegate to the v2 ops (which already have a handler).  But we
+# still want to include them in the ragged_op_list() output.
+_V1_OPS_THAT_DELEGATE_TO_V2_OPS = [
+    math_ops.reduce_sum,
+    math_ops.reduce_prod,
+    math_ops.reduce_min,
+    math_ops.reduce_max,
+    math_ops.reduce_mean,
+    math_ops.reduce_any,
+    math_ops.reduce_all,
+]
+
+
+def _ragged_gather_v1(params, indices, validate_indices=None, name=None,
+                      axis=0):
+  return ragged_array_ops.gather(
+      params=params,
+      indices=indices,
+      validate_indices=validate_indices,
+      axis=axis,
+      name=name)
+
+
+def _ragged_expand_dims_v1(input, axis=None, name=None, dim=None):  # pylint: disable=redefined-builtin
+  if dim is not None:
+    axis = dim
+  return ragged_array_ops.expand_dims(input=input, axis=axis, name=name)
+
+
+# (original_op, ragged_op, ragged_args)
+_RAGGED_DISPATCH_OPS = [
+    (array_ops.batch_gather, ragged_array_ops.batch_gather,
+     ['params', 'indices']),
+    (array_ops.concat, ragged_array_ops.concat, ['[values]']),
+    (array_ops.expand_dims, _ragged_expand_dims_v1, ['input']),
+    (array_ops.expand_dims_v2, ragged_array_ops.expand_dims, ['input']),
+    (array_ops.gather, _ragged_gather_v1, ['params', 'indices']),
+    (array_ops.gather_v2, ragged_array_ops.gather, ['params', 'indices']),
+    (array_ops.gather_nd, ragged_array_ops.gather_nd, ['params', 'indices']),
+    (array_ops.stack, ragged_array_ops.stack, ['[values]']),
+    (array_ops.tile, ragged_array_ops.tile, ['input']),
+    (array_ops.where, ragged_array_ops.where, ['condition', 'x', 'y']),
+    (math_ops.unsorted_segment_sum, ragged_math_ops.segment_sum,
+     ['data', 'segment_ids']),
+    (math_ops.unsorted_segment_prod, ragged_math_ops.segment_prod,
+     ['data', 'segment_ids']),
+    (math_ops.unsorted_segment_min, ragged_math_ops.segment_min,
+     ['data', 'segment_ids']),
+    (math_ops.unsorted_segment_max, ragged_math_ops.segment_max,
+     ['data', 'segment_ids']),
+    (math_ops.unsorted_segment_mean, ragged_math_ops.segment_mean,
+     ['data', 'segment_ids']),
+    (math_ops.unsorted_segment_sqrt_n, ragged_math_ops.segment_sqrt_n,
+     ['data', 'segment_ids']),
+    (math_ops.reduce_sum, ragged_math_ops.reduce_sum, ['input_tensor']),
+    (math_ops.reduce_prod, ragged_math_ops.reduce_prod, ['input_tensor']),
+    (math_ops.reduce_min, ragged_math_ops.reduce_min, ['input_tensor']),
+    (math_ops.reduce_max, ragged_math_ops.reduce_max, ['input_tensor']),
+    (math_ops.reduce_mean, ragged_math_ops.reduce_mean, ['input_tensor']),
+    (math_ops.reduce_any, ragged_math_ops.reduce_any, ['input_tensor']),
+    (math_ops.reduce_all, ragged_math_ops.reduce_all, ['input_tensor']),
+]
+
+
+def register_dispatchers():
+  """Constructs & registers OpDispatchers for ragged ops."""
+
+  op_list = (
+      _UNARY_ELEMENTWISE_OPS + _UNARY_LIST_ELEMENTWISE_OPS +
+      _BINARY_ELEMENTWISE_OPS + [x[0] for x in _RAGGED_DISPATCH_OPS])
+  for op in op_list:
+    _, undecorated_op = tf_decorator.unwrap(op)
+    if not hasattr(undecorated_op, tf_export.API_ATTRS['tensorflow'].names):
+      raise AssertionError('Expected %s to be an exported symbol '
+                           '(while adding a RaggedTensor dispatcher)')
+
+  for op in _UNARY_ELEMENTWISE_OPS:
+    UnaryRaggedElementwiseDispatcher(op).register(op)
+
+  for op in _UNARY_LIST_ELEMENTWISE_OPS:
+    UnaryRaggedElementwiseDispatcher(op, True).register(op)
+
+  for op in _BINARY_ELEMENTWISE_OPS:
+    BinaryRaggedElementwiseDispatcher(op).register(op)
+
+  for (original_op, ragged_op, args) in _RAGGED_DISPATCH_OPS:
+    RaggedDispatcher(original_op, ragged_op, args).register(original_op)
+
+
+def _ragged_op_signature(op, ragged_args):
+  """Returns a signature for the given op, marking ragged args in bold."""
+  op_name = tf_export.get_canonical_name_for_symbol(op)
+  argspec = tf_inspect.getfullargspec(op)
+  arg_names = argspec.args
+
+  # Mark ragged arguments in bold.
+  for pos in ragged_args:
+    arg_names[pos] = '**' + arg_names[pos] + '**'
+
+  # Add argument defaults.
+  for pos in range(-1, -len(argspec.defaults) - 1, -1):
+    arg_names[pos] += '=`{!r}`'.format(argspec.defaults[pos])
+
+  # Add varargs and keyword args
+  if argspec.varargs:
+    arg_names.append('*' + argspec.varargs)
+  if argspec.varkw:
+    arg_names.append('**' + argspec.varkw)
+
+  return '* `tf.{}`({})'.format(op_name, ', '.join(arg_names))
+
+
+def _op_is_in_tf_version(op, version):
+  if version == 1:
+    return (tf_export.get_v1_names(tf_decorator.unwrap(op)[1]) or
+            op in _V1_OPS_THAT_DELEGATE_TO_V2_OPS)
+  elif version == 2:
+    return tf_export.get_v2_names(tf_decorator.unwrap(op)[1])
+  else:
+    raise ValueError('Expected version 1 or 2.')
+
+
+def ragged_op_list(tf_version=1):
+  """Returns a string listing operators that have dispathers registered."""
+  lines = []
+  for op in _UNARY_ELEMENTWISE_OPS + _UNARY_LIST_ELEMENTWISE_OPS:
+    if _op_is_in_tf_version(op, tf_version):
+      lines.append(_ragged_op_signature(op, [0]))
+  for op in _BINARY_ELEMENTWISE_OPS:
+    if _op_is_in_tf_version(op, tf_version):
+      lines.append(_ragged_op_signature(op, [0, 1]))
+  for op, _, ragged_args in _RAGGED_DISPATCH_OPS:
+    if _op_is_in_tf_version(op, tf_version):
+      arginfos = _get_arg_infos(op, ragged_args)
+      ragged_args = [arginfo.position for arginfo in arginfos]
+      lines.append(_ragged_op_signature(op, ragged_args))
+  return ('\n\n### Additional ops that support `RaggedTensor`\n\n'
+          'Arguments that accept `RaggedTensor`s are marked in **bold**.\n\n' +
+          '\n'.join(sorted(lines)) + 'n')
+
+
+register_dispatchers()
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d70470f05a292e09def389505779b92041f2e99
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -0,0 +1,687 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RaggedTensor operator dispatch."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+# Constants listing various op types to test.  Each operation
+# should be included in at least one list below, or tested separately if
+# necessary (e.g., because it expects additional arguments).
+UNARY_FLOAT_OPS = [
+    math_ops.abs,
+    math_ops.acos,
+    math_ops.acosh,
+    math_ops.angle,
+    math_ops.asin,
+    math_ops.asinh,
+    math_ops.atan,
+    math_ops.atanh,
+    math_ops.ceil,
+    math_ops.conj,
+    math_ops.cos,
+    math_ops.cosh,
+    math_ops.digamma,
+    math_ops.erf,
+    math_ops.erfc,
+    math_ops.exp,
+    math_ops.expm1,
+    math_ops.floor,
+    math_ops.imag,
+    math_ops.is_finite,
+    math_ops.is_inf,
+    math_ops.is_nan,
+    math_ops.lgamma,
+    math_ops.log,
+    math_ops.log1p,
+    math_ops.log_sigmoid,
+    math_ops.negative,
+    math_ops.real,
+    math_ops.reciprocal,
+    math_ops.rint,
+    math_ops.round,
+    math_ops.rsqrt,
+    math_ops.sign,
+    math_ops.sin,
+    math_ops.sinh,
+    math_ops.sqrt,
+    math_ops.square,
+    math_ops.tan,
+    array_ops.identity,
+    array_ops.ones_like,
+    array_ops.zeros_like,
+]
+UNARY_BOOL_OPS = [
+    math_ops.logical_not,
+]
+UNARY_STRING_OPS = [
+    string_ops.decode_base64,
+    string_ops.encode_base64,
+    string_ops.string_strip,
+    parsing_ops.decode_compressed,
+]
+BINARY_FLOAT_OPS = [
+    math_ops.add,
+    math_ops.atan2,
+    math_ops.complex,
+    math_ops.div_no_nan,
+    math_ops.divide,
+    math_ops.equal,
+    math_ops.floordiv,
+    math_ops.floormod,
+    math_ops.greater,
+    math_ops.greater_equal,
+    math_ops.less,
+    math_ops.less_equal,
+    math_ops.maximum,
+    math_ops.minimum,
+    math_ops.multiply,
+    math_ops.not_equal,
+    math_ops.pow,
+    math_ops.realdiv,
+    math_ops.squared_difference,
+    math_ops.subtract,
+    math_ops.truediv,
+]
+BINARY_BOOL_OPS = [
+    math_ops.logical_and,
+    math_ops.logical_or,
+    math_ops.logical_xor,
+]
+UNARY_INT_OPS = [
+    string_ops.unicode_script,
+]
+BINARY_INT_OPS = [
+    math_ops.truncatediv,
+    math_ops.truncatemod,
+]
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
+                               parameterized.TestCase):
+
+  def assertSameShape(self, x, y):
+    """Checks that x and y have the same shape (including ragged shapes)."""
+    if isinstance(x, ragged_tensor.RaggedTensor):
+      self.assertIsInstance(y, ragged_tensor.RaggedTensor)
+      self.assertEqual(x.ragged_rank, y.ragged_rank)
+      for (x_splits, y_splits) in zip(x.nested_row_splits, y.nested_row_splits):
+        self.assertAllEqual(x_splits, y_splits)
+      self.assertAllEqual(
+          array_ops.shape(x.flat_values), array_ops.shape(y.flat_values))
+    else:
+      self.assertIsInstance(y, ops.Tensor)
+      self.assertAllEqual(array_ops.shape(x), array_ops.shape(y))
+
+  @parameterized.parameters(
+      #=========================================================================
+      # Test different input shapes.
+      #=========================================================================
+      [
+          # 0-dimensional input
+          {'x': 12},
+          # 1-dimensional input
+          {'x': [1, -2, 3]},
+          # 2-dimensional input
+          {'x': [[-2, 3], [-3, 4]]},
+          {'x': ragged_factory_ops.constant_value(
+              [[-2, 3], [-3]], ragged_rank=1)},
+          # 3-dimensional inputs
+          {'x': [[[-2, 3], [3, 4]], [[7, 6], [5, 4]]]},
+          {'x': ragged_factory_ops.constant_value(
+              [[[-2, 3], [3, 4]], [[7, 6]]],
+              ragged_rank=1)},
+          {'x': ragged_factory_ops.constant_value(
+              [[[-2, 3, 4], []], [[7, 6]], []],
+              ragged_rank=2)},
+          ] +
+      #=========================================================================
+      # Test each unary op.
+      #=========================================================================
+      [{'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]), 'op': op}
+       for op in UNARY_FLOAT_OPS] +
+      [{'x': ragged_factory_ops.constant_value([[True, False], [True]]),
+        'op': op}
+       for op in UNARY_BOOL_OPS] +
+      [{'x': ragged_factory_ops.constant_value([[18, 512], [12412]], np.int32),
+        'op': op}
+       for op in UNARY_INT_OPS] +
+      [{'x': ragged_factory_ops.constant_value([['abcd', 'efgh'],
+                                                ['aabbccdd']]),
+        'op': op}
+       for op in UNARY_STRING_OPS] +
+      [
+          {'op': clip_ops.clip_by_value,
+           'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
+           'clip_value_min': 0.1, 'clip_value_max': 4.0},
+          {'op': math_ops.cast,
+           'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
+           'dtype': dtypes.int32},
+          {'op': math_ops.saturate_cast,
+           'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
+           'dtype': dtypes.int32},
+          {'op': string_ops.string_to_hash_bucket,
+           'x': ragged_factory_ops.constant_value(
+               [['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000},
+          {'op': string_ops.string_to_hash_bucket_fast,
+           'x': ragged_factory_ops.constant_value(
+               [['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000},
+          {'op': string_ops.string_to_hash_bucket_strong,
+           'x': ragged_factory_ops.constant_value(
+               [['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000,
+           'key': [1231, 12512]},
+          {'op': string_ops.string_to_number,
+           'x': ragged_factory_ops.constant_value([['-2.0', '3.0'], ['-3.0']])},
+          {'op': string_ops.regex_full_match,
+           'x': ragged_factory_ops.constant_value([['hello', '123'], ['1+1']]),
+           'pattern': r'\w+'},
+          {'op': string_ops.regex_replace,
+           'x': ragged_factory_ops.constant_value([['hello', '123'], ['1+1']]),
+           'pattern': r'\d',
+           'rewrite': '#'},
+          {'op': string_ops.substr,
+           'x': ragged_factory_ops.constant_value([['hello', '123'], ['1+1']]),
+           'pos': 2, 'len': 3},
+          {'op': array_ops.check_numerics,
+           'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
+           'message': 'check-numerics'},
+      ]
+      )  # pyformat: disable
+  def testUnaryElementwiseOp(self, x, op=math_ops.abs, **extra_args):
+    x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x)
+    result = op(x, **extra_args)
+
+    # Run the wrapped op on the dense values, for comparison.
+    dense_x = x.flat_values if isinstance(x, ragged_tensor.RaggedTensor) else x
+    expected_flat_values = array_ops.reshape(op(dense_x, **extra_args), [-1])
+
+    # Check that the result has the expected shape.
+    self.assertSameShape(x, result)
+
+    # Check that the result has the expected (flattened) values.
+    if isinstance(result, ragged_tensor.RaggedTensor):
+      result_flat_values = array_ops.reshape(result.flat_values, [-1])
+    else:
+      result_flat_values = array_ops.reshape(result, [-1])
+    self.assertAllEqual(expected_flat_values, result_flat_values)
+
+  @parameterized.parameters(
+      [
+          #=====================================================================
+          # Without broadcasting -- i.e., shapes match exactly.
+          #=====================================================================
+          # Shapes: x:(), y:()
+          {'x': 12,
+           'y': 8},
+          # Shapes: x:(3,), y:(3,)
+          {'x': [7, 8, 9],
+           'y': [1, -2, 3]},
+          # Shapes: x:(2, 2), y:(2, 2)
+          {'x': [[-2, 3], [-3, -4]],
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(2, None), y:(2, None)
+          {'x': ragged_factory_ops.constant_value([[-2, 3], [-3]]),
+           'y': ragged_factory_ops.constant_value([[5, 6], [7]])},
+          # Shapes: x:(2, 2, 2), y:(2, 2, 2)
+          {'x': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+           'y': [[[9, 3], [3, 4]], [[5, 2], [7, 6]]]},
+          # Shapes: x:(2, None, None), y: (2, None, None)
+          {'x': ragged_factory_ops.constant_value(
+              [[[1, 2], [3], [4]], [[], [5, 7, 8]]]),
+           'y': ragged_factory_ops.constant_value(
+               [[[3, 8], [2], [5]], [[], [1, 9, 8]]])},
+          # Shapes: x:(2, None, 2), y: (2, None, 2)
+          {'x': ragged_factory_ops.constant_value(
+              [[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
+              ragged_rank=1),
+           'y': ragged_factory_ops.constant_value(
+               [[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
+               ragged_rank=1)},
+
+          #=====================================================================
+          # With broadcasting
+          #=====================================================================
+          # Shapes: x:(), y:(3,)
+          {'x': 12,                                 # Broadcast () -> (3,)
+           'y': [1, -2, 3]},
+          # Shapes: x:(1,), y:(3,)
+          {'x': [12],                               # Broadcast (1,) -> (3,)
+           'y': [1, -2, 3]},
+          # Shapes: x:(), y:(2, 2)
+          {'x': 12,                                 # Broadcast () -> (2, 2)
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(1,), y:(2, 2)
+          {'x': 12,                                 # Broadcast (1,) -> (2, 2)
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(2, 1), y:(2, 2)
+          {'x': [[10], [20]],                       # Broadcast (2, 1) -> (2, 2)
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(), y:(2, None)
+          {'x': 10,                                 # Broadcast () -> (2, None)
+           'y': ragged_factory_ops.constant_value(
+               [[1, 2], [3]], dtype=np.int32)},
+          # TODO(edloper): Add tests for more advanced broadcasting, once we add
+          # support for it.
+
+          #=====================================================================
+          # Keyword Args
+          #=====================================================================
+          {'x': ragged_factory_ops.constant_value(
+              [[[1, 2], [3], [4]], [[], [5, 7, 8]]]),
+           'y': ragged_factory_ops.constant_value(
+               [[[3, 8], [2], [5]], [[], [1, 9, 8]]]),
+           'use_kwargs': ('x', 'y')},
+          {'x': ragged_factory_ops.constant_value(
+              [[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
+              ragged_rank=1),
+           'y': ragged_factory_ops.constant_value(
+               [[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
+               ragged_rank=1),
+           'use_kwargs': ('x', 'y')},
+          {'x': ragged_factory_ops.constant_value(
+              [[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
+              ragged_rank=1),
+           'y': ragged_factory_ops.constant_value(
+               [[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
+               ragged_rank=1),
+           'use_kwargs': ('x',)},
+      ] +
+      #=========================================================================
+      # Test each unary op.
+      #=========================================================================
+      [{'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
+        'y': ragged_factory_ops.constant_value([[5.0, 1.0], [12.0]]),
+        'op': op}
+       for op in BINARY_FLOAT_OPS] +
+      [{'x': ragged_factory_ops.constant_value([[-2, 3], [-3]]),
+        'y': ragged_factory_ops.constant_value([[5, 1], [12]]),
+        'op': op}
+       for op in BINARY_INT_OPS] +
+      [{'x': ragged_factory_ops.constant_value([[True, True], [False]]),
+        'y': ragged_factory_ops.constant_value([[False, True], [False]]),
+        'op': op}
+       for op in BINARY_BOOL_OPS]
+      )  # pyformat: disable
+  def testBinaryElementwiseOp(self, x, y, op=math_ops.add, **extra_args):
+    use_kwargs = extra_args.pop('use_kwargs', ())
+    x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x)
+    y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y)
+    if 'x' in use_kwargs and 'y' in use_kwargs:
+      result = op(x=x, y=y, **extra_args)
+    elif 'y' in use_kwargs:
+      result = op(x, y=y, **extra_args)
+    else:
+      result = op(x, y, **extra_args)
+
+    # Run the wrapped op on the dense values, for comparison.
+    dense_x = x.flat_values if isinstance(x, ragged_tensor.RaggedTensor) else x
+    dense_y = y.flat_values if isinstance(y, ragged_tensor.RaggedTensor) else y
+    expected_flat_values = array_ops.reshape(
+        op(dense_x, dense_y, **extra_args), [-1])
+
+    # Check that the result has the expected shape.
+    self.assertSameShape(y, result)
+
+    # Check that the result has the expected (flattened) values.
+    if isinstance(result, ragged_tensor.RaggedTensor):
+      result_flat_values = array_ops.reshape(result.flat_values, [-1])
+    else:
+      result_flat_values = array_ops.reshape(result, [-1])
+    self.assertAllEqual(expected_flat_values, result_flat_values)
+
+  @parameterized.parameters(
+      [
+          {'inputs': (12, 8, 3)},
+          {'inputs': ([1, 2, 3], [7, 8, 9], [3, 6, 9])},
+          {'inputs': ([[1, 2]], [[3, 4]], [[5, 6]])},
+          {'inputs': (ragged_factory_ops.constant_value([[1, 3], [-3]]),
+                      ragged_factory_ops.constant_value([[4, 7], [88]]),
+                      ragged_factory_ops.constant_value([[2, 9], [12]]))},
+          {'inputs': (ragged_factory_ops.constant_value(
+              [[[1, 3], [-3]], [[1]]]),
+                      ragged_factory_ops.constant_value(
+                          [[[4, 7], [88]], [[2]]]),
+                      ragged_factory_ops.constant_value(
+                          [[[2, 9], [12]], [[8]]]))},
+          {'inputs': (
+              ragged_factory_ops.constant_value([[[1, 3], [3, 4]], [[1, 5]]],
+                                                ragged_rank=1),
+              ragged_factory_ops.constant_value([[[4, 7], [1, 2]], [[2, 2]]],
+                                                ragged_rank=1),
+              ragged_factory_ops.constant_value([[[2, 9], [5, 2]], [[8, 0]]],
+                                                ragged_rank=1))},
+          {'inputs': (
+              ragged_factory_ops.constant_value([[[1, 3], [-3]], [[1]]]),
+              ragged_factory_ops.constant_value([[[4, 7], [88]], [[2]]]),
+              ragged_factory_ops.constant_value([[[2, 9], [12]], [[8]]])),
+           'use_kwargs': True},
+      ] + [
+          {'op': math_ops.add_n,
+           'inputs': (ragged_factory_ops.constant_value([[1, 3], [-3]]),
+                      ragged_factory_ops.constant_value([[4, 7], [88]]),
+                      ragged_factory_ops.constant_value([[2, 9], [12]]))},
+          {'op': string_ops.string_join,
+           'inputs': (
+               ragged_factory_ops.constant_value([['a', 'b'], ['c']]),
+               ragged_factory_ops.constant_value([['foo', 'bar'], ['baz']]),
+               ragged_factory_ops.constant_value([['2', '9'], ['12']]))},
+      ])  # pyformat: disable
+  def testListValuedElementwiseOp(self, inputs, op=math_ops.add_n,
+                                  **extra_args):
+    use_kwargs = extra_args.pop('use_kwargs', False)
+    inputs = [
+        ragged_tensor.convert_to_tensor_or_ragged_tensor(x) for x in inputs
+    ]
+    if use_kwargs:
+      result = op(inputs=inputs, **extra_args)
+    else:
+      result = op(inputs, **extra_args)
+
+    # Run the wrapped op on the dense values, for comparison.
+    dense_inputs = [
+        x.flat_values if isinstance(x, ragged_tensor.RaggedTensor) else x
+        for x in inputs
+    ]
+    expected_flat_values = array_ops.reshape(
+        op(dense_inputs, **extra_args), [-1])
+
+    # Check that the result has the expected shape.
+    self.assertSameShape(inputs[0], result)
+
+    # Check that the result has the expected (flattened) values.
+    if isinstance(result, ragged_tensor.RaggedTensor):
+      result_flat_values = array_ops.reshape(result.flat_values, [-1])
+    else:
+      result_flat_values = array_ops.reshape(result, [-1])
+    self.assertAllEqual(expected_flat_values, result_flat_values)
+
+  def testElementwiseOpUnknownRankError(self):
+    if context.executing_eagerly():
+      return
+    x = ragged_factory_ops.constant([[1, 2], [3]])
+    y = ragged_tensor.RaggedTensor.from_row_splits(
+        array_ops.placeholder_with_default([1, 2, 3], shape=None), x.row_splits)
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Unable to broadcast: unknown rank'):
+      math_ops.add(x, y)
+
+  @parameterized.parameters([
+      dict(
+          x=ragged_factory_ops.constant_value([[1, 2], [3]]),
+          y=[[10]],
+          expected=[[11, 12], [13]]),
+      dict(
+          x=ragged_factory_ops.constant_value([[[1, 2], [3, 4]], [[5]]],
+                                              ragged_rank=2),
+          y=ragged_factory_ops.constant_value([[[10], [20]], [[30]]],
+                                              ragged_rank=1),
+          expected=[[[11, 12], [23, 24]], [[35]]]),
+      dict(
+          x=ragged_factory_ops.constant_value([[[1]]]),
+          y=ragged_factory_ops.constant_value([[1]]),
+          expected=[[[2]]]),
+  ])
+  def testElementwiseOpBroadcast(self, x, y, expected):
+    x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, dtype=dtypes.int32)
+    y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, dtype=dtypes.int32)
+    result = x + y
+    self.assertRaggedEqual(result, expected)
+
+  def testElementwiseOpShapeMismatch(self):
+    x = ragged_factory_ops.constant([[1, 2, 3], [4, 5]])
+    y = ragged_factory_ops.constant([[1, 2, 3], [4, 5, 6]])
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(math_ops.add(x, y))
+
+  def testBinaryOpSparseAndRagged(self):
+    x = ragged_factory_ops.constant([[1, 2, 3], [4, 5]])
+    y = sparse_tensor.SparseTensor([[0, 0], [0, 1], [2, 0]], [1, 2, 3], [3, 2])
+    with self.assertRaises((TypeError, ValueError)):
+      self.evaluate(math_ops.add(x, y))
+
+    with self.assertRaises((TypeError, ValueError)):
+      self.evaluate(math_ops.add_n([x, y]))
+
+  @parameterized.parameters([
+      dict(
+          op=array_ops.batch_gather,
+          args=(ragged_factory_ops.constant_value([[5, 6, 7], [8, 9]]),
+                ragged_factory_ops.constant_value([[2, 1, 0], [1]])),
+          expected=ragged_factory_ops.constant_value([[7, 6, 5], [9]])),
+      dict(
+          op=array_ops.concat,
+          args=([
+              ragged_factory_ops.constant_value([[1, 2, 3], [4]],
+                                                dtype=np.int32),
+              np.array([[5, 6]], dtype=np.int32)
+          ],),
+          kwargs={'axis': 0},
+          expected=ragged_factory_ops.constant_value([[1, 2, 3], [4], [5, 6]])),
+      dict(
+          op=array_ops.expand_dims,
+          kwargs={
+              'input': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'axis': 0
+          },
+          expected=ragged_factory_ops.constant_value([[[1, 2], [3]]])),
+      dict(
+          op=array_ops.expand_dims_v2,
+          kwargs={
+              'input': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'axis': -1
+          },
+          expected=ragged_factory_ops.constant_value([[[1], [2]], [[3]]],
+                                                     ragged_rank=1),
+      ),
+      dict(
+          op=array_ops.gather,
+          kwargs={
+              'params': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'indices': [1, 0, 1]
+          },
+          expected=ragged_factory_ops.constant_value([[3], [1, 2], [3]])),
+      dict(
+          op=array_ops.gather_v2,
+          kwargs={
+              'params': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'indices': ragged_factory_ops.constant_value([[1, 0], [1]])
+          },
+          expected=ragged_factory_ops.constant_value([[[3], [1, 2]], [[3]]])),
+      dict(
+          op=array_ops.gather_nd,
+          kwargs={
+              'params': ragged_factory_ops.constant_value([[7, 8], [9]]),
+              'indices': [[0, 1], [1, 0], [0, 0]]
+          },
+          expected=ragged_factory_ops.constant_value([8, 9, 7])),
+      dict(
+          op=array_ops.stack,
+          args=([
+              ragged_factory_ops.constant_value([[1, 2, 3], [4]],
+                                                dtype=np.int32),
+              np.array([[5, 6]], dtype=np.int32)
+          ],),
+          expected=ragged_factory_ops.constant_value([[[1, 2, 3], [4]],
+                                                      [[5, 6]]])),
+      dict(
+          op=array_ops.tile,
+          args=([
+              ragged_factory_ops.constant_value([[1, 2], [3]], dtype=np.int32),
+              [2, 3]
+          ]),
+          expected=ragged_factory_ops.constant_value([[1, 2, 1, 2, 1, 2],
+                                                      [3, 3, 3],
+                                                      [1, 2, 1, 2, 1, 2],
+                                                      [3, 3, 3]])),
+      dict(
+          op=array_ops.where,
+          args=(ragged_factory_ops.constant_value([[True, False], [True]]),
+                ragged_factory_ops.constant_value([[b'A', b'B'], [b'C']]),
+                ragged_factory_ops.constant_value([[b'a', b'b'], [b'c']])),
+          expected=ragged_factory_ops.constant_value([[b'A', b'b'], [b'C']])),
+      dict(
+          op=array_ops.where,
+          args=(ragged_factory_ops.constant_value([[True, False], [True]]),),
+          expected=[[0, 0], [1, 0]]),
+      dict(
+          op=math_ops.unsorted_segment_sum,
+          kwargs={
+              'data': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'segment_ids': ragged_factory_ops.constant_value([[0, 2], [0]]),
+              'num_segments': 3
+          },
+          expected=[4, 0, 2]),
+      dict(
+          op=math_ops.unsorted_segment_prod,
+          kwargs={
+              'data': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'segment_ids': ragged_factory_ops.constant_value([[0, 2], [0]]),
+              'num_segments': 3
+          },
+          expected=[3, 1, 2]),
+      dict(
+          op=math_ops.unsorted_segment_min,
+          kwargs={
+              'data': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'segment_ids': ragged_factory_ops.constant_value([[0, 1], [0]]),
+              'num_segments': 2
+          },
+          expected=[1, 2]),
+      dict(
+          op=math_ops.unsorted_segment_max,
+          kwargs={
+              'data': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'segment_ids': ragged_factory_ops.constant_value([[0, 1], [0]]),
+              'num_segments': 2
+          },
+          expected=[3, 2]),
+      dict(
+          op=math_ops.unsorted_segment_mean,
+          kwargs={
+              'data': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'segment_ids': ragged_factory_ops.constant_value([[0, 1], [0]]),
+              'num_segments': 2
+          },
+          expected=[2, 2]),
+      dict(
+          op=math_ops.unsorted_segment_sqrt_n,
+          kwargs={
+              'data':
+                  ragged_factory_ops.constant_value([[1.0, 2.0],
+                                                     [3.0, 4.0, 6.0]]),
+              'segment_ids':
+                  ragged_factory_ops.constant_value([[0, 1], [0, 0, 0]]),
+              'num_segments':
+                  2
+          },
+          expected=[7.0, 2.0]),
+      dict(
+          op=math_ops.reduce_sum,
+          kwargs={
+              'input_tensor':
+                  ragged_factory_ops.constant_value([[1, 2], [3, 4, 5]]),
+              'axis':
+                  1
+          },
+          expected=[3, 12]),
+      dict(
+          op=math_ops.reduce_prod,
+          kwargs={
+              'input_tensor':
+                  ragged_factory_ops.constant_value([[1, 2], [3, 4, 5]]),
+              'axis':
+                  1
+          },
+          expected=[2, 60]),
+      dict(
+          op=math_ops.reduce_min,
+          kwargs={
+              'input_tensor':
+                  ragged_factory_ops.constant_value([[1, 2], [3, 4, 5]]),
+              'axis':
+                  1
+          },
+          expected=[1, 3]),
+      dict(
+          op=math_ops.reduce_max,
+          kwargs={
+              'input_tensor':
+                  ragged_factory_ops.constant_value([[1, 2], [3, 4, 5]]),
+              'axis':
+                  1
+          },
+          expected=[2, 5]),
+      dict(
+          op=math_ops.reduce_mean,
+          kwargs={
+              'input_tensor':
+                  ragged_factory_ops.constant_value([[1, 3], [3, 4, 5]]),
+              'axis':
+                  1
+          },
+          expected=[2, 4]),
+      dict(
+          op=math_ops.reduce_any,
+          kwargs={
+              'input_tensor':
+                  ragged_factory_ops.constant_value([[True, False],
+                                                     [True, True, True]]),
+              'axis':
+                  1
+          },
+          expected=[True, True]),
+      dict(
+          op=math_ops.reduce_all,
+          kwargs={
+              'input_tensor':
+                  ragged_factory_ops.constant_value([[True, False],
+                                                     [True, True, True]]),
+              'axis':
+                  1
+          },
+          expected=[False, True]),
+  ])
+  def testRaggedDispatch(self, op, expected, args=(), kwargs=None):
+    if kwargs is None: kwargs = {}
+    result = op(*args, **kwargs)
+    self.assertRaggedEqual(result, expected)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_eager_test.py b/tensorflow/python/ops/ragged/ragged_eager_test.py
index 731ff742aa18bfa45c68813d5e19f4dbe2307cdb..86f01aace00d3b67bcaa78d4091d32fdab3242d7 100644
--- a/tensorflow/python/ops/ragged/ragged_eager_test.py
+++ b/tensorflow/python/ops/ragged/ragged_eager_test.py
@@ -17,17 +17,17 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import sys
 
 from absl.testing import parameterized
 
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
 
   @parameterized.parameters([
       dict(pylist=[[b'a', b'b'], [b'c']]),
@@ -35,22 +35,16 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       dict(pylist=[[[1, 2], [3, 4]], [[5, 6], [], [7, 8]]], ragged_rank=1),
   ])
   def testRaggedTensorToList(self, pylist, ragged_rank=None):
-    rt = ragged.constant(pylist, ragged_rank)
-    self.assertEqual(rt.tolist(), pylist)
-
-  expected = "RaggedTensor([['a', 'b'], ['c']])"
-  if sys.version_info[0] == 3:
-    expected = "RaggedTensor([[b'a', b'b'], [b'c']])"
+    rt = ragged_factory_ops.constant(pylist, ragged_rank)
+    self.assertRaggedEqual(rt, pylist)
 
   @parameterized.parameters([
-      dict(pylist=[['a', 'b'], ['c']],
-           expected=expected),
-      dict(pylist=[[[1, 2], [3]], [[4, 5, 6], [], [7]]],
-           expected='RaggedTensor([[[1, 2], [3]], [[4, 5, 6], [], [7]]])'),
+      dict(pylist=[[b'a', b'b'], [b'c']]),
+      dict(pylist=[[[1, 2], [3]], [[4, 5, 6], [], [7]]]),
   ])
-  def testRaggedTensorStr(self, pylist, expected):
-    rt = ragged.constant(pylist)
-    self.assertEqual(str(rt), expected)
+  def testRaggedTensorStr(self, pylist):
+    rt = ragged_factory_ops.constant(pylist)
+    self.assertEqual(str(rt), '<tf.RaggedTensor %s>' % pylist)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_elementwise_ops.py b/tensorflow/python/ops/ragged/ragged_elementwise_ops.py
deleted file mode 100644
index a497500a6ad3e706f65f9a63a65122ecc14db17e..0000000000000000000000000000000000000000
--- a/tensorflow/python/ops/ragged/ragged_elementwise_ops.py
+++ /dev/null
@@ -1,384 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Elementwise operations for RaggedTensors."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.ops.ragged import ragged_tensor_shape
-from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_export
-from tensorflow.python.util import tf_inspect
-
-# Information about an argument to an operation: The name of the argument, its
-# position in the argument list, and a boolean flag indicating whether it
-# expects a list of tensors.
-_ArgInfo = collections.namedtuple('ArgInfo', ['name', 'position', 'is_list'])
-
-
-def make_elementwise_op(op, *elementwise_args):
-  """Returns a ragged-tensor version of the elementwise operation `op`.
-
-  The returned operation will:
-
-  1. Broadcast the elementwise arguments to have a compatible shape.
-     An exception is raised if the tensors not broadcast-compatible.
-  2. Call `op`, substituting the dense values of the broadcasted tensor for
-     each elementwise argument.
-  3. Return a potentially ragged tensor constructed from the output of `op`
-     and the broadcasted tensors' nested row splits.
-
-  For example, you can construct a ragged-tensor version of the standard
-  operation `tf.add` by calling `make_elementwise_op(tf.add, 'x', 'y')`.
-
-  Args:
-    op: The operation to wrap.
-    *elementwise_args: The names of arguments to `op` that are treated as
-      elementwise.  Arguments that take a list of tensors should have their
-      names wrapped in square brackets (e.g. "[inputs]").
-
-  Raises:
-    ValueError: If any name specified in `elementwise_args` is not the name
-      of an argument to `op`.
-  """
-  elementwise_arg_infos = _get_arg_infos(op, elementwise_args)
-
-  def ragged_op(*args, **kwargs):
-    """Ragged version of `op`."""
-    args = list(args)
-
-    # Collect all of the elementwise arguments, and put them in a single
-    # dict whose values are the (potentially ragged) tensors that need to
-    # be broadcast to a common shape.  The keys of this dict are tuples
-    # (argkey, index), where argkey is an int for poitional args or a string
-    # for keyword args; and index is None for non-list args and the index of the
-    # tensor for list args.
-    elementwise_args = {}
-    for (name, position, is_list) in elementwise_arg_infos.values():
-      if position < len(args):
-        if is_list:
-          args[position] = list(args[position])
-          for (index, arg) in enumerate(args[position]):
-            elementwise_args[position, index] = arg
-        else:
-          elementwise_args[position, None] = args[position]
-      elif name in kwargs:
-        if is_list:
-          kwargs[name] = list(kwargs[name])
-          for (i, arg) in enumerate(kwargs[name]):
-            elementwise_args[name, i] = arg
-        else:
-          elementwise_args[name, None] = kwargs[name]
-
-    with ops.name_scope(None, op.__name__, elementwise_args.values()):
-      # Convert all inputs to tensors or ragged tensors.
-      for ((key, index), tensor) in elementwise_args.items():
-        argname = elementwise_arg_infos[key].name
-        converted = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-            tensor, name=argname)
-        elementwise_args[key, index] = converted
-
-      # Broadcast tensors to have compatible shapes.
-      broadcast_args, result_splits, broadcast_check_ops = \
-          _broadcast_elementwise_args(elementwise_args)
-
-      # Replace tensor arguments with their dense values.
-      for ((key, index), tensor) in broadcast_args.items():
-        if ragged_tensor.is_ragged(tensor):
-          if isinstance(key, int) and index is None:
-            args[key] = tensor.inner_values
-          elif isinstance(key, int) and index is not None:
-            args[key][index] = tensor.inner_values
-          elif isinstance(key, str) and index is None:
-            kwargs[key] = tensor.inner_values
-          else:
-            assert isinstance(key, str) and index is not None
-            kwargs[key][index] = tensor.inner_values
-
-      # Call the elementwise op on the broadcasted dense values.
-      with ops.control_dependencies(broadcast_check_ops):
-        result_values = op(*args, **kwargs)
-
-      # Restore any ragged dimensions that we stripped off, and return the
-      # result.
-      return ragged_factory_ops.from_nested_row_splits(result_values,
-                                                       result_splits)
-
-  # Construct the docstring.
-  op_name = tf_export.get_canonical_name_for_symbol(op)
-  assert op_name is not None, op
-  argnames = ', '.join('`%s`' % s.strip('[]') for s in elementwise_args)
-  docstring = _ELEMENTWISE_DOCSTRING % dict(op_name=op_name, argnames=argnames)
-
-  # Update name, docstring, signature, etc., for the wrapper, and return it.
-  return tf_decorator.make_decorator(op, ragged_op, decorator_doc=docstring)
-
-
-_ELEMENTWISE_DOCSTRING = """\
-Ragged version of the elementwise operation `tf.%(op_name)s`.
-
-  The following elementwise arguments may be ragged or dense:
-  %(argnames)s.
-  These arguments will be broadcast to a compatible shape if necessary.
-  """
-
-
-def _get_arg_infos(func, elementwise_args):
-  """Returns `_ArgInfo`s for each `func` arg specified by `elementwise_args`.
-
-  Args:
-    func: The function whose arguments should be described.
-    elementwise_args: The names of the arguments to get info for.
-
-  Returns:
-    A dictionary that maps both names and positions of arguments to
-    `_ArgInfo` tuples.
-  """
-  arg_infos = {}
-
-  # Inspect the func's argspec to find the position of each arg.
-  arg_spec = tf_inspect.getargspec(func)
-  for argname in elementwise_args:
-    assert isinstance(argname, str)
-    is_list = argname.startswith('[') and argname.endswith(']')
-    if is_list:
-      argname = argname[1:-1]
-    assert argname in arg_spec.args, (func, argname, arg_spec.args)
-    arg_info = _ArgInfo(argname, arg_spec.args.index(argname), is_list)
-    arg_infos[arg_info.name] = arg_info
-    arg_infos[arg_info.position] = arg_info
-  return arg_infos
-
-
-def _broadcast_elementwise_args(elementwise_args):
-  """Broadcasts the values of `elementwise_args` to have compatible shapes.
-
-  Args:
-    elementwise_args: A dictionary whose keys are potentially ragged tensors.
-
-  Returns:
-    A tuple `(broadcast_args, broadcast_splits, checks)` where:
-
-    * `broadcast_args` is a dictionary with the same keys as
-      `elementwise_args`, mapping to broadcasted tensors.
-    * `broadcast_splits` is the broadcasted nested row splits.
-    * `checks` is a possibly empty tuple of assertion operations that should
-      be added as control dependencies.
-
-  Raises:
-    ValueError: If broadcasting fails.
-  """
-  # No elementwise arguments were used: nothing to do!
-  if not elementwise_args:
-    return elementwise_args, (), ()
-
-  # A single elementwise argument was used: no broadcasting necessary.
-  if len(elementwise_args) == 1:
-    arg = list(elementwise_args.values())[0]
-    if ragged_tensor.is_ragged(arg):
-      return elementwise_args, arg.nested_row_splits, ()
-    else:
-      return elementwise_args, (), ()
-
-  # Multiple elementwise arguments.
-  else:
-    is_ragged = [ragged_tensor.is_ragged(t) for t in elementwise_args.values()]
-    if not any(is_ragged):
-      return elementwise_args, (), ()
-
-    # If we have a single ragged tensor plus a set of scalars, then we can
-    # rely on the underlying elementwise op to do broadcasting.
-    if (sum(is_ragged) == 1 and
-        all((ragged_tensor.is_ragged(t) or t.shape.ndims == 0)
-            for t in elementwise_args.values())):
-      nested_splits_lists = [
-          t.nested_row_splits
-          for t in elementwise_args.values()
-          if ragged_tensor.is_ragged(t)][0]
-      return elementwise_args, nested_splits_lists, ()
-
-    else:
-      # Get the shapes of all the elementwise arguments.
-      shapes = [ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(t)
-                for t in elementwise_args.values()]
-
-      # Broadcast the shapes to all have the same rank (the max rank).
-      ranks = [t.shape.ndims for t in elementwise_args.values()]
-      if any(rank is None for rank in ranks):
-        raise ValueError('Unable to broadcast: unknown rank')
-      broadcast_rank = max(ranks)
-      shapes = [shape.broadcast_to_rank(broadcast_rank) for shape in shapes]
-
-      # For each dimension, broadcast the shapes to be compatible.
-      for axis in range(broadcast_rank):
-        # For each i, broadcast shape[i+1] to be compatible with shape[i]; and
-        # then finally broadcast shape[0] to be compatible with shape[-1].
-        for i in range(len(shapes)):
-          j = (i + 1) % len(shapes)
-          dim_size = shapes[i].dimension_size(axis)
-          shapes[j] = shapes[j].broadcast_dimension(axis, dim_size)
-      broadcast_shape = shapes[0]
-
-      # Broadcast every elementwise arg to the shape that we calculated.
-      elementwise_args = dict([
-          (key, ragged_tensor_shape.broadcast_to(t, broadcast_shape, False))
-          for (key, t) in elementwise_args.items()])
-      nested_splits_lists = list(elementwise_args.values())[0].nested_row_splits
-      return elementwise_args, nested_splits_lists, ()
-
-
-# A list of symbols that should be exported in the "ragged" package.
-_symbols_to_export = []
-
-
-def _add_elementwise_ops_to_this_module(specs, verbose=False):
-  """Adds ragged versions of the given ops to this module.
-
-  Args:
-    specs: A list of tuples containing the arguments for `make_elementwise_op`.
-    verbose: If true, then display each op that gets added.
-  """
-  for spec in specs:
-    original_op = spec[0]
-    ragged_op = make_elementwise_op(*spec)
-    canonical_name = tf_export.get_canonical_name_for_symbol(original_op)
-    if '.' not in canonical_name:
-      op_name = canonical_name
-    else:
-      op_name = original_op.__name__
-    if verbose:
-      print('Adding ragged_elementwise_op: tf.ragged.%s (based on tf.%s)' %
-            (op_name, canonical_name))
-    globals()[op_name] = ragged_op
-    _symbols_to_export.append(op_name)
-
-
-# A list of tuples containing arguments for `make_elementwise_op`, for each
-# elementwise operation that should have a ragged version built.  Each tuple
-# contains a standard `Tensor` operation, and the names of any arguments
-# that are processed in elementwise fashion.
-_TF_ELEMENTWISE_OPS = [
-    # Unary math operations.
-    (clip_ops.clip_by_value, 't'),
-    (math_ops.abs, 'x'),
-    (math_ops.acos, 'x'),
-    (math_ops.acosh, 'x'),
-    (math_ops.angle, 'input'),
-    (math_ops.asin, 'x'),
-    (math_ops.asinh, 'x'),
-    (math_ops.atan, 'x'),
-    (math_ops.atanh, 'x'),
-    (math_ops.cast, 'x'),
-    (math_ops.ceil, 'x'),
-    (math_ops.conj, 'x'),
-    (math_ops.cos, 'x'),
-    (math_ops.cosh, 'x'),
-    (math_ops.digamma, 'x'),
-    (math_ops.erf, 'x'),
-    (math_ops.erfc, 'x'),
-    (math_ops.exp, 'x'),
-    (math_ops.expm1, 'x'),
-    (math_ops.floor, 'x'),
-    (math_ops.imag, 'input'),
-    (math_ops.is_finite, 'x'),
-    (math_ops.is_inf, 'x'),
-    (math_ops.is_nan, 'x'),
-    (math_ops.lgamma, 'x'),
-    (math_ops.log, 'x'),
-    (math_ops.log1p, 'x'),
-    (math_ops.log_sigmoid, 'x'),
-    (math_ops.logical_not, 'x'),
-    (math_ops.negative, 'x'),
-    (math_ops.real, 'input'),
-    (math_ops.reciprocal, 'x'),
-    (math_ops.rint, 'x'),
-    (math_ops.round, 'x'),
-    (math_ops.rsqrt, 'x'),
-    (math_ops.saturate_cast, 'value'),
-    (math_ops.sign, 'x'),
-    (math_ops.sin, 'x'),
-    (math_ops.sinh, 'x'),
-    (math_ops.sqrt, 'x'),
-    (math_ops.square, 'x'),
-    (math_ops.tan, 'x'),
-
-    # Binary math operations
-    (math_ops.add, 'x', 'y'),
-    (math_ops.atan2, 'y', 'x'),
-    (math_ops.complex, 'real', 'imag'),
-    (math_ops.div, 'x', 'y'),
-    (math_ops.div_no_nan, 'x', 'y'),
-    (math_ops.divide, 'x', 'y'),
-    (math_ops.equal, 'x', 'y'),
-    (math_ops.floordiv, 'x', 'y'),
-    (math_ops.floormod, 'x', 'y'),
-    (math_ops.greater, 'x', 'y'),
-    (math_ops.greater_equal, 'x', 'y'),
-    (math_ops.less, 'x', 'y'),
-    (math_ops.less_equal, 'x', 'y'),
-    (math_ops.logical_and, 'x', 'y'),
-    (math_ops.logical_or, 'x', 'y'),
-    (math_ops.logical_xor, 'x', 'y'),
-    (math_ops.maximum, 'x', 'y'),
-    (math_ops.minimum, 'x', 'y'),
-    (math_ops.multiply, 'x', 'y'),
-    (math_ops.not_equal, 'x', 'y'),
-    (math_ops.pow, 'x', 'y'),
-    (math_ops.realdiv, 'x', 'y'),
-    (math_ops.squared_difference, 'x', 'y'),
-    (math_ops.subtract, 'x', 'y'),
-    (math_ops.truediv, 'x', 'y'),
-    (math_ops.truncatediv, 'x', 'y'),
-    (math_ops.truncatemod, 'x', 'y'),
-
-    # N-ary math operations
-    (math_ops.add_n, '[inputs]'),
-
-    # String operations
-    (string_ops.as_string, 'input'),
-    (string_ops.decode_base64, 'input'),
-    (string_ops.encode_base64, 'input'),
-    (string_ops.regex_full_match, 'input'),
-    (string_ops.regex_replace, 'input'),
-    (string_ops.string_join, '[inputs]'),
-    (string_ops.string_strip, 'input'),
-    (string_ops.string_to_hash_bucket, 'input'),
-    (string_ops.string_to_hash_bucket_fast, 'input'),
-    (string_ops.string_to_hash_bucket_strong, 'input'),
-    (string_ops.substr, 'input'),
-    (string_ops.unicode_script, 'input'),
-
-    # Array ops
-    (array_ops.check_numerics, 'tensor'),
-    (array_ops.identity, 'input'),
-    (array_ops.ones_like, 'tensor'),
-    (array_ops.zeros_like, 'tensor'),
-
-    # Parsing ops
-    (parsing_ops.decode_compressed, 'bytes'),
-    (parsing_ops.string_to_number, 'string_tensor'),
-]
-_add_elementwise_ops_to_this_module(_TF_ELEMENTWISE_OPS)
diff --git a/tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py b/tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py
deleted file mode 100644
index 305a96df9cca4db28e4bea4d93df73a7eb722a93..0000000000000000000000000000000000000000
--- a/tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py
+++ /dev/null
@@ -1,443 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for ragged.elementwise_ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
-from tensorflow.python.platform import googletest
-
-# Constants listing various op types to test.  Each elementwise operation
-# should be included in at least one list below, or tested separately if
-# necessary (e.g., because it expects additional arguments).
-UNARY_FLOAT_OPS = [
-    ragged.abs,
-    ragged.acos,
-    ragged.acosh,
-    ragged.angle,
-    ragged.asin,
-    ragged.asinh,
-    ragged.atan,
-    ragged.atanh,
-    ragged.ceil,
-    ragged.conj,
-    ragged.cos,
-    ragged.cosh,
-    ragged.digamma,
-    ragged.erf,
-    ragged.erfc,
-    ragged.exp,
-    ragged.expm1,
-    ragged.floor,
-    ragged.imag,
-    ragged.is_finite,
-    ragged.is_inf,
-    ragged.is_nan,
-    ragged.lgamma,
-    ragged.log,
-    ragged.log1p,
-    ragged.log_sigmoid,
-    ragged.negative,
-    ragged.real,
-    ragged.reciprocal,
-    ragged.rint,
-    ragged.round,
-    ragged.rsqrt,
-    ragged.sign,
-    ragged.sin,
-    ragged.sinh,
-    ragged.sqrt,
-    ragged.square,
-    ragged.tan,
-    ragged.as_string,
-    ragged.identity,
-    ragged.ones_like,
-    ragged.zeros_like,
-]
-UNARY_BOOL_OPS = [
-    ragged.logical_not,
-]
-UNARY_STRING_OPS = [
-    ragged.decode_base64,
-    ragged.encode_base64,
-    ragged.string_strip,
-    ragged.decode_compressed,
-]
-BINARY_FLOAT_OPS = [
-    ragged.add,
-    ragged.atan2,
-    ragged.complex,
-    ragged.div,
-    ragged.div_no_nan,
-    ragged.divide,
-    ragged.equal,
-    ragged.floordiv,
-    ragged.floormod,
-    ragged.greater,
-    ragged.greater_equal,
-    ragged.less,
-    ragged.less_equal,
-    ragged.maximum,
-    ragged.minimum,
-    ragged.multiply,
-    ragged.not_equal,
-    ragged.pow,
-    ragged.realdiv,
-    ragged.squared_difference,
-    ragged.subtract,
-    ragged.truediv,
-]
-BINARY_BOOL_OPS = [
-    ragged.logical_and,
-    ragged.logical_or,
-    ragged.logical_xor,
-]
-UNARY_INT_OPS = [
-    ragged.unicode_script,
-]
-BINARY_INT_OPS = [
-    ragged.truncatediv,
-    ragged.truncatemod,
-]
-
-
-class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
-                               parameterized.TestCase):
-
-  def assertSameShape(self, x, y):
-    """Checks that x and y have the same shape (including ragged shapes)."""
-    if isinstance(x, ragged.RaggedTensor):
-      self.assertIsInstance(y, ragged.RaggedTensor)
-      self.assertEqual(x.ragged_rank, y.ragged_rank)
-      for (x_splits, y_splits) in zip(x.nested_row_splits, y.nested_row_splits):
-        self.assertAllEqual(x_splits, y_splits)
-      self.assertAllEqual(
-          array_ops.shape(x.inner_values), array_ops.shape(y.inner_values))
-    else:
-      self.assertIsInstance(y, ops.Tensor)
-      self.assertAllEqual(array_ops.shape(x), array_ops.shape(y))
-
-  @parameterized.parameters(
-      #=========================================================================
-      # Test different input shapes.
-      #=========================================================================
-      [
-          # 0-dimensional input
-          {'x': 12},
-          # 1-dimensional input
-          {'x': [1, -2, 3]},
-          # 2-dimensional input
-          {'x': [[-2, 3], [-3, 4]]},
-          {'x': ragged.constant_value([[-2, 3], [-3]], ragged_rank=1)},
-          # 3-dimensional inputs
-          {'x': [[[-2, 3], [3, 4]], [[7, 6], [5, 4]]]},
-          {'x': ragged.constant_value([[[-2, 3], [3, 4]], [[7, 6]]],
-                                      ragged_rank=1)},
-          {'x': ragged.constant_value([[[-2, 3, 4], []], [[7, 6]], []],
-                                      ragged_rank=2)},
-          ] +
-      #=========================================================================
-      # Test each unary op.
-      #=========================================================================
-      [{'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]), 'op': op}
-       for op in UNARY_FLOAT_OPS] +
-      [{'x': ragged.constant_value([[True, False], [True]]), 'op': op}
-       for op in UNARY_BOOL_OPS] +
-      [{'x': ragged.constant_value([[18, 512], [12412]], np.int32), 'op': op}
-       for op in UNARY_INT_OPS] +
-      [{'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]), 'op': op}
-       for op in UNARY_STRING_OPS] +
-      [
-          {'op': ragged.clip_by_value,
-           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
-           'clip_value_min': 0.1, 'clip_value_max': 4.0},
-          {'op': ragged.cast,
-           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
-           'dtype': dtypes.int32},
-          {'op': ragged.saturate_cast,
-           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
-           'dtype': dtypes.int32},
-          {'op': ragged.string_to_hash_bucket,
-           'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
-           'num_buckets': 1000},
-          {'op': ragged.string_to_hash_bucket_fast,
-           'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
-           'num_buckets': 1000},
-          {'op': ragged.string_to_hash_bucket_strong,
-           'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
-           'num_buckets': 1000,
-           'key': [1231, 12512]},
-          {'op': ragged.string_to_number,
-           'x': ragged.constant_value([['-2.0', '3.0'], ['-3.0']])},
-          {'op': ragged.regex_full_match,
-           'x': ragged.constant_value([['hello', '123'], ['1+1']]),
-           'pattern': r'\w+'},
-          {'op': ragged.regex_replace,
-           'x': ragged.constant_value([['hello', '123'], ['1+1']]),
-           'pattern': r'\d',
-           'rewrite': '#'},
-          {'op': ragged.substr,
-           'x': ragged.constant_value([['hello', '123'], ['1+1']]),
-           'pos': 2, 'len': 3},
-          {'op': ragged.check_numerics,
-           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
-           'message': 'check-numerics'},
-      ]
-      )  # pyformat: disable
-  def testUnaryOp(self, x, op=ragged.abs, **extra_args):
-    x = ragged.convert_to_tensor_or_ragged_tensor(x)
-    result = op(x, **extra_args)
-
-    # Run the wrapped op on the dense values, for comparison.
-    dense_x = x.inner_values if isinstance(x, ragged.RaggedTensor) else x
-    expected_flat_values = array_ops.reshape(
-        op.__wrapped__(dense_x, **extra_args), [-1])
-
-    with self.test_session():
-      # Check that the result has the expected shape.
-      self.assertSameShape(x, result)
-
-      # Check that the result has the expected (flattened) values.
-      if isinstance(result, ragged.RaggedTensor):
-        result_flat_values = array_ops.reshape(result.inner_values, [-1])
-      else:
-        result_flat_values = array_ops.reshape(result, [-1])
-      self.assertAllEqual(expected_flat_values, result_flat_values)
-
-  @parameterized.parameters(
-      [
-          #=====================================================================
-          # Without broadcasting -- i.e., shapes match exactly.
-          #=====================================================================
-          # Shapes: x:(), y:()
-          {'x': 12,
-           'y': 8},
-          # Shapes: x:(3,), y:(3,)
-          {'x': [7, 8, 9],
-           'y': [1, -2, 3]},
-          # Shapes: x:(2, 2), y:(2, 2)
-          {'x': [[-2, 3], [-3, -4]],
-           'y': [[1, 2], [3, 4]]},
-          # Shapes: x:(2, None), y:(2, None)
-          {'x': ragged.constant_value([[-2, 3], [-3]]),
-           'y': ragged.constant_value([[5, 6], [7]])},
-          # Shapes: x:(2, 2, 2), y:(2, 2, 2)
-          {'x': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
-           'y': [[[9, 3], [3, 4]], [[5, 2], [7, 6]]]},
-          # Shapes: x:(2, None, None), y: (2, None, None)
-          {'x': ragged.constant_value([[[1, 2], [3], [4]], [[], [5, 7, 8]]]),
-           'y': ragged.constant_value([[[3, 8], [2], [5]], [[], [1, 9, 8]]])},
-          # Shapes: x:(2, None, 2), y: (2, None, 2)
-          {'x': ragged.constant_value([[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
-                                      ragged_rank=1),
-           'y': ragged.constant_value([[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
-                                      ragged_rank=1)},
-
-          #=====================================================================
-          # With broadcasting
-          #=====================================================================
-          # Shapes: x:(), y:(3,)
-          {'x': 12,                                 # Broadcast () -> (3,)
-           'y': [1, -2, 3]},
-          # Shapes: x:(1,), y:(3,)
-          {'x': [12],                               # Broadcast (1,) -> (3,)
-           'y': [1, -2, 3]},
-          # Shapes: x:(), y:(2, 2)
-          {'x': 12,                                 # Broadcast () -> (2, 2)
-           'y': [[1, 2], [3, 4]]},
-          # Shapes: x:(1,), y:(2, 2)
-          {'x': 12,                                 # Broadcast (1,) -> (2, 2)
-           'y': [[1, 2], [3, 4]]},
-          # Shapes: x:(2, 1), y:(2, 2)
-          {'x': [[10], [20]],                       # Broadcast (2, 1) -> (2, 2)
-           'y': [[1, 2], [3, 4]]},
-          # Shapes: x:(), y:(2, None)
-          {'x': 10,                                 # Broadcast () -> (2, None)
-           'y': ragged.constant_value([[1, 2], [3]], dtype=np.int32)},
-          # TODO(edloper): Add tests for more advanced broadcasting, once we add
-          # support for it.
-
-          #=====================================================================
-          # Keyword Args
-          #=====================================================================
-          {'x': ragged.constant_value([[[1, 2], [3], [4]], [[], [5, 7, 8]]]),
-           'y': ragged.constant_value([[[3, 8], [2], [5]], [[], [1, 9, 8]]]),
-           'use_kwargs': True},
-          {'x': ragged.constant_value([[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
-                                      ragged_rank=1),
-           'y': ragged.constant_value([[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
-                                      ragged_rank=1),
-           'use_kwargs': True},
-      ] +
-      #=========================================================================
-      # Test each unary op.
-      #=========================================================================
-      [{'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
-        'y': ragged.constant_value([[5.0, 1.0], [12.0]]),
-        'op': op}
-       for op in BINARY_FLOAT_OPS] +
-      [{'x': ragged.constant_value([[-2, 3], [-3]]),
-        'y': ragged.constant_value([[5, 1], [12]]),
-        'op': op}
-       for op in BINARY_INT_OPS] +
-      [{'x': ragged.constant_value([[True, True], [False]]),
-        'y': ragged.constant_value([[False, True], [False]]),
-        'op': op}
-       for op in BINARY_BOOL_OPS] +
-      [
-      ]
-      )  # pyformat: disable
-  def testBinaryOp(self, x, y, op=ragged.add, **extra_args):
-    use_kwargs = extra_args.pop('use_kwargs', False)
-    x = ragged.convert_to_tensor_or_ragged_tensor(x)
-    y = ragged.convert_to_tensor_or_ragged_tensor(y)
-    if use_kwargs:
-      result = op(x=x, y=y, **extra_args)
-    else:
-      result = op(x, y, **extra_args)
-
-    # Run the wrapped op on the dense values, for comparison.
-    dense_x = x.inner_values if isinstance(x, ragged.RaggedTensor) else x
-    dense_y = y.inner_values if isinstance(y, ragged.RaggedTensor) else y
-    expected_flat_values = array_ops.reshape(
-        op.__wrapped__(dense_x, dense_y, **extra_args), [-1])
-
-    with self.test_session():
-      # Check that the result has the expected shape.
-      self.assertSameShape(y, result)
-
-      # Check that the result has the expected (flattened) values.
-      if isinstance(result, ragged.RaggedTensor):
-        result_flat_values = array_ops.reshape(result.inner_values, [-1])
-      else:
-        result_flat_values = array_ops.reshape(result, [-1])
-      self.assertAllEqual(expected_flat_values, result_flat_values)
-
-  @parameterized.parameters(
-      [
-          {'inputs': (12, 8, 3)},
-          {'inputs': ([1, 2, 3], [7, 8, 9], [3, 6, 9])},
-          {'inputs': ([[1, 2]], [[3, 4]], [[5, 6]])},
-          {'inputs': (ragged.constant_value([[1, 3], [-3]]),
-                      ragged.constant_value([[4, 7], [88]]),
-                      ragged.constant_value([[2, 9], [12]]))},
-          {'inputs': (ragged.constant_value([[[1, 3], [-3]], [[1]]]),
-                      ragged.constant_value([[[4, 7], [88]], [[2]]]),
-                      ragged.constant_value([[[2, 9], [12]], [[8]]]))},
-          {'inputs': (ragged.constant_value([[[1, 3], [3, 4]], [[1, 5]]],
-                                            ragged_rank=1),
-                      ragged.constant_value([[[4, 7], [1, 2]], [[2, 2]]],
-                                            ragged_rank=1),
-                      ragged.constant_value([[[2, 9], [5, 2]], [[8, 0]]],
-                                            ragged_rank=1))},
-          {'inputs': (ragged.constant_value([[[1, 3], [-3]], [[1]]]),
-                      ragged.constant_value([[[4, 7], [88]], [[2]]]),
-                      ragged.constant_value([[[2, 9], [12]], [[8]]])),
-           'use_kwargs': True},
-      ] + [
-          {'op': ragged.add_n,
-           'inputs': (ragged.constant_value([[1, 3], [-3]]),
-                      ragged.constant_value([[4, 7], [88]]),
-                      ragged.constant_value([[2, 9], [12]]))},
-          {'op': ragged.string_join,
-           'inputs': (ragged.constant_value([['a', 'b'], ['c']]),
-                      ragged.constant_value([['foo', 'bar'], ['baz']]),
-                      ragged.constant_value([['2', '9'], ['12']]))},
-      ])  # pyformat: disable
-  def testListValuedOp(self, inputs, op=ragged.add_n, **extra_args):
-    use_kwargs = extra_args.pop('use_kwargs', False)
-    inputs = [ragged.convert_to_tensor_or_ragged_tensor(x) for x in inputs]
-    if use_kwargs:
-      result = op(inputs=inputs, **extra_args)
-    else:
-      result = op(inputs, **extra_args)
-
-    # Run the wrapped op on the dense values, for comparison.
-    dense_inputs = [
-        x.inner_values if isinstance(x, ragged.RaggedTensor) else x
-        for x in inputs
-    ]
-    expected_flat_values = array_ops.reshape(
-        op.__wrapped__(dense_inputs, **extra_args), [-1])
-
-    with self.test_session():
-      # Check that the result has the expected shape.
-      self.assertSameShape(inputs[0], result)
-
-      # Check that the result has the expected (flattened) values.
-      if isinstance(result, ragged.RaggedTensor):
-        result_flat_values = array_ops.reshape(result.inner_values, [-1])
-      else:
-        result_flat_values = array_ops.reshape(result, [-1])
-      self.assertAllEqual(expected_flat_values, result_flat_values)
-
-  @test_util.run_deprecated_v1
-  def testUnknownRankError(self):
-    x = ragged.constant([[1, 2], [3]])
-    y = ragged.from_row_splits(
-        array_ops.placeholder_with_default([1, 2, 3], shape=None), x.row_splits)
-    with self.assertRaisesRegexp(
-        ValueError, r'Unable to broadcast: unknown rank'):
-      ragged.add(x, y)
-
-  @parameterized.parameters([
-      dict(
-          x=ragged.constant_value([[1, 2], [3]]),
-          y=[[10]],
-          expected=[[11, 12], [13]]),
-      dict(
-          x=ragged.constant_value([[[1, 2], [3, 4]], [[5]]], ragged_rank=2),
-          y=ragged.constant_value([[[10], [20]], [[30]]], ragged_rank=1),
-          expected=[[[11, 12], [23, 24]], [[35]]]),
-      dict(
-          x=ragged.constant_value([[[1]]]),
-          y=ragged.constant_value([[1]]),
-          expected=[[[2]]]),
-  ])
-  def testBroadcastAdd(self, x, y, expected):
-    x = ragged.convert_to_tensor_or_ragged_tensor(x, dtype=dtypes.int32)
-    y = ragged.convert_to_tensor_or_ragged_tensor(y, dtype=dtypes.int32)
-    result = x + y
-    with self.cached_session():
-      self.assertEqual(result.eval().tolist(), expected)
-
-  def testShapeMismatch(self):
-    x = ragged.constant([[1, 2, 3], [4, 5]])
-    y = ragged.constant([[1, 2, 3], [4, 5, 6]])
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 'Incompatible shapes'):
-      with self.cached_session():
-        ragged.add(x, y).eval()
-
-  def testDocstring(self):
-    self.assertRegexpMatches(
-        ragged.add.__doc__,
-        'Ragged version of the elementwise operation `tf.math.add`')
-    self.assertEqual(ragged.add.__name__, 'add')
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py b/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
index 3ff66973b6f4968d3a1ca2080edf362b4f1cc609..c747bb304964b1fade5ddd701375a9e91de89c9e 100644
--- a/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.expand_dims."""
+"""Tests for ragged_array_ops.expand_dims."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,16 +21,19 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedExpandDimsOpTest(test_util.TensorFlowTestCase,
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedExpandDimsOpTest(ragged_test_util.RaggedTensorTestCase,
                              parameterized.TestCase):
 
   # An example 4-d ragged tensor with shape [3, (D2), (D3), 2], and the
   # expected result calling for expand_dims on each axis.  c.f. the table of
-  # expected result shapes in the ragged.expand_dims docstring.
+  # expected result shapes in the ragged_array_ops.expand_dims docstring.
   EXAMPLE4D = [[[[1, 1], [2, 2]], [[3, 3]]],
                [],
                [[], [[4, 4], [5, 5], [6, 6]]]]  # pyformat: disable
@@ -105,21 +108,19 @@ class RaggedExpandDimsOpTest(test_util.TensorFlowTestCase,
            expected=EXAMPLE4D_EXPAND_AXIS[4],
            expected_shape=[3, None, None, 2, 1]),
   ])  # pyformat: disable
-  @test_util.run_deprecated_v1
   def testRaggedExpandDims(self,
                            rt_input,
                            axis,
                            expected,
                            ragged_rank=None,
                            expected_shape=None):
-    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
-    expanded = ragged.expand_dims(rt, axis=axis)
+    rt = ragged_factory_ops.constant(rt_input, ragged_rank=ragged_rank)
+    expanded = ragged_array_ops.expand_dims(rt, axis=axis)
     self.assertEqual(expanded.shape.ndims, rt.shape.ndims + 1)
     if expected_shape is not None:
       self.assertEqual(expanded.shape.as_list(), expected_shape)
 
-    with self.test_session():
-      self.assertEqual(expanded.eval().tolist(), expected)
+    self.assertRaggedEqual(expanded, expected)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index d1f301bc58f12831e3d8c9da2cfc494bbd5294a5..8cda98765bb1759f156693e759de73f1e2acad6c 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -21,18 +21,16 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.util.tf_export import tf_export
 
 
 #===============================================================================
 # Op to construct a constant RaggedTensor from a nested Python list.
 #===============================================================================
+@tf_export("ragged.constant")
 def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None, name=None):
   """Constructs a constant RaggedTensor from a nested Python list.
 
@@ -56,8 +54,8 @@ def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None, name=None):
       `pylist`.
     ragged_rank: An integer specifying the ragged rank of the returned
       `RaggedTensor`.  Must be nonnegative and less than `K`. Defaults to
-      `max(0, K - 1)` if `inner_shape` is not specified.  Defaults to
-      `max(0, K - 1 - len(inner_shape))` if `inner_shape` is specified.
+      `max(0, K - 1)` if `inner_shape` is not specified.  Defaults to `max(0, K
+      - 1 - len(inner_shape))` if `inner_shape` is specified.
     inner_shape: A tuple of integers specifying the shape for individual inner
       values in the returned `RaggedTensor`.  Defaults to `()` if `ragged_rank`
       is not specified.  If `ragged_rank` is specified, then a default is chosen
@@ -72,17 +70,19 @@ def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None, name=None):
     ValueError: If the scalar values in `pylist` have inconsistent nesting
       depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
   """
-  with ops.name_scope(name, 'RaggedConstant'):
-    return _constant_value(from_row_splits, constant_op.constant, pylist, dtype,
-                           ragged_rank, inner_shape)
+  with ops.name_scope(name, "RaggedConstant"):
+    return _constant_value(ragged_tensor.RaggedTensor.from_row_splits,
+                           constant_op.constant, pylist, dtype, ragged_rank,
+                           inner_shape)
 
 
+@tf_export(v1=["ragged.constant_value"])
 def constant_value(pylist, dtype=None, ragged_rank=None, inner_shape=None):
   """Constructs a RaggedTensorValue from a nested Python list.
 
-  > Warning: This function returns a `RaggedTensorValue`, not a `RaggedTensor`.
-  > If you wish to construct a constant `RaggedTensor`, use
-  > [`ragged.constant(...)`](constant.md) instead.
+  Warning: This function returns a `RaggedTensorValue`, not a `RaggedTensor`.
+  If you wish to construct a constant `RaggedTensor`, use
+  [`ragged.constant(...)`](constant.md) instead.
 
   Example:
 
@@ -153,29 +153,29 @@ def _constant_value(ragged_factory, inner_factory, pylist, dtype, ragged_rank,
       depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
   """
   if ragged_tensor.is_ragged(pylist):
-    raise TypeError('pylist may not be a RaggedTensor or RaggedTensorValue.')
+    raise TypeError("pylist may not be a RaggedTensor or RaggedTensorValue.")
 
   if not isinstance(pylist, (list, tuple)):
     # Scalar value
     if ragged_rank is not None and ragged_rank != 0:
-      raise ValueError('Invalid pylist=%r: incompatible with ragged_rank=%d' %
+      raise ValueError("Invalid pylist=%r: incompatible with ragged_rank=%d" %
                        (pylist, ragged_rank))
     if inner_shape is not None and inner_shape:
       raise ValueError(
-          'Invalid pylist=%r: incompatible with dim(inner_shape)=%d' %
+          "Invalid pylist=%r: incompatible with dim(inner_shape)=%d" %
           (pylist, len(inner_shape)))
     return inner_factory(pylist, dtype, ())
 
   if ragged_rank is not None and ragged_rank < 0:
     raise ValueError(
-        'Invalid ragged_rank=%r: must be nonnegative' % ragged_rank)
+        "Invalid ragged_rank=%r: must be nonnegative" % ragged_rank)
 
   # Find the depth of scalar values in `pylist`.
   scalar_depth, max_depth = _find_scalar_and_max_depth(pylist)
   if scalar_depth is not None:
     if max_depth > scalar_depth:
-      raise ValueError('Invalid pylist=%r: empty list nesting is greater '
-                       'than scalar value nesting' % pylist)
+      raise ValueError("Invalid pylist=%r: empty list nesting is greater "
+                       "than scalar value nesting" % pylist)
 
   # If both inner_shape and ragged_rank were specified, then check that
   # they are compatible with pylist.
@@ -184,8 +184,8 @@ def _constant_value(ragged_factory, inner_factory, pylist, dtype, ragged_rank,
     if ((scalar_depth is not None and expected_depth != scalar_depth) or
         (scalar_depth is None and expected_depth < max_depth)):
       raise ValueError(
-          'Invalid pylist=%r: incompatible with ragged_rank=%d '
-          'and dim(inner_shape)=%d' % (pylist, ragged_rank, len(inner_shape)))
+          "Invalid pylist=%r: incompatible with ragged_rank=%d "
+          "and dim(inner_shape)=%d" % (pylist, ragged_rank, len(inner_shape)))
 
   # Check if the result is a `Tensor`.
   if (ragged_rank == 0 or
@@ -221,7 +221,7 @@ def _constant_value(ragged_factory, inner_factory, pylist, dtype, ragged_rank,
     values = concatenated_values
 
   values = inner_factory(
-      values, dtype=dtype, shape=(len(values),) + inner_shape, name='values')
+      values, dtype=dtype, shape=(len(values),) + inner_shape, name="values")
   for row_splits in reversed(nested_splits):
     values = ragged_factory(values, row_splits)
   return values
@@ -249,7 +249,7 @@ def _find_scalar_and_max_depth(pylist):
       child_scalar_depth, child_max_depth = _find_scalar_and_max_depth(child)
       if child_scalar_depth is not None:
         if scalar_depth is not None and scalar_depth != child_scalar_depth + 1:
-          raise ValueError('all scalar values must have the same nesting depth')
+          raise ValueError("all scalar values must have the same nesting depth")
         scalar_depth = child_scalar_depth + 1
       max_depth = max(max_depth, child_max_depth + 1)
     return (scalar_depth, max_depth)
@@ -273,436 +273,24 @@ def _default_inner_shape_for_pylist(pylist, ragged_rank):
     """Checks that `item` has a consistent shape matching `shape`."""
     is_nested = isinstance(item, (list, tuple))
     if is_nested != bool(shape):
-      raise ValueError('inner values have inconsistent shape')
+      raise ValueError("inner values have inconsistent shape")
     if is_nested:
       if shape[0] != len(item):
-        raise ValueError('inner values have inconsistent shape')
+        raise ValueError("inner values have inconsistent shape")
       for child in item:
         check_inner_shape(child, shape[1:])
 
   # Collapse the ragged layers to get the list of inner values.
-  inner_values = pylist
+  flat_values = pylist
   for dim in range(ragged_rank):
-    if not all(isinstance(v, (list, tuple)) for v in inner_values):
-      raise ValueError('pylist has scalar values depth %d, but ragged_rank=%d '
-                       'requires scalar value depth greater than %d' %
+    if not all(isinstance(v, (list, tuple)) for v in flat_values):
+      raise ValueError("pylist has scalar values depth %d, but ragged_rank=%d "
+                       "requires scalar value depth greater than %d" %
                        (dim + 1, ragged_rank, ragged_rank))
-    inner_values = sum((list(v) for v in inner_values), [])
+    flat_values = sum((list(v) for v in flat_values), [])
 
   # Compute the inner shape looking only at the leftmost elements; and then
   # use check_inner_shape to verify that other elements have the same shape.
-  inner_shape = get_inner_shape(inner_values)
-  check_inner_shape(inner_values, inner_shape)
+  inner_shape = get_inner_shape(flat_values)
+  check_inner_shape(flat_values, inner_shape)
   return inner_shape[1:]
-
-
-#===============================================================================
-# Convert value -> tensor
-#===============================================================================
-def convert_to_tensor_or_ragged_tensor(value,
-                                       dtype=None,
-                                       preferred_dtype=None,
-                                       name=None):
-  """Converts value to a `RaggedTensor` or `Tensor`.
-
-  * If `value` is a `RaggedTensor`, then return it as-is.
-  * If `value` is a `RaggedTensorValue`, return a corresponding constant
-    `RaggedTensor`.
-  * Otherwise, use `convert_to_tensor` to convert `value` to a `Tensor`.
-
-  Args:
-    value: A `RaggedTensor`, a `RaggedTensorValue`, or an object whose type has
-      a registered `Tensor` conversion function.
-    dtype: Optional element type for the returned tensor.  If missing the type
-      is inferred from the type of `value`.
-    preferred_dtype: Optional element type for the returned tensor, used when
-      dtype is None.  This argument has no effect if `value` is already a
-      tensor, or when conversion is not possible.
-    name: Optional name to use if a new `Tensor` is created.
-
-  Returns:
-    A `Tensor` or `RaggedTensor`.
-  """
-  if isinstance(value, ragged_tensor.RaggedTensor):
-    if dtype and not dtype.is_compatible_with(value.dtype):
-      raise ValueError('Tensor conversion requested dtype %s for '
-                       'RaggedTensor with dtype %s: %r' %
-                       (dtype.name, value.dtype.name, value))
-    return value
-  elif isinstance(value, ragged_tensor_value.RaggedTensorValue):
-    with ops.name_scope(name, 'ConvertToTensorOrRaggedTensor', []):
-      inner_values = ops.convert_to_tensor(
-          value=value.inner_values,
-          dtype=dtype,
-          preferred_dtype=preferred_dtype,
-          name='inner_values')
-      return from_nested_row_splits(inner_values, value.nested_row_splits)
-  else:
-    return ops.convert_to_tensor(
-        value=value, dtype=dtype, preferred_dtype=preferred_dtype, name=name)
-
-
-#===============================================================================
-# Ops to construct RaggedTensor from row-partitioned values.
-#===============================================================================
-
-
-def from_value_rowids(values, value_rowids, nrows=None, name=None):
-  """Creates a `RaggedTensor` with rows partitioned by `value_rowids`.
-
-  The returned `RaggedTensor` corresponds with the python list defined by:
-
-  ```python
-  result = [[values[i] for i in range(len(values)) if value_rowids[i] == row]
-            for row in range(nrows)]
-  ```
-
-  Warning: currently, this needs to cast value_rowids to int64 before
-  converting, since `tf.bincount` only supports `int32`.
-
-  Args:
-    values: A potentially ragged tensor with shape `[nvals, ...]`.
-    value_rowids: A 1-D int64 tensor with shape `[nvals]`, which corresponds
-      one-to-one with `values`, and specifies each value's row index.  Must be
-      nonnegative, and must be sorted in ascending order.
-    nrows: An int64 scalar specifying the number of rows.  This should be
-      specified if the `RaggedTensor` may containing empty training rows.  Must
-      be greater than `value_rowids[-1]` (or zero if `value_rowids` is empty).
-      Defaults to `value_rowids[-1]` (or zero if `value_rowids` is empty).
-    name: A name prefix for the RaggedTensor (optional).
-
-  Returns:
-    A `RaggedTensor`.  `result.rank = values.rank + 1`.
-    `result.ragged_rank = values.ragged_rank + 1`.
-
-  Raises:
-    ValueError: If `nrows` is incompatible with `value_rowids`.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.from_value_rowids(
-    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-    ...     value_rowids=[0, 0, 0, 0, 2, 2, 2, 3],
-    ...     nrows=5)
-    >>> rt.eval().tolist()
-    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
-    ```
-  """
-  with ops.name_scope(name, 'RaggedFromValueRowIds',
-                      [values, value_rowids, nrows]):
-    values = convert_to_tensor_or_ragged_tensor(values, name='values')
-    value_rowids = ops.convert_to_tensor(
-        value_rowids, dtypes.int64, name='value_rowids')
-    if nrows is None:
-      const_rowids = tensor_util.constant_value(value_rowids)
-      if const_rowids is None:
-        nrows = array_ops.concat([value_rowids[-1:], [-1]], axis=0)[0] + 1
-        const_nrows = None
-      else:
-        const_nrows = const_rowids[-1] + 1 if const_rowids.size > 0 else 0
-        nrows = ops.convert_to_tensor(const_nrows, dtypes.int64, name='nrows')
-    else:
-      nrows = ops.convert_to_tensor(nrows, dtypes.int64, 'nrows')
-      const_nrows = tensor_util.constant_value(nrows)
-      if const_nrows is not None:
-        if const_nrows < 0:
-          raise ValueError('Expected nrows >= 0; got %d' % const_nrows)
-        const_rowids = tensor_util.constant_value(value_rowids)
-        if const_rowids is not None and const_rowids.size > 0:
-          if not const_nrows >= const_rowids[-1] + 1:
-            raise ValueError(
-                'Expected nrows >= value_rowids[-1] + 1; got nrows=%d, '
-                'value_rowids[-1]=%d' % (const_nrows, const_rowids[-1]))
-
-    value_rowids.shape.assert_has_rank(1)
-    nrows.shape.assert_has_rank(0)
-    values.shape[:1].assert_is_compatible_with(value_rowids.shape)
-
-    # Convert value_rowids & nrows to row_splits.
-    # Note: we don't use segment_ids_to_row_splits() here because we want
-    # to save the intermediate value `row_lengths`, so we can cache it.
-    # TODO(b/116708836) Upgrade bincount to accept int64 so we can skip the cast
-    # (Remove the warning in the docstring when we do.)
-    value_rowids_int32 = math_ops.cast(value_rowids, dtypes.int32)
-    nrows_int32 = math_ops.cast(nrows, dtypes.int32)
-    row_lengths = math_ops.bincount(
-        value_rowids_int32,
-        minlength=nrows_int32,
-        maxlength=nrows_int32,
-        dtype=dtypes.int64)
-    row_splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)
-    if const_nrows is not None:
-      row_lengths.set_shape([const_nrows])
-      row_splits.set_shape([const_nrows + 1])
-
-    return ragged_tensor.RaggedTensor(
-        values,
-        row_splits,
-        cached_row_lengths=row_lengths,
-        cached_value_rowids=value_rowids,
-        cached_nrows=nrows,
-        internal=True)
-
-
-def from_row_splits(values, row_splits, name=None):
-  """Creates a `RaggedTensor` with rows partitioned by `row_splits`.
-
-  The returned `RaggedTensor` corresponds with the python list defined by:
-
-  ```python
-  result = [values[row_splits[i]:row_splits[i + 1]]
-            for i in range(len(row_splits) - 1)]
-  ```
-
-  Args:
-    values: A potentially ragged tensor with shape `[nvals, ...]`.
-    row_splits: A 1-D int64 tensor with shape `[nrows+1]`.  Must not be empty,
-      and must be sorted in ascending order.  `row_splits[0]` must be zero and
-      `row_splits[-1]` must be `nvals`.
-    name: A name prefix for the RaggedTensor (optional).
-
-  Returns:
-    A `RaggedTensor`.  `result.rank = values.rank + 1`.
-    `result.ragged_rank = values.ragged_rank + 1`.
-
-  Raises:
-    ValueError: If `row_splits` is an empty list.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.from_row_splits(
-    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-    ...     row_splits=[0, 4, 4, 7, 8, 8])
-    >>> rt.eval().tolist()
-    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
-    ```
-  """
-  if isinstance(row_splits, (list, tuple)) and not row_splits:
-    raise ValueError('row_splits tensor may not be empty.')
-  with ops.name_scope(name, 'RaggedFromRowSplits', [values, row_splits]):
-    values = convert_to_tensor_or_ragged_tensor(values, name='values')
-    row_splits = ops.convert_to_tensor(row_splits, dtypes.int64, 'row_splits')
-    row_splits.shape.assert_has_rank(1)
-    return ragged_tensor.RaggedTensor(
-        values=values, row_splits=row_splits, internal=True)
-
-
-def from_row_lengths(values, row_lengths, name=None):
-  """Creates a `RaggedTensor` with rows partitioned by `row_lengths`.
-
-  The returned `RaggedTensor` corresponds with the python list defined by:
-
-  ```python
-  result = [[values.pop(0) for i in range(length)]
-            for length in row_lengths]
-  ```
-
-  Args:
-    values: A potentially ragged tensor with shape `[nvals, ...]`.
-    row_lengths: A 1-D int64 tensor with shape `[nrows]`.  Must be nonnegative.
-      `sum(row_lengths)` must be `nvals`.
-    name: A name prefix for the RaggedTensor (optional).
-
-  Returns:
-    A `RaggedTensor`.  `result.rank = values.rank + 1`.
-    `result.ragged_rank = values.ragged_rank + 1`.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.from_row_lengths(
-    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-    ...     row_lengths=[4, 0, 3, 1, 0])
-    >>> rt.eval().tolist()
-    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
-    ```
-  """
-  with ops.name_scope(name, 'RaggedFromRowLengths', [values, row_lengths]):
-    values = convert_to_tensor_or_ragged_tensor(values, name='values')
-    row_lengths = ops.convert_to_tensor(row_lengths, dtypes.int64,
-                                        'row_lengths')
-    row_lengths.shape.assert_has_rank(1)
-    row_limits = math_ops.cumsum(row_lengths)
-    row_splits = array_ops.concat([[0], row_limits], axis=0)
-    return ragged_tensor.RaggedTensor(
-        values=values,
-        row_splits=row_splits,
-        cached_row_lengths=row_lengths,
-        internal=True)
-
-
-def from_row_starts(values, row_starts, name=None):
-  """Creates a `RaggedTensor` with rows partitioned by `row_starts`.
-
-  Equivalent to: `from_row_splits(values, concat([row_starts, nvals]))`.
-
-  Args:
-    values: A potentially ragged tensor with shape `[nvals, ...]`.
-    row_starts: A 1-D int64 tensor with shape `[nrows]`.  Must be nonnegative
-      and sorted in ascending order.  If `nrows>0`, then `row_starts[0]` must be
-      zero.
-    name: A name prefix for the RaggedTensor (optional).
-
-  Returns:
-    A `RaggedTensor`.  `result.rank = values.rank + 1`.
-    `result.ragged_rank = values.ragged_rank + 1`.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.from_row_starts(
-    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-    ...     row_starts=[0, 4, 4, 7, 8])
-    >>> rt.eval().tolist()
-    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
-    ```
-  """
-  with ops.name_scope(name, 'RaggedFromRowStarts', [values, row_starts]):
-    values = convert_to_tensor_or_ragged_tensor(values, name='values')
-    row_starts = ops.convert_to_tensor(row_starts, dtypes.int64, 'row_starts')
-    row_starts.shape.assert_has_rank(1)
-    nvals = array_ops.shape(values, out_type=dtypes.int64)[:1]
-    row_splits = array_ops.concat([row_starts, nvals], axis=0)
-    return ragged_tensor.RaggedTensor(
-        values=values, row_splits=row_splits, internal=True)
-
-
-def from_row_limits(values, row_limits, name=None):
-  """Creates a `RaggedTensor` with rows partitioned by `row_limits`.
-
-  Equivalent to: `from_row_splits(values, concat([0, row_limits]))`.
-
-  Args:
-    values: A potentially ragged tensor with shape `[nvals, ...]`.
-    row_limits: A 1-D int64 tensor with shape `[nrows]`.  Must be sorted in
-      ascending order.  If `nrows>0`, then `row_limits[-1]` must be `nvals`.
-    name: A name prefix for the RaggedTensor (optional).
-
-  Returns:
-    A `RaggedTensor`.  `result.rank = values.rank + 1`.
-    `result.ragged_rank = values.ragged_rank + 1`.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.from_row_limits(
-    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-    ...     row_limits=[4, 4, 7, 8, 8])
-    >>> rt.eval().tolist()
-    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
-    ```
-  """
-  with ops.name_scope(name, 'RaggedFromRowLimits', [values, row_limits]):
-    values = convert_to_tensor_or_ragged_tensor(values, name='values')
-    row_limits = ops.convert_to_tensor(row_limits, dtypes.int64, 'row_limits')
-    row_limits.shape.assert_has_rank(1)
-    zero = array_ops.zeros([1], dtypes.int64)
-    row_splits = array_ops.concat([zero, row_limits], axis=0)
-    return ragged_tensor.RaggedTensor(
-        values=values, row_splits=row_splits, internal=True)
-
-
-def from_nested_value_rowids(inner_values,
-                             nested_value_rowids,
-                             nested_nrows=None,
-                             name=None):
-  """Creates a `RaggedTensor` from a nested list of `value_rowids` tensors.
-
-  Equivalent to:
-
-  ```python
-  result = inner_values
-  for (value_rowids, nrows) in reversed(zip(nested_value_rowids, nested_nrows)):
-    result = from_value_rowids(result, value_rowids, nrows)
-  ```
-
-  Args:
-    inner_values: A potentially ragged tensor.
-    nested_value_rowids: A list of 1-D int64 tensors.  The `i`th tensor is used
-      as the `value_rowids` for the `i`th ragged dimension.
-    nested_nrows: A list of int64 scalars.  The `i`th scalar is used as the
-      `nrows` for the `i`th ragged dimension.
-    name: A name prefix for the RaggedTensor (optional).
-
-  Returns:
-    A `RaggedTensor` (or `inner_values` if `nested_value_rowids` is empty).
-
-  Raises:
-    ValueError: If `len(nested_values_rowids) != len(nested_nrows)`.
-  """
-  if isinstance(nested_value_rowids, ops.Tensor):
-    raise TypeError('nested_value_rowids must be a list of Tensors')
-  if nested_nrows is None:
-    nested_nrows = [None] * len(nested_value_rowids)
-  else:
-    if isinstance(nested_nrows, ops.Tensor):
-      raise TypeError('nested_nrows must be a list of Tensors')
-    if len(nested_nrows) != len(nested_value_rowids):
-      raise ValueError('nested_nrows must have the same length as '
-                       'nested_value_rowids')
-
-  with ops.name_scope(
-      name, 'RaggedFromNestedValueRowIds',
-      [inner_values] + list(nested_value_rowids) + list(nested_nrows)):
-    result = inner_values
-    for value_rowids, nrows in reversed(
-        list(zip(nested_value_rowids, nested_nrows))):
-      result = from_value_rowids(result, value_rowids, nrows)
-    return result
-
-
-def from_nested_row_splits(inner_values, nested_row_splits, name=None):
-  """Creates a `RaggedTensor` from a nested list of `row_splits` tensors.
-
-  Equivalent to:
-
-  ```python
-  result = inner_values
-  for row_splits in reversed(nested_row_splits):
-    result = from_row_splits(result, row_splits)
-  ```
-
-  Args:
-    inner_values: A potentially ragged tensor.
-    nested_row_splits: A list of 1-D int64 tensors.  The `i`th tensor is used as
-      the `row_splits` for the `i`th ragged dimension.
-    name: A name prefix for the RaggedTensor (optional).
-
-  Returns:
-    A `RaggedTensor` (or `inner_values` if `nested_row_splits` is empty).
-  """
-  if isinstance(nested_row_splits, ops.Tensor):
-    raise TypeError('nested_row_splits must be a list of Tensors')
-  with ops.name_scope(name, 'RaggedFromNestedRowSplits',
-                      [inner_values] + list(nested_row_splits)):
-    result = inner_values
-    for splits in reversed(nested_row_splits):
-      result = from_row_splits(result, splits)
-    return result
-
-
-def from_nested_row_lengths(inner_values, nested_row_lengths, name=None):
-  """Creates a `RaggedTensor` from a nested list of `row_lengths` tensors.
-
-  Equivalent to:
-
-  ```python
-  result = inner_values
-  for row_lengths in reversed(nested_row_lengths):
-    result = from_row_lengths(result, row_lengths)
-  ```
-
-  Args:
-    inner_values: A potentially ragged tensor.
-    nested_row_lengths: A list of 1-D int64 tensors.  The `i`th tensor is used
-      as the `row_lengths` for the `i`th ragged dimension.
-    name: A name prefix for the RaggedTensor (optional).
-
-  Returns:
-    A `RaggedTensor` (or `inner_values` if `nested_row_lengths` is empty).
-  """
-  if isinstance(nested_row_lengths, ops.Tensor):
-    raise TypeError('nested_row_lengths must be a list of Tensors')
-  with ops.name_scope(name, 'RaggedFromNestedRowlengths',
-                      [inner_values] + list(nested_row_lengths)):
-    result = inner_values
-    for lengths in reversed(nested_row_lengths):
-      result = from_row_lengths(result, lengths)
-    return result
diff --git a/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
index 3c0db9e8fb6cac8de232aa61fe95be5cc4080360..c6998e274bed1bae78a156751785c7bb10a90abd 100644
--- a/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
@@ -12,77 +12,77 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.from_sparse."""
+"""Tests for RaggedTensor.from_sparse."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
 from tensorflow.python.platform import googletest
 
 
-class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase):
 
-  @test_util.run_deprecated_v1
   def testDocStringExample(self):
     st = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0]],
         values=[1, 2, 3, 4, 5],
         dense_shape=[4, 3])
-    rt = ragged.from_sparse(st)
+    rt = RaggedTensor.from_sparse(st)
 
-    with self.test_session():
-      self.assertEqual(rt.eval().tolist(), [[1, 2, 3], [4], [], [5]])
+    self.assertRaggedEqual(rt, [[1, 2, 3], [4], [], [5]])
 
-  @test_util.run_deprecated_v1
   def testEmpty(self):
     st = sparse_tensor.SparseTensor(
         indices=array_ops.zeros([0, 2], dtype=dtypes.int64),
         values=[],
         dense_shape=[4, 3])
-    rt = ragged.from_sparse(st)
+    rt = RaggedTensor.from_sparse(st)
 
-    with self.test_session():
-      self.assertEqual(rt.eval().tolist(), [[], [], [], []])
+    self.assertRaggedEqual(rt, [[], [], [], []])
 
-  @test_util.run_deprecated_v1
   def testBadSparseTensorRank(self):
     st1 = sparse_tensor.SparseTensor(indices=[[0]], values=[0], dense_shape=[3])
+    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
+                            RaggedTensor.from_sparse, st1)
+
     st2 = sparse_tensor.SparseTensor(
         indices=[[0, 0, 0]], values=[0], dense_shape=[3, 3, 3])
-    st3 = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=[0],
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
-                            ragged.from_sparse, st1)
-    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
-                            ragged.from_sparse, st2)
     self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
-                            ragged.from_sparse, st3)
+                            RaggedTensor.from_sparse, st2)
+
+    if not context.executing_eagerly():
+      st3 = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=[0],
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
+                              RaggedTensor.from_sparse, st3)
 
-  @test_util.run_deprecated_v1
   def testGoodPartialSparseTensorRank(self):
-    st1 = sparse_tensor.SparseTensor(
-        indices=[[0, 0]],
-        values=[0],
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    st2 = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=[0],
-        dense_shape=[4, 3])
+    if not context.executing_eagerly():
+      st1 = sparse_tensor.SparseTensor(
+          indices=[[0, 0]],
+          values=[0],
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      st2 = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=[0],
+          dense_shape=[4, 3])
 
-    # Shouldn't throw ValueError
-    ragged.from_sparse(st1)
-    ragged.from_sparse(st2)
+      # Shouldn't throw ValueError
+      RaggedTensor.from_sparse(st1)
+      RaggedTensor.from_sparse(st2)
 
-  @test_util.run_deprecated_v1
   def testNonRaggedSparseTensor(self):
     # "index_suffix" means the value of the innermost dimension of the index
     # (i.e., indices[i][-1]).
@@ -92,22 +92,21 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
     # index_suffix of first index is not zero.
     st1 = sparse_tensor.SparseTensor(
         indices=[[0, 1], [0, 2], [2, 0]], values=[1, 2, 3], dense_shape=[3, 3])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'.*SparseTensor is not right-ragged'):
+      self.evaluate(RaggedTensor.from_sparse(st1))
     # index_suffix of an index that starts a new row is not zero.
     st2 = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 1], [2, 1]], values=[1, 2, 3], dense_shape=[3, 3])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'.*SparseTensor is not right-ragged'):
+      self.evaluate(RaggedTensor.from_sparse(st2))
     # index_suffix of an index that continues a row skips a cell.
     st3 = sparse_tensor.SparseTensor(
         indices=[[0, 1], [0, 1], [0, 3]], values=[1, 2, 3], dense_shape=[3, 3])
-    rt1 = ragged.from_sparse(st1)
-    rt2 = ragged.from_sparse(st2)
-    rt3 = ragged.from_sparse(st3)
-    with self.test_session():
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              r'.*SparseTensor is not right-ragged', rt1.eval)
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              r'.*SparseTensor is not right-ragged', rt2.eval)
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              r'.*SparseTensor is not right-ragged', rt3.eval)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'.*SparseTensor is not right-ragged'):
+      self.evaluate(RaggedTensor.from_sparse(st3))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
index 1d8a00cc18df3521235eccee73dc0361d6652fe1..68d3953f4cdf31458fc75397522b3f9fc8960098 100644
--- a/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.from_tensor."""
+"""Tests for RaggedTensor.from_tensor."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,29 +24,26 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
 from tensorflow.python.platform import googletest
 
 
-class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
-                             parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase,
+                                 parameterized.TestCase):
 
-  @test_util.run_deprecated_v1
   def testDocStringExamples(self):
-    # The examples from ragged.from_tensor.__doc__.
+    # The examples from RaggedTensor.from_tensor.__doc__.
     dt = constant_op.constant([[5, 7, 0], [0, 3, 0], [6, 0, 0]])
-    with self.test_session():
-      self.assertEqual(
-          ragged.from_tensor(dt).eval().tolist(),
-          [[5, 7, 0], [0, 3, 0], [6, 0, 0]])
+    self.assertRaggedEqual(
+        RaggedTensor.from_tensor(dt), [[5, 7, 0], [0, 3, 0], [6, 0, 0]])
 
-      self.assertEqual(
-          ragged.from_tensor(dt, lengths=[1, 0, 3]).eval().tolist(),
-          [[5], [], [6, 0, 0]])
+    self.assertRaggedEqual(
+        RaggedTensor.from_tensor(dt, lengths=[1, 0, 3]), [[5], [], [6, 0, 0]])
 
-      self.assertEqual(
-          ragged.from_tensor(dt, padding=0).eval().tolist(),
-          [[5, 7], [0, 3], [6]])
+    self.assertRaggedEqual(
+        RaggedTensor.from_tensor(dt, padding=0), [[5, 7], [0, 3], [6]])
 
   @parameterized.parameters(
       # 2D test cases, no length or padding.
@@ -263,7 +260,6 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
                        [[[5, 6], [7]], [[0, 8], []]]]
       },
   )  # pyformat: disable
-  @test_util.run_deprecated_v1
   def testRaggedFromTensor(self,
                            tensor,
                            expected,
@@ -271,30 +267,27 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
                            padding=None,
                            ragged_rank=1):
     dt = constant_op.constant(tensor)
-    rt = ragged.from_tensor(dt, lengths, padding, ragged_rank)
-    self.assertEqual(type(rt), ragged.RaggedTensor)
+    rt = RaggedTensor.from_tensor(dt, lengths, padding, ragged_rank)
+    self.assertEqual(type(rt), RaggedTensor)
     self.assertEqual(rt.ragged_rank, ragged_rank)
     self.assertTrue(
         dt.shape.is_compatible_with(rt.shape),
         '%s is incompatible with %s' % (dt.shape, rt.shape))
-    with self.test_session():
-      self.assertEqual(rt.eval().tolist(), expected)
+    self.assertRaggedEqual(rt, expected)
 
-  @test_util.run_deprecated_v1
   def testHighDimensions(self):
     # Use distinct prime numbers for all dimension shapes in this test, so
     # we can see any errors that are caused by mixing up dimension sizes.
     dt = array_ops.reshape(
         math_ops.range(3 * 5 * 7 * 11 * 13 * 17), [3, 5, 7, 11, 13, 17])
     for ragged_rank in range(1, 4):
-      rt = ragged.from_tensor(dt, ragged_rank=ragged_rank)
-      self.assertEqual(type(rt), ragged.RaggedTensor)
+      rt = RaggedTensor.from_tensor(dt, ragged_rank=ragged_rank)
+      self.assertEqual(type(rt), RaggedTensor)
       self.assertEqual(rt.ragged_rank, ragged_rank)
       self.assertTrue(
           dt.shape.is_compatible_with(rt.shape),
           '%s is incompatible with %s' % (dt.shape, rt.shape))
-      with self.test_session():
-        self.assertEqual(rt.eval().tolist(), self.evaluate(dt).tolist())
+      self.assertRaggedEqual(rt, self.evaluate(dt).tolist())
 
   @parameterized.parameters(
       # With no padding or lengths
@@ -398,15 +391,13 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
           'expected': [[], []]
       },
   )
-  @test_util.run_deprecated_v1
   def testEmpty(self, dt_shape, expected, lengths=None, padding=None):
     dt = array_ops.zeros(dt_shape)
-    rt = ragged.from_tensor(dt, lengths, padding)
-    self.assertEqual(type(rt), ragged.RaggedTensor)
+    rt = RaggedTensor.from_tensor(dt, lengths, padding)
+    self.assertEqual(type(rt), RaggedTensor)
     self.assertEqual(rt.ragged_rank, 1)
     self.assertTrue(dt.shape.is_compatible_with(rt.shape))
-    with self.test_session():
-      self.assertEqual(rt.eval().tolist(), expected)
+    self.assertRaggedEqual(rt, expected)
 
   @parameterized.parameters(
       {
@@ -423,7 +414,7 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
       {
           'tensor': [[1]],
           'padding': 'a',
-          'error': (TypeError, "Expected int32, got 'a'.*")
+          'error': (TypeError, '.*')
       },
       {
           'tensor': [[1]],
@@ -451,7 +442,6 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
           'error': (ValueError, r'ragged_rank must be greater than 0; got -1')
       },
   )
-  @test_util.run_deprecated_v1
   def testErrors(self,
                  tensor,
                  lengths=None,
@@ -459,8 +449,8 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
                  ragged_rank=1,
                  error=None):
     dt = constant_op.constant(tensor)
-    self.assertRaisesRegexp(error[0], error[1], ragged.from_tensor, dt, lengths,
-                            padding, ragged_rank)
+    self.assertRaisesRegexp(error[0], error[1], RaggedTensor.from_tensor, dt,
+                            lengths, padding, ragged_rank)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_functional_ops.py b/tensorflow/python/ops/ragged/ragged_functional_ops.py
index 6b71d88435c91d1c130c1c24a033ebcf4a7959cb..b6937a1c37940339f8ea451392b42718095c7e33 100644
--- a/tensorflow/python/ops/ragged/ragged_functional_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_functional_ops.py
@@ -19,40 +19,41 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util.tf_export import tf_export
 
 
-def map_inner_values(op, *args, **kwargs):
-  """Applies `op` to the inner values of one or more RaggedTensors.
+@tf_export("ragged.map_flat_values")
+def map_flat_values(op, *args, **kwargs):
+  """Applies `op` to the values of one or more RaggedTensors.
 
-  Replaces any `RaggedTensor` in `args` or `kwargs` with its `inner_values`
+  Replaces any `RaggedTensor` in `args` or `kwargs` with its `flat_values`
   tensor, and then calls `op`.  Returns a `RaggedTensor` that is constructed
-  from the input `RaggedTensor`s' `splits` and the value returned by
+  from the input `RaggedTensor`s' `nested_row_splits` and the value returned by
   the `op`.
 
   If the input arguments contain multiple `RaggedTensor`s, then they must have
-  identical `splits`.
+  identical `nested_row_splits`.
 
   Examples:
 
   ```python
   >>> rt = ragged.constant([[1, 2, 3], [], [4, 5], [6]])
-  >>> ragged.map_inner_values(tf.ones_like, rt).eval().tolist()
+  >>> ragged.map_flat_values(tf.ones_like, rt).eval().tolist()
   [[1, 1, 1], [], [1, 1], [1]]
-  >>> ragged.map_inner_values(tf.multiply, rt, rt).eval().tolist()
+  >>> ragged.map_flat_values(tf.multiply, rt, rt).eval().tolist()
   [[1, 4, 9], [], [16, 25], [36]]
-  >>> ragged.map_inner_values(tf.add, rt, 5).eval().tolist()
+  >>> ragged.map_flat_values(tf.add, rt, 5).eval().tolist()
   [[6, 7, 8], [], [9, 10], [11]]
   ```
 
   Args:
-    op: The operation that should be applied to the RaggedTensor `inner_values`.
+    op: The operation that should be applied to the RaggedTensor `flat_values`.
       `op` is typically an element-wise operation (such as math_ops.add), but
       any operation that preserves the size of the outermost dimension can be
       used.  I.e., `shape[0]` of the value returned by `op` must match
-      `shape[0]` of the `RaggedTensor`s' `inner_values` tensors.
+      `shape[0]` of the `RaggedTensor`s' `flat_values` tensors.
     *args: Arguments for `op`.
     **kwargs: Keyword arguments for `op`.
 
@@ -66,8 +67,8 @@ def map_inner_values(op, *args, **kwargs):
   # Replace RaggedTensors with their values; and collect the splits tensors
   # from each RaggedTensor.
   nested_splits_lists = []
-  inner_args = _replace_ragged_with_inner_values(args, nested_splits_lists)
-  inner_kwargs = _replace_ragged_with_inner_values(kwargs, nested_splits_lists)
+  inner_args = _replace_ragged_with_flat_values(args, nested_splits_lists)
+  inner_kwargs = _replace_ragged_with_flat_values(kwargs, nested_splits_lists)
   if not nested_splits_lists:
     return op(*args, **kwargs)
 
@@ -75,15 +76,15 @@ def map_inner_values(op, *args, **kwargs):
       ragged_util.assert_splits_match(nested_splits_lists)):
     # Delegate to op, and then compose the result from the transformed values
     # and the splits.
-    return ragged_factory_ops.from_nested_row_splits(
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
         op(*inner_args, **inner_kwargs), nested_splits_lists[0])
 
 
-def _replace_ragged_with_inner_values(value, nested_splits_lists):
-  """Replace RaggedTensors with their inner_values, and record their splits.
+def _replace_ragged_with_flat_values(value, nested_splits_lists):
+  """Replace RaggedTensors with their flat_values, and record their splits.
 
   Returns a copy of `value`, with any nested `RaggedTensor`s replaced by their
-  `inner_values` tensor.  Looks inside lists, tuples, and dicts.
+  `flat_values` tensor.  Looks inside lists, tuples, and dicts.
 
   Appends each `RaggedTensor`'s `nested_splits` to `nested_splits_lists`.
 
@@ -97,13 +98,13 @@ def _replace_ragged_with_inner_values(value, nested_splits_lists):
   """
   # Base case
   if ragged_tensor.is_ragged(value):
-    value = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(value)
+    value = ragged_tensor.convert_to_tensor_or_ragged_tensor(value)
     nested_splits_lists.append(value.nested_row_splits)
-    return value.inner_values
+    return value.flat_values
 
   # Recursion cases
   def recurse(v):
-    return _replace_ragged_with_inner_values(v, nested_splits_lists)
+    return _replace_ragged_with_flat_values(v, nested_splits_lists)
 
   if isinstance(value, list):
     return [recurse(v) for v in value]
diff --git a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
index 62c6819374ab7bce0c8b83092636fb48ba241712..d4bffeb401656b02a48a36eb0383850656506fc4 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf.ragged.gather_nd."""
+"""Tests for ragged_array_ops.gather_nd."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,14 +21,19 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedGatherNdOpTest(test_util.TensorFlowTestCase,
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedGatherNdOpTest(ragged_test_util.RaggedTensorTestCase,
                            parameterized.TestCase):
 
   DOCSTRING_PARAMS = [[['000', '001'], ['010']],
@@ -41,18 +46,19 @@ class RaggedGatherNdOpTest(test_util.TensorFlowTestCase,
       #=========================================================================
       dict(
           descr='Docstring example 1',
-          params=ragged.constant_value(DOCSTRING_PARAMS),
+          params=ragged_factory_ops.constant_value(DOCSTRING_PARAMS),
           indices=[[2], [0]],
-          expected=ragged.constant_value([[[], [b'210']],
-                                          [[b'000', b'001'], [b'010']]])),
+          expected=ragged_factory_ops.constant_value(
+              [[[], [b'210']], [[b'000', b'001'], [b'010']]])),
       dict(
           descr='Docstring example 2',
-          params=ragged.constant_value(DOCSTRING_PARAMS),
+          params=ragged_factory_ops.constant_value(DOCSTRING_PARAMS),
           indices=[[2, 1], [0, 0]],
-          expected=ragged.constant_value([[b'210'], [b'000', b'001']])),
+          expected=ragged_factory_ops.constant_value(
+              [[b'210'], [b'000', b'001']])),
       dict(
           descr='Docstring example 3',
-          params=ragged.constant_value(DOCSTRING_PARAMS),
+          params=ragged_factory_ops.constant_value(DOCSTRING_PARAMS),
           indices=[[0, 0, 1], [1, 1, 2]],
           expected=[b'001', b'112']),
       #=========================================================================
@@ -60,175 +66,179 @@ class RaggedGatherNdOpTest(test_util.TensorFlowTestCase,
       #=========================================================================
       dict(
           descr='params: [B1, (B2)], indices: [0], result: [B1, (B2)]',
-          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          params=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d']]),
           indices=np.zeros([0], dtype=np.int32),
-          expected=ragged.constant_value([[b'a', b'b', b'c'], [b'd']])),
+          expected=ragged_factory_ops.constant_value(
+              [[b'a', b'b', b'c'], [b'd']])),
       dict(
           descr='params: [B1, (B2)], indices: [A1, 0], result: [A1, B1, (B2)]',
-          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          params=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d']]),
           indices=np.zeros([3, 0], dtype=np.int32),
-          expected=ragged.constant_value([[[b'a', b'b', b'c'], [b'd']],
-                                          [[b'a', b'b', b'c'], [b'd']],
-                                          [[b'a', b'b', b'c'], [b'd']]])),
+          expected=ragged_factory_ops.constant_value(
+              [[[b'a', b'b', b'c'], [b'd']],
+               [[b'a', b'b', b'c'], [b'd']],
+               [[b'a', b'b', b'c'], [b'd']]])),
       dict(
           descr=('params: [B1, (B2)], indices: [A1, A2, 0], '
                  'result: [A1, A2, B1, (B2)]'),
-          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          params=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d']]),
           indices=np.zeros([1, 3, 0], dtype=np.int32),
-          expected=ragged.constant_value([[[[b'a', b'b', b'c'], [b'd']],
-                                           [[b'a', b'b', b'c'], [b'd']],
-                                           [[b'a', b'b', b'c'], [b'd']]]])),
+          expected=ragged_factory_ops.constant_value(
+              [[[[b'a', b'b', b'c'], [b'd']],
+                [[b'a', b'b', b'c'], [b'd']],
+                [[b'a', b'b', b'c'], [b'd']]]])),
       dict(
           descr='params: [B1], indices: [A1, (A2), 0], result: [A1, (A2), B1]',
           params=['a'],
-          indices=ragged.constant_value([[[], []], [[]]],
-                                        ragged_rank=1,
-                                        dtype=np.int32),
-          expected=ragged.constant_value([[[b'a'], [b'a']], [[b'a']]],
-                                         ragged_rank=1)),
+          indices=ragged_factory_ops.constant_value(
+              [[[], []], [[]]],
+              ragged_rank=1,
+              dtype=np.int32),
+          expected=ragged_factory_ops.constant_value(
+              [[[b'a'], [b'a']], [[b'a']]],
+              ragged_rank=1)),
       #=========================================================================
       # Indices with 1 value (selects row from params)
       #=========================================================================
       dict(
           descr='params: [B1, (B2)], indices: [A1, 1], result: [A1, (B2)]',
-          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          params=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d']]),
           indices=[[1], [0]],
-          expected=ragged.constant_value([[b'd'], [b'a', b'b', b'c']])),
+          expected=ragged_factory_ops.constant_value(
+              [[b'd'], [b'a', b'b', b'c']])),
       dict(
           descr=('params: [B1, (B2), (B3)], indices: [A1, 1], '
                  'result: [A1, (B2), (B3)]'),
-          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
-                                        [['e', 'f']]]),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b', 'c'], ['d']], [['e', 'f']]]),
           indices=[[1], [1]],
-          expected=ragged.constant_value([[[b'e', b'f']], [[b'e', b'f']]])),
+          expected=ragged_factory_ops.constant_value(
+              [[[b'e', b'f']], [[b'e', b'f']]])),
       dict(
           descr=('params: [B1, B2, B3], indices: [A1, (A2), 1], '
                  'result: [A1, (A2), B2, B3]'),
           params=[[['a']], [['b']]],
-          indices=ragged.constant_value([[[0]]], ragged_rank=1),
-          expected=ragged.constant_value([[[[b'a']]]], ragged_rank=1)),
+          indices=ragged_factory_ops.constant_value([[[0]]], ragged_rank=1),
+          expected=ragged_factory_ops.constant_value(
+              [[[[b'a']]]], ragged_rank=1)),
       #=========================================================================
       # Indices with 2 values (selects row & col from params)
       #=========================================================================
       dict(
           descr='params: [B1, (B2)], indices: [A1, 2], result: [A1]',
-          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          params=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d']]),
           indices=[[1, 0], [0, 0], [0, 2]],
-          expected=ragged.constant_value([b'd', b'a', b'c'])),
+          expected=ragged_factory_ops.constant_value([b'd', b'a', b'c'])),
       dict(
           descr=('params: [B1, (B2), (B3)], indices: [A1, 2], '
                  'result: [A1, (B3)]'),
-          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
-                                        [['e', 'f']]]),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b', 'c'], ['d']], [['e', 'f']]]),
           indices=[[1, 0], [0, 1], [0, 0]],
-          expected=ragged.constant_value([[b'e', b'f'], [b'd'],
-                                          [b'a', b'b', b'c']])),
+          expected=ragged_factory_ops.constant_value(
+              [[b'e', b'f'], [b'd'], [b'a', b'b', b'c']])),
       dict(
           descr=('params: [B1, (B2), (B3)], indices: [A1, A2, 2], '
                  'result: [A1, (A2), (B3)]'),
-          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
-                                        [['e', 'f']]]),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b', 'c'], ['d']], [['e', 'f']]]),
           indices=[[[1, 0], [0, 1], [0, 0]]],
-          expected=ragged.constant_value([[[b'e', b'f'], [b'd'],
-                                           [b'a', b'b', b'c']]])),
+          expected=ragged_factory_ops.constant_value(
+              [[[b'e', b'f'], [b'd'], [b'a', b'b', b'c']]])),
       dict(
           descr=('params: [B1, (B2), B3], indices: [A1, A2, 2], '
                  'result: [A1, A2, B3]'),
-          params=ragged.constant_value([[['a', 'b'], ['c', 'd']],
-                                        [['e', 'f']]],
-                                       ragged_rank=1),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b'], ['c', 'd']],
+               [['e', 'f']]],
+              ragged_rank=1),
           indices=[[[1, 0], [0, 1], [0, 0]]],
           expected=[[[b'e', b'f'], [b'c', b'd'], [b'a', b'b']]]),
       dict(
           descr=('params: [B1, (B2), B3], indices: [A1, A2, A3, 2], '
                  'result: [A1, A2, A3, B3]'),
-          params=ragged.constant_value([[['a', 'b'], ['c', 'd']],
-                                        [['e', 'f']]],
-                                       ragged_rank=1),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b'], ['c', 'd']],
+               [['e', 'f']]],
+              ragged_rank=1),
           indices=[[[[1, 0], [0, 1], [0, 0]]]],
           expected=[[[[b'e', b'f'], [b'c', b'd'], [b'a', b'b']]]]),
       dict(
           descr=('params: [B1, (B2), (B3)], indices: [A1, (A2), 2], '
                  'result: [A1, (A2), (B3)]'),
-          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
-                                        [['e', 'f']]]),
-          indices=ragged.constant_value([[[1, 0], [0, 1]], [[0, 0]]],
-                                        ragged_rank=1),
-          expected=ragged.constant_value([[[b'e', b'f'], [b'd']],
-                                          [[b'a', b'b', b'c']]])),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b', 'c'], ['d']], [['e', 'f']]]),
+          indices=ragged_factory_ops.constant_value(
+              [[[1, 0], [0, 1]], [[0, 0]]],
+              ragged_rank=1),
+          expected=ragged_factory_ops.constant_value(
+              [[[b'e', b'f'], [b'd']], [[b'a', b'b', b'c']]])),
       #=========================================================================
       # Indices with 3 values
       #=========================================================================
       dict(
           descr=('params: [B1, (B2), (B3)], indices: [A1, 3], '
                  'result: [A1]'),
-          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
-                                        [['e', 'f']]]),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b', 'c'], ['d']], [['e', 'f']]]),
           indices=[[1, 0, 1], [0, 0, 0], [0, 1, 0]],
           expected=[b'f', b'a', b'd']),
       dict(
           descr=('params: [B1, (B2), B3], indices: [A1, 3], '
                  'result: [A1]'),
-          params=ragged.constant_value([[['a', 'b'], ['c', 'd']],
-                                        [['e', 'f']]],
-                                       ragged_rank=1),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b'], ['c', 'd']], [['e', 'f']]],
+              ragged_rank=1),
           indices=[[1, 0, 1], [0, 0, 0], [0, 1, 1]],
           expected=[b'f', b'a', b'd']),
       dict(
           descr=('params: [B1, (B2), (B3), B4], indices: [A1, 3], '
                  'result: [A1, B4]'),
-          params=ragged.constant_value([[[['a', 'b'], ['c', 'd']],
-                                         [['e', 'f']]]],
-                                       ragged_rank=2),
+          params=ragged_factory_ops.constant_value(
+              [[[['a', 'b'], ['c', 'd']], [['e', 'f']]]],
+              ragged_rank=2),
           indices=[[0, 0, 1], [0, 0, 0], [0, 1, 0]],
           expected=[[b'c', b'd'], [b'a', b'b'], [b'e', b'f']]),
   ])  # pyformat: disable
-  @test_util.run_deprecated_v1
   def testRaggedGatherNd(self, descr, params, indices, expected):
-    result = ragged.gather_nd(params, indices)
-    self.assertEqual(
-        getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
-    with self.test_session() as sess:
-      if hasattr(expected, 'tolist'):
-        expected = expected.tolist()
-      self.assertEqual(self.evaluate(result).tolist(), expected)
+    result = ragged_array_ops.gather_nd(params, indices)
+    self.assertRaggedEqual(result, expected)
 
-  @test_util.run_deprecated_v1
   def testRaggedGatherNdUnknownRankError(self):
-    params = ragged.constant([['a', 'b'], ['c', 'd']])
+    if context.executing_eagerly():
+      return
+    params = ragged_factory_ops.constant([['a', 'b'], ['c', 'd']])
     indices1 = array_ops.placeholder(dtypes.int32, shape=None)
     indices2 = array_ops.placeholder(dtypes.int32, shape=[None])
 
     with self.assertRaisesRegexp(ValueError,
                                  'indices.rank be statically known.'):
-      ragged.gather_nd(params, indices1)
+      ragged_array_ops.gather_nd(params, indices1)
     with self.assertRaisesRegexp(
         ValueError, r'indices.shape\[-1\] must be statically known.'):
-      ragged.gather_nd(params, indices2)
+      ragged_array_ops.gather_nd(params, indices2)
 
   @parameterized.parameters([
       dict(
           params=['a'],
           indices=0,
-          message='Shape must be at least rank 1 but is rank 0'
-          " for 'GatherNd'"),
+          error=(ValueError, errors.InvalidArgumentError)),
       dict(
-          params=ragged.constant_value([['a']]),
+          params=ragged_factory_ops.constant_value([['a']]),
           indices=0,
           message='indices.rank must be at least 1.'),
       dict(
           params=['a', 'b', 'c'],
-          indices=ragged.constant([[0]]),
+          indices=ragged_factory_ops.constant_value([[0]]),
           message='The innermost dimension of indices may not be ragged'),
   ])
-  @test_util.run_deprecated_v1
   def testRaggedGatherNdStaticError(self,
                                     params,
                                     indices,
-                                    message,
+                                    message=None,
                                     error=ValueError):
     with self.assertRaisesRegexp(error, message):
-      ragged.gather_nd(params, indices)
+      ragged_array_ops.gather_nd(params, indices)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
index 76c90cdfeeb8a1c18a68abac794068b5eb8b739a..9914b56448868b21058cdb50cda17d63676c4f23 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
@@ -12,142 +12,132 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.gather."""
+"""Tests for ragged_array_ops.gather."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedTensorOpsTest(test_util.TensorFlowTestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
 
-  @test_util.run_deprecated_v1
   def testDocStringExamples(self):
     params = constant_op.constant(['a', 'b', 'c', 'd', 'e'])
     indices = constant_op.constant([3, 1, 2, 1, 0])
-    ragged_params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
-    ragged_indices = ragged.constant([[3, 1, 2], [1], [], [0]])
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, ragged_indices).eval().tolist(),
-          [[b'd', b'b', b'c'], [b'b'], [], [b'a']])
-      self.assertEqual(
-          ragged.gather(ragged_params, indices).eval().tolist(),
-          [[b'e'], [b'd'], [], [b'd'], [b'a', b'b', b'c']])
-      self.assertEqual(
-          ragged.gather(ragged_params, ragged_indices).eval().tolist(),
-          [[[b'e'], [b'd'], []], [[b'd']], [], [[b'a', b'b', b'c']]])
-
-  @test_util.run_deprecated_v1
+    ragged_params = ragged_factory_ops.constant([['a', 'b', 'c'], ['d'], [],
+                                                 ['e']])
+    ragged_indices = ragged_factory_ops.constant([[3, 1, 2], [1], [], [0]])
+    self.assertRaggedEqual(
+        ragged_array_ops.gather(params, ragged_indices),
+        [[b'd', b'b', b'c'], [b'b'], [], [b'a']])
+    self.assertRaggedEqual(
+        ragged_array_ops.gather(ragged_params, indices),
+        [[b'e'], [b'd'], [], [b'd'], [b'a', b'b', b'c']])
+    self.assertRaggedEqual(
+        ragged_array_ops.gather(ragged_params, ragged_indices),
+        [[[b'e'], [b'd'], []], [[b'd']], [], [[b'a', b'b', b'c']]])
+
   def testTensorParamsAndTensorIndices(self):
     params = ['a', 'b', 'c', 'd', 'e']
     indices = [2, 0, 2, 1]
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, indices).eval().tolist(),
-          [b'c', b'a', b'c', b'b'])
-      self.assertEqual(type(ragged.gather(params, indices)), ops.Tensor)
+    self.assertRaggedEqual(
+        ragged_array_ops.gather(params, indices), [b'c', b'a', b'c', b'b'])
+    self.assertIsInstance(ragged_array_ops.gather(params, indices), ops.Tensor)
 
-  @test_util.run_deprecated_v1
   def testRaggedParamsAndTensorIndices(self):
-    params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    params = ragged_factory_ops.constant([['a', 'b'], ['c', 'd', 'e'], ['f'],
+                                          [], ['g']])
     indices = [2, 0, 2, 1]
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, indices).eval().tolist(),
-          [[b'f'], [b'a', b'b'], [b'f'], [b'c', b'd', b'e']])
+    self.assertRaggedEqual(
+        ragged_array_ops.gather(params, indices),
+        [[b'f'], [b'a', b'b'], [b'f'], [b'c', b'd', b'e']])
 
-  @test_util.run_deprecated_v1
   def testTensorParamsAndRaggedIndices(self):
     params = ['a', 'b', 'c', 'd', 'e']
-    indices = ragged.constant([[2, 1], [1, 2, 0], [3]])
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, indices).eval().tolist(),
-          [[b'c', b'b'], [b'b', b'c', b'a'], [b'd']])
+    indices = ragged_factory_ops.constant([[2, 1], [1, 2, 0], [3]])
+    self.assertRaggedEqual(
+        ragged_array_ops.gather(params, indices),
+        [[b'c', b'b'], [b'b', b'c', b'a'], [b'd']])
 
-  @test_util.run_deprecated_v1
   def testRaggedParamsAndRaggedIndices(self):
-    params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
-    indices = ragged.constant([[2, 1], [1, 2, 0], [3]])
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, indices).eval().tolist(),
-          [[[b'f'], [b'c', b'd', b'e']],                # [[p[2], p[1]      ],
-           [[b'c', b'd', b'e'], [b'f'], [b'a', b'b']],  #  [p[1], p[2], p[0]],
-           [[]]]                                        #  [p[3]            ]]
-      )  # pyformat: disable
-
-  @test_util.run_deprecated_v1
+    params = ragged_factory_ops.constant([['a', 'b'], ['c', 'd', 'e'], ['f'],
+                                          [], ['g']])
+    indices = ragged_factory_ops.constant([[2, 1], [1, 2, 0], [3]])
+    self.assertRaggedEqual(
+        ragged_array_ops.gather(params, indices),
+        [[[b'f'], [b'c', b'd', b'e']],                # [[p[2], p[1]      ],
+         [[b'c', b'd', b'e'], [b'f'], [b'a', b'b']],  #  [p[1], p[2], p[0]],
+         [[]]]                                        #  [p[3]            ]]
+    )  # pyformat: disable
+
   def testRaggedParamsAndScalarIndices(self):
-    params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    params = ragged_factory_ops.constant([['a', 'b'], ['c', 'd', 'e'], ['f'],
+                                          [], ['g']])
     indices = 1
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, indices).eval().tolist(), [b'c', b'd', b'e'])
+    self.assertRaggedEqual(
+        ragged_array_ops.gather(params, indices), [b'c', b'd', b'e'])
 
-  @test_util.run_deprecated_v1
   def test3DRaggedParamsAnd2DTensorIndices(self):
-    params = ragged.constant([[['a', 'b'], []], [['c', 'd'], ['e'], ['f']],
-                              [['g']]])
+    params = ragged_factory_ops.constant([[['a', 'b'], []],
+                                          [['c', 'd'], ['e'], ['f']], [['g']]])
     indices = [[1, 2], [0, 1], [2, 2]]
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, indices).eval().tolist(),
-          [[[[b'c', b'd'], [b'e'], [b'f']], [[b'g']]],            # [[p1, p2],
-           [[[b'a', b'b'], []], [[b'c', b'd'], [b'e'], [b'f']]],  #  [p0, p1],
-           [[[b'g']], [[b'g']]]]                                  #  [p2, p2]]
-      )  # pyformat: disable
-
-  @test_util.run_deprecated_v1
+    self.assertRaggedEqual(
+        ragged_array_ops.gather(params, indices),
+        [[[[b'c', b'd'], [b'e'], [b'f']], [[b'g']]],            # [[p1, p2],
+         [[[b'a', b'b'], []], [[b'c', b'd'], [b'e'], [b'f']]],  #  [p0, p1],
+         [[[b'g']], [[b'g']]]]                                  #  [p2, p2]]
+    )  # pyformat: disable
+
   def testTensorParamsAnd4DRaggedIndices(self):
-    indices = ragged.constant(
+    indices = ragged_factory_ops.constant(
         [[[[3, 4], [0, 6]], []], [[[2, 1], [1, 0]], [[2, 5]], [[2, 3]]],
          [[[1, 0]]]],  # pyformat: disable
         ragged_rank=2,
         inner_shape=(2,))
     params = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, indices).eval().tolist(),
-          [[[[b'd', b'e'], [b'a', b'g']], []],
-           [[[b'c', b'b'], [b'b', b'a']], [[b'c', b'f']], [[b'c', b'd']]],
-           [[[b'b', b'a']]]])  # pyformat: disable
-
-  @test_util.run_deprecated_v1
+    self.assertRaggedEqual(
+        ragged_array_ops.gather(params, indices),
+        [[[[b'd', b'e'], [b'a', b'g']], []],
+         [[[b'c', b'b'], [b'b', b'a']], [[b'c', b'f']], [[b'c', b'd']]],
+         [[[b'b', b'a']]]])  # pyformat: disable
+
   def testOutOfBoundsError(self):
     tensor_params = ['a', 'b', 'c']
     tensor_indices = [0, 1, 2]
-    ragged_params = ragged.constant([['a', 'b'], ['c']])
-    ragged_indices = ragged.constant([[0, 3]])
-    with self.test_session():
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              r'indices\[1\] = 3 is not in \[0, 3\)',
-                              ragged.gather(tensor_params, ragged_indices).eval)
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              r'indices\[2\] = 2 is not in \[0, 2\)',
-                              ragged.gather(ragged_params, tensor_indices).eval)
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              r'indices\[1\] = 3 is not in \[0, 2\)',
-                              ragged.gather(ragged_params, ragged_indices).eval)
-
-  @test_util.run_deprecated_v1
+    ragged_params = ragged_factory_ops.constant([['a', 'b'], ['c']])
+    ragged_indices = ragged_factory_ops.constant([[0, 3]])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'indices\[1\] = 3 is not in \[0, 3\)'):
+      self.evaluate(ragged_array_ops.gather(tensor_params, ragged_indices))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'indices\[2\] = 2 is not in \[0, 2\)'):
+      self.evaluate(ragged_array_ops.gather(ragged_params, tensor_indices))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'indices\[1\] = 3 is not in \[0, 2\)'):
+      self.evaluate(ragged_array_ops.gather(ragged_params, ragged_indices))
+
   def testUnknownIndicesRankError(self):
-    params = ragged.constant([], ragged_rank=1)
+    if context.executing_eagerly():
+      return
+    params = ragged_factory_ops.constant([], ragged_rank=1)
     indices = constant_op.constant([0], dtype=dtypes.int64)
     indices = array_ops.placeholder_with_default(indices, None)
     self.assertRaisesRegexp(ValueError,
                             r'indices\.shape\.ndims must be known statically',
-                            ragged.gather, params, indices)
+                            ragged_array_ops.gather, params, indices)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_getitem.py b/tensorflow/python/ops/ragged/ragged_getitem.py
index 9821695046c577627298c413fcfc7716b71f8019..001a400596597bb0efb9b847184abd54e757f1d5 100644
--- a/tensorflow/python/ops/ragged/ragged_getitem.py
+++ b/tensorflow/python/ops/ragged/ragged_getitem.py
@@ -24,7 +24,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_array_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 
@@ -39,7 +38,7 @@ def ragged_tensor_getitem(self, key):
   IndexError; (2) use a default value; or (3) skip that value and return a
   tensor with fewer rows than we started with.  Following the guiding
   principles of Python ("In the face of ambiguity, refuse the temptation to
-  guess" <go/pep20>), we simply disallow this operation.
+  guess"), we simply disallow this operation.
 
   Any dimensions added by `array_ops.newaxis` will be ragged if the following
   dimension is ragged.
@@ -137,8 +136,8 @@ def _ragged_getitem(rt_input, key_list):
   if row_key is array_ops.newaxis:
     inner_rt = _ragged_getitem(rt_input, inner_keys)
     nsplits = array_ops.shape(inner_rt.row_splits, out_type=dtypes.int64)[0]
-    return ragged_factory_ops.from_row_splits(inner_rt,
-                                              array_ops.stack([0, nsplits - 1]))
+    return ragged_tensor.RaggedTensor.from_row_splits(
+        inner_rt, array_ops.stack([0, nsplits - 1]))
 
   # Slicing a range of rows: first slice the outer dimension, and then
   # call `_ragged_getitem_inner_dimensions` to handle the inner keys.
@@ -184,7 +183,7 @@ def _slice_ragged_row_dimension(rt_input, row_key):
         axis=0)
     values_start = new_splits[0]
     values_limit = new_splits[-1]
-    return ragged_factory_ops.from_row_splits(
+    return ragged_tensor.RaggedTensor.from_row_splits(
         rt_input.values[values_start:values_limit], new_splits - values_start)
 
   # If there is a slice step (aka a strided slice), then use ragged_gather to
@@ -225,7 +224,8 @@ def _ragged_getitem_inner_dimensions(rt_input, key_list):
   if column_key is array_ops.newaxis:
     inner_rt = _ragged_getitem_inner_dimensions(rt_input, key_list[1:])
     nsplits = array_ops.shape(inner_rt.row_splits, out_type=dtypes.int64)[0]
-    return ragged_factory_ops.from_row_splits(inner_rt, math_ops.range(nsplits))
+    return ragged_tensor.RaggedTensor.from_row_splits(inner_rt,
+                                                      math_ops.range(nsplits))
 
   # Slicing a range of columns in a ragged inner dimension.  We use a
   # recursive call to process the values, and then assemble a RaggedTensor
@@ -239,7 +239,7 @@ def _ragged_getitem_inner_dimensions(rt_input, key_list):
     else:
       # Nontrivial slice: use ragged_gather to extract the indicated slice as
       # a new RaggedTensor (inner_rt), and then recursively process its values.
-      # The splits can be taken from ragged.row_splits(inner_rt).
+      # The splits can be taken from inner_rt.row_splits().
       inner_rt_starts = rt_input.row_splits[:-1]
       inner_rt_limits = rt_input.row_splits[1:]
       if column_key.start is not None and column_key.start != 0:
diff --git a/tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py b/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
similarity index 59%
rename from tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py
rename to tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
index b5802cb82d9440632ef4dc3ce6198875e056e1fe..e9a7cdf6c06269f3e9c879911631b2c089be23d5 100644
--- a/tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
@@ -12,26 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.map_inner_values."""
+"""Tests for ragged_functional_ops.map_flat_values."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from absl.testing import parameterized
-
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
-                                 parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedMapInnerValuesOpTest(ragged_test_util.RaggedTensorTestCase):
 
   def assertRaggedMapInnerValuesReturns(self,
                                         op,
@@ -39,85 +40,74 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
                                         args=(),
                                         kwargs=None):
     kwargs = kwargs or {}
-    result = ragged.map_inner_values(op, *args, **kwargs)
-    with self.test_session():
-      self.assertEqual(result.eval().tolist(), expected)
+    result = ragged_functional_ops.map_flat_values(op, *args, **kwargs)
+    self.assertRaggedEqual(result, expected)
 
-  @test_util.run_deprecated_v1
   def testDocStringExamples(self):
     """Test the examples in apply_op_to_ragged_values.__doc__."""
-    rt = ragged.constant([[1, 2, 3], [], [4, 5], [6]])
-    v1 = ragged.map_inner_values(array_ops.ones_like, rt)
-    v2 = ragged.map_inner_values(math_ops.multiply, rt, rt)
-    v3 = ragged.map_inner_values(math_ops.add, rt, 5)
-    with self.test_session():
-      self.assertEqual(v1.eval().tolist(), [[1, 1, 1], [], [1, 1], [1]])
-      self.assertEqual(v2.eval().tolist(), [[1, 4, 9], [], [16, 25], [36]])
-      self.assertEqual(v3.eval().tolist(), [[6, 7, 8], [], [9, 10], [11]])
-
-  @test_util.run_deprecated_v1
+    rt = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5], [6]])
+    v1 = ragged_functional_ops.map_flat_values(array_ops.ones_like, rt)
+    v2 = ragged_functional_ops.map_flat_values(math_ops.multiply, rt, rt)
+    v3 = ragged_functional_ops.map_flat_values(math_ops.add, rt, 5)
+    self.assertRaggedEqual(v1, [[1, 1, 1], [], [1, 1], [1]])
+    self.assertRaggedEqual(v2, [[1, 4, 9], [], [16, 25], [36]])
+    self.assertRaggedEqual(v3, [[6, 7, 8], [], [9, 10], [11]])
+
   def testOpWithSingleRaggedTensorArg(self):
-    tensor = ragged.constant([[1, 2, 3], [], [4, 5]])
+    tensor = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5]])
     self.assertRaggedMapInnerValuesReturns(
         op=array_ops.zeros_like,
         args=(tensor,),
         expected=[[0, 0, 0], [], [0, 0]])
 
-  @test_util.run_deprecated_v1
   def testOpWithTwoRaggedTensorArgs(self):
-    x = ragged.constant([[3, 1, 4], [], [1, 5]])
-    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply, args=(x, y), expected=[[3, 2, 12], [], [4, 25]])
 
-  @test_util.run_deprecated_v1
   def testOpWithRaggedTensorAndScalarArgs(self):
-    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    y = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply, args=(5, y), expected=[[5, 10, 15], [], [20, 25]])
 
-  @test_util.run_deprecated_v1
   def testOpWithThreeRaggedTensorArgs(self):
-    condition = ragged.constant(
+    condition = ragged_factory_ops.constant(
         [[True, True, False], [], [True, False]])  # pyformat: disable
-    x = ragged.constant([['a', 'b', 'c'], [], ['d', 'e']])
-    y = ragged.constant([['A', 'B', 'C'], [], ['D', 'E']])
+    x = ragged_factory_ops.constant([['a', 'b', 'c'], [], ['d', 'e']])
+    y = ragged_factory_ops.constant([['A', 'B', 'C'], [], ['D', 'E']])
     self.assertRaggedMapInnerValuesReturns(
         op=array_ops.where,
         args=(condition, x, y),
         expected=[[b'a', b'b', b'C'], [], [b'd', b'E']])
 
-  @test_util.run_deprecated_v1
   def testOpWithRaggedTensorListArg(self):
-    x = ragged.constant([[1, 2, 3], [], [4, 5]])
-    y = ragged.constant([[10, 20, 30], [], [40, 50]])
+    x = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5]])
+    y = ragged_factory_ops.constant([[10, 20, 30], [], [40, 50]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.add_n,
         args=([x, y, x],),
         expected=[[12, 24, 36], [], [48, 60]])
 
-  @test_util.run_deprecated_v1
   def testOpWithKeywordArgs(self):
-    x = ragged.constant([[3, 1, 4], [], [1, 5]])
-    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply,
         kwargs=dict(x=x, y=y),
         expected=[[3, 2, 12], [], [4, 25]])
 
-  @test_util.run_deprecated_v1
   def testOpWithMixedPositionalAndKeywordArgs(self):
-    x = ragged.constant([[3, 1, 4], [], [1, 5]])
-    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply,
         args=(x,),
         kwargs=dict(y=y),
         expected=[[3, 2, 12], [], [4, 25]])
 
-  @test_util.run_deprecated_v1
   def testNonElementWiseOp(self):
-    x = ragged.constant(
+    x = ragged_factory_ops.constant(
         [[[3, 1, 4], [1, 5, 9], [2, 6, 5]], [], [[3, 5, 8], [9, 7, 9]]],
         ragged_rank=1)
     self.assertRaggedMapInnerValuesReturns(
@@ -128,27 +118,26 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
         },
         expected=[[8, 15, 13], [], [16, 25]])
 
-  @test_util.run_deprecated_v1
   def testOpWithRaggedRankGreaterThanOne(self):
     # ragged_rank=0
     x0 = [3, 1, 4, 1, 5, 9, 2, 6, 5]
     y0 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
-    with self.test_session():
-      self.assertEqual(
-          math_ops.multiply(x0, y0).eval().tolist(),
-          [3, 2, 12, 4, 25, 54, 14, 48, 45])
+    self.assertRaggedEqual(
+        math_ops.multiply(x0, y0), [3, 2, 12, 4, 25, 54, 14, 48, 45])
 
     # ragged_rank=1
-    x1 = ragged.constant([[3, 1, 4], [], [1, 5], [9, 2], [6, 5]])
-    y1 = ragged.constant([[1, 2, 3], [], [4, 5], [6, 7], [8, 9]])
+    x1 = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5], [9, 2], [6, 5]])
+    y1 = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5], [6, 7], [8, 9]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply,
         args=(x1, y1),
         expected=[[3, 2, 12], [], [4, 25], [54, 14], [48, 45]])
 
     # ragged_rank=2
-    x2 = ragged.constant([[[3, 1, 4]], [], [[], [1, 5]], [[9, 2], [6, 5]]])
-    y2 = ragged.constant([[[1, 2, 3]], [], [[], [4, 5]], [[6, 7], [8, 9]]])
+    x2 = ragged_factory_ops.constant([[[3, 1, 4]], [], [[], [1, 5]],
+                                      [[9, 2], [6, 5]]])
+    y2 = ragged_factory_ops.constant([[[1, 2, 3]], [], [[], [4, 5]],
+                                      [[6, 7], [8, 9]]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply,
         args=(x2, y2),
@@ -159,10 +148,10 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
                  ])  # pyformat: disable
 
     # ragged_rank=3
-    x3 = ragged.constant([[[[3, 1, 4]], []], [], [[[], [1, 5]]],
-                          [[[9, 2], [6, 5]]]])
-    y3 = ragged.constant([[[[1, 2, 3]], []], [], [[[], [4, 5]]],
-                          [[[6, 7], [8, 9]]]])
+    x3 = ragged_factory_ops.constant([[[[3, 1, 4]], []], [], [[[], [1, 5]]],
+                                      [[[9, 2], [6, 5]]]])
+    y3 = ragged_factory_ops.constant([[[[1, 2, 3]], []], [], [[[], [4, 5]]],
+                                      [[[6, 7], [8, 9]]]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply,
         args=(x3, y3),
@@ -173,16 +162,14 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
             [[[54, 14], [48, 45]]]    # row 3
         ])  # pyformat: disable
 
-  @test_util.run_deprecated_v1
   def testOpWithRaggedRankThree(self):
-    x = ragged.constant([[[3, 1, 4]], [], [[], [1, 5]]])
-    y = ragged.constant([[[1, 2, 3]], [], [[], [4, 5]]])
+    x = ragged_factory_ops.constant([[[3, 1, 4]], [], [[], [1, 5]]])
+    y = ragged_factory_ops.constant([[[1, 2, 3]], [], [[], [4, 5]]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply,
         args=(x, y),
         expected=[[[3, 2, 12]], [], [[], [4, 25]]])
 
-  @test_util.run_deprecated_v1
   def testOpWithInnerValuesOnly(self):
     x = constant_op.constant([[1, 2], [3, 4], [5, 6]])
     y = constant_op.constant(2)
@@ -190,33 +177,30 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
         op=math_ops.multiply, args=(x, y), expected=[[2, 4], [6, 8], [10, 12]])
 
   def testRaggedTensorSplitsRaggedRankMismatchError(self):
-    x = ragged.constant([[3, 1, 4], [], [1, 5]])
-    y = ragged.constant([[[3, 1, 4], []], [], [[1, 5]]])
-    self.assertRaisesRegexp(ValueError,
-                            r'Inputs must have identical ragged splits.*',
-                            ragged.map_inner_values, math_ops.add, x, y)
+    x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged_factory_ops.constant([[[3, 1, 4], []], [], [[1, 5]]])
+    self.assertRaisesRegexp(
+        ValueError, r'Inputs must have identical ragged splits.*',
+        ragged_functional_ops.map_flat_values, math_ops.add, x, y)
 
   def testRaggedTensorSplitsValueMismatchError(self):
-    x = ragged.constant([[3, 1, 4], [], [1, 5]])
-    y = ragged.constant([[1], [2, 3], [4, 5]])
+    x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged_factory_ops.constant([[1], [2, 3], [4, 5]])
     self.assertRaisesRegexp(errors.InvalidArgumentError,
                             r'Inputs must have identical ragged splits.*',
-                            ragged.map_inner_values, math_ops.add, x, y)
+                            ragged_functional_ops.map_flat_values, math_ops.add,
+                            x, y)
 
-  @test_util.run_deprecated_v1
   def testRaggedTensorSplitsMismatchErrorAtRuntime(self):
     splits1 = array_ops.placeholder_with_default(
         constant_op.constant([0, 3, 3, 5], dtypes.int64), None)
     splits2 = array_ops.placeholder_with_default(
         constant_op.constant([0, 1, 3, 5], dtypes.int64), None)
-    x = ragged.from_row_splits([3, 1, 4, 1, 5], splits1)
-    y = ragged.from_row_splits([1, 2, 3, 4, 5], splits2)
-    result = ragged.map_inner_values(math_ops.add, x, y)
-    with self.test_session():
-      self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[Inputs must have identical ragged splits\] '
-          r'\[Condition x == y did not hold element-wise:\].*', result.eval)
+    x = ragged_tensor.RaggedTensor.from_row_splits([3, 1, 4, 1, 5], splits1)
+    y = ragged_tensor.RaggedTensor.from_row_splits([1, 2, 3, 4, 5], splits2)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'.*Inputs must have identical ragged splits'):
+      self.evaluate(ragged_functional_ops.map_flat_values(math_ops.add, x, y))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
index 7a8603c949a5ea97d0b3be3b4301d8265b3ba9bd..15206404b2a54e2660113755f392eec190e148f9 100644
--- a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
@@ -12,24 +12,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.map_fn."""
+"""Tests for ragged_map_ops.map_fn."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
+import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops as mo
-from tensorflow.python.ops import ragged
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_map_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedMapOpTest(ragged_test_util.RaggedTensorTestCase,
+                      parameterized.TestCase):
+
   @parameterized.parameters([
       # The following test sets map over a RaggedTensor and apply a
       # transformation that returns with shape:
@@ -52,61 +62,62 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           elems=[[1, 2, 3], [4, 5], [6, 7]],
           expected_output=[[2, 6], [4.5, 9], [6.5, 13]],
           dtype=dtypes.float32,
+          expected_ragged_rank=0,
       ),
       # [d1, (d2)] -> [d1, (d2)]
       dict(
-          fn=lambda x: x+1,
+          fn=lambda x: x + np.int64(1),
           elems=[[1, 2, 3], [4, 5], [6, 7]],
           expected_output=[[2, 3, 4], [5, 6], [7, 8]],
           dtype=dtypes.int64,
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=1),
+          result_dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=1),
       ),
       # [d1, (d2), d3] -> [d1, (d2), d3]
       dict(
-          fn=lambda x: x+1,
+          fn=lambda x: x + np.int64(1),
           elems=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
           elems_ragged_rank=1,
           expected_ragged_rank=1,
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=1),
+          result_dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=1),
           expected_output=[[[2, 3], [4, 5]], [], [[6, 7], [8, 9], [10, 1]]],
       ),
       # [d1, (d2)] -> [d1, (d2), (d3)]
       dict(
-          fn=lambda x: ragged.from_row_starts(x, [0]),
+          fn=lambda x: ragged_tensor.RaggedTensor.from_row_starts(x, [0]),
           elems=[[1, 2, 3], [4, 5], [6, 7]],
           expected_output=[[[1, 2, 3]], [[4, 5]], [[6, 7]]],
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=2),
+          result_dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=2),
       ),
       # [d1, (d2), (d3)] -> [d1, (d2), (d3)]
       dict(
-          fn=lambda x: ragged.map_inner_values(mo.add, x, 1),
+          fn=lambda x: ragged_functional_ops.map_flat_values(mo.add, x, 1),
           elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
           expected_output=[[[2, 3, 4]], [[5, 6], [7, 8]]],
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=2),
+          result_dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=2),
       ),
       # [d1, (d2), (d3)] -> [d1, (d2)]
       dict(
-          fn=lambda x: ragged.reduce_sum(x, axis=1),
+          fn=lambda x: ragged_math_ops.reduce_sum(x, axis=1),
           elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
           expected_output=[[6], [9, 13]],
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=1),
+          result_dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=1),
       ),
       # [d1, (d2), (d3)] -> [d1, (d3)]
       dict(
-          fn=lambda x: ragged.reduce_sum(x, axis=0),
+          fn=lambda x: ragged_math_ops.reduce_sum(x, axis=0),
           elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
           expected_output=[[1, 2, 3], [10, 12]],
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=1),
+          result_dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=1),
       ),
       # [d1, (d2), (d3)] -> [d1]
       dict(
-          fn=ragged.reduce_sum,
+          fn=ragged_math_ops.reduce_sum,
           elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
           expected_output=[6, 22],
           result_dtype=dtypes.int64,
@@ -116,31 +127,29 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           fn=mo.range,
           elems=[4, 0, 2],
           expected_output=[[0, 1, 2, 3], [], [0, 1]],
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=1),
+          result_dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=1),
       ),
       # [d1] -> [d1, (d2), (d3)]
       dict(
-          fn=lambda x: ragged.range(mo.range(x)),
+          fn=lambda x: ragged_math_ops.range(mo.range(x)),
           elems=[5, 0, 3],
-          expected_output=[
-              [[], [0], [0, 1], [0, 1, 2], [0, 1, 2, 3]], [], [[], [0], [0, 1]]
-          ],
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=2),
+          expected_output=[[[], [0], [0, 1], [0, 1, 2], [0, 1, 2, 3]], [],
+                           [[], [0], [0, 1]]],
+          result_dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=2),
       ),
       # [d1, (d2), (d3), (d4a), (d5)] ->  [d1, (d2), (d3), (d4b), (d5)]
       dict(
-          fn=lambda x: ragged.add(x, 1),
+          fn=lambda x: x + np.int64(1),
           elems=[[[[[1, 2, 3]], [[4], [5]]]], [[[[6, 7]]], [[[8], []]]]],
-          expected_output=[[[[[2, 3, 4]], [[5], [6]]]],
-                           [[[[7, 8]]], [[[9], []]]]],
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=4),
+          expected_output=[[[[[2, 3, 4]], [[5], [6]]]], [[[[7, 8]]], [[[9],
+                                                                       []]]]],
+          result_dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=4),
       ),
   ])
 
-  @test_util.run_deprecated_v1
   def testRaggedMap(
       self,
       fn,
@@ -153,135 +162,135 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       result_dtype=None,
       infer_shape=False,
   ):
-    elems = ragged.constant(elems, dtype, elems_ragged_rank)
-    output = ragged.map_fn(
+    elems = ragged_factory_ops.constant(elems, dtype, elems_ragged_rank)
+    output = ragged_map_ops.map_fn(
         fn=fn, elems=elems, dtype=result_dtype, infer_shape=infer_shape)
 
-    expected_rt = ragged.constant(
+    expected_rt = ragged_factory_ops.constant(
         expected_output, ragged_rank=expected_ragged_rank)
-    with self.test_session():
-      if ragged.is_ragged(expected_output):
-        self.assertEqual(output.ragged_rank, expected_rt.ragged_rank)
-      output_values = self.evaluate(output)
-      self.assertAllEqual(expected_output, output_values.tolist())
+    self.assertRaggedEqual(expected_rt, output)
 
-  @test_util.run_deprecated_v1
   def testRaggedMapOnStructure(self):
-    batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
+    batman = ragged_factory_ops.constant([[1, 2, 3], [4], [5, 6, 7]])
     # [[10, 20, 30], [40], [50, 60, 70]]
-    robin = ragged.map_inner_values(mo.multiply, batman, 10)
+    robin = ragged_functional_ops.map_flat_values(mo.multiply, batman, 10)
 
     features = {'batman': batman, 'robin': robin}
 
     def _reduce_sum_from_all(f):
       return mo.reduce_sum(f['batman']) + mo.reduce_sum(f['robin'])
 
-    output = ragged.map_fn(
+    output = ragged_map_ops.map_fn(
         fn=_reduce_sum_from_all,
         elems=features,
         dtype=dtypes.int32,
     )
 
-    with self.test_session():
-      self.assertAllEqual(output.eval().tolist(), [66, 44, 198])
+    self.assertRaggedEqual(output, [66, 44, 198])
 
   # Test mapping over a dict of RTs can produce a dict of RTs.
-  @test_util.run_deprecated_v1
   def testRaggedMapOnStructure_RaggedOutputs(self):
-    batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
+    batman = ragged_factory_ops.constant([[1, 2, 3], [4], [5, 6, 7]])
     # [[10, 20, 30], [40], [50, 60, 70]]
-    robin = ragged.map_inner_values(mo.multiply, batman, 10)
+    robin = ragged_functional_ops.map_flat_values(mo.multiply, batman, 10)
 
     features = {'batman': batman, 'robin': robin}
 
     def _increment(f):
       return {
-          'batman': ragged.add(f['batman'], 1),
-          'robin': ragged.add(f['robin'], 1),
+          'batman': f['batman'] + 1,
+          'robin': f['robin'] + 1,
       }
 
-    output = ragged.map_fn(
+    output = ragged_map_ops.map_fn(
         fn=_increment,
         elems=features,
         infer_shape=False,
         dtype={
             'batman':
-                ragged.RaggedTensorType(dtype=dtypes.int32, ragged_rank=1),
+                ragged_tensor.RaggedTensorType(
+                    dtype=dtypes.int32, ragged_rank=1),
             'robin':
-                ragged.RaggedTensorType(dtype=dtypes.int32, ragged_rank=1)
+                ragged_tensor.RaggedTensorType(
+                    dtype=dtypes.int32, ragged_rank=1)
         },
     )
 
-    with self.test_session():
-      self.assertAllEqual(output['batman'].eval().tolist(),
-                          [[2, 3, 4], [5], [6, 7, 8]])
-      self.assertAllEqual(output['robin'].eval().tolist(),
-                          [[11, 21, 31], [41], [51, 61, 71]])
+    self.assertRaggedEqual(output['batman'], [[2, 3, 4], [5], [6, 7, 8]])
+    self.assertRaggedEqual(output['robin'], [[11, 21, 31], [41], [51, 61, 71]])
 
-  @test_util.run_deprecated_v1
   def testZip(self):
-    x = ragged.constant([[10, 20], [30, 40], [50, 60], [70], [80, 90, 100]],
-                        dtypes.int64)
-    y = array_ops.expand_dims(
-        mo.range(ragged.nrows(x), dtype=dtypes.int64), axis=1)
+    x = ragged_factory_ops.constant(
+        [[10, 20], [30, 40], [50, 60], [70], [80, 90, 100]], dtypes.int64)
+    y = array_ops.expand_dims(mo.range(x.nrows(), dtype=dtypes.int64), axis=1)
 
     def _zip(foo):
       y_val, x_val = foo
       bar = backend.tile(y_val, array_ops.shape(x_val))
       return array_ops.stack([bar, x_val], axis=1)
 
-    output = ragged.map_fn(
+    output = ragged_map_ops.map_fn(
         _zip, (y, x),
-        dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=1),
+        dtype=ragged_tensor.RaggedTensorType(dtype=dtypes.int64, ragged_rank=1),
         infer_shape=False)
 
-    with self.test_session():
-      result = self.evaluate(output).tolist()
-      self.assertAllEqual(
-          result, [[[0, 10], [0, 20]], [[1, 30], [1, 40]], [[2, 50], [2, 60]],
-                   [[3, 70]], [[4, 80], [4, 90], [4, 100]]])
+    self.assertRaggedEqual(
+        output, [[[0, 10], [0, 20]], [[1, 30], [1, 40]], [[2, 50], [2, 60]],
+                 [[3, 70]], [[4, 80], [4, 90], [4, 100]]])
 
-  @test_util.run_deprecated_v1
   def testBatchGather(self):
-    tokens = ragged.constant([['hello', '.', 'there'], ['merhaba'],
-                              ['bonjour', '.', 'ca va', '?']])
-    indices = ragged.constant([[0, 2], [0], [0, 2]])
+    tokens = ragged_factory_ops.constant([['hello', '.', 'there'], ['merhaba'],
+                                          ['bonjour', '.', 'ca va', '?']])
+    indices = ragged_factory_ops.constant([[0, 2], [0], [0, 2]])
 
     def gather(x):
       tokens_val, indices_val = x
       return array_ops.gather(tokens_val, indices_val)
 
     data = tokens, indices
-    out = ragged.map_fn(
+    out = ragged_map_ops.map_fn(
         gather,
         data,
-        dtype=ragged.RaggedTensorType(dtype=dtypes.string, ragged_rank=1),
+        dtype=ragged_tensor.RaggedTensorType(
+            dtype=dtypes.string, ragged_rank=1),
         infer_shape=False)
 
-    with self.test_session():
-      self.assertAllEqual(
-          self.evaluate(out).tolist(),
-          [[b'hello', b'there'], [b'merhaba'], [b'bonjour', b'ca va']])
+    self.assertRaggedEqual(
+        out, [[b'hello', b'there'], [b'merhaba'], [b'bonjour', b'ca va']])
 
   def testMismatchRaggedRank(self):
-    elems = ragged.constant([[[1, 2, 3]], [[4, 5], [6, 7]]])
-    fn = lambda x: ragged.reduce_sum(x, axis=0)
+    elems = ragged_factory_ops.constant([[[1, 2, 3]], [[4, 5], [6, 7]]])
+    fn = lambda x: ragged_math_ops.reduce_sum(x, axis=0)
     with self.assertRaisesWithLiteralMatch(
         ValueError, r'The declared ragged rank (23) mismatches the result (1)'):
-      _ = ragged.map_fn(
+      _ = ragged_map_ops.map_fn(
           fn,
           elems,
-          dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=23))
+          dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=23))
 
   def testMismatchRaggedRank2(self):
-    elems = ragged.constant([[1, 2, 3], [4, 5], [6, 7]])
-    fn = lambda x: ragged.from_row_starts(x, [0])
+    elems = ragged_factory_ops.constant([[1, 2, 3], [4, 5], [6, 7]])
+    fn = lambda x: ragged_tensor.RaggedTensor.from_row_starts(x, [0])
     with self.assertRaisesWithLiteralMatch(
         ValueError, r'The declared ragged rank (10) mismatches the result (1)'):
-      _ = ragged.map_fn(
+      _ = ragged_map_ops.map_fn(
           fn,
           elems,
-          dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=10))
+          dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=10))
+
+  def testMapOnSparseTensor(self):
+    s = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
+        values=[0, 5, 0, 4],
+        dense_shape=[2, 2],
+    )
+    t2 = ragged_tensor.RaggedTensor.from_sparse(s)
+    id_t2 = ragged_map_ops.map_fn(
+        lambda x: x, t2,
+    )
+    self.assertRaggedEqual(id_t2, [[0, 5], [0, 4]])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_map_ops.py b/tensorflow/python/ops/ragged/ragged_map_ops.py
index fafa23b8dcbbf128723c1b8e51611a958087fdeb..fbe188bd1a305c1b366461528139bfcbb85b6367 100644
--- a/tensorflow/python/ops/ragged/ragged_map_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_map_ops.py
@@ -27,12 +27,12 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops.ragged import ragged_array_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
@@ -216,8 +216,8 @@ def map_fn(fn,
         varscope_caching_device_was_none = True
 
     elems_flat = [
-        ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-            elem, name="elem") for elem in elems_flat
+        ragged_tensor.convert_to_tensor_or_ragged_tensor(elem, name="elem")
+        for elem in elems_flat
     ]
 
     # We can either infer the output, or we can assume that it will be the same
@@ -226,7 +226,7 @@ def map_fn(fn,
 
     # Find the number of iterations, n may be known statically.
     if isinstance(elems_flat[0], ragged_tensor.RaggedTensor):
-      n = ragged_array_ops.nrows(elems_flat[0], out_type=dtypes.int32)
+      n = elems_flat[0].nrows(out_type=dtypes.int32)
     else:
       static_shape = elems_flat[0].shape
       if static_shape.ndims is not None and static_shape.ndims < 1:
@@ -236,8 +236,10 @@ def map_fn(fn,
         else:
           raise ValueError(
               "elements in elems must be 1+ dimensional Tensors, not scalars")
-      n = static_shape[0].value or array_ops.shape(elems_flat[0])[0]
+      n = (tensor_shape.dimension_value(static_shape[0]) or
+           array_ops.shape(elems_flat[0])[0])
 
+    n = math_ops.cast(n, dtype=dtypes.int32)
     # Create a flat list of TAs.
 
     # Flatten the dtype structure to a list.
@@ -254,7 +256,7 @@ def map_fn(fn,
         for t in dtype_components_flat
     ]
 
-    i = constant_op.constant(0)
+    i = constant_op.constant(0, dtype=dtypes.int32)
 
     def compute(i, tas):
       """The loop body of map_fn.
@@ -334,7 +336,7 @@ def map_fn(fn,
 class _RaggedTensorComponents(
     collections.namedtuple(
         "_RaggedTensorComponents",
-        ["inner_values", "nested_row_lengths", "outer_row_length"])):
+        ["flat_values", "nested_row_lengths", "outer_row_length"])):
   """A namedtuple of components which represent a `RaggedTensor`.
 
   _RaggedTensorComponents is a list of components which can be used to create a
@@ -344,7 +346,7 @@ class _RaggedTensorComponents(
 
   The following are a list of components for a `RaggedTensor`:
 
-  inner_values: The flat and inner values of a RaggedTensor. This could be
+  flat_values: The flat and inner values of a RaggedTensor. This could be
     a `Tensor`, a `TensorArray`, or a data type.
   nested_row_lengths: a tuple containing the row lengths of each rank. The
     elements of the tuple could be `Tensor`s or `TensorArray`s.
@@ -357,12 +359,12 @@ class _RaggedTensorComponents(
 
 
 def _concat_ragged_tensor_components(rt_ta):
-  inner_values = rt_ta.inner_values.concat()
+  flat_values = rt_ta.flat_values.concat()
   nested_row_lengths = tuple(
       row_lengths_ta.concat() for row_lengths_ta in rt_ta.nested_row_lengths)
   outer_row_length = rt_ta.outer_row_length.concat()
   return _RaggedTensorComponents(
-      inner_values=inner_values,
+      flat_values=flat_values,
       nested_row_lengths=nested_row_lengths,
       outer_row_length=outer_row_length)
 
@@ -374,17 +376,17 @@ def _maybe_decompose_tensor(rt):
 
   # The three component pieces we need:
   # - inner values
-  inner_values = rt.inner_values
+  flat_values = rt.flat_values
 
   # - row_splits of the RT
   splits = rt.nested_row_splits
   nested_row_lengths = tuple(split[1:] - split[:-1] for split in splits)
 
   # - outer row length
-  outer_row_length = array_ops.expand_dims(ragged_array_ops.nrows(rt), axis=0)
+  outer_row_length = array_ops.expand_dims(rt.nrows(), axis=0)
 
   return _RaggedTensorComponents(
-      inner_values=inner_values,
+      flat_values=flat_values,
       nested_row_lengths=nested_row_lengths,
       outer_row_length=outer_row_length,
   )
@@ -395,11 +397,12 @@ def _maybe_recompose_tensor(t):
   if not isinstance(t, _RaggedTensorComponents):
     return t
 
-  values = t.inner_values
+  values = t.flat_values
   nested_row_lengths = tuple(t.nested_row_lengths)
   for nested_row_length in reversed(nested_row_lengths):
-    values = ragged_factory_ops.from_row_lengths(values, nested_row_length)
-  return ragged_factory_ops.from_row_lengths(values, t.outer_row_length)
+    values = ragged_tensor.RaggedTensor.from_row_lengths(
+        values, nested_row_length)
+  return ragged_tensor.RaggedTensor.from_row_lengths(values, t.outer_row_length)
 
 
 def _maybe_decompose_dtype(d):
@@ -408,7 +411,7 @@ def _maybe_decompose_dtype(d):
     return d
 
   result = _RaggedTensorComponents(
-      inner_values=d.dtype,
+      flat_values=d.dtype,
       nested_row_lengths=tuple(dtypes.int64 for i in range(d.ragged_rank - 1)),
       outer_row_length=dtypes.int64,
   )
@@ -435,10 +438,13 @@ def _convert_declared(fn_output_flat, output_declared):
               "The declared ragged rank (%d) mismatches the result (1)" %
               declared.ragged_rank)
 
-        row_length = array_ops.expand_dims(
-            ragged_array_ops.nrows(current), axis=0)
+        if isinstance(current, ragged_tensor.RaggedTensor):
+          nrows = current.nrows()
+        else:
+          nrows = array_ops.shape(current, out_type=dtypes.int64)[0]
+        row_length = array_ops.expand_dims(nrows, axis=0)
         rt = _RaggedTensorComponents(
-            inner_values=current,
+            flat_values=current,
             nested_row_lengths=(),
             outer_row_length=row_length)
         yield rt
diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py
index 857b8dbfa361901108bf88949ac167a277991e36..02e927b6991f8d86176c347442a2f49cfdf4ce92 100644
--- a/tensorflow/python/ops/ragged/ragged_math_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_math_ops.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -25,17 +27,18 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_ragged_math_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged import segment_id_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 #===============================================================================
 # ragged.range
 #===============================================================================
 # pylint: disable=redefined-builtin
+@tf_export('ragged.range')
 def range(starts, limits=None, deltas=1, dtype=None, name=None):
   """Returns a `RaggedTensor` containing the specified sequences of numbers.
 
@@ -97,8 +100,8 @@ def range(starts, limits=None, deltas=1, dtype=None, name=None):
           [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64])
 
     result = gen_ragged_math_ops.ragged_range(starts, limits, deltas, name=name)
-    return ragged_factory_ops.from_row_splits(result.rt_dense_values,
-                                              result.rt_nested_splits)
+    return ragged_tensor.RaggedTensor.from_row_splits(result.rt_dense_values,
+                                                      result.rt_nested_splits)
 
 
 def _infer_matching_dtype(tensors, dtype_hierarchy):
@@ -143,8 +146,11 @@ Computes the %(combination)s along segments of a RaggedTensor.
 """
 
 
-def _ragged_segment_aggregate(unsorted_segment_op, data, segment_ids,
-                              num_segments, name=None):
+def _ragged_segment_aggregate(unsorted_segment_op,
+                              data,
+                              segment_ids,
+                              num_segments,
+                              name=None):
   """Aggregates along segments of a RaggedTensor using `unsorted_segment_op`.
 
   Returns a RaggedTensor `output` with `num_segments` rows, where the row
@@ -181,9 +187,8 @@ def _ragged_segment_aggregate(unsorted_segment_op, data, segment_ids,
 
   with ops.name_scope(name, 'RaggedSegment',
                       [data, segment_ids, num_segments]) as name:
-    data = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        data, name='data')
-    segment_ids = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name='data')
+    segment_ids = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         segment_ids, name='segment_ids')
 
     if ragged_tensor.is_ragged(segment_ids):
@@ -212,12 +217,11 @@ def _ragged_segment_aggregate(unsorted_segment_op, data, segment_ids,
     assert output_row_lengths.dtype == dtypes.int64
 
     # Build the splits tensor for the output RaggedTensor.
-    output_splits = array_ops.concat(
-        [
-            array_ops.zeros([1], dtypes.int64),
-            math_ops.cumsum(output_row_lengths)
-        ],
-        axis=0)
+    output_splits = array_ops.concat([
+        array_ops.zeros([1], dtypes.int64),
+        math_ops.cumsum(output_row_lengths)
+    ],
+                                     axis=0)
 
     # For each row in `data`, find the start & limit position where that row's
     # values will be aggregated in output.values.
@@ -234,7 +238,8 @@ def _ragged_segment_aggregate(unsorted_segment_op, data, segment_ids,
     output_values = _ragged_segment_aggregate(unsorted_segment_op, data.values,
                                               data_val_to_out_val_index,
                                               output_splits[-1])
-    return ragged_factory_ops.from_row_splits(output_values, output_splits)
+    return ragged_tensor.RaggedTensor.from_row_splits(output_values,
+                                                      output_splits)
 
 
 def segment_sum(data, segment_ids, num_segments, name=None):
@@ -266,28 +271,32 @@ def segment_max(data, segment_ids, num_segments, name=None):
 
 
 def segment_mean(data, segment_ids, num_segments, name=None):
-  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  """For docs, see: _RAGGED_SEGMENT_DOCSTRING."""
   with ops.name_scope(name, 'RaggedSegmentMean',
                       [data, segment_ids, num_segments]):
     total = segment_sum(data, segment_ids, num_segments)
-    ones = ragged_factory_ops.from_nested_row_splits(
-        array_ops.ones_like(data.inner_values), data.nested_row_splits)
+    ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        array_ops.ones_like(data.flat_values), data.nested_row_splits)
     count = segment_sum(ones, segment_ids, num_segments)
-    return ragged_factory_ops.from_nested_row_splits(
-        total.inner_values / count.inner_values, total.nested_row_splits)
+    if ragged_tensor.is_ragged(total):
+      return total.with_flat_values(total.flat_values / count.flat_values)
+    else:
+      return total / count
 
 
 def segment_sqrt_n(data, segment_ids, num_segments, name=None):
-  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  """For docs, see: _RAGGED_SEGMENT_DOCSTRING."""
   with ops.name_scope(name, 'RaggedSegmentSqrtN',
                       [data, segment_ids, num_segments]):
     total = segment_sum(data, segment_ids, num_segments)
-    ones = ragged_factory_ops.from_nested_row_splits(
-        array_ops.ones_like(data.inner_values), data.nested_row_splits)
+    ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        array_ops.ones_like(data.flat_values), data.nested_row_splits)
     count = segment_sum(ones, segment_ids, num_segments)
-    return ragged_factory_ops.from_nested_row_splits(
-        total.inner_values / math_ops.sqrt(count.inner_values),
-        total.nested_row_splits)
+    if ragged_tensor.is_ragged(total):
+      return total.with_flat_values(
+          total.flat_values / math_ops.sqrt(count.flat_values))
+    else:
+      return total / math_ops.sqrt(count)
 
 
 def _set_ragged_segment_docstring(func, combination, combined):
@@ -311,7 +320,7 @@ _set_ragged_segment_docstring(segment_sqrt_n, 'sum divided by sqrt(N)',
 _RAGGED_REDUCE_DOCSTRING = """\
 Computes the %(combination)s of elements across dimensions of a `RaggedTensor`.
 
-  Reduces `rt_input` along the dimensions given in `axis` by taking the
+  Reduces `input_tensor` along the dimensions given in `axis` by taking the
   %(combination)s of values.  If a reduced dimension has no elements for
   some index, then the value for that index will be %(default)s.
 
@@ -319,18 +328,18 @@ Computes the %(combination)s of elements across dimensions of a `RaggedTensor`.
   `axis` is not specified, then all dimensions are reduced, and a scalar
   value is returned.
   Args:
-    rt_input: A `RaggedTensor` containing the values to be %(combined)s.
+    input_tensor: A `RaggedTensor` containing the values to be %(combined)s.
     axis: The dimensions to reduce.  May be `None` (to reduce all axes), an
       `int` (to reduce a single axis), a `list` or `tuple` of `int` (to reduce
       a given set of axes), or a `Tensor` with a constant value.  Must be in
-      the range `[0, rt_input.rank]`.
+      the range `[0, input_tensor.rank]`.
     name: A name prefix for the returned tensor (optional).
   Returns:
     A `RaggedTensor` containing the %(combined)s values.  The returned tensor
     has the same dtype as `data`, and its shape is given by removing the
-    dimensions specified in `axis` from `rt_input.shape`.  The `ragged_rank`
+    dimensions specified in `axis` from `input_tensor.shape`.  The `ragged_rank`
     of the returned tensor is given by substracting any ragged dimensions
-    specified in `axis` from `rt_input.ragged_rank`.
+    specified in `axis` from `input_tensor.ragged_rank`.
   Raises:
     ValueError: If `axis` contains a `Tensor` whose value is not constant.
   ####Example:
@@ -387,7 +396,11 @@ _RAGGED_REDUCE_ANY_EXAMPLE = """
 """
 
 
-def _ragged_reduce_aggregate(reduce_op, unsorted_segment_op, rt_input, axis,
+def _ragged_reduce_aggregate(reduce_op,
+                             unsorted_segment_op,
+                             rt_input,
+                             axis,
+                             keepdims,
                              name=None):
   """Aggregates across axes of a RaggedTensor using the given `Tensor` ops.
 
@@ -412,6 +425,7 @@ def _ragged_reduce_aggregate(reduce_op, unsorted_segment_op, rt_input, axis,
       `int` (to reduce a single axis), a `list` or `tuple` of `int` (to reduce a
       given set of axes), or a `Tensor` with a constant value.  Must be in the
       range `[0, rt_input.rank)`.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name prefix for the returned tensor (optional).
 
   Returns:
@@ -426,14 +440,19 @@ def _ragged_reduce_aggregate(reduce_op, unsorted_segment_op, rt_input, axis,
   if not ragged_tensor.is_ragged(rt_input):
     return reduce_op(rt_input, axis, name=name)
 
+  if keepdims:
+    raise ValueError('keepdims=True is not supported for RaggedTensors.')
+
   if isinstance(axis, ops.Tensor):
     axis = tensor_util.constant_value(axis)
     if axis is None:
       raise ValueError('axis must be known at graph construction time.')
+    if isinstance(axis, np.ndarray):
+      axis = axis.tolist()
 
   # When reducing all axes, just ignore splits & reduce the inner values.
   if axis is None:
-    return reduce_op(rt_input.inner_values, None, name=name)
+    return reduce_op(rt_input.flat_values, None, name=name)
 
   with ops.name_scope(name, 'RaggedReduce', [rt_input, axis]):
     if isinstance(axis, (tuple, list)):
@@ -448,15 +467,15 @@ def _ragged_reduce_aggregate(reduce_op, unsorted_segment_op, rt_input, axis,
         # once will probably require a nontrivial c++ op.
         axis = sorted(axis)
         inner_reduced = _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
-                                                 rt_input, axis[-1])
+                                                 rt_input, axis[-1], keepdims)
         return _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
-                                        inner_reduced, axis[:-1])
+                                        inner_reduced, axis[:-1], keepdims)
 
-    axis = ragged_util.get_positive_axis(axis, rt_input.shape.ndims)
-
-    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    rt_input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         rt_input, name='rt_input')
 
+    axis = ragged_util.get_positive_axis(axis, rt_input.shape.ndims)
+
     if axis == 0:
       # out[i_1, i_2, ..., i_N] = sum_{j} rt_input[j, i_1, i_2, ..., i_N]
       row_lengths = rt_input.row_splits[1:] - rt_input.row_splits[:-1]
@@ -476,69 +495,74 @@ def _ragged_reduce_aggregate(reduce_op, unsorted_segment_op, rt_input, axis,
       #     sum_{j} rt_input [i_0, ..., i_[axis-1], j, i_axis+1], ..., i_N]
       return rt_input.with_values(
           _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
-                                   rt_input.values, axis - 1))
+                                   rt_input.values, axis - 1, keepdims))
 
 
-def reduce_sum(rt_input, axis=None, name=None):
+def reduce_sum(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
   return _ragged_reduce_aggregate(math_ops.reduce_sum,
-                                  math_ops.unsorted_segment_sum, rt_input, axis,
-                                  name or 'RaggedReduceSum')
+                                  math_ops.unsorted_segment_sum, input_tensor,
+                                  axis, keepdims, name or 'RaggedReduceSum')
 
 
-def reduce_prod(rt_input, axis=None, name=None):
+def reduce_prod(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
   return _ragged_reduce_aggregate(math_ops.reduce_prod,
-                                  math_ops.unsorted_segment_prod, rt_input,
-                                  axis, name or 'RaggedReduceProd')
+                                  math_ops.unsorted_segment_prod, input_tensor,
+                                  axis, keepdims, name or 'RaggedReduceProd')
 
 
-def reduce_min(rt_input, axis=None, name=None):
+def reduce_min(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
   return _ragged_reduce_aggregate(math_ops.reduce_min,
-                                  math_ops.unsorted_segment_min, rt_input, axis,
-                                  name or 'RaggedReduceMin')
+                                  math_ops.unsorted_segment_min, input_tensor,
+                                  axis, keepdims, name or 'RaggedReduceMin')
 
 
-def reduce_max(rt_input, axis=None, name=None):
+def reduce_max(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
   return _ragged_reduce_aggregate(math_ops.reduce_max,
-                                  math_ops.unsorted_segment_max, rt_input, axis,
-                                  name or 'RaggedReduceMax')
+                                  math_ops.unsorted_segment_max, input_tensor,
+                                  axis, keepdims, name or 'RaggedReduceMax')
 
 
-def reduce_mean(rt_input, axis=None, name=None):
+def reduce_mean(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
-  with ops.name_scope(name, 'RaggedReduceMean', [rt_input, axis]):
-    total = reduce_sum(rt_input, axis)
-    if ragged_tensor.is_ragged(rt_input):
-      ones = ragged_factory_ops.from_nested_row_splits(
-          array_ops.ones_like(rt_input.inner_values),
-          rt_input.nested_row_splits)
+  with ops.name_scope(name, 'RaggedReduceMean', [input_tensor, axis]):
+    total = reduce_sum(input_tensor, axis, keepdims)
+    if ragged_tensor.is_ragged(input_tensor):
+      ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
+          array_ops.ones_like(input_tensor.flat_values),
+          input_tensor.nested_row_splits)
     else:
-      ones = array_ops.ones_like(rt_input)
-    count = reduce_sum(ones, axis)
+      ones = array_ops.ones_like(input_tensor)
+    count = reduce_sum(ones, axis, keepdims)
     if ragged_tensor.is_ragged(total):
-      return ragged_factory_ops.from_nested_row_splits(
-          total.inner_values / count.inner_values, total.nested_row_splits)
+      return ragged_tensor.RaggedTensor.from_nested_row_splits(
+          total.flat_values / count.flat_values, total.nested_row_splits)
     else:
       return total / count
 
 
-def _cast(rt_input, dtype):
-  return ragged_functional_ops.map_inner_values(math_ops.cast, rt_input, dtype)
+def _cast(input_tensor, dtype):
+  return ragged_functional_ops.map_flat_values(math_ops.cast, input_tensor,
+                                               dtype)
 
 
-def reduce_all(rt_input, axis=None, name=None):
+def reduce_all(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
-  with ops.name_scope(name, 'RaggedReduceAll', [rt_input, axis]):
-    return _cast(reduce_prod(_cast(rt_input, dtypes.int32), axis), dtypes.bool)
+  with ops.name_scope(name, 'RaggedReduceAll', [input_tensor, axis]):
+    return _cast(
+        reduce_prod(_cast(input_tensor, dtypes.int32), axis, keepdims),
+        dtypes.bool)
 
 
-def reduce_any(rt_input, axis=None, name=None):
+def reduce_any(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
-  with ops.name_scope(name, 'RaggedReduceAny', [rt_input, axis]):
-    return _cast(reduce_sum(_cast(rt_input, dtypes.int32), axis), dtypes.bool)
+  with ops.name_scope(name, 'RaggedReduceAny', [input_tensor, axis]):
+    return _cast(
+        reduce_sum(_cast(input_tensor, dtypes.int32), axis, keepdims),
+        dtypes.bool)
 
 
 def _set_ragged_reduce_docstring(func, combination, combined, default, example):
@@ -554,9 +578,11 @@ _set_ragged_reduce_docstring(reduce_sum, 'sum', 'summed', '0',
 _set_ragged_reduce_docstring(reduce_prod, 'product', 'multiplied', '1',
                              _RAGGED_REDUCE_PROD_EXAMPLE)
 _set_ragged_reduce_docstring(reduce_min, 'minimum', 'minimized',
-                             '`rt_input.dtype.min`', _RAGGED_REDUCE_MIN_EXAMPLE)
+                             '`input_tensor.dtype.min`',
+                             _RAGGED_REDUCE_MIN_EXAMPLE)
 _set_ragged_reduce_docstring(reduce_max, 'maximum', 'maximized',
-                             '`rt_input.dtype.max`', _RAGGED_REDUCE_MAX_EXAMPLE)
+                             '`input_tensor.dtype.max`',
+                             _RAGGED_REDUCE_MAX_EXAMPLE)
 _set_ragged_reduce_docstring(reduce_mean, 'mean', 'averaged', 'NaN',
                              _RAGGED_REDUCE_MEAN_EXAMPLE)
 
diff --git a/tensorflow/python/ops/ragged/ragged_operators.py b/tensorflow/python/ops/ragged/ragged_operators.py
index 223ba0d2e7f050650a0849fdb4987afb38cebd2e..7654fa22b1e3a6d783a7a3295bca2d1a0b2ea757 100644
--- a/tensorflow/python/ops/ragged/ragged_operators.py
+++ b/tensorflow/python/ops/ragged/ragged_operators.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.ops.ragged import ragged_elementwise_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_getitem
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import tf_decorator
@@ -33,40 +33,39 @@ def _right(operator):
 ragged_tensor.RaggedTensor.__getitem__ = ragged_getitem.ragged_tensor_getitem
 
 # Ordering operators
-ragged_tensor.RaggedTensor.__ge__ = ragged_elementwise_ops.greater_equal
-ragged_tensor.RaggedTensor.__gt__ = ragged_elementwise_ops.greater
-ragged_tensor.RaggedTensor.__le__ = ragged_elementwise_ops.less_equal
-ragged_tensor.RaggedTensor.__lt__ = ragged_elementwise_ops.less
+ragged_tensor.RaggedTensor.__ge__ = math_ops.greater_equal
+ragged_tensor.RaggedTensor.__gt__ = math_ops.greater
+ragged_tensor.RaggedTensor.__le__ = math_ops.less_equal
+ragged_tensor.RaggedTensor.__lt__ = math_ops.less
 
 # Logical operators
-ragged_tensor.RaggedTensor.__and__ = ragged_elementwise_ops.logical_and
-ragged_tensor.RaggedTensor.__rand__ = _right(ragged_elementwise_ops.logical_and)
-ragged_tensor.RaggedTensor.__invert__ = ragged_elementwise_ops.logical_not
-ragged_tensor.RaggedTensor.__ror__ = _right(ragged_elementwise_ops.logical_or)
-ragged_tensor.RaggedTensor.__or__ = ragged_elementwise_ops.logical_or
-ragged_tensor.RaggedTensor.__xor__ = ragged_elementwise_ops.logical_xor
-ragged_tensor.RaggedTensor.__rxor__ = _right(ragged_elementwise_ops.logical_xor)
+ragged_tensor.RaggedTensor.__and__ = math_ops.logical_and
+ragged_tensor.RaggedTensor.__rand__ = _right(math_ops.logical_and)
+ragged_tensor.RaggedTensor.__invert__ = math_ops.logical_not
+ragged_tensor.RaggedTensor.__ror__ = _right(math_ops.logical_or)
+ragged_tensor.RaggedTensor.__or__ = math_ops.logical_or
+ragged_tensor.RaggedTensor.__xor__ = math_ops.logical_xor
+ragged_tensor.RaggedTensor.__rxor__ = _right(math_ops.logical_xor)
 
 # Arithmetic operators
-ragged_tensor.RaggedTensor.__abs__ = ragged_elementwise_ops.abs
-ragged_tensor.RaggedTensor.__add__ = ragged_elementwise_ops.add
-ragged_tensor.RaggedTensor.__radd__ = _right(ragged_elementwise_ops.add)
-ragged_tensor.RaggedTensor.__div__ = ragged_elementwise_ops.div
-ragged_tensor.RaggedTensor.__rdiv__ = _right(ragged_elementwise_ops.div)
-ragged_tensor.RaggedTensor.__floordiv__ = ragged_elementwise_ops.floordiv
-ragged_tensor.RaggedTensor.__rfloordiv__ = _right(
-    ragged_elementwise_ops.floordiv)
-ragged_tensor.RaggedTensor.__mod__ = ragged_elementwise_ops.floormod
-ragged_tensor.RaggedTensor.__rmod__ = _right(ragged_elementwise_ops.floormod)
-ragged_tensor.RaggedTensor.__mul__ = ragged_elementwise_ops.multiply
-ragged_tensor.RaggedTensor.__rmul__ = _right(ragged_elementwise_ops.multiply)
-ragged_tensor.RaggedTensor.__neg__ = ragged_elementwise_ops.negative
-ragged_tensor.RaggedTensor.__pow__ = ragged_elementwise_ops.pow
-ragged_tensor.RaggedTensor.__rpow__ = _right(ragged_elementwise_ops.pow)
-ragged_tensor.RaggedTensor.__sub__ = ragged_elementwise_ops.subtract
-ragged_tensor.RaggedTensor.__rsub__ = _right(ragged_elementwise_ops.subtract)
-ragged_tensor.RaggedTensor.__truediv__ = ragged_elementwise_ops.truediv
-ragged_tensor.RaggedTensor.__rtruediv__ = _right(ragged_elementwise_ops.truediv)
+ragged_tensor.RaggedTensor.__abs__ = math_ops.abs
+ragged_tensor.RaggedTensor.__add__ = math_ops.add
+ragged_tensor.RaggedTensor.__radd__ = _right(math_ops.add)
+ragged_tensor.RaggedTensor.__div__ = math_ops.div
+ragged_tensor.RaggedTensor.__rdiv__ = _right(math_ops.div)
+ragged_tensor.RaggedTensor.__floordiv__ = math_ops.floordiv
+ragged_tensor.RaggedTensor.__rfloordiv__ = _right(math_ops.floordiv)
+ragged_tensor.RaggedTensor.__mod__ = math_ops.floormod
+ragged_tensor.RaggedTensor.__rmod__ = _right(math_ops.floormod)
+ragged_tensor.RaggedTensor.__mul__ = math_ops.multiply
+ragged_tensor.RaggedTensor.__rmul__ = _right(math_ops.multiply)
+ragged_tensor.RaggedTensor.__neg__ = math_ops.negative
+ragged_tensor.RaggedTensor.__pow__ = math_ops.pow
+ragged_tensor.RaggedTensor.__rpow__ = _right(math_ops.pow)
+ragged_tensor.RaggedTensor.__sub__ = math_ops.subtract
+ragged_tensor.RaggedTensor.__rsub__ = _right(math_ops.subtract)
+ragged_tensor.RaggedTensor.__truediv__ = math_ops.truediv
+ragged_tensor.RaggedTensor.__rtruediv__ = _right(math_ops.truediv)
 
 
 # Dummy methods
diff --git a/tensorflow/python/ops/ragged/ragged_operators_test.py b/tensorflow/python/ops/ragged/ragged_operators_test.py
index 7fe8159d82215071fb151174b5c1722c54f56966..d1c6b902f2fa223b3fabfb4184e8ebb004b16a40 100644
--- a/tensorflow/python/ops/ragged/ragged_operators_test.py
+++ b/tensorflow/python/ops/ragged/ragged_operators_test.py
@@ -19,85 +19,75 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase):
-  # @TODO(edloper): Test right-handed versions of operators once we add
-  # broadcasting support for elementwise ops.
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase):
 
-  @test_util.run_deprecated_v1
   def testOrderingOperators(self):
-    x = ragged.constant([[1, 5], [3]])
-    y = ragged.constant([[4, 5], [1]])
-    with self.test_session():
-      self.assertEqual((x > y).eval().tolist(), [[False, False], [True]])
-      self.assertEqual((x >= y).eval().tolist(), [[False, True], [True]])
-      self.assertEqual((x < y).eval().tolist(), [[True, False], [False]])
-      self.assertEqual((x <= y).eval().tolist(), [[True, True], [False]])
-
-  def assertEqual(self, a, b):
-    if a != b:
-      print('%30s %s' % (b, a))
-
-  @test_util.run_deprecated_v1
+    x = ragged_factory_ops.constant([[1, 5], [3]])
+    y = ragged_factory_ops.constant([[4, 5], [1]])
+    self.assertRaggedEqual((x > y), [[False, False], [True]])
+    self.assertRaggedEqual((x >= y), [[False, True], [True]])
+    self.assertRaggedEqual((x < y), [[True, False], [False]])
+    self.assertRaggedEqual((x <= y), [[True, True], [False]])
+
   def testArithmeticOperators(self):
-    x = ragged.constant([[1.0, -2.0], [8.0]])
-    y = ragged.constant([[4.0, 4.0], [2.0]])
-    with self.test_session():
-      self.assertEqual(abs(x).eval().tolist(), [[1.0, 2.0], [8.0]])
+    x = ragged_factory_ops.constant([[1.0, -2.0], [8.0]])
+    y = ragged_factory_ops.constant([[4.0, 4.0], [2.0]])
+    self.assertRaggedEqual(abs(x), [[1.0, 2.0], [8.0]])
 
-      self.assertEqual((-x).eval().tolist(), [[-1.0, 2.0], [-8.0]])
+    self.assertRaggedEqual((-x), [[-1.0, 2.0], [-8.0]])
 
-      self.assertEqual((x + y).eval().tolist(), [[5.0, 2.0], [10.0]])
-      self.assertEqual((3.0 + y).eval().tolist(), [[7.0, 7.0], [5.0]])
-      self.assertEqual((x + 3.0).eval().tolist(), [[4.0, 1.0], [11.0]])
+    self.assertRaggedEqual((x + y), [[5.0, 2.0], [10.0]])
+    self.assertRaggedEqual((3.0 + y), [[7.0, 7.0], [5.0]])
+    self.assertRaggedEqual((x + 3.0), [[4.0, 1.0], [11.0]])
 
-      self.assertEqual((x - y).eval().tolist(), [[-3.0, -6.0], [6.0]])
-      self.assertEqual((3.0 - y).eval().tolist(), [[-1.0, -1.0], [1.0]])
-      self.assertEqual((x + 3.0).eval().tolist(), [[4.0, 1.0], [11.0]])
+    self.assertRaggedEqual((x - y), [[-3.0, -6.0], [6.0]])
+    self.assertRaggedEqual((3.0 - y), [[-1.0, -1.0], [1.0]])
+    self.assertRaggedEqual((x + 3.0), [[4.0, 1.0], [11.0]])
 
-      self.assertEqual((x * y).eval().tolist(), [[4.0, -8.0], [16.0]])
-      self.assertEqual((3.0 * y).eval().tolist(), [[12.0, 12.0], [6.0]])
-      self.assertEqual((x * 3.0).eval().tolist(), [[3.0, -6.0], [24.0]])
+    self.assertRaggedEqual((x * y), [[4.0, -8.0], [16.0]])
+    self.assertRaggedEqual((3.0 * y), [[12.0, 12.0], [6.0]])
+    self.assertRaggedEqual((x * 3.0), [[3.0, -6.0], [24.0]])
 
-      self.assertEqual((x / y).eval().tolist(), [[0.25, -0.5], [4.0]])
-      self.assertEqual((y / x).eval().tolist(), [[4.0, -2.0], [0.25]])
-      self.assertEqual((2.0 / y).eval().tolist(), [[0.5, 0.5], [1.0]])
-      self.assertEqual((x / 2.0).eval().tolist(), [[0.5, -1.0], [4.0]])
+    self.assertRaggedEqual((x / y), [[0.25, -0.5], [4.0]])
+    self.assertRaggedEqual((y / x), [[4.0, -2.0], [0.25]])
+    self.assertRaggedEqual((2.0 / y), [[0.5, 0.5], [1.0]])
+    self.assertRaggedEqual((x / 2.0), [[0.5, -1.0], [4.0]])
 
-      self.assertEqual((x // y).eval().tolist(), [[0.0, -1.0], [4.0]])
-      self.assertEqual((y // x).eval().tolist(), [[4.0, -2.0], [0.0]])
-      self.assertEqual((2.0 // y).eval().tolist(), [[0.0, 0.0], [1.0]])
-      self.assertEqual((x // 2.0).eval().tolist(), [[0.0, -1.0], [4.0]])
+    self.assertRaggedEqual((x // y), [[0.0, -1.0], [4.0]])
+    self.assertRaggedEqual((y // x), [[4.0, -2.0], [0.0]])
+    self.assertRaggedEqual((2.0 // y), [[0.0, 0.0], [1.0]])
+    self.assertRaggedEqual((x // 2.0), [[0.0, -1.0], [4.0]])
 
-      self.assertEqual((x % y).eval().tolist(), [[1.0, 2.0], [0.0]])
-      self.assertEqual((y % x).eval().tolist(), [[0.0, -0.0], [2.0]])
-      self.assertEqual((2.0 % y).eval().tolist(), [[2.0, 2.0], [0.0]])
-      self.assertEqual((x % 2.0).eval().tolist(), [[1.0, 0.0], [0.0]])
+    self.assertRaggedEqual((x % y), [[1.0, 2.0], [0.0]])
+    self.assertRaggedEqual((y % x), [[0.0, -0.0], [2.0]])
+    self.assertRaggedEqual((2.0 % y), [[2.0, 2.0], [0.0]])
+    self.assertRaggedEqual((x % 2.0), [[1.0, 0.0], [0.0]])
 
-  @test_util.run_deprecated_v1
   def testLogicalOperators(self):
-    a = ragged.constant([[True, True], [False]])
-    b = ragged.constant([[True, False], [False]])
-    with self.test_session():
-      self.assertEqual((~a).eval().tolist(), [[False, False], [True]])
+    a = ragged_factory_ops.constant([[True, True], [False]])
+    b = ragged_factory_ops.constant([[True, False], [False]])
+    self.assertRaggedEqual((~a), [[False, False], [True]])
 
-      self.assertEqual((a & b).eval().tolist(), [[True, False], [False]])
-      self.assertEqual((a & True).eval().tolist(), [[True, True], [False]])
-      self.assertEqual((True & b).eval().tolist(), [[True, False], [False]])
+    self.assertRaggedEqual((a & b), [[True, False], [False]])
+    self.assertRaggedEqual((a & True), [[True, True], [False]])
+    self.assertRaggedEqual((True & b), [[True, False], [False]])
 
-      self.assertEqual((a | b).eval().tolist(), [[True, True], [False]])
-      self.assertEqual((a | False).eval().tolist(), [[True, True], [False]])
-      self.assertEqual((False | b).eval().tolist(), [[True, False], [False]])
+    self.assertRaggedEqual((a | b), [[True, True], [False]])
+    self.assertRaggedEqual((a | False), [[True, True], [False]])
+    self.assertRaggedEqual((False | b), [[True, False], [False]])
 
-      self.assertEqual((a ^ b).eval().tolist(), [[False, True], [False]])
-      self.assertEqual((a ^ True).eval().tolist(), [[False, False], [True]])
-      self.assertEqual((True ^ b).eval().tolist(), [[False, True], [True]])
+    self.assertRaggedEqual((a ^ b), [[False, True], [False]])
+    self.assertRaggedEqual((a ^ True), [[False, False], [True]])
+    self.assertRaggedEqual((True ^ b), [[False, True], [True]])
 
   def testDummyOperators(self):
-    a = ragged.constant([[True, True], [False]])
+    a = ragged_factory_ops.constant([[True, True], [False]])
     with self.assertRaisesRegexp(TypeError,
                                  'RaggedTensor may not be used as a boolean.'):
       bool(a)
diff --git a/tensorflow/python/ops/ragged/ragged_range_op_test.py b/tensorflow/python/ops/ragged/ragged_range_op_test.py
index 644423ecb7ffe67ef1316b5c62cbd89e387959e8..afe5866cff5002791a84a051f1a9fd1a9da06fb1 100644
--- a/tensorflow/python/ops/ragged/ragged_range_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_range_op_test.py
@@ -20,113 +20,108 @@ from __future__ import print_function
 
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedRangeOpTest(test_util.TensorFlowTestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedRangeOpTest(ragged_test_util.RaggedTensorTestCase):
 
-  @test_util.run_deprecated_v1
   def testDocStringExamples(self):
     """Examples from ragged_range.__doc__."""
-    with self.test_session():
-      rt1 = ragged.range([3, 5, 2]).eval().tolist()
-      self.assertEqual(rt1, [[0, 1, 2], [0, 1, 2, 3, 4], [0, 1]])
+    rt1 = ragged_math_ops.range([3, 5, 2])
+    self.assertRaggedEqual(rt1, [[0, 1, 2], [0, 1, 2, 3, 4], [0, 1]])
 
-      rt2 = ragged.range([0, 5, 8], [3, 3, 12]).eval().tolist()
-      self.assertEqual(rt2, [[0, 1, 2], [], [8, 9, 10, 11]])
+    rt2 = ragged_math_ops.range([0, 5, 8], [3, 3, 12])
+    self.assertRaggedEqual(rt2, [[0, 1, 2], [], [8, 9, 10, 11]])
 
-      rt3 = ragged.range([0, 5, 8], [3, 3, 12], 2).eval().tolist()
-      self.assertEqual(rt3, [[0, 2], [], [8, 10]])
+    rt3 = ragged_math_ops.range([0, 5, 8], [3, 3, 12], 2)
+    self.assertRaggedEqual(rt3, [[0, 2], [], [8, 10]])
 
-  @test_util.run_deprecated_v1
   def testBasicRanges(self):
-    with self.test_session():
-      # Specify limits only.
-      self.assertEqual(
-          ragged.range([0, 3, 5]).eval().tolist(),
-          [list(range(0)), list(range(3)), list(range(5))])
-
-      # Specify starts and limits.
-      self.assertEqual(
-          ragged.range([0, 3, 5], [2, 3, 10]).eval().tolist(),
-          [list(range(0, 2)), list(range(3, 3)), list(range(5, 10))])
-
-      # Specify starts, limits, and deltas.
-      self.assertEqual(
-          ragged.range([0, 3, 5], [4, 4, 15], [2, 3, 4]).eval().tolist(),
-          [list(range(0, 4, 2)), list(range(3, 4, 3)),
-           list(range(5, 15, 4))])
-
-  @test_util.run_deprecated_v1
+    # Specify limits only.
+    self.assertRaggedEqual(
+        ragged_math_ops.range([0, 3, 5]),
+        [list(range(0)), list(range(3)),
+         list(range(5))])
+
+    # Specify starts and limits.
+    self.assertRaggedEqual(
+        ragged_math_ops.range([0, 3, 5], [2, 3, 10]),
+        [list(range(0, 2)),
+         list(range(3, 3)),
+         list(range(5, 10))])
+
+    # Specify starts, limits, and deltas.
+    self.assertRaggedEqual(
+        ragged_math_ops.range([0, 3, 5], [4, 4, 15], [2, 3, 4]),
+        [list(range(0, 4, 2)),
+         list(range(3, 4, 3)),
+         list(range(5, 15, 4))])
+
   def testFloatRanges(self):
-    with self.test_session():
-      expected = [[0.0, 0.4, 0.8, 1.2, 1.6, 2.0, 2.4, 2.8, 3.2, 3.6], [3.0],
-                  [5.0, 7.2, 9.4, 11.6, 13.8]]
-      actual = ragged.range([0.0, 3.0, 5.0], [3.9, 4.0, 15.0],
-                            [0.4, 1.5, 2.2]).eval().tolist()
-      self.assertEqual(expected, [[round(v, 5) for v in row] for row in actual])
-
-  @test_util.run_deprecated_v1
+    expected = [[0.0, 0.4, 0.8, 1.2, 1.6, 2.0, 2.4, 2.8, 3.2, 3.6], [3.0],
+                [5.0, 7.2, 9.4, 11.6, 13.8]]
+    actual = ragged_math_ops.range([0.0, 3.0, 5.0], [3.9, 4.0, 15.0],
+                                   [0.4, 1.5, 2.2])
+    self.assertEqual(
+        expected,
+        [[round(v, 5) for v in row] for row in self.eval_to_list(actual)])
+
   def testNegativeDeltas(self):
-    with self.test_session():
-      self.assertEqual(
-          ragged.range([0, 3, 5], limits=0, deltas=-1).eval().tolist(),
-          [list(range(0, 0, -1)), list(range(3, 0, -1)),
-           list(range(5, 0, -1))])
-
-      self.assertEqual(
-          ragged.range([0, -3, 5], limits=0, deltas=[-1, 1,
-                                                     -2]).eval().tolist(),
-          [list(range(0, 0, -1)), list(range(-3, 0, 1)),
-           list(range(5, 0, -2))])
-
-  @test_util.run_deprecated_v1
+    self.assertRaggedEqual(
+        ragged_math_ops.range([0, 3, 5], limits=0, deltas=-1),
+        [list(range(0, 0, -1)),
+         list(range(3, 0, -1)),
+         list(range(5, 0, -1))])
+
+    self.assertRaggedEqual(
+        ragged_math_ops.range([0, -3, 5], limits=0, deltas=[-1, 1, -2]),
+        [list(range(0, 0, -1)),
+         list(range(-3, 0, 1)),
+         list(range(5, 0, -2))])
+
   def testBroadcast(self):
-    with self.test_session():
-      # Specify starts and limits, broadcast deltas.
-      self.assertEqual(
-          ragged.range([0, 3, 5], [4, 4, 15], 3).eval().tolist(),
-          [list(range(0, 4, 3)), list(range(3, 4, 3)),
-           list(range(5, 15, 3))])
-
-      # Broadcast all arguments.
-      self.assertEqual(
-          ragged.range(0, 5, 1).eval().tolist(), [list(range(0, 5, 1))])
-
-  @test_util.run_deprecated_v1
+    # Specify starts and limits, broadcast deltas.
+    self.assertRaggedEqual(
+        ragged_math_ops.range([0, 3, 5], [4, 4, 15], 3),
+        [list(range(0, 4, 3)),
+         list(range(3, 4, 3)),
+         list(range(5, 15, 3))])
+
+    # Broadcast all arguments.
+    self.assertRaggedEqual(
+        ragged_math_ops.range(0, 5, 1), [list(range(0, 5, 1))])
+
   def testEmptyRanges(self):
-    rt1 = ragged.range([0, 5, 3], [0, 3, 5])
-    rt2 = ragged.range([0, 5, 5], [0, 3, 5], -1)
-    with self.test_session():
-      self.assertEqual(rt1.eval().tolist(), [[], [], [3, 4]])
-      self.assertEqual(rt2.eval().tolist(), [[], [5, 4], []])
+    rt1 = ragged_math_ops.range([0, 5, 3], [0, 3, 5])
+    rt2 = ragged_math_ops.range([0, 5, 5], [0, 3, 5], -1)
+    self.assertRaggedEqual(rt1, [[], [], [3, 4]])
+    self.assertRaggedEqual(rt2, [[], [5, 4], []])
 
-  @test_util.run_deprecated_v1
   def testShapeFnErrors(self):
-    with self.test_session():
-      self.assertRaisesRegexp(ValueError, r'Shape must be at most rank 1.*',
-                              ragged.range, [[0]], 5)
-      self.assertRaisesRegexp(ValueError, r'Shape must be at most rank 1.*',
-                              ragged.range, 0, [[5]])
-      self.assertRaisesRegexp(ValueError, r'Shape must be at most rank 1.*',
-                              ragged.range, 0, 5, [[0]])
-      self.assertRaisesRegexp(ValueError, r'Dimensions must be equal.*',
-                              ragged.range, [0], [1, 2])
-
-  @test_util.run_deprecated_v1
+    self.assertRaises((ValueError, errors.InvalidArgumentError),
+                      ragged_math_ops.range, [[0]], 5)
+    self.assertRaises((ValueError, errors.InvalidArgumentError),
+                      ragged_math_ops.range, 0, [[5]])
+    self.assertRaises((ValueError, errors.InvalidArgumentError),
+                      ragged_math_ops.range, 0, 5, [[0]])
+    self.assertRaises((ValueError, errors.InvalidArgumentError),
+                      ragged_math_ops.range, [0], [1, 2])
+
   def testKernelErrors(self):
-    with self.test_session():
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              r'Requires delta != 0',
-                              ragged.range(0, 0, 0).eval)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'Requires delta != 0'):
+      self.evaluate(ragged_math_ops.range(0, 0, 0))
 
-  @test_util.run_deprecated_v1
   def testShape(self):
-    self.assertEqual(ragged.range(0, 0, 0).shape.as_list(), [1, None])
-    self.assertEqual(ragged.range([1, 2, 3]).shape.as_list(), [3, None])
-    self.assertEqual(
-        ragged.range([1, 2, 3], [4, 5, 6]).shape.as_list(), [3, None])
+    self.assertRaggedEqual(
+        ragged_math_ops.range(0, 0, 1).shape.as_list(), [1, None])
+    self.assertRaggedEqual(
+        ragged_math_ops.range([1, 2, 3]).shape.as_list(), [3, None])
+    self.assertRaggedEqual(
+        ragged_math_ops.range([1, 2, 3], [4, 5, 6]).shape.as_list(), [3, None])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
index 9f51d59ba3cb0ddb004b0350216ae9414d323282..a9fa378eebc01e97390c48f5aaeebee7e9791359 100644
--- a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.reduce_<AGGREGATE> ops."""
+"""Tests for ragged_math_ops.reduce_<AGGREGATE> ops."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,11 +21,14 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 _MAX_INT32 = dtypes.int32.max
@@ -37,7 +40,9 @@ def mean(*values):
   return 1.0 * sum(values) / len(values)
 
 
-class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedReduceOpsTest(ragged_test_util.RaggedTensorTestCase,
+                          parameterized.TestCase):
 
   @parameterized.parameters(
       #=========================================================================
@@ -48,88 +53,88 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       #    [2, 6   ]]
       #=========================================================================
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=0,
           expected=[15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=-2,
           expected=[15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=1,
           expected=[8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=-1,
           expected=[8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_prod,
+          ragged_reduce_op=ragged_math_ops.reduce_prod,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=0,
           expected=[54, 30, 4]  # = [3*1*9*2, 1*5*6, 4]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_prod,
+          ragged_reduce_op=ragged_math_ops.reduce_prod,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=1,
           expected=[12, 5, 9, 12]  # = [3*1*4, 1*5, 9, 2*6]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_min,
+          ragged_reduce_op=ragged_math_ops.reduce_min,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=0,
           expected=[1, 1, 4]  # = [min(3, 1, 9, 2), min(1, 5, 6), 4]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_min,
+          ragged_reduce_op=ragged_math_ops.reduce_min,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=1,
           expected=[1, 1, 9, 2]  # = [min(3, 1, 4), min(1, 5), 9, min(2, 6)]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_max,
+          ragged_reduce_op=ragged_math_ops.reduce_max,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=0,
           expected=[9, 6, 4]  # = [max(3, 1, 9, 2), max(1, 5, 6), 4]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_max,
+          ragged_reduce_op=ragged_math_ops.reduce_max,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=1,
           expected=[4, 5, 9, 6]  # = [max(3, 1, 4), max(1, 5), 9, max(2, 6)]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_mean,
+          ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=0,
           expected=[3.75, 4, 4]  # = [mean(3, 1, 9, 2), mean(1, 5, 6), 4]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_any,
+          ragged_reduce_op=ragged_math_ops.reduce_any,
           rt_input=[[True, True], [True, True, False, True], [False, True]],
           axis=0,
           expected=[True, True, False, True]),
       dict(
-          ragged_reduce_op=ragged.reduce_any,
+          ragged_reduce_op=ragged_math_ops.reduce_any,
           rt_input=[[True, True], [True, True, False, True], [False, True]],
           axis=1,
           expected=[True, True, True]),
       dict(
-          ragged_reduce_op=ragged.reduce_all,
+          ragged_reduce_op=ragged_math_ops.reduce_all,
           rt_input=[[True, True], [True, True, False, True], [False, True]],
           axis=0,
           expected=[False, True, False, True]),
       dict(
-          ragged_reduce_op=ragged.reduce_all,
+          ragged_reduce_op=ragged_math_ops.reduce_all,
           rt_input=[[True, True], [True, True, False, True], [False, True]],
           axis=1,
           expected=[True, False, False]),
@@ -146,53 +151,53 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
       # axis=None
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=None,
           expected=0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9),
       dict(
-          ragged_reduce_op=ragged.reduce_prod,
+          ragged_reduce_op=ragged_math_ops.reduce_prod,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=None,
           expected=0 * 1 * 2 * 3 * 4 * 5 * 6 * 7 * 8 * 9),
       dict(
-          ragged_reduce_op=ragged.reduce_min,
+          ragged_reduce_op=ragged_math_ops.reduce_min,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=None,
           expected=min(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
       dict(
-          ragged_reduce_op=ragged.reduce_max,
+          ragged_reduce_op=ragged_math_ops.reduce_max,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=None,
           expected=max(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
       dict(
-          ragged_reduce_op=ragged.reduce_mean,
+          ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=None,
           expected=mean(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
       # axis=0
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=0,
           expected=[0 + 4 + 5 + 7 + 8, 1 + 6 + 9, 2, 3]),
       dict(
-          ragged_reduce_op=ragged.reduce_prod,
+          ragged_reduce_op=ragged_math_ops.reduce_prod,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=0,
           expected=[0 * 4 * 5 * 7 * 8, 1 * 6 * 9, 2, 3]),
       dict(
-          ragged_reduce_op=ragged.reduce_min,
+          ragged_reduce_op=ragged_math_ops.reduce_min,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=0,
           expected=[min(0, 4, 5, 7, 8), min(1, 6, 9), 2, 3]),
       dict(
-          ragged_reduce_op=ragged.reduce_max,
+          ragged_reduce_op=ragged_math_ops.reduce_max,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=0,
           expected=[max(0, 4, 5, 7, 8), max(1, 6, 9), 2, 3]),
       dict(
-          ragged_reduce_op=ragged.reduce_mean,
+          ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=0,
           expected=[mean(0, 4, 5, 7, 8),
@@ -201,24 +206,24 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       # Note: we don't test mean here because it gives a NaN, and this will
       # cause assertEqual to fail (since NaN != NaN).  See testMeanNan().
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=1,
           expected=[0 + 1 + 2 + 3, 4, 0, 5 + 6, 7, 8 + 9]),
       dict(
-          ragged_reduce_op=ragged.reduce_prod,
+          ragged_reduce_op=ragged_math_ops.reduce_prod,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=1,
           expected=[0 * 1 * 2 * 3, 4, 1, 5 * 6, 7, 8 * 9]),
       dict(
-          ragged_reduce_op=ragged.reduce_min,
+          ragged_reduce_op=ragged_math_ops.reduce_min,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=1,
           expected=[min(0, 1, 2, 3), 4, _MAX_INT32,
                     min(5, 6), 7,
                     min(8, 9)]),
       dict(
-          ragged_reduce_op=ragged.reduce_max,
+          ragged_reduce_op=ragged_math_ops.reduce_max,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=1,
           expected=[max(0, 1, 2, 3), 4, _MIN_INT32,
@@ -233,47 +238,47 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       #  [[9   ]                ]]
       #=========================================================================
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[],
           expected=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]]),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=None,
           expected=sum([1, 2, 3, 4, 5, 6, 7, 8, 9])),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=0,
           expected=[[1 + 6 + 9, 2 + 7], [], [3 + 8, 4, 5]]),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=1,
           expected=[[1 + 3, 2 + 4, 5], [6 + 8, 7], [], [9]]),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=2,
           expected=[[1 + 2, 0, 3 + 4 + 5], [6 + 7, 0, 8], [], [9]]),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[0, 1],
           expected=[1 + 3 + 6 + 8 + 9, 2 + 4 + 7, 5]),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[0, 2],
           expected=[1 + 6 + 9 + 2 + 7, 0, 3 + 8 + 4 + 5]),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[1, 2],
           expected=[1 + 2 + 3 + 4 + 5, 6 + 7 + 8, 0, 9]),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[0, 1, 2],
           expected=sum([1, 2, 3, 4, 5, 6, 7, 8, 9])),
@@ -285,62 +290,56 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       #  [[9   ]          ]]
       #=========================================================================
       dict(
-          ragged_reduce_op=ragged.reduce_mean,
+          ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
           axis=0,
           expected=[[mean(1, 6, 9), mean(2, 7)], [mean(3, 8), 4, 5]]),
       dict(
-          ragged_reduce_op=ragged.reduce_mean,
+          ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
           axis=1,
           expected=[[mean(1, 3), mean(2, 4), 5], [mean(6, 8), 7], [9]]),
       dict(
-          ragged_reduce_op=ragged.reduce_mean,
+          ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
           axis=2,
           expected=[[mean(1, 2), mean(3, 4, 5)], [mean(6, 7), 8], [9]]),
   )
-  @test_util.run_deprecated_v1
   def testReduce(self, ragged_reduce_op, rt_input, axis, expected):
-    rt_input = ragged.constant(rt_input)
+    rt_input = ragged_factory_ops.constant(rt_input)
     reduced = ragged_reduce_op(rt_input, axis)
-    with self.test_session():
-      self.assertEqual(reduced.eval().tolist(), expected)
+    self.assertRaggedEqual(reduced, expected)
 
   def assertEqualWithNan(self, actual, expected):
     """Like assertEqual, but NaN==NaN."""
     self.assertTrue(
         ((actual == expected) | (np.isnan(actual) & np.isnan(expected))).all())
 
-  @test_util.run_deprecated_v1
   def testMeanNan(self):
     rt_as_list = [[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]]
     expected = (
         np.array([0 + 1 + 2 + 3, 4, 0, 5 + 6, 7, 8 + 9]) / np.array(
             [4, 1, 0, 2, 1, 2]))
-    rt_input = ragged.constant(rt_as_list)
-    reduced = ragged.reduce_mean(rt_input, axis=1)
-    with self.test_session():
-      self.assertEqualWithNan(reduced.eval(), expected)
+    rt_input = ragged_factory_ops.constant(rt_as_list)
+    reduced = ragged_math_ops.reduce_mean(rt_input, axis=1)
+    self.assertEqualWithNan(self.evaluate(reduced), expected)
 
-  @test_util.run_deprecated_v1
   def testMeanWithTensorInputs(self):
     tensor = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]
     expected = [2.0, 20.0]
-    reduced = ragged.reduce_mean(tensor, axis=1)
-    with self.test_session():
-      self.assertAllEqual(reduced.eval(), expected)
+    reduced = ragged_math_ops.reduce_mean(tensor, axis=1)
+    self.assertRaggedEqual(reduced, expected)
 
-  @test_util.run_deprecated_v1
   def testErrors(self):
-    rt_input = ragged.constant([[1, 2, 3], [4, 5]])
+    rt_input = ragged_factory_ops.constant([[1, 2, 3], [4, 5]])
     axis = array_ops.placeholder_with_default(constant_op.constant([0]), None)
-    self.assertRaisesRegexp(ValueError,
-                            r'axis must be known at graph construction time.',
-                            ragged.reduce_sum, rt_input, axis)
-    self.assertRaisesRegexp(TypeError,
-                            r'axis must be an int; got str.*',
-                            ragged.reduce_sum, rt_input, ['x'])
+
+    if not context.executing_eagerly():
+      self.assertRaisesRegexp(
+          ValueError, r'axis must be known at graph construction time.',
+          ragged_math_ops.reduce_sum, rt_input, axis)
+    self.assertRaisesRegexp(TypeError, r'axis must be an int; got str.*',
+                            ragged_math_ops.reduce_sum, rt_input, ['x'])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
index 4a705be48487302d5de27a587eb771efc528bb16..8f8089c9bf351be819c1e6ece0cc0165da1de5fb 100644
--- a/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
@@ -20,12 +20,17 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedRowLengthsOp(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedRowLengthsOp(ragged_test_util.RaggedTensorTestCase,
+                         parameterized.TestCase):
 
   @parameterized.parameters([
       # Docstring Example
@@ -37,24 +42,6 @@ class RaggedRowLengthsOp(test_util.TensorFlowTestCase, parameterized.TestCase):
           axis=2,
           expected=[[3, 1], [], [2, 1], [1], []]),
 
-      # 1D tensor
-      dict(
-          rt_input=[1, 2, 3, 4, 5],
-          ragged_rank=0,
-          axis=0,
-          expected=5),
-
-      # 2D Tensor (0 ragged dimensions)
-      dict(
-          rt_input=[[1, 2], [3, 4], [5, 6], [7, 8]],
-          ragged_rank=0,
-          expected=[2, 2, 2, 2]),
-      dict(
-          rt_input=[[1, 2], [3, 4], [5, 6], [7, 8]],
-          ragged_rank=0,
-          axis=0,
-          expected=4),
-
       # 2D Tensor (1 ragged dimension)
       dict(
           rt_input=[['a'], ['b', 'c', 'd'], ['e'], [], ['f']],
@@ -79,24 +66,6 @@ class RaggedRowLengthsOp(test_util.TensorFlowTestCase, parameterized.TestCase):
           axis=0,
           expected=0),
 
-      # 3D Tensor (0 ragged dimensions)
-      dict(
-          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]],
-          ragged_rank=0,
-          axis=0,
-          expected=2),
-      dict(
-          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]],
-          ragged_rank=0,
-          axis=1,
-          expected=[3, 3]),
-      dict(
-          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]],
-          ragged_rank=0,
-          axis=2,
-          expected=[[2, 2, 2], [2, 2, 2]],
-          expected_ragged_rank=0),
-
       # 3D Tensor (1 ragged dimension)
       dict(
           rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10]]],
@@ -143,42 +112,35 @@ class RaggedRowLengthsOp(test_util.TensorFlowTestCase, parameterized.TestCase):
           expected=[[2, 3, 0], [4, 1]],
           expected_ragged_rank=1),
   ])  # pyformat: disable
-  @test_util.run_deprecated_v1
   def testRowLengths(self,
                      rt_input,
                      expected,
                      axis=1,
                      ragged_rank=None,
                      expected_ragged_rank=None):
-    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
-    lengths = ragged.row_lengths(rt, axis)
-    with self.test_session():
-      self.assertEqual(lengths.eval().tolist(), expected)
-      if expected_ragged_rank is not None:
-        if isinstance(lengths, ragged.RaggedTensor):
-          self.assertEqual(lengths.ragged_rank, expected_ragged_rank)
-        else:
-          self.assertEqual(0, expected_ragged_rank)
+    rt = ragged_factory_ops.constant(rt_input, ragged_rank=ragged_rank)
+    lengths = rt.row_lengths(axis)
+    self.assertRaggedEqual(lengths, expected)
+    if expected_ragged_rank is not None:
+      if isinstance(lengths, ragged_tensor.RaggedTensor):
+        self.assertEqual(lengths.ragged_rank, expected_ragged_rank)
+      else:
+        self.assertEqual(0, expected_ragged_rank)
 
   @parameterized.parameters([
-      dict(
-          rt_input=10,
-          exception=ValueError,
-          message='rt_input may not be a scalar.'),
-      dict(
-          rt_input=[10, 20],
-          axis=1,
-          exception=ValueError,
-          message='axis=1 out of bounds: expected -1<=axis<1.'),
-      dict(
+      dict(  # axis=2 out of bounds: expected -2<=axis<2.
+          rt_input=[[10, 20], [30]],
+          axis=2,
+          exception=(ValueError, errors.InvalidArgumentError)),
+      dict(  # axis=-3 out of bounds: expected -2<=axis<2.
           rt_input=[[2, 3, 0], [4, 1, 2]],
           axis=-3,
-          exception=ValueError,
-          message='axis=-3 out of bounds: expected -2<=axis<2.'),
+          exception=(ValueError, errors.InvalidArgumentError)),
   ])
-  def testErrors(self, rt_input, exception, message, axis=1):
+  def testErrors(self, rt_input, exception, message=None, axis=1):
+    rt = ragged_factory_ops.constant(rt_input)
     with self.assertRaisesRegexp(exception, message):
-      ragged.row_lengths(rt_input, axis)
+      rt.row_lengths(axis)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
index 7f5f4e91bdea1ce686ca03663ef5c1985ffc62bf..5384f3ac09df6ce6a2cb9fc19409afd84b09fcc1 100644
--- a/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the ragged.row_splits_to_segment_ids() op."""
+"""Tests for the segment_id_ops.row_splits_to_segment_ids() op."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,39 +20,36 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged import segment_id_ops
 from tensorflow.python.platform import googletest
 
 
-class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedSplitsToSegmentIdsOpTest(ragged_test_util.RaggedTensorTestCase):
 
-  @test_util.run_deprecated_v1
   def testDocStringExample(self):
     splits = [0, 3, 3, 5, 6, 9]
     expected = [0, 0, 0, 2, 2, 3, 4, 4, 4]
-    segment_ids = ragged.row_splits_to_segment_ids(splits)
-    with self.test_session():
-      self.assertEqual(segment_ids.eval().tolist(), expected)
+    segment_ids = segment_id_ops.row_splits_to_segment_ids(splits)
+    self.assertAllEqual(segment_ids, expected)
 
-  @test_util.run_deprecated_v1
   def testEmptySplits(self):
     # Note: the splits for an empty ragged tensor contains a single zero.
-    segment_ids = ragged.row_splits_to_segment_ids([0])
-    with self.test_session():
-      self.assertEqual(segment_ids.eval().tolist(), [])
+    segment_ids = segment_id_ops.row_splits_to_segment_ids([0])
+    self.assertAllEqual(segment_ids, [])
 
-  @test_util.run_deprecated_v1
   def testErrors(self):
     self.assertRaisesRegexp(ValueError, r'Invalid row_splits: \[\]',
-                            ragged.row_splits_to_segment_ids, [])
+                            segment_id_ops.row_splits_to_segment_ids, [])
     self.assertRaisesRegexp(
         ValueError, r'Tensor conversion requested dtype int64 for '
-        'Tensor with dtype float32', ragged.row_splits_to_segment_ids,
+        'Tensor with dtype float32', segment_id_ops.row_splits_to_segment_ids,
         constant_op.constant([0.5]))
     self.assertRaisesRegexp(ValueError, r'Shape \(\) must have rank 1',
-                            ragged.row_splits_to_segment_ids, 0)
+                            segment_id_ops.row_splits_to_segment_ids, 0)
     self.assertRaisesRegexp(ValueError, r'Shape \(1, 1\) must have rank 1',
-                            ragged.row_splits_to_segment_ids, [[0]])
+                            segment_id_ops.row_splits_to_segment_ids, [[0]])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
index 7e52f2d844bc2652e330d84e84a89dacd03d02d6..73ee42a19dc204a006d41e8280efb6228be055ef 100644
--- a/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the ragged.segment_ids_to_row_splits() op."""
+"""Tests for the segment_id_ops.segment_ids_to_row_splits() op."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,58 +20,52 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged import segment_id_ops
 from tensorflow.python.platform import googletest
 
 
-class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedSplitsToSegmentIdsOpTest(ragged_test_util.RaggedTensorTestCase):
 
-  @test_util.run_deprecated_v1
   def testDocStringExample(self):
     segment_ids = [0, 0, 0, 2, 2, 3, 4, 4, 4]
     expected = [0, 3, 3, 5, 6, 9]
-    splits = ragged.segment_ids_to_row_splits(segment_ids)
-    with self.test_session():
-      self.assertEqual(splits.eval().tolist(), expected)
+    splits = segment_id_ops.segment_ids_to_row_splits(segment_ids)
+    self.assertAllEqual(splits, expected)
 
-  @test_util.run_deprecated_v1
   def testEmptySegmentIds(self):
     # Note: the splits for an empty ragged tensor contains a single zero.
-    segment_ids = ragged.segment_ids_to_row_splits([])
-    with self.test_session():
-      self.assertEqual(segment_ids.eval().tolist(), [0])
+    segment_ids = segment_id_ops.segment_ids_to_row_splits([])
+    self.assertAllEqual(segment_ids, [0])
 
   def testErrors(self):
     self.assertRaisesRegexp(TypeError,
                             r'segment_ids must be an integer tensor.*',
-                            ragged.segment_ids_to_row_splits,
+                            segment_id_ops.segment_ids_to_row_splits,
                             constant_op.constant([0.5]))
     self.assertRaisesRegexp(ValueError, r'Shape \(\) must have rank 1',
-                            ragged.segment_ids_to_row_splits, 0)
+                            segment_id_ops.segment_ids_to_row_splits, 0)
     self.assertRaisesRegexp(ValueError, r'Shape \(1, 1\) must have rank 1',
-                            ragged.segment_ids_to_row_splits, [[0]])
+                            segment_id_ops.segment_ids_to_row_splits, [[0]])
 
-  @test_util.run_deprecated_v1
   def testNumSegments(self):
     segment_ids = [0, 0, 0, 2, 2, 3, 4, 4, 4]
     num_segments = 7
     expected = [0, 3, 3, 5, 6, 9, 9, 9]
-    splits = ragged.segment_ids_to_row_splits(segment_ids, num_segments)
-    with self.test_session():
-      self.assertEqual(splits.eval().tolist(), expected)
+    splits = segment_id_ops.segment_ids_to_row_splits(segment_ids, num_segments)
+    self.assertAllEqual(splits, expected)
 
-  @test_util.run_deprecated_v1
   def testUnsortedSegmentIds(self):
     # Segment ids are not required to be sorted.
     segment_ids = [0, 4, 3, 2, 4, 4, 2, 0, 0]
-    splits1 = ragged.segment_ids_to_row_splits(segment_ids)
+    splits1 = segment_id_ops.segment_ids_to_row_splits(segment_ids)
     expected1 = [0, 3, 3, 5, 6, 9]
 
-    splits2 = ragged.segment_ids_to_row_splits(segment_ids, 7)
+    splits2 = segment_id_ops.segment_ids_to_row_splits(segment_ids, 7)
     expected2 = [0, 3, 3, 5, 6, 9, 9, 9]
-    with self.test_session():
-      self.assertEqual(splits1.eval().tolist(), expected1)
-      self.assertEqual(splits2.eval().tolist(), expected2)
+    self.assertAllEqual(splits1, expected1)
+    self.assertAllEqual(splits2, expected2)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_segment_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
index 9e4877ae3e67e1a5d9b11b39b1146aebc7880171..435ce87e00d56e6fa34ecfcaa6cb72bbb8c3cfe8 100644
--- a/tensorflow/python/ops/ragged/ragged_segment_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
@@ -25,7 +25,10 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
@@ -45,21 +48,10 @@ def sqrt_n(values):
   return 1.0 * sum(values) / math.sqrt(len(values))
 
 
-class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedSegmentOpsTest(ragged_test_util.RaggedTensorTestCase,
                            parameterized.TestCase):
 
-  def assertNestedListAmostEqual(self, lhs, rhs, places=7, context='value'):
-    self.assertEqual(type(lhs), type(rhs))
-    if isinstance(lhs, (list, tuple)):
-      self.assertEqual(len(lhs), len(rhs), 'Length differs for %s' % context)
-      for i in range(len(lhs)):
-        self.assertNestedListAmostEqual(lhs[i], rhs[i], places,
-                                        '%s[%s]' % (context, i))
-    else:
-      self.assertAlmostEqual(
-          lhs, rhs, places,
-          '%s != %s within %s places at %s' % (lhs, rhs, places, context))
-
   def expected_value(self, data, segment_ids, num_segments, combiner):
     """Find the expected value for a call to ragged_segment_<aggregate>.
 
@@ -71,7 +63,7 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
     Returns:
       The expected value, as a nested Python list.
     """
-    self.assertEqual(len(data), len(segment_ids))
+    self.assertLen(data, len(segment_ids))
 
     # Build an empty (num_segments x ncols) "grouped" matrix
     ncols = max(len(row) for row in data)
@@ -89,147 +81,140 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
             for grouped_row in grouped]
 
   @parameterized.parameters(
-      (ragged.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_sum, sum, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_sum, sum, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_sum, sum, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_prod, prod, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_prod, prod, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_prod, prod, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_prod, prod, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_min, min, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_min, min, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_min, min, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_min, min, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_max, max, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_max, max, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_max, max, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_max, max, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_mean, mean, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_mean, mean, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_mean, mean, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_mean, mean, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_sum, sum, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_sum, sum, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_sum, sum, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_prod, prod, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_prod, prod, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_prod, prod, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_prod, prod, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_min, min, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_min, min, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_min, min, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_min, min, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_max, max, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_max, max, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_max, max, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_max, max, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_mean, mean, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_mean, mean, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_mean, mean, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_mean, mean, [0, 0, 0, 10, 10, 10]),
   )
-  @test_util.run_deprecated_v1
   def testRaggedSegment_Int(self, segment_op, combiner, segment_ids):
     rt_as_list = [[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]]
-    rt = ragged.constant(rt_as_list)
+    rt = ragged_factory_ops.constant(rt_as_list)
     num_segments = max(segment_ids) + 1
     expected = self.expected_value(rt_as_list, segment_ids, num_segments,
                                    combiner)
 
     segmented = segment_op(rt, segment_ids, num_segments)
-    self.assertListEqual(self.evaluate(segmented).tolist(), expected)
+    self.assertRaggedEqual(segmented, expected)
 
   @parameterized.parameters(
-      (ragged.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_sum, sum, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_sum, sum, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_sum, sum, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_prod, prod, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_prod, prod, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_prod, prod, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_prod, prod, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_min, min, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_min, min, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_min, min, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_min, min, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_max, max, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_max, max, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_max, max, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_max, max, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_mean, mean, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_mean, mean, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_mean, mean, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_mean, mean, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_sqrt_n, sqrt_n, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_sqrt_n, sqrt_n, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_sqrt_n, sqrt_n, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_sqrt_n, sqrt_n, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_sum, sum, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_sum, sum, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_sum, sum, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_prod, prod, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_prod, prod, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_prod, prod, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_prod, prod, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_min, min, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_min, min, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_min, min, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_min, min, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_max, max, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_max, max, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_max, max, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_max, max, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_mean, mean, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_mean, mean, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_mean, mean, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_mean, mean, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_sqrt_n, sqrt_n, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_sqrt_n, sqrt_n, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_sqrt_n, sqrt_n, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_sqrt_n, sqrt_n, [0, 0, 0, 10, 10, 10]),
   )
-  @test_util.run_deprecated_v1
   def testRaggedSegment_Float(self, segment_op, combiner, segment_ids):
     rt_as_list = [[0., 1., 2., 3.], [4.], [], [5., 6.], [7.], [8., 9.]]
-    rt = ragged.constant(rt_as_list)
+    rt = ragged_factory_ops.constant(rt_as_list)
     num_segments = max(segment_ids) + 1
     expected = self.expected_value(rt_as_list, segment_ids, num_segments,
                                    combiner)
 
     segmented = segment_op(rt, segment_ids, num_segments)
-    self.assertNestedListAmostEqual(
-        self.evaluate(segmented).tolist(), expected, places=5)
+    self.assertRaggedAlmostEqual(segmented, expected, places=5)
 
-  @test_util.run_deprecated_v1
   def testRaggedRankTwo(self):
-    rt = ragged.constant([
+    rt = ragged_factory_ops.constant([
         [[111, 112, 113, 114], [121],],  # row 0
         [],                              # row 1
         [[], [321, 322], [331]],         # row 2
         [[411, 412]]                     # row 3
     ])  # pyformat: disable
     segment_ids1 = [0, 2, 2, 2]
-    segmented1 = ragged.segment_sum(rt, segment_ids1, 3)
+    segmented1 = ragged_math_ops.segment_sum(rt, segment_ids1, 3)
     expected1 = [[[111, 112, 113, 114], [121]],     # row 0
                  [],                                # row 1
                  [[411, 412], [321, 322], [331]]    # row 2
                 ]  # pyformat: disable
-    self.assertEqual(self.evaluate(segmented1).tolist(), expected1)
+    self.assertRaggedEqual(segmented1, expected1)
 
     segment_ids2 = [1, 2, 1, 1]
-    segmented2 = ragged.segment_sum(rt, segment_ids2, 3)
+    segmented2 = ragged_math_ops.segment_sum(rt, segment_ids2, 3)
     expected2 = [[],
                  [[111+411, 112+412, 113, 114], [121+321, 322], [331]],
                  []]  # pyformat: disable
-    self.assertEqual(self.evaluate(segmented2).tolist(), expected2)
+    self.assertRaggedEqual(segmented2, expected2)
 
-  @test_util.run_deprecated_v1
   def testRaggedSegmentIds(self):
-    rt = ragged.constant([
+    rt = ragged_factory_ops.constant([
         [[111, 112, 113, 114], [121],],  # row 0
         [],                              # row 1
         [[], [321, 322], [331]],         # row 2
         [[411, 412]]                     # row 3
     ])  # pyformat: disable
-    segment_ids = ragged.constant([[1, 2], [], [1, 1, 2], [2]])
-    segmented = ragged.segment_sum(rt, segment_ids, 3)
+    segment_ids = ragged_factory_ops.constant([[1, 2], [], [1, 1, 2], [2]])
+    segmented = ragged_math_ops.segment_sum(rt, segment_ids, 3)
     expected = [[],
                 [111+321, 112+322, 113, 114],
                 [121+331+411, 412]]  # pyformat: disable
-    self.assertEqual(self.evaluate(segmented).tolist(), expected)
+    self.assertRaggedEqual(segmented, expected)
 
   def testShapeMismatchError1(self):
     dt = constant_op.constant([1, 2, 3, 4, 5, 6])
-    segment_ids = ragged.constant([[1, 2], []])
+    segment_ids = ragged_factory_ops.constant([[1, 2], []])
     self.assertRaisesRegexp(
         ValueError, 'segment_ids.shape must be a prefix of data.shape, '
-        'but segment_ids is ragged and data is not.', ragged.segment_sum, dt,
-        segment_ids, 3)
+        'but segment_ids is ragged and data is not.',
+        ragged_math_ops.segment_sum, dt, segment_ids, 3)
 
-  @test_util.run_deprecated_v1
   def testShapeMismatchError2(self):
-    rt = ragged.constant([
+    rt = ragged_factory_ops.constant([
         [[111, 112, 113, 114], [121]],  # row 0
         [],                             # row 1
         [[], [321, 322], [331]],        # row 2
         [[411, 412]]                    # row 3
     ])  # pyformat: disable
-    segment_ids = ragged.constant([[1, 2], [1], [1, 1, 2], [2]])
+    segment_ids = ragged_factory_ops.constant([[1, 2], [1], [1, 1, 2], [2]])
 
     # Error is raised at graph-building time if we can detect it then.
     self.assertRaisesRegexp(
         errors.InvalidArgumentError,
         'segment_ids.shape must be a prefix of data.shape.*',
-        ragged.segment_sum, rt, segment_ids, 3)
+        ragged_math_ops.segment_sum, rt, segment_ids, 3)
 
     # Otherwise, error is raised when we run the graph.
-    segment_ids2 = ragged.from_row_splits(
+    segment_ids2 = ragged_tensor.RaggedTensor.from_row_splits(
         array_ops.placeholder_with_default(segment_ids.values, None),
         array_ops.placeholder_with_default(segment_ids.row_splits, None))
-    segmented2 = ragged.segment_sum(rt, segment_ids2, 3)
-    with self.cached_session():
-      self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          'segment_ids.shape must be a prefix of data.shape.*', segmented2.eval)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        'segment_ids.shape must be a prefix of data.shape.*'):
+      self.evaluate(ragged_math_ops.segment_sum(rt, segment_ids2, 3))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_stack_op_test.py b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
index 43434716942fb59452271870b380544f15ea0e74..f9c825168e64a4c9f8f0df572e396ca01dc8de51 100644
--- a/tensorflow/python/ops/ragged/ragged_stack_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.stack."""
+"""Tests for ragged_array_ops.stack."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,11 +22,15 @@ from absl.testing import parameterized
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedStackOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedStackOpTest(ragged_test_util.RaggedTensorTestCase,
+                        parameterized.TestCase):
 
   @parameterized.parameters(
       dict(
@@ -265,7 +269,6 @@ class RaggedStackOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           axis=0,
           expected=[[[b'a00', b'a01'], [], [b'a20', b'a21']]]),
   )   # pyformat: disable
-  @test_util.run_deprecated_v1
   def testRaggedStack(self,
                       descr,
                       rt_inputs,
@@ -277,17 +280,16 @@ class RaggedStackOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     if ragged_ranks is None:
       ragged_ranks = [None] * len(rt_inputs)
     rt_inputs = [
-        ragged.constant(rt_input, ragged_rank=rrank)
+        ragged_factory_ops.constant(rt_input, ragged_rank=rrank)  # pylint: disable=g-long-ternary
         if rrank != 0 else constant_op.constant(rt_input)
         for (rt_input, rrank) in zip(rt_inputs, ragged_ranks)
     ]
-    stacked = ragged.stack(rt_inputs, axis)
+    stacked = ragged_array_ops.stack(rt_inputs, axis)
     if expected_ragged_rank is not None:
       self.assertEqual(stacked.ragged_rank, expected_ragged_rank)
     if expected_shape is not None:
       self.assertEqual(stacked.shape.as_list(), expected_shape)
-    with self.test_session():
-      self.assertEqual(stacked.eval().tolist(), expected)
+    self.assertRaggedEqual(stacked, expected)
 
   @parameterized.parameters(
       dict(
@@ -312,9 +314,9 @@ class RaggedStackOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           message='axis=3 out of bounds: expected -3<=axis<3'),
   )
   def testError(self, rt_inputs, axis, error, message):
-    self.assertRaisesRegexp(error, message, ragged.stack, rt_inputs, axis)
+    self.assertRaisesRegexp(error, message, ragged_array_ops.stack, rt_inputs,
+                            axis)
 
-  @test_util.run_deprecated_v1
   def testSingleTensorInput(self):
     """Tests ragged_stack with a single tensor input.
 
@@ -322,10 +324,9 @@ class RaggedStackOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     also pass in a single value (as with tf.stack), in which case it is
     equivalent to expand_dims(axis=0).  This test exercises that path.
     """
-    rt_inputs = ragged.constant([[1, 2], [3, 4]])
-    stacked = ragged.stack(rt_inputs, 0)
-    with self.test_session():
-      self.assertEqual(stacked.eval().tolist(), [[[1, 2], [3, 4]]])
+    rt_inputs = ragged_factory_ops.constant([[1, 2], [3, 4]])
+    stacked = ragged_array_ops.stack(rt_inputs, 0)
+    self.assertRaggedEqual(stacked, [[[1, 2], [3, 4]]])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
index cdcdbdff07b12e4875ab8ff38ff62d3110a76e79..80216376f30ec0ff4b6ce14a0681a43361483093 100644
--- a/tensorflow/python/ops/ragged/ragged_string_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -22,24 +22,27 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_string_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_conversion_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=redefined-builtin
 @tf_export("strings.unicode_encode")
-def unicode_encode(input, output_encoding, errors="replace",
-                   replacement_char=65533, name=None):
+def unicode_encode(input,
+                   output_encoding,
+                   errors="replace",
+                   replacement_char=65533,
+                   name=None):
   r"""Encodes each sequence of Unicode code points in `input` into a string.
 
   `result[i1...iN]` is the string formed by concatenating the Unicode
   codepoints `input[1...iN, :]`, encoded using `output_encoding`.
 
   Args:
-    input: An `N+1` dimensional potentially ragged integer tensor with
-        shape `[D1...DN, num_chars]`.
+    input: An `N+1` dimensional potentially ragged integer tensor with shape
+      `[D1...DN, num_chars]`.
     output_encoding: Unicode encoding that should be used to encode each
       codepoint sequence.  Can be `"UTF-8"`, `"UTF-16-BE"`, or `"UTF-32-BE"`.
     errors: Specifies the response when an invalid codepoint is encountered
@@ -65,16 +68,16 @@ def unicode_encode(input, output_encoding, errors="replace",
     ```
   """
   with ops.name_scope(name, "UnicodeEncode", [input]):
-    input_tensor = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(input)
+    input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(input)
     if input_tensor.shape.ndims is None:
       raise ValueError("Rank of input_tensor must be statically known.")
     if ragged_tensor.is_ragged(input_tensor):
-      if input_tensor.inner_values.shape.ndims > 1:
-        # If the inner_values of our ragged tensor is multi-dimensional, we can
+      if input_tensor.flat_values.shape.ndims > 1:
+        # If the flat_values of our ragged tensor is multi-dimensional, we can
         # process it separately and our output will have the same nested splits
         # as our input.
-        return input_tensor.with_inner_values(
-            unicode_encode(input_tensor.inner_values, output_encoding, errors,
+        return input_tensor.with_flat_values(
+            unicode_encode(input_tensor.flat_values, output_encoding, errors,
                            replacement_char))
       elif input_tensor.ragged_rank > 1:
         # Recursively process the values of the ragged tensor.
@@ -82,7 +85,7 @@ def unicode_encode(input, output_encoding, errors="replace",
             unicode_encode(input_tensor.values, output_encoding, errors,
                            replacement_char))
       else:
-        # Our ragged tensor is of the correct shape (rank 1 inner_values tensor
+        # Our ragged tensor is of the correct shape (rank 1 flat_values tensor
         # with ragged_rank of 1) so we can process it as normal.
         return gen_string_ops.unicode_encode(
             input_values=input_tensor.values,
@@ -93,8 +96,9 @@ def unicode_encode(input, output_encoding, errors="replace",
     else:
       if input_tensor.shape.ndims == 2:
         # The input tensor is of the correct 2-D shape, it's just not ragged.
-        return unicode_encode(ragged_conversion_ops.from_tensor(input_tensor),
-                              output_encoding, errors, replacement_char)
+        return unicode_encode(
+            ragged_conversion_ops.from_tensor(input_tensor), output_encoding,
+            errors, replacement_char)
       elif input_tensor.shape.ndims > 2:
         # We need to initially flatten the input tensor to 2-D, and then can
         # reshape the output of our processed flattened tensor.
@@ -110,10 +114,289 @@ def unicode_encode(input, output_encoding, errors="replace",
         # Our input tensor is rank 1, so we create a ragged tensor with an added
         # dimension to create the correct input shape & type, and then remove
         # the additional dimension from the output and return the string scalar.
-        ragged_input_tensor = ragged_factory_ops.from_row_splits(
+        ragged_input_tensor = ragged_tensor.RaggedTensor.from_row_splits(
             input_tensor,
-            array_ops.stack([0, array_ops.shape(input_tensor,
-                                                out_type=dtypes.int64)[0]]))
+            array_ops.stack(
+                [0, array_ops.shape(input_tensor, out_type=dtypes.int64)[0]]))
         output_tensor = unicode_encode(ragged_input_tensor, output_encoding,
                                        errors, replacement_char)
         return array_ops.reshape(output_tensor, [])
+
+
+# pylint: disable=redefined-builtin
+@tf_export("strings.unicode_decode")
+def unicode_decode(input,
+                   input_encoding,
+                   errors="replace",
+                   replacement_char=0xFFFD,
+                   replace_control_characters=False,
+                   name=None):
+  r"""Decodes each string in `input` into a sequence of Unicode code points.
+
+  `result[i1...iN, j]` is the Unicode codepoint for the `j`th character in
+  `input[i1...iN]`, when decoded using `input_encoding`.
+
+  Args:
+    input: An `N` dimensional potentially ragged `string` tensor with shape
+      `[D1...DN]`.  `N` must be statically known.
+    input_encoding: String name for the unicode encoding that should be used to
+      decode each string.
+    errors: Specifies the response when an input string can't be converted
+      using the indicated encoding. One of:
+      * `'strict'`: Raise an exception for any illegal substrings.
+      * `'replace'`: Replace illegal substrings with `replacement_char`.
+      * `'ignore'`: Skip illegal substrings.
+    replacement_char: The replacement codepoint to be used in place of invalid
+      substrings in `input` when `errors='replace'`; and in place of C0 control
+      characters in `input` when `replace_control_characters=True`.
+    replace_control_characters: Whether to replace the C0 control characters
+      `(U+0000 - U+001F)` with the `replacement_char`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `N+1` dimensional `int32` tensor with shape `[D1...DN, (num_chars)]`.
+    The returned tensor is a `tf.Tensor` if `input` is a scalar, or a
+    `tf.RaggedTensor` otherwise.
+
+  #### Example:
+    ```python
+    >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
+    >>> tf.strings.unicode_decode(input, 'UTF-8').tolist()
+    [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
+    ```
+  """
+  with ops.name_scope(name, "UnicodeDecode", [input]):
+    return _unicode_decode(input, input_encoding, errors, replacement_char,
+                           replace_control_characters, with_offsets=False)
+
+
+@tf_export("strings.unicode_decode_with_offsets")
+def unicode_decode_with_offsets(input,
+                                input_encoding,
+                                errors="replace",
+                                replacement_char=0xFFFD,
+                                replace_control_characters=False,
+                                name=None):
+  r"""Decodes each string into a sequence of code points with start offsets.
+
+  This op is similar to `tf.strings.decode(...)`, but it also returns the
+  start offset for each character in its respective string.  This information
+  can be used to align the characters with the original byte sequence.
+
+  Returns a tuple `(codepoints, start_offsets)` where:
+
+  * `codepoints[i1...iN, j]` is the Unicode codepoint for the `j`th character
+    in `input[i1...iN]`, when decoded using `input_encoding`.
+  * `start_offsets[i1...iN, j]` is the start byte offset for the `j`th
+    character in `input[i1...iN]`, when decoded using `input_encoding`.
+
+  Args:
+    input: An `N` dimensional potentially ragged `string` tensor with shape
+      `[D1...DN]`.  `N` must be statically known.
+    input_encoding: String name for the unicode encoding that should be used to
+      decode each string.
+    errors: Specifies the response when an input string can't be converted
+      using the indicated encoding. One of:
+      * `'strict'`: Raise an exception for any illegal substrings.
+      * `'replace'`: Replace illegal substrings with `replacement_char`.
+      * `'ignore'`: Skip illegal substrings.
+    replacement_char: The replacement codepoint to be used in place of invalid
+      substrings in `input` when `errors='replace'`; and in place of C0 control
+      characters in `input` when `replace_control_characters=True`.
+    replace_control_characters: Whether to replace the C0 control characters
+      `(U+0000 - U+001F)` with the `replacement_char`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `N+1` dimensional tensors `(codepoints, start_offsets)`.
+
+    * `codepoints` is an `int32` tensor with shape `[D1...DN, (num_chars)]`.
+    * `offsets` is an `int64` tensor with shape `[D1...DN, (num_chars)]`.
+
+    The returned tensors are `tf.Tensor`s if `input` is a scalar, or
+    `tf.RaggedTensor`s otherwise.
+
+  #### Example:
+    ```python
+    >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
+    >>> result = tf.strings.unicode_decode_with_offsets(input, 'UTF-8')
+    >>> result[0].tolist()  # codepoints
+    [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
+    >>> result[1].tolist()  # offsets
+   [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]]
+    ```
+  """
+  with ops.name_scope(name, "UnicodeDecodeWithOffsets", [input]):
+    return _unicode_decode(input, input_encoding, errors, replacement_char,
+                           replace_control_characters, with_offsets=True)
+
+
+@tf_export("strings.unicode_split")
+def unicode_split(input,
+                  input_encoding,
+                  errors="replace",
+                  replacement_char=0xFFFD,
+                  name=None):
+  r"""Splits each string in `input` into a sequence of Unicode code points.
+
+  `result[i1...iN, j]` is the substring of `input[i1...iN] that encodes its
+  `j`th character, when decoded using `input_encoding`.
+
+  Args:
+    input: An `N` dimensional potentially ragged `string` tensor with shape
+      `[D1...DN]`.  `N` must be statically known.
+    input_encoding: String name for the unicode encoding that should be used to
+      decode each string.
+    errors: Specifies the response when an input string can't be converted
+      using the indicated encoding. One of:
+      * `'strict'`: Raise an exception for any illegal substrings.
+      * `'replace'`: Replace illegal substrings with `replacement_char`.
+      * `'ignore'`: Skip illegal substrings.
+    replacement_char: The replacement codepoint to be used in place of invalid
+      substrings in `input` when `errors='replace'`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `N+1` dimensional `int32` tensor with shape `[D1...DN, (num_chars)]`.
+    The returned tensor is a `tf.Tensor` if `input` is a scalar, or a
+    `tf.RaggedTensor` otherwise.
+
+  #### Example:
+    ```python
+    >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
+    >>> tf.strings.unicode_split(input, 'UTF-8').tolist()
+    [['G', '\xc3\xb6', '\xc3\xb6', 'd', 'n', 'i', 'g', 'h', 't'],
+     ['\xf0\x9f\x98\x8a']]
+    ```
+  """
+  with ops.name_scope(name, "UnicodeSplit", [input]):
+    codepoints = _unicode_decode(input, input_encoding, errors,
+                                 replacement_char, False, with_offsets=False)
+    return unicode_encode(
+        ragged_array_ops.expand_dims(codepoints, -1),
+        output_encoding=input_encoding,
+        errors=errors,
+        replacement_char=replacement_char)
+
+
+@tf_export("strings.unicode_split_with_offsets")
+def unicode_split_with_offsets(input,
+                               input_encoding,
+                               errors="replace",
+                               replacement_char=0xFFFD,
+                               name=None):
+  r"""Splits each string into a sequence of code points with start offsets.
+
+  This op is similar to `tf.strings.decode(...)`, but it also returns the
+  start offset for each character in its respective string.  This information
+  can be used to align the characters with the original byte sequence.
+
+  Returns a tuple `(chars, start_offsets)` where:
+
+  * `chars[i1...iN, j]` is the substring of `input[i1...iN] that encodes its
+    `j`th character, when decoded using `input_encoding`.
+  * `start_offsets[i1...iN, j]` is the start byte offset for the `j`th
+    character in `input[i1...iN]`, when decoded using `input_encoding`.
+
+  Args:
+    input: An `N` dimensional potentially ragged `string` tensor with shape
+      `[D1...DN]`.  `N` must be statically known.
+    input_encoding: String name for the unicode encoding that should be used to
+      decode each string.
+    errors: Specifies the response when an input string can't be converted
+      using the indicated encoding. One of:
+      * `'strict'`: Raise an exception for any illegal substrings.
+      * `'replace'`: Replace illegal substrings with `replacement_char`.
+      * `'ignore'`: Skip illegal substrings.
+    replacement_char: The replacement codepoint to be used in place of invalid
+      substrings in `input` when `errors='replace'`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `N+1` dimensional tensors `(codepoints, start_offsets)`.
+
+    * `codepoints` is an `int32` tensor with shape `[D1...DN, (num_chars)]`.
+    * `offsets` is an `int64` tensor with shape `[D1...DN, (num_chars)]`.
+
+    The returned tensors are `tf.Tensor`s if `input` is a scalar, or
+    `tf.RaggedTensor`s otherwise.
+
+  #### Example:
+    ```python
+    >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
+    >>> result = tf.strings.unicode_split_with_offsets(input, 'UTF-8')
+    >>> result[0].tolist()  # character substrings
+    [['G', '\xc3\xb6', '\xc3\xb6', 'd', 'n', 'i', 'g', 'h', 't'],
+     ['\xf0\x9f\x98\x8a']]
+    >>> result[1].tolist()  # offsets
+   [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]]
+    ```
+  """
+  with ops.name_scope(name, "UnicodeSplitWithOffsets", [input]):
+    codepoints, offsets = _unicode_decode(input, input_encoding, errors,
+                                          replacement_char, False,
+                                          with_offsets=True)
+    chars = unicode_encode(
+        ragged_array_ops.expand_dims(codepoints, -1),
+        output_encoding=input_encoding,
+        errors=errors,
+        replacement_char=replacement_char)
+    return chars, offsets
+
+
+def _unicode_decode(input, input_encoding, errors, replacement_char,
+                    replace_control_characters, with_offsets):
+  """Decodes each string into a sequence of codepoints."""
+  input = ragged_tensor.convert_to_tensor_or_ragged_tensor(input, name="input")
+  input_ndims = input.shape.ndims
+  if input_ndims is None:
+    raise ValueError("Rank of `input` must be statically known.")
+
+  if input_ndims > 1:
+    # Convert to a ragged tensor with ragged_rank = input_ndims - 1.
+    if not ragged_tensor.is_ragged(input):
+      input = ragged_conversion_ops.from_tensor(
+          input, ragged_rank=input_ndims - 1)
+    elif input.ragged_rank < input_ndims - 1:
+      input = input.with_flat_values(
+          ragged_conversion_ops.from_tensor(
+              input.flat_values,
+              ragged_rank=input_ndims - input.ragged_rank + 1))
+
+  # Reshape the input to a flat vector, and apply the gen_string_ops op.
+  if ragged_tensor.is_ragged(input):
+    flat_input = array_ops.reshape(input.flat_values, [-1])
+  else:
+    flat_input = array_ops.reshape(input, [-1])
+
+  if with_offsets:
+    decode_op = gen_string_ops.unicode_decode_with_offsets
+  else:
+    decode_op = gen_string_ops.unicode_decode
+  flat_result = decode_op(
+      input=flat_input,
+      input_encoding=input_encoding,
+      errors=errors,
+      replacement_char=replacement_char,
+      replace_control_characters=replace_control_characters)
+
+  if input_ndims == 0:
+    codepoints = flat_result.char_values
+    if with_offsets:
+      offsets = flat_result.char_to_byte_starts
+  else:
+    codepoints = ragged_tensor.RaggedTensor.from_row_splits(
+        flat_result.char_values, flat_result.row_splits)
+    if input_ndims > 1:
+      codepoints = input.with_flat_values(codepoints)
+    if with_offsets:
+      offsets = ragged_tensor.RaggedTensor.from_row_splits(
+          flat_result.char_to_byte_starts, flat_result.row_splits)
+      if input_ndims > 1:
+        offsets = input.with_flat_values(offsets)
+
+  if with_offsets:
+    return codepoints, offsets
+  else:
+    return codepoints
+
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index 90f0dafd99a426f5a3e38a93d6b7fcff7de95c7c..fd334e6cc713d3cc3e94a84e9f7f7bdc813e0a7b 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -19,9 +19,20 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_ragged_conversion_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.ops.ragged import segment_id_ops
+from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=protected-access
 _eval_using_default_session = ops._eval_using_default_session
@@ -33,8 +44,9 @@ _eval_using_default_session = ops._eval_using_default_session
 #===============================================================================
 
 
+@tf_export("RaggedTensor")
 class RaggedTensor(object):
-  """Represents a ragged tensor (go/ragged).
+  """Represents a ragged tensor.
 
   A `RaggedTensor` is a tensor with one or more *ragged dimensions*, which are
   dimensions whose slices may have different lengths.  For example, the inner
@@ -64,7 +76,7 @@ class RaggedTensor(object):
   a 3-D `RaggedTensor` that stores the fixed-size word embedding for each
   word in a sentence, for each sentence in a batch, could be written as
   `[num_sentences, (num_words), embedding_size]`.  The parentheses around
-  `(num_words)` indicate that that dimension is ragged, and that the length
+  `(num_words)` indicate that dimension is ragged, and that the length
   of each element list in that dimension may vary for each item.
 
   ### Component Tensors
@@ -84,10 +96,10 @@ class RaggedTensor(object):
   Example:
 
   ```python
-  >>> rt = ragged.from_row_splits(values=[3, 1, 4, 1, 5, 9, 2, 6],
-  ...                             row_splits=[0, 4, 4, 7, 8, 8])
-  >>> rt.tolist()
-  [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
+  >>> print(tf.RaggedTensor.from_row_splits(
+  ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+  ...     row_splits=[0, 4, 4, 7, 8, 8]))
+  <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
   ```
 
   ### Alternative Row-Partitioning Schemes
@@ -116,13 +128,12 @@ class RaggedTensor(object):
 
   ```python
   >>> values = [3, 1, 4, 1, 5, 9, 2, 6]
-  >>> rt1 = ragged.from_row_splits(values, row_splits=[0, 4, 4, 7, 8, 8])
-  >>> rt2 = ragged.from_row_lengths(values, row_lengths=[4, 0, 3, 1, 0])
-  >>> rt3 = ragged.from_value_rowids(values,
-  ...                                value_rowids=[0, 0, 0, 0, 2, 2, 2, 3],
-  ...                                nrows=5)
-  >>> rt4 = ragged.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
-  >>> rt5 = ragged.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
+  >>> rt1 = RaggedTensor.from_row_splits(values, row_splits=[0, 4, 4, 7, 8, 8])
+  >>> rt2 = RaggedTensor.from_row_lengths(values, row_lengths=[4, 0, 3, 1, 0])
+  >>> rt3 = RaggedTensor.from_value_rowids(
+  ...     values, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
+  >>> rt4 = RaggedTensor.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
+  >>> rt5 = RaggedTensor.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
   ```
 
   ### Multiple Ragged Dimensions
@@ -132,24 +143,24 @@ class RaggedTensor(object):
   adds a single ragged dimension.
 
   ```python
-  >>> inner_rt = ragged.from_row_splits(  # =rt1 from above
+  >>> inner_rt = RaggedTensor.from_row_splits(  # =rt1 from above
   ...     values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
-  >>> outer_rt = ragged.from_row_splits(
+  >>> outer_rt = RaggedTensor.from_row_splits(
   ...     values=inner_rt, row_splits=[0, 3, 3, 5])
-  >>> print outer_rt.tolist()
+  >>> print outer_rt.to_list()
   [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]
   >>> print outer_rt.ragged_rank
   2
   ```
 
-  The factory function `ragged.from_nested_row_splits` may be used to
+  The factory function `RaggedTensor.from_nested_row_splits` may be used to
   construct a `RaggedTensor` with multiple ragged dimensions directly, by
   providing a list of `row_splits` tensors:
 
   ```python
-  >>> ragged.from_nested_row_splits(
-  ...     inner_values=[3, 1, 4, 1, 5, 9, 2, 6],
-  ...     nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8])).tolist()
+  >>> RaggedTensor.from_nested_row_splits(
+  ...     flat_values=[3, 1, 4, 1, 5, 9, 2, 6],
+  ...     nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8])).to_list()
   [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]
   ```
 
@@ -159,12 +170,13 @@ class RaggedTensor(object):
   by using a multidimensional `Tensor` for `values`.
 
   ```python
-  >>> rt = ragged.from_row_splits(values=tf.ones([5, 3]), row_splits=[0, 2, 5])
-  >>> print rt.tolist()
+  >>> rt = RaggedTensor.from_row_splits(values=tf.ones([5, 3]),
+  ..                                    row_splits=[0, 2, 5])
+  >>> print rt.to_list()
   [[[1, 1, 1], [1, 1, 1]],
    [[1, 1, 1], [1, 1, 1], [1, 1, 1]]]
-   >>> print rt.shape.as_list()
-   [2, None, 3]
+   >>> print rt.shape
+   (2, ?, 3)
   ```
 
   ### RaggedTensor Shape Restrictions
@@ -181,31 +193,6 @@ class RaggedTensor(object):
   dimension followed by a ragged dimension.
   """
 
-  #=============================================================================
-  # Implementation notes
-  #=============================================================================
-  # Currently, the RaggedTensor class uses a single row-partitioning scheme
-  # (row_splits).
-  #
-  # We are considering adding value_rowids+nvals as a secondary
-  # row-partitioning scheme.  This change would not impact the functional
-  # interface of the RaggedTensor class, but it would impact the efficiency
-  # of several operations.  In particular:
-  #
-  #   * The functions `ragged.value_rowids` and `ragged.nrows` would always
-  #     return pre-existing tensors; they would not need to add any ops to
-  #     the graph.
-  #
-  #   * The `RaggedTensor` constructor would construct all row-partitioning
-  #     tensors (row_splits, value_rowids, and nvals).  In eager mode, this
-  #     would mean that conversion operations would occur whenever a
-  #     `RaggedTensor` is constructed.  But in graph mode, the converted
-  #     row-partitioning tensors would only be evaluated if they are used.
-  #
-  # Since this change impacts efficiency but not functionality, we would like
-  # to perform additional profiling with real-world use cases before we
-  # decide whether to make this change.
-
   #=============================================================================
   # Constructor (private)
   #=============================================================================
@@ -221,13 +208,14 @@ class RaggedTensor(object):
     This constructor is private -- please use one of the following ops to
     build `RaggedTensor`s:
 
-      * [`ragged.from_row_lengths()`](from_row_lengths.md)
-      * [`ragged.from_value_rowids()`](from_value_rowids.md)
-      * [`ragged.from_row_splits()`](from_row_splits.md)
-      * [`ragged.from_row_starts()`](from_row_starts.md)
-      * [`ragged.from_row_limits()`](from_row_limits.md)
-      * [`ragged.from_nested_row_splits()`](from_nested_row_splits.md)
-      * [`ragged.from_nested_value_rowids()`](from_nested_value_rowids.md)
+      * `tf.RaggedTensor.from_row_lengths`
+      * `tf.RaggedTensor.from_value_rowids`
+      * `tf.RaggedTensor.from_row_splits`
+      * `tf.RaggedTensor.from_row_starts`
+      * `tf.RaggedTensor.from_row_limits`
+      * `tf.RaggedTensor.from_nested_row_splits`
+      * `tf.RaggedTensor.from_nested_row_lengths`
+      * `tf.RaggedTensor.from_nested_value_rowids`
 
     Args:
       values: A potentially ragged tensor of any dtype and shape `[nvals, ...]`.
@@ -248,7 +236,7 @@ class RaggedTensor(object):
     if not internal:
       raise ValueError("RaggedTensor constructor is private; please use one "
                        "of the factory methods instead (e.g., "
-                       "ragged.from_row_lengths())")
+                       "RaggedTensor.from_row_lengths())")
 
     # Validate the arguments.
     if not isinstance(values, (RaggedTensor, ops.Tensor)):
@@ -272,6 +260,364 @@ class RaggedTensor(object):
     self._cached_value_rowids = cached_value_rowids
     self._cached_nrows = cached_nrows
 
+  #=============================================================================
+  # Factory Methods
+  #=============================================================================
+
+  @classmethod
+  def from_value_rowids(cls, values, value_rowids, nrows=None, name=None):
+    """Creates a `RaggedTensor` with rows partitioned by `value_rowids`.
+
+    The returned `RaggedTensor` corresponds with the python list defined by:
+
+    ```python
+    result = [[values[i] for i in range(len(values)) if value_rowids[i] == row]
+              for row in range(nrows)]
+    ```
+
+    Warning: currently, this needs to cast value_rowids to int64 before
+    converting, since `tf.bincount` only supports `int32`.
+
+    Args:
+      values: A potentially ragged tensor with shape `[nvals, ...]`.
+      value_rowids: A 1-D int64 tensor with shape `[nvals]`, which corresponds
+        one-to-one with `values`, and specifies each value's row index.  Must be
+        nonnegative, and must be sorted in ascending order.
+      nrows: An int64 scalar specifying the number of rows.  This should be
+        specified if the `RaggedTensor` may containing empty training rows. Must
+        be greater than `value_rowids[-1]` (or zero if `value_rowids` is empty).
+        Defaults to `value_rowids[-1]` (or zero if `value_rowids` is empty).
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    Raises:
+      ValueError: If `nrows` is incompatible with `value_rowids`.
+
+    #### Example:
+      ```python
+      >>> print(tf.RaggedTensor.from_value_rowids(
+      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+      ...     value_rowids=[0, 0, 0, 0, 2, 2, 2, 3],
+      ...     nrows=5))
+      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
+      ```
+    """
+    with ops.name_scope(name, "RaggedFromValueRowIds",
+                        [values, value_rowids, nrows]):
+      values = convert_to_tensor_or_ragged_tensor(values, name="values")
+      value_rowids = ops.convert_to_tensor(
+          value_rowids, dtypes.int64, name="value_rowids")
+      if nrows is None:
+        const_rowids = tensor_util.constant_value(value_rowids)
+        if const_rowids is None:
+          nrows = array_ops.concat([value_rowids[-1:], [-1]], axis=0)[0] + 1
+          const_nrows = None
+        else:
+          const_nrows = const_rowids[-1] + 1 if const_rowids.size > 0 else 0
+          nrows = ops.convert_to_tensor(const_nrows, dtypes.int64, name="nrows")
+      else:
+        nrows = ops.convert_to_tensor(nrows, dtypes.int64, "nrows")
+        const_nrows = tensor_util.constant_value(nrows)
+        if const_nrows is not None:
+          if const_nrows < 0:
+            raise ValueError("Expected nrows >= 0; got %d" % const_nrows)
+          const_rowids = tensor_util.constant_value(value_rowids)
+          if const_rowids is not None and const_rowids.size > 0:
+            if not const_nrows >= const_rowids[-1] + 1:
+              raise ValueError(
+                  "Expected nrows >= value_rowids[-1] + 1; got nrows=%d, "
+                  "value_rowids[-1]=%d" % (const_nrows, const_rowids[-1]))
+
+      value_rowids.shape.assert_has_rank(1)
+      nrows.shape.assert_has_rank(0)
+      values.shape[:1].assert_is_compatible_with(value_rowids.shape)
+
+      # Convert value_rowids & nrows to row_splits.
+      # Note: we don't use segment_ids_to_row_splits() here because we want
+      # to save the intermediate value `row_lengths`, so we can cache it.
+      # TODO(b/116708836) Upgrade bincount to accept int64 so we can skip the
+      # cast (Remove the warning in the docstring when we do.)
+      value_rowids_int32 = math_ops.cast(value_rowids, dtypes.int32)
+      nrows_int32 = math_ops.cast(nrows, dtypes.int32)
+      row_lengths = math_ops.bincount(
+          value_rowids_int32,
+          minlength=nrows_int32,
+          maxlength=nrows_int32,
+          dtype=dtypes.int64)
+      row_splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)
+      if const_nrows is not None:
+        row_lengths.set_shape([const_nrows])
+        row_splits.set_shape([const_nrows + 1])
+
+      return cls(
+          values,
+          row_splits,
+          cached_row_lengths=row_lengths,
+          cached_value_rowids=value_rowids,
+          cached_nrows=nrows,
+          internal=True)
+
+  @classmethod
+  def from_row_splits(cls, values, row_splits, name=None):
+    """Creates a `RaggedTensor` with rows partitioned by `row_splits`.
+
+    The returned `RaggedTensor` corresponds with the python list defined by:
+
+    ```python
+    result = [values[row_splits[i]:row_splits[i + 1]]
+              for i in range(len(row_splits) - 1)]
+    ```
+
+    Args:
+      values: A potentially ragged tensor with shape `[nvals, ...]`.
+      row_splits: A 1-D int64 tensor with shape `[nrows+1]`.  Must not be empty,
+        and must be sorted in ascending order.  `row_splits[0]` must be zero and
+        `row_splits[-1]` must be `nvals`.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    Raises:
+      ValueError: If `row_splits` is an empty list.
+
+    #### Example:
+      ```python
+      >>> print(tf.RaggedTensor.from_row_splits(
+      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+      ...     row_splits=[0, 4, 4, 7, 8, 8]))
+      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
+      ```
+    """
+    if isinstance(row_splits, (list, tuple)) and not row_splits:
+      raise ValueError("row_splits tensor may not be empty.")
+    with ops.name_scope(name, "RaggedFromRowSplits", [values, row_splits]):
+      values = convert_to_tensor_or_ragged_tensor(values, name="values")
+      row_splits = ops.convert_to_tensor(row_splits, dtypes.int64, "row_splits")
+      row_splits.shape.assert_has_rank(1)
+      return cls(values=values, row_splits=row_splits, internal=True)
+
+  @classmethod
+  def from_row_lengths(cls, values, row_lengths, name=None):
+    """Creates a `RaggedTensor` with rows partitioned by `row_lengths`.
+
+    The returned `RaggedTensor` corresponds with the python list defined by:
+
+    ```python
+    result = [[values.pop(0) for i in range(length)]
+              for length in row_lengths]
+    ```
+
+    Args:
+      values: A potentially ragged tensor with shape `[nvals, ...]`.
+      row_lengths: A 1-D int64 tensor with shape `[nrows]`.  Must be
+        nonnegative.  `sum(row_lengths)` must be `nvals`.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    #### Example:
+      ```python
+      >>> print(tf.RaggedTensor.from_row_lengths(
+      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+      ...     row_lengths=[4, 0, 3, 1, 0]))
+      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []])>
+      ```
+    """
+    with ops.name_scope(name, "RaggedFromRowLengths", [values, row_lengths]):
+      values = convert_to_tensor_or_ragged_tensor(values, name="values")
+      row_lengths = ops.convert_to_tensor(row_lengths, dtypes.int64,
+                                          "row_lengths")
+      row_lengths.shape.assert_has_rank(1)
+      row_limits = math_ops.cumsum(row_lengths)
+      row_splits = array_ops.concat([[0], row_limits], axis=0)
+      return cls(
+          values=values,
+          row_splits=row_splits,
+          cached_row_lengths=row_lengths,
+          internal=True)
+
+  @classmethod
+  def from_row_starts(cls, values, row_starts, name=None):
+    """Creates a `RaggedTensor` with rows partitioned by `row_starts`.
+
+    Equivalent to: `from_row_splits(values, concat([row_starts, nvals]))`.
+
+    Args:
+      values: A potentially ragged tensor with shape `[nvals, ...]`.
+      row_starts: A 1-D int64 tensor with shape `[nrows]`.  Must be nonnegative
+        and sorted in ascending order.  If `nrows>0`, then `row_starts[0]` must
+        be zero.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    #### Example:
+      ```python
+      >>> print(tf.RaggedTensor.from_row_starts(
+      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+      ...     row_starts=[0, 4, 4, 7, 8]))
+      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
+      ```
+    """
+    with ops.name_scope(name, "RaggedFromRowStarts", [values, row_starts]):
+      values = convert_to_tensor_or_ragged_tensor(values, name="values")
+      row_starts = ops.convert_to_tensor(row_starts, dtypes.int64, "row_starts")
+      row_starts.shape.assert_has_rank(1)
+      nvals = array_ops.shape(values, out_type=dtypes.int64)[:1]
+      row_splits = array_ops.concat([row_starts, nvals], axis=0)
+      return cls(values=values, row_splits=row_splits, internal=True)
+
+  @classmethod
+  def from_row_limits(cls, values, row_limits, name=None):
+    """Creates a `RaggedTensor` with rows partitioned by `row_limits`.
+
+    Equivalent to: `from_row_splits(values, concat([0, row_limits]))`.
+
+    Args:
+      values: A potentially ragged tensor with shape `[nvals, ...]`.
+      row_limits: A 1-D int64 tensor with shape `[nrows]`.  Must be sorted in
+        ascending order.  If `nrows>0`, then `row_limits[-1]` must be `nvals`.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    #### Example:
+      ```python
+      >>> print(tf.RaggedTensor.from_row_limits(
+      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+      ...     row_limits=[4, 4, 7, 8, 8]))
+      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
+      ```
+    """
+    with ops.name_scope(name, "RaggedFromRowLimits", [values, row_limits]):
+      values = convert_to_tensor_or_ragged_tensor(values, name="values")
+      row_limits = ops.convert_to_tensor(row_limits, dtypes.int64, "row_limits")
+      row_limits.shape.assert_has_rank(1)
+      zero = array_ops.zeros([1], dtypes.int64)
+      row_splits = array_ops.concat([zero, row_limits], axis=0)
+      return cls(values=values, row_splits=row_splits, internal=True)
+
+  @classmethod
+  def from_nested_value_rowids(cls,
+                               flat_values,
+                               nested_value_rowids,
+                               nested_nrows=None,
+                               name=None):
+    """Creates a `RaggedTensor` from a nested list of `value_rowids` tensors.
+
+    Equivalent to:
+
+    ```python
+    result = flat_values
+    for (rowids, nrows) in reversed(zip(nested_value_rowids, nested_nrows)):
+      result = from_value_rowids(result, rowids, nrows)
+    ```
+
+    Args:
+      flat_values: A potentially ragged tensor.
+      nested_value_rowids: A list of 1-D int64 tensors.  The `i`th tensor is
+        used as the `value_rowids` for the `i`th ragged dimension.
+      nested_nrows: A list of int64 scalars.  The `i`th scalar is used as the
+        `nrows` for the `i`th ragged dimension.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor` (or `flat_values` if `nested_value_rowids` is empty).
+
+    Raises:
+      ValueError: If `len(nested_values_rowids) != len(nested_nrows)`.
+    """
+    if isinstance(nested_value_rowids, ops.Tensor):
+      raise TypeError("nested_value_rowids must be a list of Tensors")
+    if nested_nrows is None:
+      nested_nrows = [None] * len(nested_value_rowids)
+    else:
+      if isinstance(nested_nrows, ops.Tensor):
+        raise TypeError("nested_nrows must be a list of Tensors")
+      if len(nested_nrows) != len(nested_value_rowids):
+        raise ValueError("nested_nrows must have the same length as "
+                         "nested_value_rowids")
+
+    with ops.name_scope(
+        name, "RaggedFromNestedValueRowIds",
+        [flat_values] + list(nested_value_rowids) + list(nested_nrows)):
+      result = flat_values
+      for value_rowids, nrows in reversed(
+          list(zip(nested_value_rowids, nested_nrows))):
+        result = cls.from_value_rowids(result, value_rowids, nrows)
+      return result
+
+  @classmethod
+  def from_nested_row_splits(cls, flat_values, nested_row_splits, name=None):
+    """Creates a `RaggedTensor` from a nested list of `row_splits` tensors.
+
+    Equivalent to:
+
+    ```python
+    result = flat_values
+    for row_splits in reversed(nested_row_splits):
+      result = from_row_splits(result, row_splits)
+    ```
+
+    Args:
+      flat_values: A potentially ragged tensor.
+      nested_row_splits: A list of 1-D int64 tensors.  The `i`th tensor is used
+        as the `row_splits` for the `i`th ragged dimension.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor` (or `flat_values` if `nested_row_splits` is empty).
+    """
+    if isinstance(nested_row_splits, ops.Tensor):
+      raise TypeError("nested_row_splits must be a list of Tensors")
+    with ops.name_scope(name, "RaggedFromNestedRowSplits",
+                        [flat_values] + list(nested_row_splits)):
+      result = flat_values
+      for splits in reversed(nested_row_splits):
+        result = cls.from_row_splits(result, splits)
+      return result
+
+  @classmethod
+  def from_nested_row_lengths(cls, flat_values, nested_row_lengths, name=None):
+    """Creates a `RaggedTensor` from a nested list of `row_lengths` tensors.
+
+    Equivalent to:
+
+    ```python
+    result = flat_values
+    for row_lengths in reversed(nested_row_lengths):
+      result = from_row_lengths(result, row_lengths)
+    ```
+
+    Args:
+      flat_values: A potentially ragged tensor.
+      nested_row_lengths: A list of 1-D int64 tensors.  The `i`th tensor is used
+        as the `row_lengths` for the `i`th ragged dimension.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor` (or `flat_values` if `nested_row_lengths` is empty).
+    """
+    if isinstance(nested_row_lengths, ops.Tensor):
+      raise TypeError("nested_row_lengths must be a list of Tensors")
+    with ops.name_scope(name, "RaggedFromNestedRowlengths",
+                        [flat_values] + list(nested_row_lengths)):
+      result = flat_values
+      for lengths in reversed(nested_row_lengths):
+        result = cls.from_row_lengths(result, lengths)
+      return result
+
   #=============================================================================
   # Accessors
   #=============================================================================
@@ -334,8 +680,8 @@ class RaggedTensor(object):
     #### Example:
       ```python
       >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-      >>> rt.values.eval()
-      [3, 1, 4, 1, 5, 9, 2, 6]
+      >>> print rt.values
+      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
       ```
     """
     return self._values
@@ -357,26 +703,24 @@ class RaggedTensor(object):
     #### Example:
       ```python
       >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-      >>> rt.values.eval()
-      [3, 1, 4, 1, 5, 9, 2, 6]
-      >>> rt.row_splits.eval()  # indices of row splits in ragged.values
-      [0, 4, 4, 7, 8, 8]
+      >>> print rt.row_splits  # indices of row splits in rt.values
+      tf.Tensor([0, 4, 4, 7, 8, 8])
       ```
     """
     return self._row_splits
 
   @property
-  def inner_values(self):
+  def flat_values(self):
     """The innermost `values` tensor for this ragged tensor.
 
-    Concretely, if `rt.values` is a `Tensor`, then `rt.inner_values` is
-    `rt.values`; otherwise, `rt.inner_values` is `rt.values.inner_values`.
+    Concretely, if `rt.values` is a `Tensor`, then `rt.flat_values` is
+    `rt.values`; otherwise, `rt.flat_values` is `rt.values.flat_values`.
 
-    Conceptually, `inner_values` is the tensor formed by flattening the
+    Conceptually, `flat_values` is the tensor formed by flattening the
     outermost dimension and all of the ragged dimensions into a single
     dimension.
 
-    `rt.inner_values.shape = [nvals] + rt.shape[rt.ragged_rank + 1:]`
+    `rt.flat_values.shape = [nvals] + rt.shape[rt.ragged_rank + 1:]`
     (where `nvals` is the number of items in the flattened dimensions).
 
     Returns:
@@ -386,8 +730,8 @@ class RaggedTensor(object):
 
       ```python
       >>> rt = ragged.constant([[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
-      >>> ragged.inner_values(rt).eval()
-      [3, 1, 4, 1, 5, 9, 2, 6]
+      >>> print rt.flat_values()
+      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
       ```
     """
     rt_values = self.values
@@ -413,8 +757,8 @@ class RaggedTensor(object):
 
       ```python
       >>> rt = ragged.constant([[[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]])
-      >>> for i, splits in enumerate(ragged.nested_row_splits(rt)):
-      ...   print('Splits for dimension %d: %s' % (i+1, splits.eval()))
+      >>> for i, splits in enumerate(rt.nested_row_splits()):
+      ...   print('Splits for dimension %d: %s' % (i+1, splits))
       Splits for dimension 1: [0, 1]
       Splits for dimension 2: [0, 3, 3, 5]
       Splits for dimension 3: [0, 4, 4, 7, 8, 8]
@@ -428,38 +772,220 @@ class RaggedTensor(object):
       rt_values = rt_values.values
     return tuple(rt_nested_splits)
 
-  @property
-  def cached_value_rowids(self):
-    """The row lengths for this `RaggedTensor`, or `None`.
+  def value_rowids(self, name=None):
+    """Returns the row indices for the `values` in this ragged tensor.
+
+    `rt.value_rowids()` corresponds one-to-one with the outermost dimension of
+    `rt.values`, and specifies the row containing each value.  In particular,
+    the row `rt[row]` consists of the values `rt.values[j]` where
+    `rt.value_rowids()[j] == row`.
+
+    Args:
+      name: A name prefix for the returned tensor (optional).
 
     Returns:
-      The `value_rowids` tensor that was used to construct this `RaggedTensor`
-      if it was constructed using
-      [`ragged.from_value_rowids`](from_value_rowids.md); or `None` otherwise.
+      A 1-D `int64` `Tensor` with shape `self.values.shape[:1]`.
+      The returned tensor is nonnegative, and is sorted in ascending order.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.values
+      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
+      >>> rt.value_rowids()
+      tf.Tensor([0, 0, 0, 0, 2, 2, 2, 3])  # corresponds 1:1 with rt.values
+      ```
     """
-    return self._cached_value_rowids
+    if self._cached_value_rowids is not None:
+      return self._cached_value_rowids
 
-  @property
-  def cached_nrows(self):
-    """The row lengths for this `RaggedTensor`, or `None`.
+    with ops.name_scope(name, "RaggedValueRowIds", [self]):
+      return segment_id_ops.row_splits_to_segment_ids(self.row_splits)
+
+  def nrows(self, out_type=dtypes.int64, name=None):
+    """Returns the number of rows in this ragged tensor.
+
+    I.e., the size of the outermost dimension of the tensor.
+
+    Args:
+      out_type: `dtype` for the returned tensor.
+      name: A name prefix for the returned tensor (optional).
 
     Returns:
-      The `nrows` tensor that was used to construct this `RaggedTensor`
-      if it was constructed using
-      [`ragged.from_value_rowids`](from_value_rowids.md); or `None` otherwise.
+      A scalar `Tensor` with dtype `out_type`.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.nrows()  # rt has 5 rows.
+      5
+      ```
     """
-    return self._cached_nrows
+    if self._cached_nrows is not None:
+      return self._cached_nrows
 
-  @property
-  def cached_row_lengths(self):
-    """The row lengths for this `RaggedTensor`, or `None`.
+    with ops.name_scope(name, "RaggedNRows", [self]):
+      return array_ops.shape(self.row_splits, out_type=out_type)[0] - 1
+
+  def row_starts(self, name=None):
+    """Returns the start indices for rows in this ragged tensor.
+
+    These indices specify where the values for each row begin in
+    `self.values`.  `rt.row_starts()` is equal to `rt.row_splits[:-1]`.
+
+    Args:
+      name: A name prefix for the returned tensor (optional).
+
+    Returns:
+      A 1-D Tensor of int64 with shape `[nrows]`.
+      The returned tensor is nonnegative, and is sorted in ascending order.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.values
+      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
+      >>> rt.row_starts()  # indices of row starts in rt.values
+      tf.Tensor([0, 4, 4, 7, 8])
+      ```
+    """
+    with ops.name_scope(name, "RaggedRowStarts", [self]):
+      return self.row_splits[:-1]
+
+  def row_limits(self, name=None):
+    """Returns the limit indices for rows in this ragged tensor.
+
+    These indices specify where the values for each row end in
+    `self.values`.  `rt.row_limits(self)` is equal to `rt.row_splits[:-1]`.
+
+    Args:
+      name: A name prefix for the returned tensor (optional).
+
+    Returns:
+      A 1-D Tensor of int64 with shape `[nrows]`.
+      The returned tensor is nonnegative, and is sorted in ascending order.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.values
+      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
+      >>> rt.row_limits()  # indices of row limits in rt.values
+      tf.Tensor([4, 4, 7, 8, 8])
+      ```
+    """
+    with ops.name_scope(name, "RaggedRowLimits", [self]):
+      return self.row_splits[1:]
+
+  def row_lengths(self, axis=1, name=None):
+    """Returns the lengths of the rows in this ragged tensor.
+
+    `rt.row_lengths()[i]` indicates the number of values in the
+    `i`th row of `rt`.
+
+    Args:
+      axis: An integer constant indicating the axis whose row lengths should be
+        returned.
+      name: A name prefix for the returned tensor (optional).
+
+    Returns:
+      A potentially ragged Tensor of int64 with shape `self.shape[:axis]`.
+
+    Raises:
+      ValueError: If `axis` is out of bounds.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[[3, 1, 4], [1]], [], [[5, 9], [2]], [[6]], []])
+      >>> rt.row_lengths(rt)  # lengths of rows in rt
+      tf.Tensor([2, 0, 2, 1, 0])
+      >>> rt.row_lengths(axis=2)  # lengths of axis=2 rows.
+      <tf.RaggedTensor [[3, 1], [], [2, 1], [1], []]>
+      ```
+    """
+    if self._cached_row_lengths is not None:
+      return self._cached_row_lengths
+
+    with ops.name_scope(name, "RaggedRowLengths", [self]):
+      axis = ragged_util.get_positive_axis(axis, self.shape.ndims)
+      if axis == 0:
+        return self.nrows()
+      elif axis == 1:
+        splits = self.row_splits
+        return splits[1:] - splits[:-1]
+      elif isinstance(self.values, RaggedTensor):
+        return self.with_values(self.values.row_lengths(axis - 1))
+      else:
+        shape = array_ops.shape(self.values, out_type=dtypes.int64)
+        return self.with_values(
+            array_ops.ones(shape[:axis - 1], dtypes.int64) * shape[axis - 1])
+
+  def nested_row_lengths(self, name=None):
+    """Returns a tuple containing the row_lengths for all ragged dimensions.
+
+    `rtnested_row_lengths()` is a tuple containing the `row_lengths` tensors for
+    all ragged dimensions in `rt`, ordered from outermost to innermost.
+
+    Args:
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A `tuple` of 1-D `int64` `Tensors`.  The length of the tuple is equal to
+      `self.ragged_rank`.
+    """
+    with ops.name_scope(name, "RaggedNestedRowLengths", [self]):
+      rt_nested_row_lengths = []
+      rt = self
+      while isinstance(rt, RaggedTensor):
+        rt_nested_row_lengths.append(rt.row_lengths())
+        rt = rt.values
+      return tuple(rt_nested_row_lengths)
+
+  def bounding_shape(self, axis=None, name=None):
+    """Returns the tight bounding box shape for this `RaggedTensor`.
+
+    Args:
+      axis: An integer scalar or vector indicating which axes to return the
+        bounding box for.  If not specified, then the full bounding box is
+        returned.
+      name: A name prefix for the returned tensor (optional).
 
     Returns:
-      The `row_lengths` tensor that was used to construct this `RaggedTensor`
-      if it was constructed using
-      [`ragged.from_row_lengths`](from_row_lengths.md); or `None` otherwise.
+      An int64 `Tensor`.  If `axis` is not specified, then `output`
+      is a vector with `output.shape=[self.shape.ndims]`.  If `axis` is a
+      scalar, then the `output` is a scalar.  If `axis` is a vector, then
+      `output` is a vector, where `output[i]` is the bounding size for
+      dimension `axis[i]`.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
+      >>> rt.bounding_shape()
+      [5, 4]
+      ```
     """
-    return self._cached_row_lengths
+    with ops.name_scope(name, "RaggedBoundingBox", [self, axis]):
+      nested_splits = self.nested_row_splits
+      rt_flat_values = self.flat_values
+
+      # Optimized special cases for when axis=0 or axis=1:
+      if isinstance(axis, int):
+        if axis == 0:
+          return array_ops.shape(nested_splits[0], out_type=dtypes.int64)[0] - 1
+        elif axis == 1:
+          return math_ops.maximum(math_ops.reduce_max(self.row_lengths()), 0)
+
+      splits_shape = array_ops.shape(self.row_splits, out_type=dtypes.int64)
+      flat_values_shape = array_ops.shape(rt_flat_values, out_type=dtypes.int64)
+
+      ragged_dimensions = array_ops.stack([splits_shape[0] - 1] + [
+          math_ops.maximum(math_ops.reduce_max(splits[1:] - splits[:-1]), 0)
+          for splits in nested_splits
+      ])
+      inner_dimensions = flat_values_shape[1:]
+
+      bbox = array_ops.concat([ragged_dimensions, inner_dimensions], axis=0)
+      return bbox if axis is None else array_ops.gather(bbox, axis)
 
   #=============================================================================
   # Transformation
@@ -481,7 +1007,7 @@ class RaggedTensor(object):
       `result.ragged_rank = 1 + new_values.ragged_rank`
     """
     new_values.shape.with_rank_at_least(1)
-    self.values.shape[0].assert_is_compatible_with(new_values.shape[0])
+    self.values.shape[:1].assert_is_compatible_with(new_values.shape[:1])
     return RaggedTensor(
         new_values,
         self._row_splits,
@@ -490,16 +1016,16 @@ class RaggedTensor(object):
         self._cached_nrows,
         internal=True)
 
-  def with_inner_values(self, new_values):
-    """Returns a copy of `self` with `inner_values` replaced by `new_value`.
+  def with_flat_values(self, new_values):
+    """Returns a copy of `self` with `flat_values` replaced by `new_value`.
 
     Preserves cached row-partitioning tensors such as `self.cached_nrows` and
     `self.cached_value_rowids` if they have values.
 
     Args:
       new_values: Potentially ragged tensor that should replace
-      `self.inner_values`.  Must have `rank > 0`, and must have the same
-      number of rows as `self.inner_values`.
+      `self.flat_values`.  Must have `rank > 0`, and must have the same
+      number of rows as `self.flat_values`.
 
     Returns:
       A `RaggedTensor`.
@@ -509,46 +1035,369 @@ class RaggedTensor(object):
     if isinstance(self._values, ops.Tensor):
       return self.with_values(new_values)
     else:
-      return self.with_values(self.values.with_inner_values(new_values))
+      return self.with_values(self.values.with_flat_values(new_values))
+
+  #=============================================================================
+  # Tensor Type Conversions
+  #=============================================================================
+
+  @classmethod
+  def from_tensor(cls,
+                  tensor,
+                  lengths=None,
+                  padding=None,
+                  ragged_rank=1,
+                  name=None):
+    """Converts a `tf.Tensor` into a `RaggedTensor`.
+
+    The set of absent/default values may be specified using a vector of lengths
+    or a padding value (but not both).  If `lengths` is specified, then the
+    output tensor will satisfy `output[row] = tensor[row][:lengths[row]]`.
+    If `padding` is specified, then any row *suffix* consisting entirely of
+    `padding` will be excluded from the returned `RaggedTensor`.  If neither
+    `lengths` nor `padding` is specified, then the returned `RaggedTensor` will
+    have no absent/default values.
+
+    Examples:
+
+    ```python
+    >>> dt = tf.constant([[5, 7, 0], [0, 3, 0], [6, 0, 0]])
+    >>> tf.RaggedTensor.from_tensor(dt)
+    <tf.RaggedTensor [[5, 7, 0], [0, 3, 0], [6, 0, 0]]>
+    >>> tf.RaggedTensor.from_tensor(dt, lengths=[2, 0, 3])
+    <tf.RaggedTensor [[5, 7], [], [6, 0, 0]]>
+    >>> tf.RaggedTensor.from_tensor(dt, padding=0)
+    <tf.RaggedTensor [[5, 7], [0, 3], [6]]>
+    ```
+
+    Args:
+      tensor: The `Tensor` to convert.  Must have rank `ragged_rank + 1` or
+        higher.
+      lengths: An optional set of row lengths, specified using a 1-D integer
+        `Tensor` whose length is equal to `tensor.shape[0]` (the number of rows
+        in `tensor`).  If specified, then `output[row]` will contain
+        `tensor[row][:lengths[row]]`.  Negative lengths are treated as zero.
+      padding: An optional padding value.  If specified, then any row suffix
+        consisting entirely of `padding` will be excluded from the returned
+        RaggedTensor.  `padding` is a `Tensor` with the same dtype as `tensor`
+        and with `shape=tensor.shape[ragged_rank + 1:]`.
+      ragged_rank: Integer specifying the ragged rank for the returned
+        `RaggedTensor`.  Must be greater than zero.
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A `RaggedTensor` with the specified `ragged_rank`.  The shape of the
+      returned ragged tensor is compatible with the shape of `tensor`.
+    Raises:
+      ValueError: If both `lengths` and `padding` are specified.
+    """
+    if lengths is not None and padding is not None:
+      raise ValueError("Specify lengths or padding, but not both")
+    if not isinstance(ragged_rank, int):
+      raise TypeError("ragged_rank expected int, got %r" % ragged_rank)
+    if ragged_rank <= 0:
+      raise ValueError(
+          "ragged_rank must be greater than 0; got %s" % ragged_rank)
+
+    with ops.name_scope(name, "RaggedFromTensor", [tensor, lengths, padding]):
+      tensor = ops.convert_to_tensor(tensor, name="tensor")
+      tensor.shape.with_rank_at_least(ragged_rank + 1)
+      input_shape = array_ops.shape(tensor, out_type=dtypes.int64)
+      ncols = input_shape[1]
+
+      # Handle ragged_rank>1 via recursion:
+      # If the output should have multiple ragged dimensions, then first
+      # flatten the tensor to eliminate all but the last ragged dimension,
+      # and recursively convert that flattened tensor.  Then add on the splits
+      # for the dimensions that we flattened out.
+      if ragged_rank > 1:
+        # Flatten `tensor` to eliminate all but the last ragged dimension.
+        new_shape = array_ops.concat([
+            constant_op.constant([-1], dtypes.int64), input_shape[ragged_rank:]
+        ],
+                                     axis=0)
+        flattened = array_ops.reshape(tensor, new_shape)
+        # Recursively convert the flattened tensor.
+        values = cls.from_tensor(flattened, lengths, padding)
+        # The total number of elements in each  dimension.  E.g., if
+        # input_shape=[3, 4, 5, 6], then dim[2] has 3*4*5 elements in total.
+        dim_size = math_ops.cumprod(input_shape)
+        # Construct splits tensors for the dimensions that were flattened.
+        new_splits = [
+            math_ops.range(0, dim_size[dim - 1] + 1) * input_shape[dim]
+            for dim in range(1, ragged_rank)
+        ]
+        return cls.from_nested_row_splits(values, new_splits)
+
+      # If padding was specified, then use it to find row lengths.
+      if padding is not None:
+        padding = ops.convert_to_tensor(
+            padding, name="padding", dtype=tensor.dtype)
+        padding.shape.assert_is_compatible_with(tensor.shape[2:])
+
+        # Find places where the padding is equal to the tensor.  (This will
+        # broadcast `padding` across the outermost 2 dimensions of `tensor`,
+        # so `has_default_value.shape = tensor.shape`.)
+        has_default_value = math_ops.equal(padding, tensor)
+
+        # If the padding isn't a scalar, then require that all values in the
+        # padding match each item in the tensor.  After this block of code,
+        # `has_default.shape = tensor.shape[:2]`.  (Unfortunately, we can't just
+        # use reduce_all for both cases, becaue when you pass an empty `axis`
+        # list to reduce_all, it reduces all axes; but we want it to reduce no
+        # axes -- i.e., to be a no-op.)
+        tensor_rank = array_ops.rank(tensor)
+        reduce_axis = math_ops.range(2, tensor_rank)
+        has_default = control_flow_ops.cond(
+            tensor_rank > 2,
+            lambda: math_ops.reduce_all(has_default_value, axis=reduce_axis),
+            lambda: has_default_value)
+        has_default.set_shape(tensor_shape.TensorShape([None, None]))
+        has_default.set_shape(tensor.shape[:2])
+
+        # Use has_default it to find the length of each row: for each
+        # non-default item in a row, calculate the length that the row needs to
+        # have to include that item; and then take the max of those values
+        # (across each row).
+        has_nondefault = math_ops.logical_not(has_default)
+        has_nondefault = math_ops.cast(has_nondefault, dtypes.int64)
+        length_for_nondefault_value = (
+            has_nondefault * array_ops.expand_dims(
+                math_ops.range(1, ncols + 1), 0))
+        lengths = math_ops.reduce_max(length_for_nondefault_value, axis=1)
+
+      # If we have lengths (either directly supplied, or computed from
+      # paddings), then use those to construct splits; and then use masking
+      # to get the corresponding values.
+      if lengths is not None:
+        lengths = ragged_util.convert_to_int_tensor(lengths, "lengths",
+                                                    dtypes.int64)
+        lengths.shape.assert_has_rank(1)
+        lengths = math_ops.minimum(lengths, ncols)
+        lengths = math_ops.maximum(lengths, 0)
+        limits = math_ops.cumsum(lengths)
+        splits = array_ops.concat([array_ops.zeros([1], dtypes.int64), limits],
+                                  axis=0)
+        mask = array_ops.sequence_mask(lengths, maxlen=ncols)
+        values = array_ops.boolean_mask(tensor, mask)
+        return cls.from_row_splits(values, splits)
+
+      # If neither padding nor lengths were specified, then create a splits
+      # vector that contains no default values, and reshape the input tensor
+      # to form the values for the RaggedTensor.
+      nrows = input_shape[0]
+      nvals = nrows * ncols
+      splits = math_ops.range(nrows + 1) * ncols
+      values_shape = array_ops.concat([[nvals], input_shape[2:]], axis=0)
+      values = array_ops.reshape(tensor, values_shape)
+      return cls.from_row_splits(values, splits)
+
+  def to_tensor(self, default_value=None, name=None):
+    """Converts this `RaggedTensor` into a `tf.Tensor`.
+
+    Example:
+
+    ```python
+    >>> rt = ragged.constant([[9, 8, 7], [], [6, 5], [4]])
+    >>> print rt.to_tensor()
+    [[9 8 7]
+     [0 0 0]
+     [6 5 0]
+     [4 0 0]]
+    ```
+
+    Args:
+      default_value: Value to set for indices not specified in `self`. Defaults
+        to zero.  `default_value` must be broadcastable to
+        `self.shape[self.ragged_rank + 1:]`.
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A `Tensor` with shape `ragged.bounding_shape(self)` and the
+      values specified by the non-empty values in `self`.  Empty values are
+      assigned `default_value`.
+    """
+    with ops.name_scope(name, "RaggedToTensor", [self, default_value]):
+      if default_value is not None:
+        default_value = ops.convert_to_tensor(
+            default_value, name="default_value", dtype=self.dtype)
+
+      # If ragged_rank > 1, then recursively convert the ragged values into a
+      # `Tensor` before we proceed.
+      values = self.values
+      if is_ragged(values):
+        values = values.to_tensor(default_value)
+
+      # Tile the default value, if necessary.
+      if default_value is not None:
+        if values.shape.ndims is not None:
+          default_value.shape.with_rank_at_most(values.shape.ndims - 1)
+        if (values.shape.ndims is None or default_value.shape.ndims is None or
+            values.shape.ndims != default_value.shape.ndims + 1):
+          value_shape = array_ops.shape(values)[1:]
+          default_value = array_ops.broadcast_to(default_value, value_shape)
+        default_value.shape.assert_is_compatible_with(values.shape[1:])
+
+      # Get the expected dense shape ([nrows, ncols] + value_shape).
+      rt_row_lengths = [self.row_splits[1:] - self.row_splits[:-1]]
+      nrows = array_ops.shape(self.row_splits, out_type=dtypes.int64)[0] - 1
+      ncols = math_ops.maximum(math_ops.reduce_max(rt_row_lengths), 0)
+      values_shape = array_ops.shape(values, out_type=dtypes.int64)
+      value_shape = values_shape[1:]
+      nvals = values_shape[0]
+
+      # Build a default value if none was supplied.
+      if default_value is None:
+        default_value = array_ops.zeros(value_shape, dtype=values.dtype)
+      default_value.shape.assert_is_compatible_with(values.shape[1:])
+      default_value.set_shape(values.shape[1:])
+
+      # Get the row start indices, and expand to shape=[nrows, 1].
+      starts = array_ops.expand_dims(self.row_splits[:-1], 1)
+
+      # Get the row limit indices, and expand to shape=[nrows, 1].
+      limits = array_ops.expand_dims(self.row_splits[1:], 1)
+
+      # Get the column indices, and expand to shape=[1, ncols].
+      columns = array_ops.expand_dims(math_ops.range(0, ncols), 0)
+
+      # Build a list containing the values plus the default value.  We will use
+      # tf.gather to collect values from this list for the `Tensor` (using
+      # nvals as the index for the default value).
+      values_and_default = array_ops.concat(
+          [values, array_ops.stack([default_value])], axis=0)
+
+      # Construct a matrix "indices" pointing into values_and_default.  I.e.,
+      # output[r, c] = values_and_default[indices[r, c].
+      nondefault_index = starts + columns
+      has_value = nondefault_index < limits
+      default_index = array_ops.fill(array_ops.stack([nrows, ncols]), nvals)
+      indices = array_ops.where(has_value, nondefault_index, default_index)
+
+      # Gather the results into a `Tensor`.
+      return array_ops.gather(values_and_default, indices)
+
+  @classmethod
+  def from_sparse(cls, st_input, name=None):
+    """Converts a 2D `tf.SparseTensor` to a `RaggedTensor`.
+
+    Each row of the `output` `RaggedTensor` will contain the explicit values
+    from the same row in `st_input`.  `st_input` must be ragged-right.  If not
+    it is not ragged-right, then an error will be generated.
+
+    Example:
+
+    ```python
+    >>> st = SparseTensor(indices=[[0, 1], [0, 2], [0, 3], [1, 0], [3, 0]],
+    ...                   values=[1, 2, 3, 4, 5],
+    ...                   dense_shape=[4, 3])
+    >>> rt.RaggedTensor.from_sparse(st).eval().tolist()
+    [[1, 2, 3], [4], [], [5]]
+    ```
+
+    Currently, only two-dimensional `SparseTensors` are supported.
+
+    Args:
+      st_input: The sparse tensor to convert.  Must have rank 2.
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A `RaggedTensor` with the same values as `st_input`.
+      `output.ragged_rank = rank(st_input) - 1`.
+      `output.shape = [st_input.dense_shape[0], None]`.
+    Raises:
+      ValueError: If the number of dimensions in `st_input` is not known
+        statically, or is not two.
+    """
+    if not sparse_tensor.is_sparse(st_input):
+      raise TypeError("Expected SparseTensor, got %s" % type(st_input).__name__)
+    with ops.name_scope(name, "RaggedFromSparse", [st_input]):
+      st_input = sparse_tensor.convert_to_tensor_or_sparse_tensor(
+          st_input, name="st_input")
+
+      if st_input.dense_shape.shape.ndims is None:
+        static_rank_from_dense_shape = None
+      else:
+        static_rank_from_dense_shape = st_input.dense_shape.shape.dims[0].value
+
+      if st_input.indices.shape.ndims is None:
+        static_rank_from_indices = None
+      else:
+        static_rank_from_indices = st_input.indices.shape.dims[1].value
+
+      if static_rank_from_dense_shape != 2 and static_rank_from_indices != 2:
+        raise ValueError("rank(st_input) must be 2")
+
+      with ops.control_dependencies(
+          _assert_sparse_indices_are_ragged_right(st_input.indices)):
+        # Treat sparse row indices as segment ids to generate a splits tensor
+        # thta we can pair with the sparse tensor values.  (Ignore sparse column
+        # indices.)
+        segment_ids = st_input.indices[:, 0]
+        num_segments = st_input.dense_shape[0]
+        return cls.from_value_rowids(st_input.values, segment_ids, num_segments)
+
+  def to_sparse(self, name=None):
+    """Converts this `RaggedTensor` into a `tf.SparseTensor`.
+
+    Example:
+
+    ```python
+    >>> rt = ragged.constant([[1, 2, 3], [4], [], [5, 6]])
+    >>> rt.to_sparse().eval()
+    SparseTensorValue(indices=[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]],
+                      values=[1, 2, 3, 4, 5, 6],
+                      dense_shape=[4, 3])
+    ```
+
+    Args:
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A SparseTensor with the same values as `self`.
+    """
+    with ops.name_scope(name, "RaggedToSparse", [self]):
+      result = gen_ragged_conversion_ops.ragged_tensor_to_sparse(
+          self.nested_row_splits, self.flat_values, name=name)
+      return sparse_tensor.SparseTensor(result.sparse_indices,
+                                        result.sparse_values,
+                                        result.sparse_dense_shape)
 
   #=============================================================================
   # String Encoding
   #=============================================================================
   def __str__(self):
     if self._is_eager():
-      return "RaggedTensor(%s)" % self.tolist()
+      return "<tf.RaggedTensor %s>" % self.to_list()
     else:
       return self.__repr__()
 
   def __repr__(self):
-    return "RaggedTensor(values=%s, row_splits=%s)" % (self._values,
-                                                       self._row_splits)
+    return "tf.RaggedTensor(values=%s, row_splits=%s)" % (self._values,
+                                                          self._row_splits)
 
   #=============================================================================
   # Eager Execution Mode
   #=============================================================================
 
-  def tolist(self):
+  def to_list(self):
     """Returns a nested Python `list` with the values for this `RaggedTensor`.
 
-    If a `RaggedTensor` `rt` was constructed in graph execution mode, then
-    `rt.tolist()` is equivalent to `rt.eval().tolist()`.
-
-    If a `RaggedTensor` `rt` was constructed in eager execution mode, then
-    `rt.tolist()` builds the Python list based on `rt`'s `EagerTensor`
-    components.
+    Requires that `rt` was constructed in eager execution mode.
 
     Returns:
       A nested Python `list`.
     """
     if self._is_eager():
-      return self._eager_value().tolist()
+      return self._eager_value().to_list()
     else:
-      return self.eval().tolist()
+      raise ValueError("RaggedTensor.to_list() is only supported in eager "
+                       "mode; in graph mode, evaluate the RaggedTensor first "
+                       "and then use RaggedTensorValue.to_list().")
 
   def _eager_value(self):
     """Returns a RaggedTensorValue for self.  Requires self._is_eager()=true."""
-    value = self.inner_values.numpy()
+    value = self.flat_values.numpy()
     for row_splits in reversed(self.nested_row_splits):
       value = ragged_tensor_value.RaggedTensorValue(value, row_splits.numpy())
     return value
@@ -562,24 +1411,6 @@ class RaggedTensor(object):
       rt = rt.values
     return isinstance(rt, ops.EagerTensor)
 
-  #=============================================================================
-  # Evaluation
-  #=============================================================================
-  def eval(self, feed_dict=None, session=None):  # pylint: disable=redefined-outer-name
-    """Evaluates this ragged tensor in a `Session`.
-
-    Args:
-      feed_dict: A dictionary that maps `Tensor` objects to feed values. See
-        `tf.Session.run` for a description of the valid feed values.
-      session: The `Session` to be used to evaluate this ragged tensor. If none,
-        the default session will be used.
-
-    Returns:
-      A `RaggedTensorValue` object.
-    """
-    return _eval_using_default_session(self, feed_dict,
-                                       self._as_graph_element().graph, session)
-
   #=============================================================================
   # Indexing & Slicing
   #=============================================================================
@@ -613,6 +1444,53 @@ def is_ragged(value):
                     (RaggedTensor, ragged_tensor_value.RaggedTensorValue))
 
 
+#===============================================================================
+# Convert value -> tensor
+#===============================================================================
+def convert_to_tensor_or_ragged_tensor(value,
+                                       dtype=None,
+                                       preferred_dtype=None,
+                                       name=None):
+  """Converts value to a `RaggedTensor` or `Tensor`.
+
+  * If `value` is a `RaggedTensor`, then return it as-is.
+  * If `value` is a `RaggedTensorValue`, return a corresponding constant
+    `RaggedTensor`.
+  * Otherwise, use `convert_to_tensor` to convert `value` to a `Tensor`.
+
+  Args:
+    value: A `RaggedTensor`, a `RaggedTensorValue`, or an object whose type has
+      a registered `Tensor` conversion function.
+    dtype: Optional element type for the returned tensor.  If missing the type
+      is inferred from the type of `value`.
+    preferred_dtype: Optional element type for the returned tensor, used when
+      dtype is None.  This argument has no effect if `value` is already a
+      tensor, or when conversion is not possible.
+    name: Optional name to use if a new `Tensor` is created.
+
+  Returns:
+    A `Tensor` or `RaggedTensor`.
+  """
+  if isinstance(value, RaggedTensor):
+    if dtype and not dtype.is_compatible_with(value.dtype):
+      raise ValueError("Tensor conversion requested dtype %s for "
+                       "RaggedTensor with dtype %s: %r" %
+                       (dtype.name, value.dtype.name, value))
+    return value
+  elif isinstance(value, ragged_tensor_value.RaggedTensorValue):
+    with ops.name_scope(name, "ConvertToTensorOrRaggedTensor", []):
+      flat_values = ops.convert_to_tensor(
+          value=value.flat_values,
+          dtype=dtype,
+          preferred_dtype=preferred_dtype,
+          name="flat_values")
+      return RaggedTensor.from_nested_row_splits(flat_values,
+                                                 value.nested_row_splits)
+  else:
+    return ops.convert_to_tensor(
+        value=value, dtype=dtype, preferred_dtype=preferred_dtype, name=name)
+
+
 #===============================================================================
 # Register RaggedTensor for use with session.run.
 #===============================================================================
@@ -625,18 +1503,18 @@ def _ragged_tensor_value_from_components(components):
 
 
 def _ragged_tensor_session_fetch(rt):
-  components = rt.nested_row_splits + (rt.inner_values,)
+  components = rt.nested_row_splits + (rt.flat_values,)
   return (components, _ragged_tensor_value_from_components)
 
 
 def _ragged_tensor_session_feed(feed_key, feed_val):
-  key_components = feed_key.nested_row_splits + (feed_key.inner_values,)
-  val_components = feed_val.nested_row_splits + (feed_val.inner_values,)
+  key_components = feed_key.nested_row_splits + (feed_key.flat_values,)
+  val_components = feed_val.nested_row_splits + (feed_val.flat_values,)
   return zip(key_components, val_components)
 
 
 def _ragged_tensor_session_feed_for_partial_run(feed_key):
-  return feed_key.nested_row_splits + (feed_key.inner_values,)
+  return feed_key.nested_row_splits + (feed_key.flat_values,)
 
 
 session.register_session_run_conversion_functions(
@@ -644,6 +1522,9 @@ session.register_session_run_conversion_functions(
     _ragged_tensor_session_feed_for_partial_run)
 
 
+#===============================================================================
+# RaggedTensorType
+#===============================================================================
 class RaggedTensorType(object):
   """Encoding of a static type for a `RaggedTensor`.
 
@@ -663,3 +1544,67 @@ class RaggedTensorType(object):
 
   dtype = property(lambda self: self._dtype)
   ragged_rank = property(lambda self: self._ragged_rank)
+
+
+#===============================================================================
+# Helper Functions
+#===============================================================================
+def _assert_sparse_indices_are_ragged_right(indices):
+  """Checks that the given SparseTensor.indices tensor is ragged-right.
+
+  Example: `indices = [[0, 0], [0, 1], [2, 0], [3, 1]]` is not ragged right
+  because the entry `[3, 1]` skips a cell.
+
+  Args:
+    indices: The SparseTensor indices to check.
+
+  Returns:
+    A list of control dependency op tensors.
+  """
+  index_prefix = indices[:, :-1]
+  index_suffix = indices[:, -1]
+
+  # Check whether each index is starting a new row in the innermost dimension
+  # (prefix[i] != prefix[i-1]) or continuing a row (prefix[i] == prefix[i-1]).
+  # (Note: this skips the first index; we will check that separately below.)
+  index_prefix_changed = math_ops.reduce_any(
+      math_ops.not_equal(index_prefix[1:], index_prefix[:-1]), axis=1)
+
+  # Check two cases:
+  #   * For indices that start a new row: index_suffix[i] must be zero.
+  #   * For indices that continue a row: index_suffix[i] must be equal to
+  #     index_suffix[i-1]+1.
+  index_ok = array_ops.where(
+      index_prefix_changed, math_ops.equal(index_suffix[1:], 0),
+      math_ops.equal(index_suffix[1:], index_suffix[:-1] + 1))
+
+  # Also check that the very first index didn't skip any cells.  The first
+  # index starts a new row (by definition), so its suffix should be zero.
+  sparse_indices_are_ragged_right = math_ops.logical_and(
+      math_ops.reduce_all(math_ops.equal(index_suffix[:1], 0)),
+      math_ops.reduce_all(index_ok))
+
+  message = [
+      "SparseTensor is not right-ragged", "SparseTensor.indices =", indices
+  ]
+  return [control_flow_ops.Assert(sparse_indices_are_ragged_right, message)]
+
+
+@ops.RegisterGradient("RaggedTensorToSparse")
+def _ragged_tensor_to_sparse_gradient(op, unused_sparse_indices_grad,
+                                      sparse_values_grad,
+                                      unused_sparse_shape_grad):
+  """Gradient for RaggedTensorToSparse."""
+  op_inputs_nested_row_splits = op.inputs[:-1]
+  op_inputs_flat_values = op.inputs[-1]
+
+  # No gradient for the RaggedTensor's nested_row_splits.
+  nested_row_splits_gradient = [None] * len(op_inputs_nested_row_splits)
+
+  # Gradient for the RaggedTensor's flat_values is formed by reshaping
+  # the gradient for the SparseTensor's values.
+  flat_values_shape = array_ops.shape(op_inputs_flat_values)
+  flat_values_gradient = array_ops.reshape(sparse_values_grad,
+                                           flat_values_shape)
+
+  return nested_row_splits_gradient + [flat_values_gradient]
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py b/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
index befe30f0e10ce59d9a485a4d19048d4ed68f48d2..025a221626cd580d07b8993e59328e798e830cfa 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
@@ -19,48 +19,45 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedTensorBoundingShapeOp(test_util.TensorFlowTestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorBoundingShapeOp(ragged_test_util.RaggedTensorTestCase):
 
   def testDocStringExample(self):
     # This is the example from ragged.bounding_shape.__doc__.
-    rt = ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
-    self.assertEqual(self.evaluate(ragged.bounding_shape(rt)).tolist(), [5, 4])
+    rt = ragged_factory_ops.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9],
+                                      [10]])
+    self.assertRaggedEqual(rt.bounding_shape(), [5, 4])
 
   def test2DRaggedTensorWithOneRaggedDimension(self):
     values = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
-    rt1 = ragged.from_row_splits(values, [0, 2, 5, 6, 6, 7])
-    rt2 = ragged.from_row_splits(values, [0, 7])
-    rt3 = ragged.from_row_splits(values, [0, 0, 7, 7])
-    self.assertEqual(self.evaluate(ragged.bounding_shape(rt1)).tolist(), [5, 3])
-    self.assertEqual(self.evaluate(ragged.bounding_shape(rt2)).tolist(), [1, 7])
-    self.assertEqual(self.evaluate(ragged.bounding_shape(rt3)).tolist(), [3, 7])
+    rt1 = ragged_tensor.RaggedTensor.from_row_splits(values, [0, 2, 5, 6, 6, 7])
+    rt2 = ragged_tensor.RaggedTensor.from_row_splits(values, [0, 7])
+    rt3 = ragged_tensor.RaggedTensor.from_row_splits(values, [0, 0, 7, 7])
+    self.assertRaggedEqual(rt1.bounding_shape(), [5, 3])
+    self.assertRaggedEqual(rt2.bounding_shape(), [1, 7])
+    self.assertRaggedEqual(rt3.bounding_shape(), [3, 7])
 
   def test3DRaggedTensorWithOneRaggedDimension(self):
     values = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]]
-    rt1 = ragged.from_row_splits(values, [0, 2, 5, 6, 6, 7])
-    rt2 = ragged.from_row_splits(values, [0, 7])
-    rt3 = ragged.from_row_splits(values, [0, 0, 7, 7])
-    self.assertEqual(
-        self.evaluate(ragged.bounding_shape(rt1)).tolist(), [5, 3, 2])
-    self.assertEqual(
-        self.evaluate(ragged.bounding_shape(rt2)).tolist(), [1, 7, 2])
-    self.assertEqual(
-        self.evaluate(ragged.bounding_shape(rt3)).tolist(), [3, 7, 2])
-
-  def testNonRaggedTensor(self):
-    dt = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]
-    self.assertEqual(self.evaluate(ragged.bounding_shape(dt)).tolist(), [4, 3])
+    rt1 = ragged_tensor.RaggedTensor.from_row_splits(values, [0, 2, 5, 6, 6, 7])
+    rt2 = ragged_tensor.RaggedTensor.from_row_splits(values, [0, 7])
+    rt3 = ragged_tensor.RaggedTensor.from_row_splits(values, [0, 0, 7, 7])
+    self.assertRaggedEqual(rt1.bounding_shape(), [5, 3, 2])
+    self.assertRaggedEqual(rt2.bounding_shape(), [1, 7, 2])
+    self.assertRaggedEqual(rt3.bounding_shape(), [3, 7, 2])
 
   def testExplicitAxisOptimizations(self):
-    rt = ragged.from_row_splits(b'a b c d e f g'.split(), [0, 2, 5, 6, 6, 7])
-    self.assertEqual(self.evaluate(ragged.bounding_shape(rt, 0)).tolist(), 5)
-    self.assertEqual(self.evaluate(ragged.bounding_shape(rt, 1)).tolist(), 3)
-    self.assertEqual(
-        self.evaluate(ragged.bounding_shape(rt, [1, 0])).tolist(), [3, 5])
+    rt = ragged_tensor.RaggedTensor.from_row_splits(b'a b c d e f g'.split(),
+                                                    [0, 2, 5, 6, 6, 7])
+    self.assertRaggedEqual(rt.bounding_shape(0), 5)
+    self.assertRaggedEqual(rt.bounding_shape(1), 3)
+    self.assertRaggedEqual(rt.bounding_shape([1, 0]), [3, 5])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape.py b/tensorflow/python/ops/ragged/ragged_tensor_shape.py
index 9129b4b10b4c7f477fcc67612abb9e9bc788f225..706881da74a46137171d4d4771b82e652d4ad4c8 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_shape.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape.py
@@ -21,13 +21,13 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_conversion_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
 
@@ -55,7 +55,7 @@ class RaggedTensorDynamicShape(object):
       be ragged.
 
     * "Inner dimensions" are dimensions that are encoded using a
-      `RaggedTensor`'s `inner_values`.  Inner dimensions are always uniform.
+      `RaggedTensor`'s `flat_values`.  Inner dimensions are always uniform.
 
   The sizes of partitioned dimensions are recorded using `partitioned_dim_sizes`
   and `inner_dim_sizes`:
@@ -161,15 +161,15 @@ class RaggedTensorDynamicShape(object):
   def from_tensor(cls, rt_input):
     """Constructs a ragged shape for a potentially ragged tensor."""
     with ops.name_scope(None, 'RaggedTensorDynamicShapeFromTensor', [rt_input]):
-      rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(rt_input)
+      rt_input = ragged_tensor.convert_to_tensor_or_ragged_tensor(rt_input)
       if not ragged_tensor.is_ragged(rt_input):
         return cls([], array_ops.shape(rt_input))
       else:
-        partitioned_dim_sizes = ((ragged_array_ops.nrows(rt_input),) +
-                                 ragged_array_ops.nested_row_lengths(rt_input))
+        partitioned_dim_sizes = (
+            (rt_input.nrows(),) + rt_input.nested_row_lengths())
         return RaggedTensorDynamicShape(
             partitioned_dim_sizes,
-            array_ops.shape(rt_input.inner_values)[1:])
+            array_ops.shape(rt_input.flat_values)[1:])
 
   def dimension_size(self, axis):
     """Returns the size of slices across the specified dimension."""
@@ -197,7 +197,7 @@ class RaggedTensorDynamicShape(object):
   @property
   def rank(self):
     """The number of dimensions in this shape, or None if unknown."""
-    inner_ndims = self._inner_dim_sizes.shape[0].value
+    inner_ndims = tensor_shape.dimension_value(self._inner_dim_sizes.shape[0])
     if inner_ndims is None:
       return None
     else:
@@ -229,7 +229,7 @@ class RaggedTensorDynamicShape(object):
   @property
   def num_inner_dimensions(self):
     """The number of inner dimensions, or `None` if not statically known."""
-    return self._inner_dim_sizes.shape[0].value
+    return tensor_shape.dimension_value(self._inner_dim_sizes.shape[0])
 
   def broadcast_to_rank(self, rank):
     """Adds leading size-1 dimensions to broadcast `self` to the given rank.
@@ -456,7 +456,7 @@ def broadcast_to(rt_input, shape, broadcast_inner_dimensions=True):
   """
   if not isinstance(shape, RaggedTensorDynamicShape):
     raise TypeError('shape must be a RaggedTensorDynamicShape')
-  rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(rt_input)
+  rt_input = ragged_tensor.convert_to_tensor_or_ragged_tensor(rt_input)
 
   # Broadcasting to a uniform shape.
   if shape.num_partitioned_dimensions == 0:
@@ -497,17 +497,20 @@ def _broadcast_to_ragged_shape(rt_input, dst_shape, broadcast_inner_dimensions):
       rt_input = array_ops.reshape(
           rt_input, array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0))
     for _ in range(dst_shape.rank - rt_input.shape.ndims):
-      rt_input = ragged_factory_ops.from_row_lengths(
-          rt_input, [ragged_array_ops.nrows(rt_input)])
+      if ragged_tensor.is_ragged(rt_input):
+        nrows = rt_input.nrows()
+      else:
+        nrows = array_ops.shape(rt_input, out_type=dtypes.int64)[0]
+      rt_input = ragged_tensor.RaggedTensor.from_row_lengths(rt_input, [nrows])
 
   # Add ragged dimensions to match dst_shape.
   if ragged_tensor.is_ragged(rt_input):
     inner_rank_diff = (
-        rt_input.inner_values.shape.ndims - 1 - dst_shape.num_inner_dimensions)
+        rt_input.flat_values.shape.ndims - 1 - dst_shape.num_inner_dimensions)
     if inner_rank_diff > 0:
-      rt_input = rt_input.with_inner_values(
+      rt_input = rt_input.with_flat_values(
           ragged_conversion_ops.from_tensor(
-              rt_input.inner_values, ragged_rank=inner_rank_diff))
+              rt_input.flat_values, ragged_rank=inner_rank_diff))
   else:
     rt_input = ragged_conversion_ops.from_tensor(
         rt_input, ragged_rank=dst_shape.num_partitioned_dimensions - 1)
@@ -528,9 +531,9 @@ def _broadcast_to_ragged_shape(rt_input, dst_shape, broadcast_inner_dimensions):
     rt_input = ragged_array_ops.tile(rt_input, multiples)
 
   if broadcast_inner_dimensions:
-    rt_input = rt_input.with_inner_values(
+    rt_input = rt_input.with_flat_values(
         array_ops.reshape(
-            rt_input.inner_values,
+            rt_input.flat_values,
             array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0)))
 
   # Do broadcasting for dimensions that become ragged.  We must do these from
@@ -555,7 +558,7 @@ def _ragged_tile_axis(rt_input, axis, repeats):
         _ragged_tile_axis(rt_input.values, axis - 1, repeats))
   else:
     src_row_splits = rt_input.nested_row_splits
-    src_row_lengths = ragged_array_ops.nested_row_lengths(rt_input)
+    src_row_lengths = rt_input.nested_row_lengths()
     splits = src_row_splits[0]
 
     dst_row_lengths = [repeats]
@@ -563,8 +566,7 @@ def _ragged_tile_axis(rt_input, axis, repeats):
       dst_row_lengths.append(
           ragged_util.repeat_ranges(src_row_lengths[i], splits, repeats))
       splits = array_ops.gather(src_row_splits[i], splits)
-    dst_values = ragged_util.repeat_ranges(rt_input.inner_values, splits,
+    dst_values = ragged_util.repeat_ranges(rt_input.flat_values, splits,
                                            repeats)
-    return ragged_factory_ops.from_nested_row_lengths(dst_values,
-                                                      dst_row_lengths)
-
+    return ragged_tensor.RaggedTensor.from_nested_row_lengths(
+        dst_values, dst_row_lengths)
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
index 9c2dd260503e7ae678a9306a92078398ecebd15e..bc0139cffd846662fe2df990a0eaa511cd7f0f63 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
@@ -23,27 +23,31 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_shape
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged.ragged_tensor_shape import RaggedTensorDynamicShape
 from tensorflow.python.platform import googletest
 
 
-class RaggedTensorShapeTest(test_util.TensorFlowTestCase,
-                            parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorBoundingShapeOp(ragged_test_util.RaggedTensorTestCase,
+                                  parameterized.TestCase):
 
   def assertShapeEq(self, x, y):
-    assert isinstance(x, ragged.RaggedTensorDynamicShape)
-    assert isinstance(y, ragged.RaggedTensorDynamicShape)
+    assert isinstance(x, RaggedTensorDynamicShape)
+    assert isinstance(y, RaggedTensorDynamicShape)
     x_partitioned_dim_sizes = [
-        splits.eval().tolist()  #
+        self.eval_to_list(splits)  #
         for splits in x.partitioned_dim_sizes
     ]
     y_partitioned_dim_sizes = [
-        splits.eval().tolist()  #
+        self.eval_to_list(splits)  #
         for splits in y.partitioned_dim_sizes
     ]
     self.assertEqual(x_partitioned_dim_sizes, y_partitioned_dim_sizes)
-    self.assertEqual(x.inner_dim_sizes.eval().tolist(),
-                     y.inner_dim_sizes.eval().tolist())
+    self.assertAllEqual(x.inner_dim_sizes, y.inner_dim_sizes)
 
   @parameterized.parameters([
       dict(value='x', expected_dim_sizes=[]),
@@ -53,41 +57,41 @@ class RaggedTensorShapeTest(test_util.TensorFlowTestCase,
           value=[[['a', 'b', 'c'], ['d', 'e', 'f']]],
           expected_dim_sizes=[1, 2, 3]),
       dict(
-          value=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          value=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d',
+                                                                     'e']]),
           expected_dim_sizes=[2, [3, 2]]),
       dict(
-          value=ragged.constant_value([[['a', 'b', 'c'], ['d', 'e']]]),
+          value=ragged_factory_ops.constant_value([[['a', 'b', 'c'], ['d',
+                                                                      'e']]]),
           expected_dim_sizes=[1, [2], [3, 2]]),
       dict(
-          value=ragged.constant_value([[['a', 'b', 'c'], ['d', 'e', 'f']]],
-                                      ragged_rank=1),
+          value=ragged_factory_ops.constant_value(
+              [[['a', 'b', 'c'], ['d', 'e', 'f']]], ragged_rank=1),
           expected_dim_sizes=[1, [2], 3]),
       dict(
-          value=ragged.constant_value([[[[1], [2]], [[3], [4]]],
-                                       [[[5], [6]]]], ragged_rank=1),
+          value=ragged_factory_ops.constant_value(
+              [[[[1], [2]], [[3], [4]]], [[[5], [6]]]], ragged_rank=1),
           expected_dim_sizes=[2, [2, 1], 2, 1]),
       dict(
-          value=ragged.constant_value([[10, 20], [30]]),
+          value=ragged_factory_ops.constant_value([[10, 20], [30]]),
           expected_dim_sizes=[2, [2, 1]]),
       # Docstring examples:
       dict(value=[[1, 2, 3], [4, 5, 6]], expected_dim_sizes=[2, 3]),
       dict(
-          value=ragged.constant_value([[1, 2], [], [3, 4, 5]]),
+          value=ragged_factory_ops.constant_value([[1, 2], [], [3, 4, 5]]),
           expected_dim_sizes=[3, [2, 0, 3]]),
       dict(
-          value=ragged.constant_value([[[1, 2], [3, 4]], [[5, 6]]],
-                                      ragged_rank=1),
+          value=ragged_factory_ops.constant_value([[[1, 2], [3, 4]], [[5, 6]]],
+                                                  ragged_rank=1),
           expected_dim_sizes=[2, [2, 1], 2]),
       dict(
-          value=ragged.constant_value([[[1, 2], [3]], [[4, 5]]]),
+          value=ragged_factory_ops.constant_value([[[1, 2], [3]], [[4, 5]]]),
           expected_dim_sizes=[2, [2, 1], [2, 1, 2]]),
   ])
   def testFromTensor(self, value, expected_dim_sizes):
-    shape = ragged.RaggedTensorDynamicShape.from_tensor(value)
-    expected = ragged.RaggedTensorDynamicShape.from_dim_sizes(
-        expected_dim_sizes)
-    with self.cached_session():
-      self.assertShapeEq(shape, expected)
+    shape = RaggedTensorDynamicShape.from_tensor(value)
+    expected = RaggedTensorDynamicShape.from_dim_sizes(expected_dim_sizes)
+    self.assertShapeEq(shape, expected)
 
   @parameterized.parameters([
       dict(dim_sizes=[], rank=0, expected_dim_sizes=[]),
@@ -106,13 +110,11 @@ class RaggedTensorShapeTest(test_util.TensorFlowTestCase,
           expected_dim_sizes=[1, 3, [3, 2, 4], 2, 3]),
   ])
   def testBroadcastToRank(self, dim_sizes, rank, expected_dim_sizes):
-    shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
-    expected = ragged.RaggedTensorDynamicShape.from_dim_sizes(
-        expected_dim_sizes)
+    shape = RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
+    expected = RaggedTensorDynamicShape.from_dim_sizes(expected_dim_sizes)
     broadcasted_shape = shape.broadcast_to_rank(rank)
-    with self.cached_session():
-      self.assertShapeEq(broadcasted_shape, expected)
-      self.assertEqual(broadcasted_shape.rank, rank)
+    self.assertShapeEq(broadcasted_shape, expected)
+    self.assertEqual(broadcasted_shape.rank, rank)
 
   @parameterized.parameters([
       #=========================================================================
@@ -298,22 +300,19 @@ class RaggedTensorShapeTest(test_util.TensorFlowTestCase,
         original_dim_sizes[axis] should be equal to `1` or `row_length`.
       broadcast_dim_sizes: THe dimension sizes after broadcasting.
     """
-    original_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(
-        original_dim_sizes)
-    broadcast_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(
-        broadcast_dim_sizes)
-    self.assertEqual(original_shape.rank, broadcast_shape.rank)
-    with self.cached_session():
-      # shape[axis].value == 1 and row_length > 1:
-      bcast1 = original_shape.broadcast_dimension(axis, row_length)
-      # shape[axis].value > 1 and row_length == shape[axis].value:
-      bcast2 = broadcast_shape.broadcast_dimension(axis, row_length)
-      # shape[axis].value > 1 and row_length == 1:
-      bcast3 = broadcast_shape.broadcast_dimension(axis, 1)
-
-      self.assertShapeEq(bcast1, broadcast_shape)
-      self.assertShapeEq(bcast2, broadcast_shape)
-      self.assertShapeEq(bcast3, broadcast_shape)
+    original_shape = RaggedTensorDynamicShape.from_dim_sizes(original_dim_sizes)
+    bcast_shape = RaggedTensorDynamicShape.from_dim_sizes(broadcast_dim_sizes)
+    self.assertEqual(original_shape.rank, bcast_shape.rank)
+    # shape[axis].value == 1 and row_length > 1:
+    bcast1 = original_shape.broadcast_dimension(axis, row_length)
+    # shape[axis].value > 1 and row_length == shape[axis].value:
+    bcast2 = bcast_shape.broadcast_dimension(axis, row_length)
+    # shape[axis].value > 1 and row_length == 1:
+    bcast3 = bcast_shape.broadcast_dimension(axis, 1)
+
+    self.assertShapeEq(bcast1, bcast_shape)
+    self.assertShapeEq(bcast2, bcast_shape)
+    self.assertShapeEq(bcast3, bcast_shape)
 
   @parameterized.parameters(
       [
@@ -371,116 +370,121 @@ class RaggedTensorShapeTest(test_util.TensorFlowTestCase,
               expected_dims=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)]),
       ])
   def testBroadcastDynamicShape(self, x_dims, y_dims, expected_dims):
-    x_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(x_dims)
-    y_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(y_dims)
-    expected = ragged.RaggedTensorDynamicShape.from_dim_sizes(expected_dims)
-    result1 = ragged.broadcast_dynamic_shape(x_shape, y_shape)
-    result2 = ragged.broadcast_dynamic_shape(y_shape, x_shape)
-    with self.cached_session():
-      self.assertShapeEq(expected, result1)
-      self.assertShapeEq(expected, result2)
+    x_shape = RaggedTensorDynamicShape.from_dim_sizes(x_dims)
+    y_shape = RaggedTensorDynamicShape.from_dim_sizes(y_dims)
+    expected = RaggedTensorDynamicShape.from_dim_sizes(expected_dims)
+    result1 = ragged_tensor_shape.broadcast_dynamic_shape(x_shape, y_shape)
+    result2 = ragged_tensor_shape.broadcast_dynamic_shape(y_shape, x_shape)
+    self.assertShapeEq(expected, result1)
+    self.assertShapeEq(expected, result2)
 
   def testRepr(self):
-    shape = ragged.RaggedTensorDynamicShape.from_dim_sizes([2, (2, 1), 2, 1])
+    shape = RaggedTensorDynamicShape.from_dim_sizes([2, (2, 1), 2, 1])
     self.assertRegexpMatches(
         repr(shape),
         r'RaggedTensorDynamicShape\('
         r'partitioned_dim_sizes=\(<[^>]+>, <[^>]+>\), '
         r'inner_dim_sizes=<[^>]+>\)')
 
-  @parameterized.parameters([
-      dict(
-          x=[[10], [20], [30]],  # shape=[3, 1]
-          dim_sizes=[3, 2],
-          expected=[[10, 10], [20, 20], [30, 30]]),
-      dict(
-          x=[[10], [20], [30]],  # shape=[3, 1]
-          dim_sizes=[3, [3, 0, 2]],
-          expected=ragged.constant_value([[10, 10, 10], [], [30, 30]],
-                                         dtype=np.int32)),
-      dict(
-          x=[[[1, 2, 3]], [[4, 5, 6]]],  # shape = [2, 1, 3]
-          dim_sizes=[2, [2, 3], 3],
-          expected=ragged.constant_value(
-              [[[1, 2, 3], [1, 2, 3]], [[4, 5, 6], [4, 5, 6], [4, 5, 6]]],
-              dtype=np.int32,
-              ragged_rank=1)),
-      dict(
-          x=[[[1]], [[2]]],  # shape = [2, 1, 1]
-          dim_sizes=[2, [2, 3], [0, 2, 1, 2, 0]],
-          expected=ragged.constant_value([[[], [1, 1]], [[2], [2, 2], []]],
-                                         dtype=np.int32,
-                                         ragged_rank=2)),
-      dict(
-          x=10,
-          dim_sizes=[3, [3, 0, 2]],
-          expected=ragged.constant_value([[10, 10, 10], [], [10, 10]])),
-  ])
+  @parameterized.parameters(
+      [
+          dict(
+              x=[[10], [20], [30]],  # shape=[3, 1]
+              dim_sizes=[3, 2],
+              expected=[[10, 10], [20, 20], [30, 30]]),
+          dict(
+              x=[[10], [20], [30]],  # shape=[3, 1]
+              dim_sizes=[3, [3, 0, 2]],
+              expected=ragged_factory_ops.constant_value(
+                  [[10, 10, 10], [], [30, 30]], dtype=np.int32)),
+          dict(
+              x=[[[1, 2, 3]], [[4, 5, 6]]],  # shape = [2, 1, 3]
+              dim_sizes=[2, [2, 3], 3],
+              expected=ragged_factory_ops.constant_value(
+                  [[[1, 2, 3], [1, 2, 3]], [[4, 5, 6], [4, 5, 6], [4, 5, 6]]],
+                  dtype=np.int32,
+                  ragged_rank=1)),
+          dict(
+              x=[[[1]], [[2]]],  # shape = [2, 1, 1]
+              dim_sizes=[2, [2, 3], [0, 2, 1, 2, 0]],
+              expected=ragged_factory_ops.constant_value(
+                  [[[], [1, 1]], [[2], [2, 2], []]],
+                  dtype=np.int32,
+                  ragged_rank=2)),
+          dict(
+              x=10,
+              dim_sizes=[3, [3, 0, 2]],
+              expected=ragged_factory_ops.constant_value([[10, 10, 10], [],
+                                                          [10, 10]])),
+      ])
   def testRaggedBroadcastTo(self, x, dim_sizes, expected):
-    shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
-    result = ragged.broadcast_to(x, shape)
-    with self.cached_session():
-      self.assertEqual(
-          getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank',
-                                                     0))
-      if hasattr(expected, 'tolist'):
-        expected = expected.tolist()
-      self.assertEqual(result.eval().tolist(), expected)
+    shape = RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
+    result = ragged_tensor_shape.broadcast_to(x, shape)
+    self.assertEqual(
+        getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
+    self.assertRaggedEqual(result, expected)
 
-  @parameterized.parameters([
-      dict(
-          doc='x.shape=[3, (D1)]; y.shape=[3, 1]; bcast.shape=[3, (D1)]',
-          x=ragged.constant_value([[1, 2, 3], [], [4, 5]], dtype=np.int32),
-          y=[[10], [20], [30]],
-          expected=ragged.constant_value([[11, 12, 13], [], [34, 35]])),
-      dict(
-          doc='x.shape=[3, (D1)]; y.shape=[]; bcast.shape=[3, (D1)]',
-          x=ragged.constant_value([[1, 2, 3], [], [4, 5]], dtype=np.int32),
-          y=10,
-          expected=ragged.constant_value([[11, 12, 13], [], [14, 15]])),
-      dict(
-          doc='x.shape=[1, (D1)]; y.shape=[3, 1]; bcast.shape=[3, (D1)]',
-          x=ragged.constant_value([[1, 2, 3]], dtype=np.int32),
-          y=[[10], [20], [30]],
-          expected=ragged.constant_value(
-              [[11, 12, 13], [21, 22, 23], [31, 32, 33]], dtype=np.int32)),
-      dict(
-          doc=('x.shape=[2, (D1), 1]; y.shape=[1, (D2)]; '
-               'bcast.shape=[2, (D1), (D2)]'),
-          x=ragged.constant_value([[[1], [2], [3]], [[4]]], ragged_rank=1),
-          y=ragged.constant_value([[10, 20, 30]]),
-          expected=ragged.constant_value([[[11, 21, 31], [12, 22, 32],
-                                           [13, 23, 33]], [[14, 24, 34]]])),
-      dict(
-          doc=('x.shape=[2, (D1), 1]; y.shape=[1, 1, 4]; '
-               'bcast.shape=[2, (D1), 4]'),
-          x=ragged.constant_value([[[10], [20]], [[30]]], ragged_rank=1),
-          y=[[[1, 2, 3, 4]]],
-          expected=ragged.constant_value(
-              [[[11, 12, 13, 14], [21, 22, 23, 24]], [[31, 32, 33, 34]]],
-              ragged_rank=1)),
-      dict(
-          doc=('x.shape=[2, (D1), 2, 1]; y.shape=[2, (D2)]; '
-               'bcast.shape=[2, (D1), (2), (D2)'),
-          x=ragged.constant_value([[[[1], [2]], [[3], [4]]],
-                                   [[[5], [6]]]],
-                                  ragged_rank=1),
-          y=ragged.constant_value([[10, 20], [30]]),
-          expected=ragged.constant_value(
-              [[[[11, 21], [32]], [[13, 23], [34]]],
-               [[[15, 25], [36]]]])),
-  ])
+  @parameterized.parameters(
+      [
+          dict(
+              doc='x.shape=[3, (D1)]; y.shape=[3, 1]; bcast.shape=[3, (D1)]',
+              x=ragged_factory_ops.constant_value([[1, 2, 3], [], [4, 5]],
+                                                  dtype=np.int32),
+              y=[[10], [20], [30]],
+              expected=ragged_factory_ops.constant_value([[11, 12, 13], [],
+                                                          [34, 35]])),
+          dict(
+              doc='x.shape=[3, (D1)]; y.shape=[]; bcast.shape=[3, (D1)]',
+              x=ragged_factory_ops.constant_value([[1, 2, 3], [], [4, 5]],
+                                                  dtype=np.int32),
+              y=10,
+              expected=ragged_factory_ops.constant_value([[11, 12, 13], [],
+                                                          [14, 15]])),
+          dict(
+              doc='x.shape=[1, (D1)]; y.shape=[3, 1]; bcast.shape=[3, (D1)]',
+              x=ragged_factory_ops.constant_value([[1, 2, 3]], dtype=np.int32),
+              y=[[10], [20], [30]],
+              expected=ragged_factory_ops.constant_value(
+                  [[11, 12, 13], [21, 22, 23], [31, 32, 33]], dtype=np.int32)),
+          dict(
+              doc=('x.shape=[2, (D1), 1]; y.shape=[1, (D2)]; '
+                   'bcast.shape=[2, (D1), (D2)]'),
+              x=ragged_factory_ops.constant_value([[[1], [2], [3]], [[4]]],
+                                                  ragged_rank=1),
+              y=ragged_factory_ops.constant_value([[10, 20, 30]]),
+              expected=ragged_factory_ops.constant_value([[[11, 21, 31],
+                                                           [12, 22, 32],
+                                                           [13, 23, 33]],
+                                                          [[14, 24, 34]]])),
+          dict(
+              doc=('x.shape=[2, (D1), 1]; y.shape=[1, 1, 4]; '
+                   'bcast.shape=[2, (D1), 4]'),
+              x=ragged_factory_ops.constant_value([[[10], [20]], [[30]]],
+                                                  ragged_rank=1),
+              y=[[[1, 2, 3, 4]]],
+              expected=ragged_factory_ops.constant_value(
+                  [[[11, 12, 13, 14], [21, 22, 23, 24]], [[31, 32, 33, 34]]],
+                  ragged_rank=1)),
+          dict(
+              doc=('x.shape=[2, (D1), 2, 1]; y.shape=[2, (D2)]; '
+                   'bcast.shape=[2, (D1), (2), (D2)'),
+              x=ragged_factory_ops.constant_value(
+                  [[[[1], [2]], [[3], [4]]], [[[5], [6]]]], ragged_rank=1),
+              y=ragged_factory_ops.constant_value([[10, 20], [30]]),
+              expected=ragged_factory_ops.constant_value([[[[11, 21], [32]],
+                                                           [[13, 23], [34]]],
+                                                          [[[15, 25], [36]]]])),
+      ])
   def testRaggedAddWithBroadcasting(self, x, y, expected, doc):
     expected_rrank = getattr(expected, 'ragged_rank', 0)
-    x = ragged.convert_to_tensor_or_ragged_tensor(x, dtype=dtypes.int32)
-    y = ragged.convert_to_tensor_or_ragged_tensor(y, dtype=dtypes.int32)
+    x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, dtype=dtypes.int32)
+    y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, dtype=dtypes.int32)
     result = x + y
     result_rrank = getattr(result, 'ragged_rank', 0)
     self.assertEqual(expected_rrank, result_rrank)
     if hasattr(expected, 'tolist'):
       expected = expected.tolist()
-    with self.cached_session():
-      self.assertEqual(result.eval().tolist(), expected)
+    self.assertRaggedEqual(result, expected)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index 608fbd6e5b7595ca013c1f7edb801839119d9aa2..89691b015d76dbd35d0a9f5db2f2a0ab431147b1 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -19,17 +19,22 @@ from __future__ import division
 from __future__ import print_function
 
 import re
-import sys
 
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
 from tensorflow.python.platform import googletest
 
 
@@ -103,63 +108,62 @@ EXAMPLE_RAGGED_TENSOR_4D = [
 EXAMPLE_RAGGED_TENSOR_4D_SPLITS1 = [0, 2, 2, 3, 4]
 EXAMPLE_RAGGED_TENSOR_4D_SPLITS2 = [0, 3, 6, 9, 10]
 EXAMPLE_RAGGED_TENSOR_4D_VALUES = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10],
-                                   [11, 12], [13, 14], [15, 16], [17,
-                                                                  18], [19, 20]]
+                                   [11, 12], [13, 14], [15, 16], [17, 18],
+                                   [19, 20]]
 
 
-class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
   longMessage = True  # Property in unittest.Testcase. pylint: disable=invalid-name
 
   #=============================================================================
   # RaggedTensor class docstring examples
   #=============================================================================
 
-  @test_util.run_deprecated_v1
   def testClassDocStringExamples(self):
     # From section: "Component Tensors"
-    rt = ragged.from_row_splits(
+    rt = RaggedTensor.from_row_splits(
         values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
-    self.assertEqual(
-        self.evaluate(rt).tolist(), [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    self.assertRaggedEqual(rt, [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
     del rt
 
     # From section: "Alternative Row-Partitioning Schemes"
     values = [3, 1, 4, 1, 5, 9, 2, 6]
-    rt1 = ragged.from_row_splits(values, row_splits=[0, 4, 4, 7, 8, 8])
-    rt2 = ragged.from_row_lengths(values, row_lengths=[4, 0, 3, 1, 0])
-    rt3 = ragged.from_value_rowids(
+    rt1 = RaggedTensor.from_row_splits(values, row_splits=[0, 4, 4, 7, 8, 8])
+    rt2 = RaggedTensor.from_row_lengths(values, row_lengths=[4, 0, 3, 1, 0])
+    rt3 = RaggedTensor.from_value_rowids(
         values, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
-    rt4 = ragged.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
-    rt5 = ragged.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
+    rt4 = RaggedTensor.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
+    rt5 = RaggedTensor.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
     for rt in (rt1, rt2, rt3, rt4, rt5):
-      self.assertEqual(
-          self.evaluate(rt).tolist(), [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      self.assertRaggedEqual(rt, [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
     del rt1, rt2, rt3, rt4, rt5
 
     # From section: "Multiple Ragged Dimensions"
-    inner_rt = ragged.from_row_splits(
+    inner_rt = RaggedTensor.from_row_splits(
         values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
-    outer_rt = ragged.from_row_splits(values=inner_rt, row_splits=[0, 3, 3, 5])
+    outer_rt = RaggedTensor.from_row_splits(
+        values=inner_rt, row_splits=[0, 3, 3, 5])
     self.assertEqual(outer_rt.ragged_rank, 2)
     self.assertEqual(
-        self.evaluate(outer_rt).tolist(),
+        self.eval_to_list(outer_rt),
         [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
     del inner_rt, outer_rt
 
     # From section: "Multiple Ragged Dimensions"
-    rt = ragged.from_nested_row_splits(
-        inner_values=[3, 1, 4, 1, 5, 9, 2, 6],
+    rt = RaggedTensor.from_nested_row_splits(
+        flat_values=[3, 1, 4, 1, 5, 9, 2, 6],
         nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8]))
     self.assertEqual(
-        self.evaluate(rt).tolist(),
-        [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+        self.eval_to_list(rt), [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
     del rt
 
     # From section: "Uniform Inner Dimensions"
-    rt = ragged.from_row_splits(
+    rt = RaggedTensor.from_row_splits(
         values=array_ops.ones([5, 3]), row_splits=[0, 2, 5])
     self.assertEqual(
-        self.evaluate(rt).tolist(),
+        self.eval_to_list(rt),
         [[[1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]])
     self.assertEqual(rt.shape.as_list(), [2, None, 3])
     del rt
@@ -174,18 +178,19 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     splits2 = np.array([0, 3, 5], dtype=np.int64)
 
     # Test construction of a RaggedTensorValue with ragged_rank=1.
-    rt_value = ragged.RaggedTensorValue(values, splits)
+    rt_value = ragged_tensor_value.RaggedTensorValue(values, splits)
     self.assertEqual(rt_value.row_splits.dtype, np.int64)
     self.assertEqual(rt_value.shape, (5, None))
     self.assertEqual(len(rt_value.nested_row_splits), 1)
     self.assertAllEqual(splits, rt_value.row_splits)
     self.assertAllEqual(values, rt_value.values)
     self.assertAllEqual(splits, rt_value.nested_row_splits[0])
-    self.assertAllEqual(values, rt_value.inner_values)
+    self.assertAllEqual(values, rt_value.flat_values)
 
     # Test construction of a RaggedTensorValue with ragged_rank=2.
-    rt_value = ragged.RaggedTensorValue(
-        values=ragged.RaggedTensorValue(values, splits), row_splits=splits2)
+    rt_value = ragged_tensor_value.RaggedTensorValue(
+        values=ragged_tensor_value.RaggedTensorValue(values, splits),
+        row_splits=splits2)
     self.assertEqual(rt_value.row_splits.dtype, np.int64)
     self.assertEqual(rt_value.shape, (2, None, None))
     self.assertEqual(len(rt_value.nested_row_splits), 2)
@@ -194,21 +199,19 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(splits2, rt_value.nested_row_splits[0])
     self.assertAllEqual(splits, rt_value.nested_row_splits[1])
     self.assertAllEqual(values, rt_value.values.values)
-    self.assertAllEqual(values, rt_value.inner_values)
+    self.assertAllEqual(values, rt_value.flat_values)
 
   #=============================================================================
   # RaggedTensor Constructor (private)
   #=============================================================================
 
-  @test_util.run_deprecated_v1
   def testRaggedTensorConstruction(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
-    rt = ragged.RaggedTensor(
-        values=values, row_splits=row_splits, internal=True)
+    rt = RaggedTensor(values=values, row_splits=row_splits, internal=True)
 
     self.assertEqual(
-        self.evaluate(rt).tolist(),
+        self.eval_to_list(rt),
         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testRaggedTensorConstructionErrors(self):
@@ -217,117 +220,118 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     with self.assertRaisesRegexp(ValueError,
                                  'RaggedTensor constructor is private'):
-      ragged.RaggedTensor(values=values, row_splits=row_splits)
+      RaggedTensor(values=values, row_splits=row_splits)
 
     with self.assertRaisesRegexp(TypeError,
                                  'values must be a Tensor or RaggedTensor'):
-      ragged.RaggedTensor(values=range(7), row_splits=row_splits, internal=True)
+      RaggedTensor(values=range(7), row_splits=row_splits, internal=True)
 
     with self.assertRaisesRegexp(TypeError,
                                  'Row-partitioning argument must be a Tensor'):
-      ragged.RaggedTensor(
-          values=values, row_splits=[0, 2, 2, 5, 6, 7], internal=True)
+      RaggedTensor(values=values, row_splits=[0, 2, 2, 5, 6, 7], internal=True)
 
     with self.assertRaisesRegexp(ValueError,
                                  r'Shape \(6, 1\) must have rank 1'):
-      ragged.RaggedTensor(
+      RaggedTensor(
           values=values,
           row_splits=array_ops.expand_dims(row_splits, 1),
           internal=True)
 
     with self.assertRaisesRegexp(TypeError,
                                  'Cached value must be a Tensor or None.'):
-      ragged.RaggedTensor(values=values, row_splits=row_splits,
-                          cached_row_lengths=[2, 3, 4], internal=True)
+      RaggedTensor(
+          values=values,
+          row_splits=row_splits,
+          cached_row_lengths=[2, 3, 4],
+          internal=True)
 
 
 #=============================================================================
 # RaggedTensor Factory Ops
 #=============================================================================
 
-  @test_util.run_deprecated_v1
   def testFromValueRowIdsWithDerivedNRows(self):
     # nrows is known at graph creation time.
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
 
-    rt = ragged.from_value_rowids(values, value_rowids)
+    rt = RaggedTensor.from_value_rowids(values, value_rowids)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
-    rt_value_rowids = ragged.value_rowids(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_value_rowids = rt.value_rowids()
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
     self.assertAllEqual(rt_value_rowids, value_rowids)
-    self.assertEqual(self.evaluate(rt_nrows), 5)
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
     self.assertEqual(
-        self.evaluate(rt).tolist(),
+        self.eval_to_list(rt),
         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
-  @test_util.run_deprecated_v1
   def testFromValueRowIdsWithDerivedNRowsDynamic(self):
     # nrows is not known at graph creation time.
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     value_rowids = array_ops.placeholder_with_default(value_rowids, shape=None)
 
-    rt = ragged.from_value_rowids(values, value_rowids)
+    rt = RaggedTensor.from_value_rowids(values, value_rowids)
     self.assertEqual(rt.dtype, dtypes.string)
-    self.assertEqual(rt.shape.as_list(), [None, None])
+    if context.executing_eagerly():
+      self.assertEqual(rt.shape.as_list(), [5, None])
+    else:
+      self.assertEqual(rt.shape.as_list(), [None, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
-    rt_value_rowids = ragged.value_rowids(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_value_rowids = rt.value_rowids()
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
     self.assertAllEqual(rt_value_rowids, value_rowids)
-    self.assertEqual(self.evaluate(rt_nrows), 5)
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
     self.assertEqual(
-        self.evaluate(rt).tolist(),
+        self.eval_to_list(rt),
         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
-  @test_util.run_deprecated_v1
   def testFromValueRowIdsWithExplicitNRows(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     nrows = constant_op.constant(7, dtypes.int64)
 
-    rt = ragged.from_value_rowids(values, value_rowids, nrows)
+    rt = RaggedTensor.from_value_rowids(values, value_rowids, nrows)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [7, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
-    rt_value_rowids = ragged.value_rowids(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_value_rowids = rt.value_rowids()
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
     self.assertIs(rt_nrows, nrows)  # cached_nrows
     self.assertEqual(
-        self.evaluate(rt).tolist(),
+        self.eval_to_list(rt),
         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g'], [], []])
 
-  @test_util.run_deprecated_v1
   def testFromValueRowIdsWithExplicitNRowsEqualToDefault(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     nrows = constant_op.constant(5, dtypes.int64)
 
-    rt = ragged.from_value_rowids(values, value_rowids, nrows)
+    rt = RaggedTensor.from_value_rowids(values, value_rowids, nrows)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
-    rt_value_rowids = ragged.value_rowids(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_value_rowids = rt.value_rowids()
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
@@ -335,112 +339,106 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(rt_value_rowids, value_rowids)
     self.assertAllEqual(rt_nrows, nrows)
     self.assertEqual(
-        self.evaluate(rt).tolist(),
+        self.eval_to_list(rt),
         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
-  @test_util.run_deprecated_v1
   def testFromValueRowIdsWithEmptyValues(self):
-    rt = ragged.from_value_rowids([], [])
-    rt_nrows = ragged.nrows(rt)
+    rt = RaggedTensor.from_value_rowids([], [])
+    rt_nrows = rt.nrows()
     self.assertEqual(rt.dtype, dtypes.float32)
     self.assertEqual(rt.shape.as_list(), [0, None])
     self.assertEqual(rt.ragged_rank, 1)
     self.assertEqual(rt.values.shape.as_list(), [0])
-    self.assertEqual(ragged.value_rowids(rt).shape.as_list(), [0])
-    self.assertEqual(self.evaluate(rt_nrows).tolist(), 0)
-    self.assertEqual(self.evaluate(rt).tolist(), [])
+    self.assertEqual(rt.value_rowids().shape.as_list(), [0])
+    self.assertEqual(self.eval_to_list(rt_nrows), 0)
+    self.assertEqual(self.eval_to_list(rt), [])
 
-  @test_util.run_deprecated_v1
   def testFromRowSplits(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
 
-    rt = ragged.from_row_splits(values, row_splits)
+    rt = RaggedTensor.from_row_splits(values, row_splits)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
     rt_row_splits = rt.row_splits
-    rt_nrows = ragged.nrows(rt)
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_row_splits, row_splits)
-    self.assertEqual(self.evaluate(rt_nrows), 5)
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
     self.assertEqual(
-        self.evaluate(rt).tolist(),
+        self.eval_to_list(rt),
         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testFromRowSplitsWithEmptySplits(self):
     err_msg = 'row_splits tensor may not be empty'
     with self.assertRaisesRegexp(ValueError, err_msg):
-      ragged.from_row_splits([], [])
+      RaggedTensor.from_row_splits([], [])
 
-  @test_util.run_deprecated_v1
   def testFromRowStarts(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_starts = constant_op.constant([0, 2, 2, 5, 6], dtypes.int64)
 
-    rt = ragged.from_row_starts(values, row_starts)
+    rt = RaggedTensor.from_row_starts(values, row_starts)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
-    rt_row_starts = ragged.row_starts(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_row_starts = rt.row_starts()
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
-    self.assertEqual(self.evaluate(rt_nrows), 5)
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
     self.assertAllEqual(rt_row_starts, row_starts)
     self.assertEqual(
-        self.evaluate(rt).tolist(),
+        self.eval_to_list(rt),
         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
-  @test_util.run_deprecated_v1
   def testFromRowLimits(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_limits = constant_op.constant([2, 2, 5, 6, 7], dtypes.int64)
 
-    rt = ragged.from_row_limits(values, row_limits)
+    rt = RaggedTensor.from_row_limits(values, row_limits)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
-    rt_row_limits = ragged.row_limits(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_row_limits = rt.row_limits()
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
-    self.assertEqual(self.evaluate(rt_nrows), 5)
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
     self.assertAllEqual(rt_row_limits, row_limits)
     self.assertEqual(
-        self.evaluate(rt).tolist(),
+        self.eval_to_list(rt),
         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
-  @test_util.run_deprecated_v1
   def testFromRowLengths(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_lengths = constant_op.constant([2, 0, 3, 1, 1], dtypes.int64)
 
-    rt = ragged.from_row_lengths(values, row_lengths)
+    rt = RaggedTensor.from_row_lengths(values, row_lengths)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
-    rt_row_lengths = ragged.row_lengths(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_row_lengths = rt.row_lengths()
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_row_lengths, row_lengths)  # cached_nrows
-    self.assertEqual(self.evaluate(rt_nrows), 5)
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
     self.assertAllEqual(rt_row_lengths, row_lengths)
     self.assertEqual(
-        self.evaluate(rt).tolist(),
+        self.eval_to_list(rt),
         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
-  @test_util.run_deprecated_v1
   def testFromNestedValueRowIdsWithDerivedNRows(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     nested_value_rowids = [
@@ -448,24 +446,23 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     ]
 
-    rt = ragged.from_nested_value_rowids(values, nested_value_rowids)
+    rt = RaggedTensor.from_nested_value_rowids(values, nested_value_rowids)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [4, None, None])
     self.assertEqual(rt.ragged_rank, 2)
 
     rt_values = rt.values
-    rt_value_rowids = ragged.value_rowids(rt)
+    rt_value_rowids = rt.value_rowids()
     rt_values_values = rt_values.values
-    rt_values_value_rowids = ragged.value_rowids(rt_values)
+    rt_values_value_rowids = rt_values.value_rowids()
 
     self.assertIs(rt_values_values, values)
     self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
     self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
     self.assertEqual(
-        self.evaluate(rt).tolist(),
+        self.eval_to_list(rt),
         [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
 
-  @test_util.run_deprecated_v1
   def testFromNestedValueRowIdsWithExplicitNRows(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     nested_value_rowids = [
@@ -477,17 +474,18 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         constant_op.constant(6, dtypes.int64)
     ]
 
-    rt = ragged.from_nested_value_rowids(values, nested_value_rowids, nrows)
+    rt = RaggedTensor.from_nested_value_rowids(values, nested_value_rowids,
+                                               nrows)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [6, None, None])
     self.assertEqual(rt.ragged_rank, 2)
 
     rt_values = rt.values
-    rt_value_rowids = ragged.value_rowids(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_value_rowids = rt.value_rowids()
+    rt_nrows = rt.nrows()
     rt_values_values = rt_values.values
-    rt_values_value_rowids = ragged.value_rowids(rt_values)
-    rt_values_nrows = ragged.nrows(rt_values)
+    rt_values_value_rowids = rt_values.value_rowids()
+    rt_values_nrows = rt_values.nrows()
 
     self.assertIs(rt_values_values, values)
     self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
@@ -495,9 +493,8 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(rt_nrows, nrows[0])
     self.assertAllEqual(rt_values_nrows, nrows[1])
     self.assertEqual(
-        self.evaluate(rt).tolist(),
-        [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g'], []], [],
-         []])
+        self.eval_to_list(rt), [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [],
+                                [[b'f'], [b'g'], []], [], []])
 
   def testFromNestedValueRowIdsWithExplicitNRowsMismatch(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -509,28 +506,26 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(
         ValueError, 'nested_nrows must have the same '
         'length as nested_value_rowids'):
-      ragged.from_nested_value_rowids(values, nested_value_rowids, nrows)
+      RaggedTensor.from_nested_value_rowids(values, nested_value_rowids, nrows)
 
   def testFromNestedValueRowIdsWithNonListInput(self):
     with self.assertRaisesRegexp(
         TypeError, 'nested_value_rowids must be a list of Tensors'):
-      ragged.from_nested_value_rowids([1, 2, 3],
-                                      constant_op.constant(
-                                          [[0, 1, 2], [0, 1, 2]], dtypes.int64))
+      RaggedTensor.from_nested_value_rowids(
+          [1, 2, 3], constant_op.constant([[0, 1, 2], [0, 1, 2]], dtypes.int64))
     with self.assertRaisesRegexp(TypeError,
                                  'nested_nrows must be a list of Tensors'):
-      ragged.from_nested_value_rowids([1, 2, 3], [[0, 1, 2], [0, 1, 2]],
-                                      constant_op.constant([3, 3]))
+      RaggedTensor.from_nested_value_rowids([1, 2, 3], [[0, 1, 2], [0, 1, 2]],
+                                            constant_op.constant([3, 3]))
 
-  @test_util.run_deprecated_v1
   def testFromNestedRowSplits(self):
-    inner_values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    flat_values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     nested_row_splits = [
         constant_op.constant([0, 2, 3, 3, 5], dtypes.int64),
         constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
     ]
 
-    rt = ragged.from_nested_row_splits(inner_values, nested_row_splits)
+    rt = RaggedTensor.from_nested_row_splits(flat_values, nested_row_splits)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [4, None, None])
     self.assertEqual(rt.ragged_rank, 2)
@@ -540,19 +535,18 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     rt_values_values = rt_values.values
     rt_values_row_splits = rt_values.row_splits
 
-    self.assertIs(rt_values_values, inner_values)
+    self.assertIs(rt_values_values, flat_values)
     self.assertIs(rt_row_splits, nested_row_splits[0])
     self.assertIs(rt_values_row_splits, nested_row_splits[1])
     self.assertEqual(
-        self.evaluate(rt).tolist(),
+        self.eval_to_list(rt),
         [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
 
   def testFromNestedRowSplitsWithNonListInput(self):
     with self.assertRaisesRegexp(TypeError,
                                  'nested_row_splits must be a list of Tensors'):
-      ragged.from_nested_row_splits([1, 2],
-                                    constant_op.constant([[0, 1, 2], [0, 1, 2]],
-                                                         dtypes.int64))
+      RaggedTensor.from_nested_row_splits(
+          [1, 2], constant_op.constant([[0, 1, 2], [0, 1, 2]], dtypes.int64))
 
   def testFromValueRowIdsWithBadNRows(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -560,7 +554,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     nrows = constant_op.constant(5, dtypes.int64)
 
     with self.assertRaisesRegexp(ValueError, r'Expected nrows >= 0; got -2'):
-      ragged.from_value_rowids(
+      RaggedTensor.from_value_rowids(
           values=values,
           value_rowids=array_ops.placeholder_with_default(value_rowids, None),
           nrows=-2)
@@ -568,113 +562,94 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(
         ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=2, '
         r'value_rowids\[-1\]=4'):
-      ragged.from_value_rowids(
+      RaggedTensor.from_value_rowids(
           values=values, value_rowids=value_rowids, nrows=2)
 
     with self.assertRaisesRegexp(
         ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=4, '
         r'value_rowids\[-1\]=4'):
-      ragged.from_value_rowids(
+      RaggedTensor.from_value_rowids(
           values=values, value_rowids=value_rowids, nrows=4)
 
     with self.assertRaisesRegexp(ValueError,
                                  r'Shape \(7, 1\) must have rank 1'):
-      ragged.from_value_rowids(
+      RaggedTensor.from_value_rowids(
           values=values,
           value_rowids=array_ops.expand_dims(value_rowids, 1),
           nrows=nrows)
 
     with self.assertRaisesRegexp(ValueError, r'Shape \(1,\) must have rank 0'):
-      ragged.from_value_rowids(
+      RaggedTensor.from_value_rowids(
           values=values,
           value_rowids=value_rowids,
           nrows=array_ops.expand_dims(nrows, 0))
 
-  @test_util.run_deprecated_v1
   def testGraphMismatch(self):
-    with ops.Graph().as_default():
-      values = constant_op.constant([1, 2, 3])
-    with ops.Graph().as_default():
-      splits = constant_op.constant([0, 2, 3])
-    self.assertRaisesRegexp(ValueError, '.* must be from the same graph as .*',
-                            ragged.from_row_splits, values, splits)
+    if not context.executing_eagerly():
+      with ops.Graph().as_default():
+        values = constant_op.constant([1, 2, 3], dtypes.int64)
+      with ops.Graph().as_default():
+        splits = constant_op.constant([0, 2, 3], dtypes.int64)
+      self.assertRaisesRegexp(ValueError,
+                              '.* must be from the same graph as .*',
+                              RaggedTensor.from_row_splits, values, splits)
 
   #=============================================================================
   # Ragged Value & Row-Partitioning Tensor Accessors
   #=============================================================================
 
-  @test_util.run_deprecated_v1
   def testRaggedTensorAccessors_2d(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
-    rt1 = ragged.from_row_splits(values, row_splits)
-    rt2 = ragged.from_value_rowids(values, value_rowids)
+    rt1 = RaggedTensor.from_row_splits(values, row_splits)
+    rt2 = RaggedTensor.from_value_rowids(values, value_rowids)
 
     for rt in [rt1, rt2]:
-      self.assertEqual(
-          self.evaluate(rt).tolist(),
-          [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
-      self.assertEqual(
-          self.evaluate(rt.values).tolist(),
-          [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+      self.assertRaggedEqual(
+          rt, [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+      self.assertAllEqual(rt.values, [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
       self.assertEqual(rt.values.shape.dims[0].value, 7)
-      self.assertEqual(
-          self.evaluate(ragged.value_rowids(rt)).tolist(),
-          [0, 0, 2, 2, 2, 3, 4])
-      self.assertEqual(self.evaluate(ragged.nrows(rt)).tolist(), 5)
-      self.assertEqual(
-          self.evaluate(rt.row_splits).tolist(), [0, 2, 2, 5, 6, 7])
-      self.assertEqual(
-          self.evaluate(ragged.row_starts(rt)).tolist(), [0, 2, 2, 5, 6])
-      self.assertEqual(
-          self.evaluate(ragged.row_limits(rt)).tolist(), [2, 2, 5, 6, 7])
-      self.assertEqual(
-          self.evaluate(ragged.row_lengths(rt)).tolist(), [2, 0, 3, 1, 1])
-      self.assertEqual(
-          self.evaluate(rt.inner_values).tolist(),
-          [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
-      self.assertEqual(
-          [self.evaluate(s).tolist() for s in rt.nested_row_splits],
-          [[0, 2, 2, 5, 6, 7]])
+      self.assertAllEqual(rt.value_rowids(), [0, 0, 2, 2, 2, 3, 4])
+      self.assertAllEqual(rt.nrows(), 5)
+      self.assertAllEqual(rt.row_splits, [0, 2, 2, 5, 6, 7])
+      self.assertAllEqual(rt.row_starts(), [0, 2, 2, 5, 6])
+      self.assertAllEqual(rt.row_limits(), [2, 2, 5, 6, 7])
+      self.assertAllEqual(rt.row_lengths(), [2, 0, 3, 1, 1])
+      self.assertAllEqual(rt.flat_values,
+                          [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+      self.assertLen(rt.nested_row_splits, 1)
+      self.assertAllEqual(rt.nested_row_splits[0], [0, 2, 2, 5, 6, 7])
 
-  @test_util.run_deprecated_v1
   def testRaggedTensorAccessors_3d_with_ragged_rank_1(self):
     values = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]]
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
-    rt1 = ragged.from_row_splits(values, row_splits)
-    rt2 = ragged.from_value_rowids(values, value_rowids)
+    rt1 = RaggedTensor.from_row_splits(values, row_splits)
+    rt2 = RaggedTensor.from_value_rowids(values, value_rowids)
 
     for rt in [rt1, rt2]:
       self.assertEqual(
-          self.evaluate(rt).tolist(),
+          self.eval_to_list(rt),
           [[[0, 1], [2, 3]], [], [[4, 5], [6, 7], [8, 9]], [[10, 11]],
            [[12, 13]]])
       self.assertEqual(
-          self.evaluate(rt.values).tolist(),
+          self.eval_to_list(rt.values),
           [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
       self.assertEqual(rt.values.shape.dims[0].value, 7)
       self.assertEqual(
-          self.evaluate(ragged.value_rowids(rt)).tolist(),
-          [0, 0, 2, 2, 2, 3, 4])
-      self.assertEqual(self.evaluate(ragged.nrows(rt)).tolist(), 5)
-      self.assertEqual(
-          self.evaluate(rt.row_splits).tolist(), [0, 2, 2, 5, 6, 7])
-      self.assertEqual(
-          self.evaluate(ragged.row_starts(rt)).tolist(), [0, 2, 2, 5, 6])
-      self.assertEqual(
-          self.evaluate(ragged.row_limits(rt)).tolist(), [2, 2, 5, 6, 7])
+          self.eval_to_list(rt.value_rowids()), [0, 0, 2, 2, 2, 3, 4])
+      self.assertEqual(self.eval_to_list(rt.nrows()), 5)
+      self.assertEqual(self.eval_to_list(rt.row_splits), [0, 2, 2, 5, 6, 7])
+      self.assertEqual(self.eval_to_list(rt.row_starts()), [0, 2, 2, 5, 6])
+      self.assertEqual(self.eval_to_list(rt.row_limits()), [2, 2, 5, 6, 7])
+      self.assertEqual(self.eval_to_list(rt.row_lengths()), [2, 0, 3, 1, 1])
       self.assertEqual(
-          self.evaluate(ragged.row_lengths(rt)).tolist(), [2, 0, 3, 1, 1])
-      self.assertEqual(
-          self.evaluate(rt.inner_values).tolist(),
+          self.eval_to_list(rt.flat_values),
           [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
-      self.assertEqual(
-          [self.evaluate(s).tolist() for s in rt.nested_row_splits],
-          [[0, 2, 2, 5, 6, 7]])
+      self.assertEqual([self.eval_to_list(s) for s in rt.nested_row_splits],
+                       [[0, 2, 2, 5, 6, 7]])
 
-  @test_util.run_deprecated_v1
   def testRaggedTensorAccessors_3d_with_ragged_rank_2(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     nested_row_splits = [
@@ -685,73 +660,59 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         constant_op.constant([0, 0, 1, 3, 3], dtypes.int64),
         constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     ]
-    rt1 = ragged.from_nested_row_splits(values, nested_row_splits)
-    rt2 = ragged.from_nested_value_rowids(values, nested_value_rowids)
+    rt1 = RaggedTensor.from_nested_row_splits(values, nested_row_splits)
+    rt2 = RaggedTensor.from_nested_value_rowids(values, nested_value_rowids)
 
     for rt in [rt1, rt2]:
       self.assertEqual(
-          self.evaluate(rt).tolist(),
+          self.eval_to_list(rt),
           [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
       self.assertEqual(
-          self.evaluate(rt.values).tolist(),
+          self.eval_to_list(rt.values),
           [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
       self.assertEqual(rt.values.shape.dims[0].value, 5)
+      self.assertEqual(self.eval_to_list(rt.value_rowids()), [0, 0, 1, 3, 3])
+      self.assertEqual(self.eval_to_list(rt.nrows()), 4)
+      self.assertEqual(self.eval_to_list(rt.row_splits), [0, 2, 3, 3, 5])
+      self.assertEqual(self.eval_to_list(rt.row_starts()), [0, 2, 3, 3])
+      self.assertEqual(self.eval_to_list(rt.row_limits()), [2, 3, 3, 5])
+      self.assertEqual(self.eval_to_list(rt.row_lengths()), [2, 1, 0, 2])
       self.assertEqual(
-          self.evaluate(ragged.value_rowids(rt)).tolist(), [0, 0, 1, 3, 3])
-      self.assertEqual(self.evaluate(ragged.nrows(rt)).tolist(), 4)
-      self.assertEqual(self.evaluate(rt.row_splits).tolist(), [0, 2, 3, 3, 5])
-      self.assertEqual(
-          self.evaluate(ragged.row_starts(rt)).tolist(), [0, 2, 3, 3])
-      self.assertEqual(
-          self.evaluate(ragged.row_limits(rt)).tolist(), [2, 3, 3, 5])
-      self.assertEqual(
-          self.evaluate(ragged.row_lengths(rt)).tolist(), [2, 1, 0, 2])
-      self.assertEqual(
-          self.evaluate(rt.inner_values).tolist(),
+          self.eval_to_list(rt.flat_values),
           [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
-      self.assertEqual(
-          [self.evaluate(s).tolist() for s in rt.nested_row_splits],
-          [[0, 2, 3, 3, 5], [0, 2, 2, 5, 6, 7]])
-
-  def testNRowsWithTensorInput(self):
-    dt = constant_op.constant([[1, 2, 3], [4, 5, 6]])
-    nrows = ragged.nrows(dt)
-    self.assertEqual(self.evaluate(nrows), 2)
-
-  def testRowLengthsWithTensorInput(self):
-    dt = constant_op.constant([[1, 2, 3], [4, 5, 6]])
-    row_lengths = ragged.row_lengths(dt)
-    self.assertEqual(self.evaluate(row_lengths).tolist(), [3, 3])
+      self.assertEqual([self.eval_to_list(s) for s in rt.nested_row_splits],
+                       [[0, 2, 3, 3, 5], [0, 2, 2, 5, 6, 7]])
 
   #=============================================================================
   # RaggedTensor.shape
   #=============================================================================
 
-  @test_util.run_deprecated_v1
   def testShape(self):
     """Tests for RaggedTensor.shape."""
-    rt1 = ragged.from_row_splits(b'a b c d e f g'.split(), [0, 2, 5, 6, 6, 7])
+    rt1 = RaggedTensor.from_row_splits(b'a b c d e f g'.split(),
+                                       [0, 2, 5, 6, 6, 7])
     self.assertEqual(rt1.shape.as_list(), [5, None])
 
-    rt2 = ragged.from_row_splits(
+    rt2 = RaggedTensor.from_row_splits(
         [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]],
         [0, 2, 5, 6, 6, 7])
     self.assertEqual(rt2.shape.as_list(), [5, None, 2])
 
-    rt3 = ragged.from_row_splits(
+    rt3 = RaggedTensor.from_row_splits(
         [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]], [0, 2, 2, 3])
     self.assertEqual(rt3.shape.as_list(), [3, None, 2, 2])
 
-    rt4 = ragged.from_row_splits(rt3, [0, 1, 3, 3])
+    rt4 = RaggedTensor.from_row_splits(rt3, [0, 1, 3, 3])
     self.assertEqual(rt4.shape.as_list(), [3, None, None, 2, 2])
 
-    rt5 = ragged.from_row_splits(
-        array_ops.placeholder(dtype=dtypes.string), [0, 2, 3, 5])
-    self.assertEqual(rt5.shape.ndims, None)
+    if not context.executing_eagerly():
+      rt5 = RaggedTensor.from_row_splits(
+          array_ops.placeholder(dtype=dtypes.string), [0, 2, 3, 5])
+      self.assertEqual(rt5.shape.ndims, None)
 
-    rt6 = ragged.from_row_splits([1, 2, 3],
-                                 array_ops.placeholder(dtype=dtypes.int64))
-    self.assertEqual(rt6.shape.as_list(), [None, None])
+      rt6 = RaggedTensor.from_row_splits(
+          [1, 2, 3], array_ops.placeholder(dtype=dtypes.int64))
+      self.assertEqual(rt6.shape.as_list(), [None, None])
 
   #=============================================================================
   # RaggedTensor.__getitem__
@@ -777,15 +738,9 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     """
     tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
     tensor_slice_spec2 = _make_tensor_slice_spec(slice_spec, False)
-    value1 = self.evaluate(rt.__getitem__(slice_spec))
-    value2 = self.evaluate(rt.__getitem__(tensor_slice_spec1))
-    value3 = self.evaluate(rt.__getitem__(tensor_slice_spec2))
-    if hasattr(value1, 'tolist'):
-      value1 = value1.tolist()
-    if hasattr(value2, 'tolist'):
-      value2 = value2.tolist()
-    if hasattr(value3, 'tolist'):
-      value3 = value3.tolist()
+    value1 = self.eval_to_list(rt.__getitem__(slice_spec))
+    value2 = self.eval_to_list(rt.__getitem__(tensor_slice_spec1))
+    value3 = self.eval_to_list(rt.__getitem__(tensor_slice_spec2))
     self.assertEqual(value1, expected, 'slice_spec=%s' % (slice_spec,))
     self.assertEqual(value2, expected, 'slice_spec=%s' % (slice_spec,))
     self.assertEqual(value3, expected, 'slice_spec=%s' % (slice_spec,))
@@ -861,23 +816,26 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       (SLICE_BUILDER[:, -2:], [row[-2:] for row in EXAMPLE_RAGGED_TENSOR_2D]),
       # TODO(edloper): Add tests for strided slices, once support is added.
   )
-  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemWithRaggedRank1(self, slice_spec, expected):
     """Test that rt.__getitem__(slice_spec) == expected."""
     # Ragged tensor
-    rt = ragged.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
-                                EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
+    rt = RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
+                                      EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
 
-    self.assertEqual(self.evaluate(rt).tolist(), EXAMPLE_RAGGED_TENSOR_2D)
+    self.assertEqual(self.eval_to_list(rt), EXAMPLE_RAGGED_TENSOR_2D)
     self._TestGetItem(rt, slice_spec, expected)
 
   # pylint: disable=invalid-slice-index
   @parameterized.parameters(
       # Tests for out-of-bound errors
-      (SLICE_BUILDER[5], ValueError, '.*out of bounds.*'),
-      (SLICE_BUILDER[-6], ValueError, '.*out of bounds.*'),
-      (SLICE_BUILDER[0, 2], ValueError, '.*out of bounds.*'),
-      (SLICE_BUILDER[3, 0], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[5],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[-6],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 2],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[3, 0],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
 
       # Indexing into an inner ragged dimension
       (SLICE_BUILDER[:, 3], ValueError,
@@ -889,8 +847,8 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
       # Tests for type errors
       (SLICE_BUILDER[0.5], TypeError, re.escape(array_ops._SLICE_TYPE_ERROR)),
-      (SLICE_BUILDER[1:3:0.5], TypeError,
-       re.escape(array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER[1:3:0.5], TypeError, re.escape(
+          array_ops._SLICE_TYPE_ERROR)),
       (SLICE_BUILDER[:, 1:3:0.5], TypeError,
        'slice strides must be integers or None'),
       (SLICE_BUILDER[:, 0.5:1.5], TypeError,
@@ -903,17 +861,14 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       (SLICE_BUILDER[..., 0, 0, 0], IndexError,
        'Too many indices for RaggedTensor'),
   )
-  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemErrorsWithRaggedRank1(self, slice_spec, expected,
                                                    message):
     """Test that rt.__getitem__(slice_spec) == expected."""
     # Ragged tensor
-    rt = ragged.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
-                                EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
-    # if sys.version_info[0] == 3:
-    #   message = 'must be str, not int'
+    rt = RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
+                                      EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
 
-    self.assertEqual(self.evaluate(rt).tolist(), EXAMPLE_RAGGED_TENSOR_2D)
+    self.assertEqual(self.eval_to_list(rt), EXAMPLE_RAGGED_TENSOR_2D)
     self._TestGetItemException(rt, slice_spec, expected, message)
 
   @parameterized.parameters(
@@ -982,13 +937,12 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       # TODO(edloper): Add tests slicing inner ragged dimensions, one support
       # is added.
   )
-  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemWithRaggedRank2(self, slice_spec, expected):
     """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = ragged.from_nested_row_splits(
+    rt = RaggedTensor.from_nested_row_splits(
         EXAMPLE_RAGGED_TENSOR_4D_VALUES,
         [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
-    self.assertEqual(self.evaluate(rt).tolist(), EXAMPLE_RAGGED_TENSOR_4D)
+    self.assertEqual(self.eval_to_list(rt), EXAMPLE_RAGGED_TENSOR_4D)
     self._TestGetItem(rt, slice_spec, expected)
 
   @parameterized.parameters(
@@ -999,19 +953,22 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
        'Cannot index into an inner ragged dimension.'),
 
       # Test for out-of-bounds errors.
-      (SLICE_BUILDER[1, 0], ValueError, '.*out of bounds.*'),
-      (SLICE_BUILDER[0, 0, 3], ValueError, '.*out of bounds.*'),
-      (SLICE_BUILDER[5], ValueError, '.*out of bounds.*'),
-      (SLICE_BUILDER[0, 5], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[1, 0],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 0, 3],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[5],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 5],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
   )
-  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemErrorsWithRaggedRank2(self, slice_spec, expected,
                                                    message):
     """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = ragged.from_nested_row_splits(
+    rt = RaggedTensor.from_nested_row_splits(
         EXAMPLE_RAGGED_TENSOR_4D_VALUES,
         [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
-    self.assertEqual(self.evaluate(rt).tolist(), EXAMPLE_RAGGED_TENSOR_4D)
+    self.assertEqual(self.eval_to_list(rt), EXAMPLE_RAGGED_TENSOR_4D)
     self._TestGetItemException(rt, slice_spec, expected, message)
 
   @parameterized.parameters(
@@ -1019,21 +976,21 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       (SLICE_BUILDER[2:], []),
       (SLICE_BUILDER[:-3], []),
   )
-  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemWithEmptyTensor(self, slice_spec, expected):
     """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = ragged.from_row_splits([], [0])
+    rt = RaggedTensor.from_row_splits([], [0])
     self._TestGetItem(rt, slice_spec, expected)
 
   @parameterized.parameters(
-      (SLICE_BUILDER[0], ValueError, '.*out of bounds.*'),
-      (SLICE_BUILDER[-1], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[0],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[-1],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
   )
-  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemErrorsWithEmptyTensor(self, slice_spec, expected,
                                                    message):
     """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = ragged.from_row_splits([], [0])
+    rt = RaggedTensor.from_row_splits([], [0])
     self._TestGetItemException(rt, slice_spec, expected, message)
 
   @parameterized.parameters(
@@ -1045,7 +1002,6 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       (SLICE_BUILDER[0, 1], EXAMPLE_RAGGED_TENSOR_2D[0][1]),
       (SLICE_BUILDER[-3, 0], EXAMPLE_RAGGED_TENSOR_2D[-3][0]),
   )
-  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemWithPlaceholderShapes(self, slice_spec, expected):
     """Test that rt.__getitem__(slice_spec) == expected."""
     # Intentionally use an unknown shape for `splits`, to force the code path
@@ -1053,29 +1009,28 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     splits = constant_op.constant(
         EXAMPLE_RAGGED_TENSOR_2D_SPLITS, dtype=dtypes.int64)
     splits = array_ops.placeholder_with_default(splits, None)
-    rt = ragged.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES, splits)
-    self.assertEqual(self.evaluate(rt).tolist(), EXAMPLE_RAGGED_TENSOR_2D)
+    rt = RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES, splits)
+    self.assertEqual(self.eval_to_list(rt), EXAMPLE_RAGGED_TENSOR_2D)
     self._TestGetItem(rt, slice_spec, expected)
 
   @parameterized.parameters(
       (SLICE_BUILDER[..., 2], ValueError,
        'Ellipsis not supported for unknown shape RaggedTensors'),)
-  @test_util.run_deprecated_v1
   def testRaggedTensorGetItemErrorsWithPlaceholderShapes(
       self, slice_spec, expected, message):
     """Test that rt.__getitem__(slice_spec) == expected."""
-    # Intentionally use an unknown shape for `values`.
-    values = array_ops.placeholder_with_default([0], None)
-    rt = ragged.from_row_splits(values, [0, 1])
-    self._TestGetItemException(rt, slice_spec, expected, message)
+    if not context.executing_eagerly():
+      # Intentionally use an unknown shape for `values`.
+      values = array_ops.placeholder_with_default([0], None)
+      rt = RaggedTensor.from_row_splits(values, [0, 1])
+      self._TestGetItemException(rt, slice_spec, expected, message)
 
-  @test_util.run_deprecated_v1
   def testGetItemNewAxis(self):
     # rt: [[[['a', 'b'], ['c', 'd']], [], [['e', 'f']]], []]
     splits1 = [0, 3, 3]
     splits2 = [0, 2, 2, 3]
     values = constant_op.constant([['a', 'b'], ['c', 'd'], ['e', 'f']])
-    rt = ragged.from_nested_row_splits(values, [splits1, splits2])
+    rt = RaggedTensor.from_nested_row_splits(values, [splits1, splits2])
     rt_newaxis0 = rt[array_ops.newaxis]
     rt_newaxis1 = rt[:, array_ops.newaxis]
     rt_newaxis2 = rt[:, :, array_ops.newaxis]
@@ -1083,22 +1038,22 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     rt_newaxis4 = rt[:, :, :, :, array_ops.newaxis]
 
     self.assertEqual(
-        self.evaluate(rt).tolist(),
+        self.eval_to_list(rt),
         [[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []])
     self.assertEqual(
-        self.evaluate(rt_newaxis0).tolist(),
+        self.eval_to_list(rt_newaxis0),
         [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []]])
     self.assertEqual(
-        self.evaluate(rt_newaxis1).tolist(),
+        self.eval_to_list(rt_newaxis1),
         [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]]], [[]]])
     self.assertEqual(
-        self.evaluate(rt_newaxis2).tolist(),
+        self.eval_to_list(rt_newaxis2),
         [[[[[b'a', b'b'], [b'c', b'd']]], [[]], [[[b'e', b'f']]]], []])
     self.assertEqual(
-        self.evaluate(rt_newaxis3).tolist(),
+        self.eval_to_list(rt_newaxis3),
         [[[[[b'a', b'b']], [[b'c', b'd']]], [], [[[b'e', b'f']]]], []])
     self.assertEqual(
-        self.evaluate(rt_newaxis4).tolist(),
+        self.eval_to_list(rt_newaxis4),
         [[[[[b'a'], [b'b']], [[b'c'], [b'd']]], [], [[[b'e'], [b'f']]]], []])
 
     self.assertEqual(rt.ragged_rank, 2)
@@ -1117,126 +1072,140 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   #=============================================================================
   # RaggedTensor.__str__
   #=============================================================================
-  @test_util.run_deprecated_v1
   def testRaggedTensorStr(self):
-    rt1 = ragged.from_row_splits(b'a b c d e f g'.split(), [0, 2, 5, 6, 6, 7])
-    expected1 = ('RaggedTensor(values=Tensor("RaggedFromRowSplits/values:0", '
-                 'shape=(7,), dtype=string), row_splits='
-                 'Tensor("RaggedFromRowSplits/row_splits:0", '
-                 'shape=(6,), dtype=int64))')
-    self.assertEqual(str(rt1), expected1)
-    self.assertEqual(repr(rt1), expected1)
+    values = [b'a', b'b', b'c', b'd', b'e', b'f', b'g']
+    row_splits = [0, 2, 5, 6, 6, 7]
+    rt = RaggedTensor.from_row_splits(values, row_splits)
+    if context.executing_eagerly():
+      expected_str = '<tf.RaggedTensor {}>'.format([[b'a', b'b'],
+                                                    [b'c', b'd', b'e'], [b'f'],
+                                                    [], [b'g']])
+      expected_repr = (
+          'tf.RaggedTensor(values=tf.Tensor([{}], shape=(7,), dtype=string), '
+          'row_splits=tf.Tensor([{}], shape=(6,), dtype=int64))'.format(
+              ' '.join(repr(x) for x in values), ' '.join(
+                  repr(x) for x in row_splits)))
+      self.assertEqual(str(rt), expected_str)
+      self.assertEqual(repr(rt), expected_repr)
+    else:
+      expected_repr = (
+          'tf.RaggedTensor(values=Tensor("RaggedFromRowSplits/values:0", '
+          'shape=(7,), dtype=string), row_splits='
+          'Tensor("RaggedFromRowSplits/row_splits:0", '
+          'shape=(6,), dtype=int64))')
+      self.assertEqual(repr(rt), expected_repr)
+      self.assertEqual(str(rt), expected_repr)
 
   def testRaggedTensorValueStr(self):
-    rt = ragged.RaggedTensorValue(
-        values=np.array(b'a b c d e f g'.split()),
-        row_splits=np.array([0, 2, 5, 6, 6, 7], dtype=np.int64))
-    if sys.version_info[0] == 2:
-      self.assertEqual(' '.join(str(rt).split()),
-                       (r"<RaggedTensorValue [['a', 'b'], ['c', 'd', 'e'], "
-                        "['f'], [], ['g']]>"))
-      self.assertEqual(
-          ' '.join(repr(rt).split()),
-          (r"RaggedTensorValue(values=array(['a', 'b', 'c', 'd', "
-           "'e', 'f', 'g'], dtype='|S1'), row_splits=array([0, 2, 5,"
-           ' 6, 6, 7]))'))
-    else:
-      self.assertEqual(
-          ' '.join(str(rt).split()),
-          (r"<RaggedTensorValue [[b'a', b'b'], [b'c', b'd', b'e'], "
-           "[b'f'], [], [b'g']]>"))
-      self.assertEqual(
-          ' '.join(repr(rt).split()),
-          (r"RaggedTensorValue(values=array([b'a', b'b', b'c', b'd', "
-           "b'e', b'f', b'g'], dtype='|S1'), row_splits=array([0, 2, 5,"
-           ' 6, 6, 7]))'))
+    values = [b'a', b'b', b'c', b'd', b'e', b'f', b'g']
+    row_splits = [0, 2, 5, 6, 6, 7]
+    rt = ragged_tensor_value.RaggedTensorValue(
+        np.array(values), np.array(row_splits, dtype=np.int64))
+    expected_str = '<tf.RaggedTensorValue {}>'.format([[b'a', b'b'],
+                                                       [b'c', b'd', b'e'],
+                                                       [b'f'], [], [b'g']])
+    expected_repr = ("tf.RaggedTensorValue(values=array({}, dtype='|S1'), "
+                     'row_splits=array({}))'.format(values, row_splits))
+    self.assertEqual(' '.join(str(rt).split()), expected_str)
+    self.assertEqual(' '.join(repr(rt).split()), expected_repr)
 
   #=============================================================================
-  # RaggedTensor.with_values() and RaggedTensor.with_inner_values().
+  # RaggedTensor.with_values() and RaggedTensor.with_flat_values().
   #=============================================================================
 
-  @test_util.run_deprecated_v1
   def testWithValues(self):
-    rt1 = ragged.constant([[1, 2], [3, 4, 5], [6], [], [7]])
-    rt2 = ragged.constant([[[1, 2], [3, 4, 5]], [[6]], [], [[], [7]]])
+    rt1 = ragged_factory_ops.constant([[1, 2], [3, 4, 5], [6], [], [7]])
+    rt2 = ragged_factory_ops.constant([[[1, 2], [3, 4, 5]], [[6]], [], [[],
+                                                                        [7]]])
 
     rt1_plus_10 = rt1.with_values(rt1.values + 10)
-    rt2_times_10 = rt2.with_inner_values(rt2.inner_values * 10)
+    rt2_times_10 = rt2.with_flat_values(rt2.flat_values * 10)
     rt1_expanded = rt1.with_values(array_ops.expand_dims(rt1.values, axis=1))
 
     self.assertEqual(
-        self.evaluate(rt1_plus_10).tolist(),
+        self.eval_to_list(rt1_plus_10),
         [[11, 12], [13, 14, 15], [16], [], [17]])
     self.assertEqual(
-        self.evaluate(rt2_times_10).tolist(),
+        self.eval_to_list(rt2_times_10),
         [[[10, 20], [30, 40, 50]], [[60]], [], [[], [70]]])
     self.assertEqual(
-        self.evaluate(rt1_expanded).tolist(),
+        self.eval_to_list(rt1_expanded),
         [[[1], [2]], [[3], [4], [5]], [[6]], [], [[7]]])
 
   #=============================================================================
   # Session.run
   #=============================================================================
-  @test_util.run_deprecated_v1
   def testSessionRun(self):
-    rt1 = ragged.constant([[1, 2, 3], [4]])
-    rt2 = ragged.constant([[[], [1, 2]], [[3]]])
+    if context.executing_eagerly():
+      return
+
+    rt1 = ragged_factory_ops.constant([[1, 2, 3], [4]])
+    rt2 = ragged_factory_ops.constant([[[], [1, 2]], [[3]]])
     with self.test_session() as session:
       result = session.run({'rt1': rt1, 'rt2': rt2})
       self.assertCountEqual(sorted(result.keys()), ['rt1', 'rt2'])
-      self.assertEqual(result['rt1'].tolist(), [[1, 2, 3], [4]])
-      self.assertEqual(result['rt2'].tolist(), [[[], [1, 2]], [[3]]])
+      self.assertEqual(result['rt1'].to_list(), [[1, 2, 3], [4]])
+      self.assertEqual(result['rt2'].to_list(), [[[], [1, 2]], [[3]]])
 
-  @test_util.run_deprecated_v1
   def testSessionRunFeed(self):
-    rt1 = ragged.from_row_splits(
+    if context.executing_eagerly():
+      return
+
+    rt1 = RaggedTensor.from_row_splits(
         array_ops.placeholder(dtypes.int32),
         array_ops.placeholder(dtypes.int64))
-    rt2 = ragged.from_nested_row_splits(
-        array_ops.placeholder(dtypes.int32),
-        [array_ops.placeholder(dtypes.int64),
-         array_ops.placeholder(dtypes.int64)])
+    rt2 = RaggedTensor.from_nested_row_splits(
+        array_ops.placeholder(dtypes.int32), [
+            array_ops.placeholder(dtypes.int64),
+            array_ops.placeholder(dtypes.int64)
+        ])
 
-    rt1_feed_val = ragged.constant_value([[1, 2, 3], [4]])
-    rt2_feed_val = ragged.constant_value([[[], [1, 2]], [[3]]])
+    rt1_feed_val = ragged_factory_ops.constant_value([[1, 2, 3], [4]])
+    rt2_feed_val = ragged_factory_ops.constant_value([[[], [1, 2]], [[3]]])
 
     with self.test_session() as session:
-      result = session.run({'rt1': rt1, 'rt2': rt2},
-                           feed_dict={rt1: rt1_feed_val,
-                                      rt2: rt2_feed_val})
+      result = session.run({
+          'rt1': rt1,
+          'rt2': rt2
+      },
+                           feed_dict={
+                               rt1: rt1_feed_val,
+                               rt2: rt2_feed_val
+                           })
       self.assertCountEqual(sorted(result.keys()), ['rt1', 'rt2'])
-      self.assertEqual(result['rt1'].tolist(), [[1, 2, 3], [4]])
-      self.assertEqual(result['rt2'].tolist(), [[[], [1, 2]], [[3]]])
+      self.assertEqual(result['rt1'].to_list(), [[1, 2, 3], [4]])
+      self.assertEqual(result['rt2'].to_list(), [[[], [1, 2]], [[3]]])
 
-  @test_util.run_deprecated_v1
   def testSessionPartialRunFeed(self):
+    if context.executing_eagerly():
+      return
+
     # Placeholder inputs.
-    a = ragged.from_row_splits(
+    a = RaggedTensor.from_row_splits(
         array_ops.placeholder(dtypes.int32, shape=[None], name='a.values'),
         array_ops.placeholder(dtypes.int64, name='a.row_splits'))
-    b = ragged.from_row_splits(
+    b = RaggedTensor.from_row_splits(
         array_ops.placeholder(dtypes.int32, shape=[None], name='b.values'),
         array_ops.placeholder(dtypes.int64, name='b.row_splits'))
     c = array_ops.placeholder(dtypes.int32, shape=[], name='c')
 
     # Feed values for placeholder inputs.
-    a_val = ragged.constant_value([[1, 2, 3], [4]])
-    b_val = ragged.constant_value([[5, 4, 3], [2]])
+    a_val = ragged_factory_ops.constant_value([[1, 2, 3], [4]])
+    b_val = ragged_factory_ops.constant_value([[5, 4, 3], [2]])
     c_val = 3
 
     # Compute some values.
-    r1 = ragged.reduce_sum(a * b, axis=1)
-    r2 = ragged.reduce_sum(a + c, axis=1)
+    r1 = ragged_math_ops.reduce_sum(a * b, axis=1)
+    r2 = ragged_math_ops.reduce_sum(a + c, axis=1)
 
     with self.test_session() as session:
       handle = session.partial_run_setup([r1, r2], [a, b, c])
 
       res1 = session.partial_run(handle, r1, feed_dict={a: a_val, b: b_val})
-      self.assertEqual(res1.tolist(), [22, 8])
+      self.assertAllEqual(res1, [22, 8])
 
       res2 = session.partial_run(handle, r2, feed_dict={c: c_val})
-      self.assertEqual(res2.tolist(), [15, 7])
-
+      self.assertAllEqual(res2, [15, 7])
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_value.py b/tensorflow/python/ops/ragged/ragged_tensor_value.py
index 39d3249c991674a090d2dab4da8fb385b7463f13..c5e498e95fb5bca7ba2d5496a8af33bd8b8eb0fd 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_value.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_value.py
@@ -20,11 +20,17 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.util.tf_export import tf_export
 
+
+@tf_export(v1=["ragged.RaggedTensorValue"])
 class RaggedTensorValue(object):
   """Represents the value of a `RaggedTensor`.
 
-  See `RaggedTensor` for a description of ragged tensors.
+  Warning: `RaggedTensorValue` should only be used in graph mode; in
+  eager mode, the `tf.RaggedTensor` class contains its value directly.
+
+  See `tf.RaggedTensor` for a description of ragged tensors.
   """
 
   def __init__(self, values, row_splits):
@@ -53,7 +59,7 @@ class RaggedTensorValue(object):
       doc="""The numpy dtype of values in this tensor.""")
 
   @property
-  def inner_values(self):
+  def flat_values(self):
     """The innermost `values` array for this ragged tensor value."""
     rt_values = self.values
     while isinstance(rt_values, RaggedTensorValue):
@@ -82,15 +88,18 @@ class RaggedTensorValue(object):
     return (self._row_splits.shape[0] - 1,) + (None,) + self._values.shape[1:]
 
   def __str__(self):
-    return "<RaggedTensorValue %s>" % self.tolist()
+    return "<tf.RaggedTensorValue %s>" % self.to_list()
 
   def __repr__(self):
-    return "RaggedTensorValue(values=%r, row_splits=%r)" % (self._values,
-                                                            self._row_splits)
+    return "tf.RaggedTensorValue(values=%r, row_splits=%r)" % (self._values,
+                                                               self._row_splits)
 
-  def tolist(self):
+  def to_list(self):
     """Returns this ragged tensor value as a nested Python list."""
-    values_as_list = self._values.tolist()
+    if isinstance(self._values, RaggedTensorValue):
+      values_as_list = self._values.to_list()
+    else:
+      values_as_list = self._values.tolist()
     return [
         values_as_list[self._row_splits[i]:self._row_splits[i + 1]]
         for i in range(len(self._row_splits) - 1)
diff --git a/tensorflow/python/ops/ragged/ragged_test_util.py b/tensorflow/python/ops/ragged/ragged_test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcbab3021ecb483641e9376ec0cdfefa36fdd704
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_test_util.py
@@ -0,0 +1,96 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# pylint: disable=invalid-name
+"""Test utils for tensorflow RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_value
+
+
+class RaggedTensorTestCase(test_util.TensorFlowTestCase):
+  """Base class for RaggedTensor test cases."""
+
+  def _GetPyList(self, a):
+    """Converts a to a nested python list."""
+    if isinstance(a, ragged_tensor.RaggedTensor):
+      return self.evaluate(a).to_list()
+    elif isinstance(a, ops.Tensor):
+      a = self.evaluate(a)
+      return a.tolist() if isinstance(a, np.ndarray) else a
+    elif isinstance(a, np.ndarray):
+      return a.tolist()
+    elif isinstance(a, ragged_tensor_value.RaggedTensorValue):
+      return a.to_list()
+    else:
+      return np.array(a).tolist()
+
+  def assertRaggedEqual(self, a, b):
+    """Asserts that two potentially ragged tensors are equal."""
+    a_list = self._GetPyList(a)
+    b_list = self._GetPyList(b)
+    self.assertEqual(a_list, b_list)
+
+    if not (isinstance(a, (list, tuple)) or isinstance(b, (list, tuple))):
+      a_ragged_rank = a.ragged_rank if ragged_tensor.is_ragged(a) else 0
+      b_ragged_rank = b.ragged_rank if ragged_tensor.is_ragged(b) else 0
+      self.assertEqual(a_ragged_rank, b_ragged_rank)
+
+  def assertRaggedAlmostEqual(self, a, b, places=7):
+    a_list = self._GetPyList(a)
+    b_list = self._GetPyList(b)
+    self.assertNestedListAlmostEqual(a_list, b_list, places, context='value')
+
+    if not (isinstance(a, (list, tuple)) or isinstance(b, (list, tuple))):
+      a_ragged_rank = a.ragged_rank if ragged_tensor.is_ragged(a) else 0
+      b_ragged_rank = b.ragged_rank if ragged_tensor.is_ragged(b) else 0
+      self.assertEqual(a_ragged_rank, b_ragged_rank)
+
+  def assertNestedListAlmostEqual(self, a, b, places=7, context='value'):
+    self.assertEqual(type(a), type(b))
+    if isinstance(a, (list, tuple)):
+      self.assertLen(a, len(b), 'Length differs for %s' % context)
+      for i in range(len(a)):
+        self.assertNestedListAlmostEqual(a[i], b[i], places,
+                                         '%s[%s]' % (context, i))
+    else:
+      self.assertAlmostEqual(
+          a, b, places,
+          '%s != %s within %s places at %s' % (a, b, places, context))
+
+  def eval_to_list(self, tensor):
+    value = self.evaluate(tensor)
+    if ragged_tensor.is_ragged(value):
+      return value.to_list()
+    elif isinstance(value, np.ndarray):
+      return value.tolist()
+    else:
+      return value
+
+  def _eval_tensor(self, tensor):
+    if ragged_tensor.is_ragged(tensor):
+      return ragged_tensor_value.RaggedTensorValue(
+          self._eval_tensor(tensor.values),
+          self._eval_tensor(tensor.row_splits))
+    else:
+      return test_util.TensorFlowTestCase._eval_tensor(self, tensor)
diff --git a/tensorflow/python/ops/ragged/ragged_tile_op_test.py b/tensorflow/python/ops/ragged/ragged_tile_op_test.py
index f335b15dd1577c32dd8ab907f35ae65b66b3d00e..8c03b166531c3ce07d7543677e70529413b37648 100644
--- a/tensorflow/python/ops/ragged/ragged_tile_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tile_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.tile."""
+"""Tests for ragged_array_ops.tile."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -26,10 +26,13 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedTileOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTileOpTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
 
   @parameterized.parameters([
       #=========================================================================
@@ -181,7 +184,6 @@ class RaggedTileOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                     [[[5], [6]]]]),
 
   ])  # pyformat: disable
-  @test_util.run_deprecated_v1
   def testRaggedTile(self,
                      descr,
                      rt_input,
@@ -207,10 +209,8 @@ class RaggedTileOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertEqual(tiled.shape.ndims, rt.shape.ndims)
       if multiples_tensor is const_multiples:
         self.assertEqual(tiled.shape.as_list(), expected_shape)
-      with self.test_session():
-        self.assertEqual(tiled.eval().tolist(), expected)
+      self.assertRaggedEqual(tiled, expected)
 
-  @test_util.run_deprecated_v1
   def testRaggedTileWithTensorInput(self):
     # When the input is a `Tensor`, ragged_tile just delegates to tf.tile.
     dt = constant_op.constant([[1, 2], [3, 4]])
@@ -218,8 +218,7 @@ class RaggedTileOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     expected = [[1, 2, 1, 2], [3, 4, 3, 4],
                 [1, 2, 1, 2], [3, 4, 3, 4],
                 [1, 2, 1, 2], [3, 4, 3, 4]]  # pyformat: disable
-    with self.test_session():
-      self.assertEqual(tiled.eval().tolist(), expected)
+    self.assertRaggedEqual(tiled, expected)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
index 69b31ad0e976cfb06264360cb27b7be8ff9fcf4a..92959a98bfeaa27f0db697656d51cf7e46b10327 100644
--- a/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
@@ -18,183 +18,182 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase):
 
-  @test_util.run_deprecated_v1
   def testDocStringExample(self):
-    rt = ragged.constant([[1, 2, 3], [4], [], [5, 6]])
-    st = ragged.to_sparse(rt)
-    expected = ('SparseTensorValue(indices='
-                'array([[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]]), '
-                'values=array([1, 2, 3, 4, 5, 6], dtype=int32), '
-                'dense_shape=array([4, 3]))')
-    with self.test_session():
-      self.assertEqual(' '.join(repr(st.eval()).split()), expected)
-
-  @test_util.run_deprecated_v1
+    rt = ragged_factory_ops.constant([[1, 2, 3], [4], [], [5, 6]])
+    st = self.evaluate(rt.to_sparse())
+    self.assertAllEqual(st.indices,
+                        [[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]])
+    self.assertAllEqual(st.values, [1, 2, 3, 4, 5, 6])
+    self.assertAllEqual(st.dense_shape, [4, 3])
+
   def test2DRaggedTensorWithOneRaggedDimension(self):
-    rt = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
-    with self.test_session():
-      st = ragged.to_sparse(rt).eval()
-      self.assertAllEqual(
-          st.indices, [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [2, 0], [4, 0]])
-      self.assertAllEqual(st.values, b'a b c d e f g'.split())
-      self.assertAllEqual(st.dense_shape, [5, 3])
-
-  @test_util.run_deprecated_v1
+    rt = ragged_factory_ops.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [],
+                                      ['g']])
+    st = self.evaluate(rt.to_sparse())
+    self.assertAllEqual(
+        st.indices, [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [2, 0], [4, 0]])
+    self.assertAllEqual(st.values, b'a b c d e f g'.split())
+    self.assertAllEqual(st.dense_shape, [5, 3])
+
   def test3DRaggedTensorWithOneRaggedDimension(self):
-    rt = ragged.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]],
-                          [[11, 12]], [], [[13, 14]]],
-                         ragged_rank=1)
-    with self.test_session():
-      st = ragged.to_sparse(rt).eval()
-      self.assertAllEqual(
-          st.indices, [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0],
-                       [1, 0, 1], [1, 1, 0], [1, 1, 1], [1, 2, 0], [1, 2, 1],
-                       [2, 0, 0], [2, 0, 1], [4, 0, 0], [4, 0, 1]])
-      self.assertAllEqual(st.values,
-                          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
-      self.assertAllEqual(st.dense_shape, [5, 3, 2])
-
-  @test_util.run_deprecated_v1
+    rt = ragged_factory_ops.constant(
+        [[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]], [[11, 12]], [], [[13, 14]]
+        ],
+        ragged_rank=1)
+    st = self.evaluate(rt.to_sparse())
+    self.assertAllEqual(st.indices,
+                        [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0],
+                         [1, 0, 1], [1, 1, 0], [1, 1, 1], [1, 2, 0], [1, 2, 1],
+                         [2, 0, 0], [2, 0, 1], [4, 0, 0], [4, 0, 1]])
+    self.assertAllEqual(st.values,
+                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+    self.assertAllEqual(st.dense_shape, [5, 3, 2])
+
   def test4DRaggedTensorWithOneRaggedDimension(self):
-    rt = ragged.constant(
+    rt = ragged_factory_ops.constant(
         [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]], [], [[[9, 10], [11, 12]]]],
         ragged_rank=1)
-    with self.test_session():
-      st = ragged.to_sparse(rt).eval()
-      self.assertAllEqual(st.values, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
-      self.assertAllEqual(
-          st.indices,
-          [
-              [0, 0, 0, 0],  # index for value=1
-              [0, 0, 0, 1],  # index for value=2
-              [0, 0, 1, 0],  # index for value=3
-              [0, 0, 1, 1],  # index for value=4
-              [0, 1, 0, 0],  # index for value=5
-              [0, 1, 0, 1],  # index for value=6
-              [0, 1, 1, 0],  # index for value=7
-              [0, 1, 1, 1],  # index for value=8
-              [2, 0, 0, 0],  # index for value=9
-              [2, 0, 0, 1],  # index for value=10
-              [2, 0, 1, 0],  # index for value=11
-              [2, 0, 1, 1],  # index for value=12
-          ])
-      self.assertAllEqual(st.dense_shape, [3, 2, 2, 2])
-
-  @test_util.run_deprecated_v1
+    st = self.evaluate(rt.to_sparse())
+    self.assertAllEqual(st.values, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
+    self.assertAllEqual(
+        st.indices,
+        [
+            [0, 0, 0, 0],  # index for value=1
+            [0, 0, 0, 1],  # index for value=2
+            [0, 0, 1, 0],  # index for value=3
+            [0, 0, 1, 1],  # index for value=4
+            [0, 1, 0, 0],  # index for value=5
+            [0, 1, 0, 1],  # index for value=6
+            [0, 1, 1, 0],  # index for value=7
+            [0, 1, 1, 1],  # index for value=8
+            [2, 0, 0, 0],  # index for value=9
+            [2, 0, 0, 1],  # index for value=10
+            [2, 0, 1, 0],  # index for value=11
+            [2, 0, 1, 1],  # index for value=12
+        ])
+    self.assertAllEqual(st.dense_shape, [3, 2, 2, 2])
+
   def test4DRaggedTensorWithTwoRaggedDimensions(self):
-    rt = ragged.constant([[[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]]],
-                          [[[11, 12]], [], [[13, 14]]], []],
-                         ragged_rank=2)
-    with self.test_session():
-      st = ragged.to_sparse(rt).eval()
-      self.assertAllEqual(
-          st.indices,
-          [
-              [0, 0, 0, 0],  # index for value=1
-              [0, 0, 0, 1],  # index for value=2
-              [0, 0, 1, 0],  # index for value=3
-              [0, 0, 1, 1],  # index for value=4
-              [0, 1, 0, 0],  # index for value=5
-              [0, 1, 0, 1],  # index for value=6
-              [0, 1, 1, 0],  # index for value=7
-              [0, 1, 1, 1],  # index for value=8
-              [0, 1, 2, 0],  # index for value=9
-              [0, 1, 2, 1],  # index for value=10
-              [1, 0, 0, 0],  # index for value=11
-              [1, 0, 0, 1],  # index for value=12
-              [1, 2, 0, 0],  # index for value=13
-              [1, 2, 0, 1],  # index for value=14
-          ])
-      self.assertAllEqual(st.values,
-                          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
-      self.assertAllEqual(st.dense_shape, [3, 3, 3, 2])
+    rt = ragged_factory_ops.constant(
+        [[[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]]],
+         [[[11, 12]], [], [[13, 14]]], []],
+        ragged_rank=2)
+    st = self.evaluate(rt.to_sparse())
+    self.assertAllEqual(
+        st.indices,
+        [
+            [0, 0, 0, 0],  # index for value=1
+            [0, 0, 0, 1],  # index for value=2
+            [0, 0, 1, 0],  # index for value=3
+            [0, 0, 1, 1],  # index for value=4
+            [0, 1, 0, 0],  # index for value=5
+            [0, 1, 0, 1],  # index for value=6
+            [0, 1, 1, 0],  # index for value=7
+            [0, 1, 1, 1],  # index for value=8
+            [0, 1, 2, 0],  # index for value=9
+            [0, 1, 2, 1],  # index for value=10
+            [1, 0, 0, 0],  # index for value=11
+            [1, 0, 0, 1],  # index for value=12
+            [1, 2, 0, 0],  # index for value=13
+            [1, 2, 0, 1],  # index for value=14
+        ])
+    self.assertAllEqual(st.values,
+                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+    self.assertAllEqual(st.dense_shape, [3, 3, 3, 2])
 
   def testShape(self):
-    rt = ragged.constant([[1, 2], [3, 4, 5], [6], [], [7]])
-    st = ragged.to_sparse(rt)
+    rt = ragged_factory_ops.constant([[1, 2], [3, 4, 5], [6], [], [7]])
+    st = rt.to_sparse()
     self.assertEqual(st.indices.shape.as_list(), [7, 2])
     self.assertEqual(st.values.shape.as_list(), [7])
     self.assertEqual(st.dense_shape.shape.as_list(), [2])
 
-    rt = ragged.constant([[[1, 2]], [], [[3, 4]], []], ragged_rank=1)
-    st = ragged.to_sparse(rt)
+    rt = ragged_factory_ops.constant([[[1, 2]], [], [[3, 4]], []],
+                                     ragged_rank=1)
+    st = rt.to_sparse()
     self.assertEqual(st.indices.shape.as_list(), [4, 3])
     self.assertEqual(st.values.shape.as_list(), [4])
     self.assertEqual(st.dense_shape.shape.as_list(), [3])
 
-    rt = ragged.constant([[[1], [2, 3, 4, 5, 6, 7]], [[]]])
-    st = ragged.to_sparse(rt)
+    rt = ragged_factory_ops.constant([[[1], [2, 3, 4, 5, 6, 7]], [[]]])
+    st = rt.to_sparse()
     self.assertEqual(st.indices.shape.as_list(), [7, 3])
     self.assertEqual(st.values.shape.as_list(), [7])
     self.assertEqual(st.dense_shape.shape.as_list(), [3])
 
-  @test_util.run_deprecated_v1
   def testKernelErrors(self):
     # An empty vector, defined using a placeholder to ensure that we can't
     # determine that it's invalid at graph-construction time.
     empty_vector = array_ops.placeholder_with_default(
         array_ops.zeros([0], dtypes.int64), shape=None)
 
-    bad_rt1 = ragged.from_row_splits(row_splits=[2, 3], values=[1, 2, 3])
-    with self.test_session():
-      bad_split0_error = r'First value of ragged splits must be 0.*'
-      self.assertRaisesRegexp(errors.InvalidArgumentError, bad_split0_error,
-                              ragged.to_sparse(bad_rt1).eval)
+    bad_rt1 = ragged_tensor.RaggedTensor.from_row_splits(
+        row_splits=[2, 3], values=[1, 2, 3])
+    bad_split0 = r'First value of ragged splits must be 0.*'
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, bad_split0):
+      self.evaluate(bad_rt1.to_sparse())
 
-    bad_rt2 = ragged.from_row_splits(row_splits=[0, 5], values=empty_vector)
-    bad_rt3 = ragged.from_row_splits(
+    bad_rt2 = ragged_tensor.RaggedTensor.from_row_splits(
+        row_splits=[0, 5], values=empty_vector)
+    bad_rt3 = ragged_tensor.RaggedTensor.from_row_splits(
         row_splits=[0, 1],
-        values=ragged.from_row_splits(row_splits=[0, 5], values=empty_vector))
-    with self.test_session():
-      split_mismatch1_error = r'Final value of ragged splits must match.*'
-      for rt in [bad_rt2, bad_rt3]:
-        self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                split_mismatch1_error,
-                                ragged.to_sparse(rt).eval)
-
-    bad_rt4 = ragged.from_row_splits(
+        values=ragged_tensor.RaggedTensor.from_row_splits(
+            row_splits=[0, 5], values=empty_vector))
+    split_mismatch1_error = r'Final value of ragged splits must match.*'
+    for rt in [bad_rt2, bad_rt3]:
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   split_mismatch1_error):
+        self.evaluate(rt.to_sparse())
+
+    bad_rt4 = ragged_tensor.RaggedTensor.from_row_splits(
         row_splits=[0, 5],
-        values=ragged.from_row_splits(row_splits=[0], values=empty_vector))
-    with self.test_session():
-      split_mismatch2_error = r'Final value of ragged splits must match.*'
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              split_mismatch2_error,
-                              ragged.to_sparse(bad_rt4).eval)
-
-    bad_rt5 = ragged.from_row_splits(row_splits=empty_vector, values=[])
-    with self.test_session():
-      empty_splits_error = (r'ragged splits may not be empty.*')
-      self.assertRaisesRegexp(errors.InvalidArgumentError, empty_splits_error,
-                              ragged.to_sparse(bad_rt5).eval)
-
-  @test_util.run_deprecated_v1
+        values=ragged_tensor.RaggedTensor.from_row_splits(
+            row_splits=[0], values=empty_vector))
+    split_mismatch2_error = r'Final value of ragged splits must match.*'
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 split_mismatch2_error):
+      self.evaluate(bad_rt4.to_sparse())
+
+    bad_rt5 = ragged_tensor.RaggedTensor.from_row_splits(
+        row_splits=empty_vector, values=[])
+    empty_splits_error = (r'ragged splits may not be empty.*')
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 empty_splits_error):
+      self.evaluate(bad_rt5.to_sparse())
+
   def testGradient(self):
+    if context.executing_eagerly():
+      return
     # rt1.shape == rt2.shape == [2, (D2), (D3), 2].
-    rt1 = ragged.constant([[[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0]]]],
-                          ragged_rank=2)
-    rt2 = ragged.constant([[[[9.0, 8.0], [7.0, 6.0]], [[5.0, 4.0]]]],
-                          ragged_rank=2)
-    rt = ragged.map_inner_values(math_ops.add, rt1, rt2 * 2.0)
-    st = ragged.to_sparse(rt)
-
-    g1, g2 = gradients_impl.gradients(st.values, [rt1.inner_values,
-                                                  rt2.inner_values])
+    rt1 = ragged_factory_ops.constant(
+        [[[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0]]]], ragged_rank=2)
+    rt2 = ragged_factory_ops.constant(
+        [[[[9.0, 8.0], [7.0, 6.0]], [[5.0, 4.0]]]], ragged_rank=2)
+    rt = ragged_functional_ops.map_flat_values(math_ops.add, rt1, rt2 * 2.0)
+    st = rt.to_sparse()
+
+    g1, g2 = gradients_impl.gradients(st.values,
+                                      [rt1.flat_values, rt2.flat_values])
     print(g1, g2)
-    with self.test_session():
-      self.assertEqual(g1.eval().tolist(), [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]])
-      self.assertEqual(g2.eval().tolist(), [[2.0, 2.0], [2.0, 2.0], [2.0, 2.0]])
+    self.assertRaggedEqual(g1, [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]])
+    self.assertRaggedEqual(g2, [[2.0, 2.0], [2.0, 2.0], [2.0, 2.0]])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
index 77499b9cb3cd067e926c2436b547a8c562c96e48..ac75456813fc05f1ce74d2f5ea09283fec22de90 100644
--- a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
@@ -23,24 +23,20 @@ from absl.testing import parameterized
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorToTensorOpTest(ragged_test_util.RaggedTensorTestCase,
                                  parameterized.TestCase):
 
-  @test_util.run_deprecated_v1
   def testDocStringExamples(self):
     """Example from ragged_to_tensor.__doc__."""
-    rt = ragged.constant([[9, 8, 7], [], [6, 5], [4]])
-    dt = ragged.to_tensor(rt)
-    with self.test_session():
-      self.assertEqual(str(dt.eval()),
-                       '[[9 8 7]\n'
-                       ' [0 0 0]\n'
-                       ' [6 5 0]\n'
-                       ' [4 0 0]]')  # pyformat: disable
+    rt = ragged_factory_ops.constant([[9, 8, 7], [], [6, 5], [4]])
+    dt = rt.to_tensor()
+    self.assertAllEqual(dt, [[9, 8, 7], [0, 0, 0], [6, 5, 0], [4, 0, 0]])
 
   @parameterized.parameters(
       {
@@ -98,23 +94,21 @@ class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
           'expected': [[[[1], [2]], [[9], [9]], [[3], [9]]]],
       },
   )
-  @test_util.run_deprecated_v1
   def testRaggedTensorToTensor(self,
                                rt_input,
                                expected,
                                ragged_rank=None,
                                default=None,
                                expected_shape=None):
-    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
-    dt = ragged.to_tensor(rt, default)
-    self.assertEqual(type(dt), ops.Tensor)
+    rt = ragged_factory_ops.constant(rt_input, ragged_rank=ragged_rank)
+    dt = rt.to_tensor(default)
+    self.assertIsInstance(dt, ops.Tensor)
     self.assertEqual(rt.dtype, dt.dtype)
     self.assertTrue(dt.shape.is_compatible_with(rt.shape))
-    with self.test_session():
-      self.assertEqual(dt.eval().tolist(), expected)
-      if expected_shape is not None:
-        dt_shape = array_ops.shape(dt)
-        self.assertEqual(dt_shape.eval().tolist(), expected_shape)
+    self.assertAllEqual(self.eval_to_list(dt), expected)
+    if expected_shape is not None:
+      dt_shape = array_ops.shape(dt)
+      self.assertAllEqual(dt_shape, expected_shape)
 
   @parameterized.parameters(
       {
@@ -131,14 +125,13 @@ class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
       {
           'rt_input': [[1, 2, 3]],
           'default': 'a',
-          'error': (TypeError, "Expected int32, got 'a' of type 'str' instead"),
+          'error': (TypeError, '.*'),
       },
   )
-  @test_util.run_deprecated_v1
   def testError(self, rt_input, default, error, ragged_rank=None):
-    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    rt = ragged_factory_ops.constant(rt_input, ragged_rank=ragged_rank)
     with self.assertRaisesRegexp(error[0], error[1]):
-      ragged.to_tensor(rt, default)
+      rt.to_tensor(default)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_util_test.py b/tensorflow/python/ops/ragged/ragged_util_test.py
index c24ea65353104f78f9f4e3e90b0c73edb923c7e2..ab5436a91cc8440373798c65bdac3648319316f3 100644
--- a/tensorflow/python/ops/ragged/ragged_util_test.py
+++ b/tensorflow/python/ops/ragged/ragged_util_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.platform import googletest
 
@@ -41,7 +42,9 @@ TENSOR_4D = [[[[('%d%d%d%d' % (i, j, k, l)).encode('utf-8')
              for i in range(4)]
 
 
-class RaggedRepeatTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedUtilTest(ragged_test_util.RaggedTensorTestCase,
+                     parameterized.TestCase):
 
   @parameterized.parameters([
       # Docstring examples
@@ -89,8 +92,7 @@ class RaggedRepeatTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   ])
   def testRepeat(self, data, repeats, expected, axis=None):
     result = ragged_util.repeat(data, repeats, axis)
-    with self.test_session():
-      self.assertEqual(result.eval().tolist(), expected)
+    self.assertAllEqual(result, expected)
 
   @parameterized.parameters([
       dict(mode=mode, **args)
@@ -155,8 +157,7 @@ class RaggedRepeatTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       repeats = array_ops.placeholder_with_default(repeats, None)
 
     result = ragged_util.repeat(data, repeats, axis)
-    with self.test_session():
-      self.assertEqual(result.eval().tolist(), expected.tolist())
+    self.assertAllEqual(result, expected)
 
   @parameterized.parameters([
       dict(
diff --git a/tensorflow/python/ops/ragged/ragged_where_op_test.py b/tensorflow/python/ops/ragged/ragged_where_op_test.py
index de83a54977101f7d1fd1cd45d3aa013d817e6aa0..3dd95658265de90a71f59ab4ae7c38ad80579cec 100644
--- a/tensorflow/python/ops/ragged/ragged_where_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_where_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.where."""
+"""Tests for ragged_array_ops.where."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,29 +21,39 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedWhereOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedWhereOpTest(ragged_test_util.RaggedTensorTestCase,
+                        parameterized.TestCase):
 
   @parameterized.parameters([
       #=========================================================================
       # Docstring Examples
       #=========================================================================
       dict(  # shape=[D1, (D2)]
-          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          condition=ragged_factory_ops.constant_value(
+              [[True, False, True], [False, True]]),
           expected=[[0, 0], [0, 2], [1, 1]]),
       dict(  # shape=[D1, (D2)]
-          condition=ragged.constant_value([[True, False, True], [False, True]]),
-          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
-          y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
-          expected=ragged.constant_value([[b'A', b'b', b'C'], [b'd', b'E']])),
+          condition=ragged_factory_ops.constant_value(
+              [[True, False, True], [False, True]]),
+          x=ragged_factory_ops.constant_value(
+              [['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged_factory_ops.constant_value(
+              [['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged_factory_ops.constant_value(
+              [[b'A', b'b', b'C'], [b'd', b'E']])),
       dict(  # shape=[D1, (D2)]
-          condition=ragged.constant_value([True, False]),
-          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
-          y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
-          expected=ragged.constant_value([[b'A', b'B', b'C'], [b'd', b'e']])),
+          condition=ragged_factory_ops.constant_value([True, False]),
+          x=ragged_factory_ops.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged_factory_ops.constant_value(
+              [[b'A', b'B', b'C'], [b'd', b'e']])),
       #=========================================================================
       # Coordinate-retrieval mode
       #=========================================================================
@@ -54,24 +64,25 @@ class RaggedWhereOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           condition=[[True, False], [False, True]],
           expected=[[0, 0], [1, 1]]),
       dict(  # shape=[D1, (D2)]
-          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          condition=ragged_factory_ops.constant_value(
+              [[True, False, True], [False, True]]),
           expected=[[0, 0], [0, 2], [1, 1]]),
       dict(  # shape=[D1, (D2), (D3)]
-          condition=ragged.constant_value([
+          condition=ragged_factory_ops.constant_value([
               [[True, False, True], [False, True]],
               [[True], [], [False], [False, True, False]]
           ]),
           expected=[[0, 0, 0], [0, 0, 2], [0, 1, 1],
                     [1, 0, 0], [1, 3, 1]]),
       dict(  # shape=[D1, (D2), D3]
-          condition=ragged.constant_value([
+          condition=ragged_factory_ops.constant_value([
               [[True, False], [False, True]],
               [[True, False], [False, False], [True, False], [False, True]]
           ], ragged_rank=1),
           expected=[[0, 0, 0], [0, 1, 1],
                     [1, 0, 0], [1, 2, 0], [1, 3, 1]]),
       dict(  # shape=[D1, (D2), (D3), (D4)]
-          condition=ragged.constant_value([
+          condition=ragged_factory_ops.constant_value([
               [[[], [True]]],
               [[[True, False, True], [False, True]],
                [[True], [], [False], [False, True, False]]]
@@ -98,44 +109,46 @@ class RaggedWhereOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           y=[['a', 'b'], ['d', 'e']],
           expected=[[b'A', b'b'], [b'd', b'E']]),
       dict(  # shape=[D1, (D2)]
-          condition=ragged.constant_value([[True, False, True], [False, True]]),
-          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
-          y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
-          expected=ragged.constant_value([[b'A', b'b', b'C'], [b'd', b'E']])),
+          condition=ragged_factory_ops.constant_value(
+              [[True, False, True], [False, True]]),
+          x=ragged_factory_ops.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged_factory_ops.constant_value(
+              [[b'A', b'b', b'C'], [b'd', b'E']])),
       dict(  # shape=[D1, (D2), D3]
-          condition=ragged.constant_value([
+          condition=ragged_factory_ops.constant_value([
               [[True, False], [False, True]],
               [[True, False], [False, False], [True, False], [False, True]]
           ], ragged_rank=1),
-          x=ragged.constant_value([
+          x=ragged_factory_ops.constant_value([
               [['A', 'B'], ['C', 'D']],
               [['E', 'F'], ['G', 'H'], ['I', 'J'], ['K', 'L']]
           ], ragged_rank=1),
-          y=ragged.constant_value([
+          y=ragged_factory_ops.constant_value([
               [['a', 'b'], ['c', 'd']],
               [['e', 'f'], ['g', 'h'], ['i', 'j'], ['k', 'l']]
           ], ragged_rank=1),
-          expected=ragged.constant_value([
+          expected=ragged_factory_ops.constant_value([
               [[b'A', b'b'], [b'c', b'D']],
               [[b'E', b'f'], [b'g', b'h'], [b'I', b'j'], [b'k', b'L']]
           ], ragged_rank=1)),
       dict(  # shape=[D1, (D2), (D3), (D4)]
-          condition=ragged.constant_value([
+          condition=ragged_factory_ops.constant_value([
               [[[], [True]]],
               [[[True, False, True], [False, True]],
                [[True], [], [False], [False, True, False]]]
           ]),
-          x=ragged.constant_value([
+          x=ragged_factory_ops.constant_value([
               [[[], ['A']]],
               [[['B', 'C', 'D'], ['E', 'F']],
                [['G'], [], ['H'], ['I', 'J', 'K']]]
           ]),
-          y=ragged.constant_value([
+          y=ragged_factory_ops.constant_value([
               [[[], ['a']]],
               [[['b', 'c', 'd'], ['e', 'f']],
                [['g'], [], ['h'], ['i', 'j', 'k']]]
           ]),
-          expected=ragged.constant_value([
+          expected=ragged_factory_ops.constant_value([
               [[[], [b'A']]],
               [[[b'B', b'c', b'D'], [b'e', b'F']],
                [[b'G'], [], [b'h'], [b'i', b'J', b'k']]]
@@ -151,32 +164,26 @@ class RaggedWhereOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           expected=[[b'A', b'B'], [b'c', b'd'], [b'E', b'F']]),
       dict(  # shape=[D1, (D2)]
           condition=[True, False, True],
-          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E'], ['F', 'G']]),
-          y=ragged.constant_value([['a', 'b'], ['c'], ['d', 'e']]),
-          expected=ragged.constant_value([[b'A', b'B', b'C'], [b'c'],
-                                          [b'F', b'G']])),
+          x=ragged_factory_ops.constant_value(
+              [['A', 'B', 'C'], ['D', 'E'], ['F', 'G']]),
+          y=ragged_factory_ops.constant_value(
+              [['a', 'b'], ['c'], ['d', 'e']]),
+          expected=ragged_factory_ops.constant_value(
+              [[b'A', b'B', b'C'], [b'c'], [b'F', b'G']])),
       dict(  # shape=[D1, (D2), (D3), (D4)]
-          condition=ragged.constant_value([True, False]),
-          x=ragged.constant_value([
+          condition=ragged_factory_ops.constant_value([True, False]),
+          x=ragged_factory_ops.constant_value([
               [[[], ['A']]],
               [[['B', 'C', 'D'], ['E', 'F']],
                [['G'], [], ['H'], ['I', 'J', 'K']]]
           ]),
-          y=ragged.constant_value([[[['a']]], [[['b']]]]),
-          expected=ragged.constant_value([[[[], [b'A']]], [[[b'b']]]])),
+          y=ragged_factory_ops.constant_value([[[['a']]], [[['b']]]]),
+          expected=ragged_factory_ops.constant_value(
+              [[[[], [b'A']]], [[[b'b']]]])),
   ])   # pyformat: disable
-  @test_util.run_deprecated_v1
   def testRaggedWhere(self, condition, expected, x=None, y=None):
-    result = ragged.where(condition, x, y)
-    self.assertEqual(
-        getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
-    with self.test_session():
-      result_value = self.evaluate(result)
-      if hasattr(result_value, 'tolist'):
-        result_value = result_value.tolist()
-      if hasattr(expected, 'tolist'):
-        expected = expected.tolist()
-      self.assertEqual(result_value, expected)
+    result = ragged_array_ops.where(condition, x, y)
+    self.assertRaggedEqual(result, expected)
 
   @parameterized.parameters([
       dict(
@@ -185,15 +192,16 @@ class RaggedWhereOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           error=ValueError,
           message='x and y must be either both None or both non-None'),
       dict(
-          condition=ragged.constant_value([[True, False, True], [False, True]]),
-          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          condition=ragged_factory_ops.constant_value([[True, False, True],
+                                                       [False, True]]),
+          x=ragged_factory_ops.constant_value([['A', 'B', 'C'], ['D', 'E']]),
           y=[['a', 'b'], ['d', 'e']],
           error=ValueError,
           message='Input shapes do not match.'),
   ])
   def testRaggedWhereErrors(self, condition, error, message, x=None, y=None):
     with self.assertRaisesRegexp(error, message):
-      ragged.where(condition, x, y)
+      ragged_array_ops.where(condition, x, y)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/segment_id_ops.py b/tensorflow/python/ops/ragged/segment_id_ops.py
index fa2970c3e75af36d3f042ab23ab70c8d2cdb36ca..42dc13223b67e2505578baefb783bc81182ec150 100644
--- a/tensorflow/python/ops/ragged/segment_id_ops.py
+++ b/tensorflow/python/ops/ragged/segment_id_ops.py
@@ -25,12 +25,14 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util.tf_export import tf_export
 
 
 # For background on "segments" and "segment ids", see:
 # https://www.tensorflow.org/api_guides/python/math_ops#Segmentation
+@tf_export("ragged.row_splits_to_segment_ids")
 def row_splits_to_segment_ids(splits, name=None):
-  """Generates the segmentation corresponding to a RaggedTensor `splits` vector.
+  """Generates the segmentation corresponding to a RaggedTensor `row_splits`.
 
   Returns an integer vector `segment_ids`, where `segment_ids[i] == j` if
   `splits[j] <= i < splits[j+1]`.  Example:
@@ -63,8 +65,9 @@ def row_splits_to_segment_ids(splits, name=None):
 
 # For background on "segments" and "segment ids", see:
 # https://www.tensorflow.org/api_guides/python/math_ops#Segmentation
+@tf_export("ragged.segment_ids_to_row_splits")
 def segment_ids_to_row_splits(segment_ids, num_segments=None, name=None):
-  """Generates the RaggedTensor `splits` vector corresponding to a segmentation.
+  """Generates the RaggedTensor `row_splits` corresponding to a segmentation.
 
   Returns an integer vector `splits`, where `splits[0] = 0` and
   `splits[i] = splits[i-1] + count(segment_ids==i)`.  Example:
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index f2df87cf2d06c22e5995d220b058e50aa89b54df..62e2f6d1025bb9802a5b2a09a4dbffbe15921ace 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -357,9 +357,9 @@ def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
     return multinomial_categorical_impl(logits, num_samples, output_dtype, seed)
 
 
-@tf_export("random.categorical", v1=[])
+@tf_export("random.categorical")
 def categorical(logits, num_samples, dtype=None, seed=None, name=None):
-  """Draws samples from a multinomial distribution.
+  """Draws samples from a categorical distribution.
 
   Example:
 
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 1066b357b43bb60d5e5b078846fcd82e12e941c3..6104cfa7ffe74499c465400bce1212fd36fad0a2 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops.gen_resource_variable_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import compat
+from tensorflow.python.util.deprecation import deprecated
 
 
 def get_resource_handle_data(graph_op):
@@ -685,6 +686,7 @@ class ResourceVariable(variables.RefVariable):
     raise NotImplementedError(
         "numpy() is only available when eager execution is enabled.")
 
+  @deprecated(None, "Prefer Dataset.range instead.")
   def count_up_to(self, limit):
     """Increments this variable until it reaches `limit`.
 
@@ -807,9 +809,6 @@ class ResourceVariable(variables.RefVariable):
     return ResourceVariable(
         variable_def=variable_def, import_scope=import_scope)
 
-  def _AsTensor(self):
-    return self.value()
-
   def _ref(self):
     """Unsupported."""
     raise NotImplementedError("ResourceVariable does not implement _ref()")
diff --git a/tensorflow/python/ops/signal/reconstruction_ops.py b/tensorflow/python/ops/signal/reconstruction_ops.py
index 0fc7fec23933d600c89513fb39d3a45856a8618b..4eaab4e0a0cd7958d56c9af3ccf2c5f69b35ee9b 100644
--- a/tensorflow/python/ops/signal/reconstruction_ops.py
+++ b/tensorflow/python/ops/signal/reconstruction_ops.py
@@ -18,46 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.signal import shape_ops
-from tensorflow.python.ops.signal import util_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-def _shuffle_to_front(input_tensor, k):
-  """Shuffles the last `k` indices of `input_tensor` to the front.
-
-  Transposes `input_tensor` to have the last `k` indices at the front. The input
-  may have arbitrary rank and unknown shape.
-
-  Args:
-    input_tensor: A `Tensor` of arbitrary rank and unknown shape.
-    k: A scalar `Tensor` specifying how many indices to shuffle.
-
-  Returns:
-    A transposed version of `input_tensor` with `k` indices shuffled to the
-    front.
-
-  Raises:
-    ValueError: If `input_tensor` is not at least rank `k` or `k` is not scalar.
-  """
-  k = ops.convert_to_tensor(k, name="k")
-  k.shape.with_rank(0)
-  k_static = tensor_util.constant_value(k)
-  if k_static is not None:
-    input_tensor.shape.with_rank_at_least(k_static)
-
-  rank = array_ops.rank(input_tensor)
-  outer_indices, inner_indices = array_ops.split(math_ops.range(rank),
-                                                 [rank - k, k])
-  permutation = array_ops.concat([inner_indices, outer_indices], 0)
-
-  return array_ops.transpose(input_tensor, perm=permutation)
-
-
 @tf_export("signal.overlap_and_add")
 def overlap_and_add(signal, frame_step, name=None):
   """Reconstructs a signal from a framed representation.
@@ -80,8 +48,8 @@ def overlap_and_add(signal, frame_step, name=None):
     frames of `signal`'s inner-most two dimensions.
 
   Raises:
-    ValueError: If `signal`'s rank is less than 2, `frame_step` is not a scalar
-      integer or `frame_step` is greater than `frame_length`.
+    ValueError: If `signal`'s rank is less than 2, or `frame_step` is not a
+      scalar integer.
   """
   with ops.name_scope(name, "overlap_and_add", [signal, frame_step]):
     signal = ops.convert_to_tensor(signal, name="signal")
@@ -97,56 +65,91 @@ def overlap_and_add(signal, frame_step, name=None):
     # All dimensions that are not part of the overlap-and-add. Can be empty for
     # rank 2 inputs.
     outer_dimensions = signal_shape[:-2]
+    outer_rank = array_ops.size(outer_dimensions)
+
+    def full_shape(inner_shape):
+      return array_ops.concat([outer_dimensions, inner_shape], 0)
 
-    # If frame_length and frame_step are known at graph construction time, check
-    # frame_step is less than or equal to frame_length.
-    frame_step_static = tensor_util.constant_value(frame_step)
-    if (frame_step_static is not None and signal.shape.ndims is not None and
-        signal.shape.dims[-1].value is not None):
-      if frame_step_static > signal.shape.dims[-1].value:
-        raise ValueError(
-            "frame_step (%d) must be less than or equal to "
-            "frame_length (%d)" % (
-                frame_step_static, signal.shape.dims[-1].value))
-      # If frame_length is equal to frame_step, there's no overlap so just
-      # reshape the tensor.
-      if frame_step_static == signal.shape.dims[-1].value:
-        return array_ops.reshape(signal, array_ops.concat(
-            [outer_dimensions, [-1]], 0))
-
-    signal_rank = array_ops.rank(signal)
-    frames = signal_shape[-2]
     frame_length = signal_shape[-1]
+    frames = signal_shape[-2]
 
-    subframe_length = util_ops.gcd(frame_length, frame_step)
-    subframe_step = frame_step // subframe_length
-    subframes_per_frame = frame_length // subframe_length
-    output_size = frame_step * (frames - 1) + frame_length
-    output_subframes = output_size // subframe_length
-
-    # To avoid overlap-adding sample-by-sample, we overlap-add at the "subframe"
-    # level, where a subframe is gcd(frame_length, frame_step). Reshape signal
-    # from [..., frames, frame_length] into [..., subframes, subframe_length].
-    subframe_shape = array_ops.concat(
-        [outer_dimensions, [-1, subframe_length]], 0)
-    subframe_signal = array_ops.reshape(signal, subframe_shape)
-
-    # Now we shuffle the last [subframes, subframe_length] dimensions to the
-    # front.
-    # TODO(rjryan): Add an axis argument to unsorted_segment_sum so we can
-    # avoid this pair of transposes.
-    subframe_signal = _shuffle_to_front(subframe_signal, 2)
-
-    # Use unsorted_segment_sum to add overlapping subframes together.
-    segment_ids = array_ops.reshape(shape_ops.frame(
-        math_ops.range(output_subframes), subframes_per_frame, subframe_step,
-        pad_end=False), [-1])
-    result = math_ops.unsorted_segment_sum(subframe_signal, segment_ids,
-                                           num_segments=output_subframes)
-
-    # result is a [subframes, subframe_length, ...outer_dimensions] tensor. We
-    # return a [...outer_dimensions, output_size] tensor with a transpose and
-    # reshape.
-    result_shape = array_ops.concat([outer_dimensions, [output_size]], 0)
-    return array_ops.reshape(_shuffle_to_front(result, signal_rank - 2),
-                             result_shape)
+    # Compute output length.
+    output_length = frame_length + frame_step * (frames - 1)
+
+    # If frame_length is equal to frame_step, there's no overlap so just
+    # reshape the tensor.
+    frame_step_static = tensor_util.constant_value(frame_step)
+    if (frame_step_static is not None and signal.shape.dims is not None and
+        frame_step_static == signal.shape.dims[-1].value):
+      output_shape = full_shape([output_length])
+      return array_ops.reshape(signal, output_shape, name="fast_path")
+
+    # The following code is documented using this example:
+    #
+    # frame_step = 2
+    # signal.shape = (3, 5)
+    # a b c d e
+    # f g h i j
+    # k l m n o
+
+    # Compute the number of segments, per frame.
+    segments = -(-frame_length // frame_step)  # Divide and round up.
+
+    # Pad the frame_length dimension to a multiple of the frame step.
+    # Pad the frames dimension by `segments` so that signal.shape = (6, 6)
+    # a b c d e 0
+    # f g h i j 0
+    # k l m n o 0
+    # 0 0 0 0 0 0
+    # 0 0 0 0 0 0
+    # 0 0 0 0 0 0
+    paddings = [[0, segments], [0, segments * frame_step - frame_length]]
+    outer_paddings = array_ops.zeros([outer_rank, 2], dtypes.int32)
+    paddings = array_ops.concat([outer_paddings, paddings], 0)
+    signal = array_ops.pad(signal, paddings)
+
+    # Reshape so that signal.shape = (3, 6, 2)
+    # ab cd e0
+    # fg hi j0
+    # kl mn o0
+    # 00 00 00
+    # 00 00 00
+    # 00 00 00
+    shape = full_shape([frames + segments, segments, frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Transpose dimensions so that signal.shape = (3, 6, 2)
+    # ab fg kl 00 00 00
+    # cd hi mn 00 00 00
+    # e0 j0 o0 00 00 00
+    perm = array_ops.concat(
+        [math_ops.range(outer_rank), outer_rank + [1, 0, 2]], 0)
+    signal = array_ops.transpose(signal, perm)
+
+    # Reshape so that signal.shape = (18, 2)
+    # ab fg kl 00 00 00 cd hi mn 00 00 00 e0 j0 o0 00 00 00
+    shape = full_shape([(frames + segments) * segments, frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Truncate so that signal.shape = (15, 2)
+    # ab fg kl 00 00 00 cd hi mn 00 00 00 e0 j0 o0
+    signal = signal[..., :(frames + segments - 1) * segments, :]
+
+    # Reshape so that signal.shape = (3, 5, 2)
+    # ab fg kl 00 00
+    # 00 cd hi mn 00
+    # 00 00 e0 j0 o0
+    shape = full_shape([segments, (frames + segments - 1), frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Now, reduce over the columns, to achieve the desired sum.
+    signal = math_ops.reduce_sum(signal, -3)
+
+    # Flatten the array.
+    shape = full_shape([(frames + segments - 1) * frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Truncate to final length.
+    signal = signal[..., :output_length]
+
+    return signal
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 245080cb2678cc5c8001fa094286cc90f67e663e..097b485a115fb8153f77d0ad24c63b872fb2e8ca 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -44,6 +44,9 @@ from tensorflow.python.ops.gen_sparse_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
+from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import get_canonical_name_for_symbol
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -296,7 +299,7 @@ def sparse_concat(axis,
 
 
 @tf_export("sparse.concat", v1=[])
-def sparse_concat_v2(axis, sp_inputs, expand_nonconcat_dim=False, name=None):  # pylint: disable=missing-docstring
+def sparse_concat_v2(axis, sp_inputs, expand_nonconcat_dims=False, name=None):  # pylint: disable=missing-docstring
   sp_inputs = _convert_to_sparse_tensors(sp_inputs)
 
   if len(sp_inputs) == 1:  # Degenerate case of one tensor.
@@ -306,7 +309,7 @@ def sparse_concat_v2(axis, sp_inputs, expand_nonconcat_dim=False, name=None):  #
   vals = [sp_input.values for sp_input in sp_inputs]
   shapes = [sp_input.dense_shape for sp_input in sp_inputs]
 
-  if expand_nonconcat_dim:
+  if expand_nonconcat_dims:
     max_shape = math_ops.reduce_max(
         array_ops.concat(
             [array_ops.reshape(shape, [1, -1]) for shape in shapes], 0), 0)
@@ -1090,6 +1093,9 @@ def sparse_reduce_max_v2(
 @deprecation.deprecated_endpoints("sparse_reduce_max")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+@deprecation.deprecated_args(
+    None, "reduction_axes is deprecated, use axis instead",
+    "reduction_axes")
 def sparse_reduce_max(sp_input, axis=None, keepdims=None,
                       reduction_axes=None, keep_dims=None):
   """Computes the max of elements across dimensions of a SparseTensor.
@@ -1138,7 +1144,7 @@ def sparse_reduce_max(sp_input, axis=None, keepdims=None,
     axis: The dimensions to reduce; list or scalar. If `None` (the
       default), reduces all dimensions.
     keepdims: If true, retain reduced dimensions with length 1.
-    reduction_axes: Deprecated name of axis.
+    reduction_axes: Deprecated name of `axis`.
     keep_dims:  Deprecated alias for `keepdims`.
 
   Returns:
@@ -1276,6 +1282,9 @@ def sparse_reduce_sum_v2(
 @deprecation.deprecated_endpoints("sparse_reduce_sum")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+@deprecation.deprecated_args(
+    None, "reduction_axes is deprecated, use axis instead",
+    "reduction_axes")
 def sparse_reduce_sum(sp_input, axis=None, keepdims=None,
                       reduction_axes=None, keep_dims=None):
   """Computes the sum of elements across dimensions of a SparseTensor.
@@ -1311,7 +1320,7 @@ def sparse_reduce_sum(sp_input, axis=None, keepdims=None,
     axis: The dimensions to reduce; list or scalar. If `None` (the
       default), reduces all dimensions.
     keepdims: If true, retain reduced dimensions with length 1.
-    reduction_axes: Deprecated name of axis.
+    reduction_axes: Deprecated name of `axis`.
     keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
@@ -1422,7 +1431,7 @@ def sparse_tensor_to_dense(sp_input,
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  return sparse_to_dense(
+  return gen_sparse_ops.sparse_to_dense(
       sp_input.indices,
       sp_input.dense_shape,
       sp_input.values,
@@ -2673,3 +2682,48 @@ def _take_many_sparse_from_tensors_map(sparse_map_op,
   output_shape.set_shape([rank])
 
   return sparse_tensor.SparseTensor(output_indices, output_values, output_shape)
+
+
+class _UnaryMapValueDispatcher(dispatch.OpDispatcher):
+  """OpDispatcher for unary ops that maps base function across sparse values."""
+
+  def __init__(self, original_func):
+    self._original_func = original_func
+    func_name = get_canonical_name_for_symbol(original_func)
+    arg_names = tf_inspect.getfullargspec(original_func)[0]
+    self._x = arg_names[0]
+    original_func.__doc__ = (
+        original_func.__doc__.rstrip() + "\n\n" +
+        ("    If `{x}` is a `SparseTensor`, returns\n"
+         "    `SparseTensor({x}.indices, tf.{func}({x}.values, ...), "
+         "{x}.dense_shape)`").format(x=self._x, func=func_name))
+
+  def handle(self, args, kwargs):
+    if args:
+      x, args = args[0], args[1:]
+    else:
+      kwargs = kwargs.copy()
+      x = kwargs.pop(self._x, None)
+    if isinstance(x, sparse_tensor.SparseTensor):
+      return sparse_tensor.SparseTensor(
+          indices=x.indices,
+          values=self._original_func(x.values, *args, **kwargs),
+          dense_shape=x.dense_shape)
+    else:
+      return self.NOT_SUPPORTED
+
+
+_UNARY_OPS = [
+    # TODO(b/120307967) Add dispatchers for additional TensorFlow ops.
+    math_ops.abs,
+    math_ops.negative,
+    math_ops.sign,
+    math_ops.square,
+    math_ops.sqrt,
+    math_ops.erf,
+    math_ops.tanh,
+    math_ops.bessel_i0e,
+    math_ops.bessel_i1e,
+]
+for unary_op in _UNARY_OPS:
+  _UnaryMapValueDispatcher(unary_op).register(unary_op)
diff --git a/tensorflow/python/ops/sparse_ops_test.py b/tensorflow/python/ops/sparse_ops_test.py
index 4ee1569249b5ccd3b38de7bb6c2bb5bce761c513..031069a0f017c5d7e80999d2aa6a3e5fd2cf10e6 100644
--- a/tensorflow/python/ops/sparse_ops_test.py
+++ b/tensorflow/python/ops/sparse_ops_test.py
@@ -18,18 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import googletest
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class SparseOpsTest(test_util.TensorFlowTestCase):
+class SparseOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testSparseEye(self):
     def test_one(n, m, as_tensors):
@@ -77,5 +79,23 @@ class SparseOpsTest(test_util.TensorFlowTestCase):
           d = sparse_ops.sparse_to_dense(s.indices, s.dense_shape, s.values)
           self.assertAllEqual(self.evaluate(d), expected_after)
 
+  @parameterized.parameters([
+      (math_ops.abs, [1.0, -1.0, 3.0, -4.0], [1.0, 1.0, 3.0, 4.0]),
+      (math_ops.negative, [1.0, -1.0, 3.0, -4.0], [-1.0, 1.0, -3.0, 4.0]),
+      (math_ops.sign, [3.0, -2.0, 0.0, -4.0], [1.0, -1.0, 0.0, -1.0]),
+      (math_ops.square, [1.0, -1.0, 3.0, -4.0], [1.0, 1.0, 9.0, 16.0]),
+  ])
+  def testUnarySparseDispatch(self, op, values, expected):
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [2, 0], [2, 4]],
+        values=values,
+        dense_shape=[3, 6])
+    result = op(st)
+    result_value = self.evaluate(result)
+    self.assertAllEqual(result_value.indices, st.indices)
+    self.assertAllEqual(result_value.values, expected)
+    self.assertAllEqual(result_value.dense_shape, st.dense_shape)
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index c614d072badbdf7927d6c889288e1cf4e8d988ef..ba3bd094923abf2929d9e64e9f9bdb7d60cf4c80 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -22,6 +22,8 @@ from __future__ import print_function
 
 import sys as _sys
 
+from tensorflow.python import autograph
+
 # pylint: disable=g-bad-import-order
 # Imports the following modules so that @RegisterGradient get executed.
 from tensorflow.python.ops import array_grad
@@ -69,6 +71,8 @@ from tensorflow.python.ops.math_ops import *
 from tensorflow.python.ops.numerics import *
 from tensorflow.python.ops.parsing_ops import *
 from tensorflow.python.ops.partitioned_variables import *
+from tensorflow.python.ops.ragged import ragged_dispatch as _ragged_dispatch
+from tensorflow.python.ops.ragged import ragged_operators as _ragged_operators
 from tensorflow.python.ops.random_ops import *
 from tensorflow.python.ops.script_ops import py_func
 from tensorflow.python.ops.session_ops import *
@@ -100,3 +104,7 @@ from tensorflow.python.ops.variable_scope import *
 from tensorflow.python.ops.variables import *
 # pylint: enable=wildcard-import
 # pylint: enable=g-bad-import-order
+
+
+# These modules were imported to set up RaggedTensor operators and dispatchers:
+del _ragged_dispatch, _ragged_operators
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 76684f89f8ac9347486a115c12e0b4f5ff49ba30..71aaceee272f6e0acd8b8e860fb501eaed4bd61b 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -32,6 +32,8 @@ from tensorflow.python.ops import gen_state_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_state_ops import *
 # pylint: enable=wildcard-import
+from tensorflow.python.util import deprecation
+from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -223,6 +225,7 @@ def assign(ref, value, validate_shape=None, use_locking=None, name=None):
 
 
 @tf_export(v1=["count_up_to"])
+@deprecated(None, "Prefer Dataset.range instead.")
 def count_up_to(ref, limit, name=None):
   r"""Increments 'ref' until it reaches 'limit'.
 
@@ -595,7 +598,9 @@ def scatter_nd_sub(ref, indices, updates, use_locking=False, name=None):
       name=name))
 
 
-@tf_export("batch_scatter_update")
+@tf_export(v1=["batch_scatter_update"])
+@deprecation.deprecated(
+    "2018-11-29", "Use the batch_scatter_update method of Variable instead.")
 def batch_scatter_update(ref, indices, updates, use_locking=True, name=None):
   """Generalization of `tf.scatter_update` to axis different than 0.
 
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index b6b329c48653d75c94176ca2557a14c599757d17..9967f48060c2aefd7c5fe789c82f935751efc45c 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.gen_string_ops import *
 from tensorflow.python.util import compat as util_compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=g-bad-import-order
 # pylint: enable=wildcard-import
@@ -45,6 +46,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=redefined-builtin
 @tf_export("strings.regex_full_match")
+@dispatch.add_dispatch_support
 def regex_full_match(input, pattern, name=None):
   r"""Match elements of `input` with regex `pattern`.
 
@@ -76,6 +78,7 @@ regex_full_match.__doc__ = gen_string_ops.regex_full_match.__doc__
 @tf_export(
     "strings.regex_replace", v1=["strings.regex_replace", "regex_replace"])
 @deprecation.deprecated_endpoints("regex_replace")
+@dispatch.add_dispatch_support
 def regex_replace(input, pattern, rewrite, replace_global=True, name=None):
   r"""Replace elements of `input` matching regex `pattern` with `rewrite`.
 
@@ -350,10 +353,13 @@ reduce_join.__doc__ = reduce_join.__doc__.replace("tf.reduce_join(",
 # This wrapper provides backwards compatibility for code that predates the
 # unit argument and that passed 'name' as a positional argument.
 @tf_export(v1=["strings.length"])
+@dispatch.add_dispatch_support
 def string_length(input, name=None, unit="BYTE"):
   return gen_string_ops.string_length(input, unit=unit, name=name)
 
+
 @tf_export("strings.length", v1=[])
+@dispatch.add_dispatch_support
 def string_length_v2(input, unit="BYTE", name=None):
   return string_length(input, name, unit)
 
@@ -361,7 +367,7 @@ def string_length_v2(input, unit="BYTE", name=None):
 string_length.__doc__ = gen_string_ops.string_length.__doc__
 
 
-@tf_export("substr")
+@tf_export(v1=["substr"])
 @deprecation.deprecated(None, "Use `tf.strings.substr` instead of `tf.substr`.")
 def substr_deprecated(input, pos, len, name=None, unit="BYTE"):
   return substr(input, pos, len, name=name, unit=unit)
@@ -370,16 +376,19 @@ substr_deprecated.__doc__ = gen_string_ops.substr.__doc__
 
 
 @tf_export(v1=["strings.substr"])
+@dispatch.add_dispatch_support
 def substr(input, pos, len, name=None, unit="BYTE"):
   return gen_string_ops.substr(input, pos, len, unit=unit, name=name)
 
+substr.__doc__ = gen_string_ops.substr.__doc__
+
 
 @tf_export("strings.substr", v1=[])
+@dispatch.add_dispatch_support
 def substr_v2(input, pos, len, unit="BYTE", name=None):
-  return substr(input, pos, len, name=name, unit=unit)
-
+  return gen_string_ops.substr(input, pos, len, unit=unit, name=name)
 
-substr.__doc__ = gen_string_ops.substr.__doc__
+substr_v2.__doc__ = gen_string_ops.substr.__doc__
 
 
 ops.NotDifferentiable("RegexReplace")
@@ -395,6 +404,7 @@ ops.NotDifferentiable("DecodeBase64")
 
 
 @tf_export("strings.to_number", v1=[])
+@dispatch.add_dispatch_support
 def string_to_number(input, out_type=dtypes.float32, name=None):
   r"""Converts each string in the input Tensor to the specified numeric type.
 
@@ -418,6 +428,7 @@ tf_export(v1=["strings.to_number", "string_to_number"])(
 
 
 @tf_export("strings.to_hash_bucket", v1=[])
+@dispatch.add_dispatch_support
 def string_to_hash_bucket(input, num_buckets, name=None):
   # pylint: disable=line-too-long
   r"""Converts each string in the input Tensor to its hash mod by a number of buckets.
diff --git a/tensorflow/python/ops/summary_op_util.py b/tensorflow/python/ops/summary_op_util.py
index c72a9aefc3fa53d2a94a5f84a44f728208d82915..93d8d50842ba681688e6d42890445ab4e6879124 100644
--- a/tensorflow/python/ops/summary_op_util.py
+++ b/tensorflow/python/ops/summary_op_util.py
@@ -21,10 +21,10 @@ from __future__ import print_function
 import contextlib
 import re
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging
-from tensorflow.python.training import distribution_strategy_context
 
 
 def collect(val, collections, default_collections):
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index 3f99b9f8773b3d26cf334044e0d127bf7443bfea..168cb975548095be4648a9e705deb797241363c7 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -58,14 +58,31 @@ _RUN_NAME_PATTERNS = re.compile(r"^[^\x00-\x1F<>]{0,512}$")
 _USER_NAME_PATTERNS = re.compile(r"^[a-z]([-a-z0-9]{0,29}[a-z0-9])?$", re.I)
 
 
-def should_record_summaries():
-  """Returns boolean Tensor which is true if summaries should be recorded."""
+def _should_record_summaries_internal():
+  """Returns boolean Tensor if summaries should/shouldn't be recorded, or None.
+  """
   global _SHOULD_RECORD_SUMMARIES
   key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  should = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
+  should = _SHOULD_RECORD_SUMMARIES.get(key)
   return should() if callable(should) else should
 
 
+def _should_record_summaries_v2():
+  """Returns boolean Tensor which is true if summaries should be recorded.
+
+  If no recording status has been set, this defaults to True, unlike the public
+  should_record_summaries().
+  """
+  result = _should_record_summaries_internal()
+  return True if result is None else result
+
+
+def should_record_summaries():
+  """Returns boolean Tensor which is true if summaries should be recorded."""
+  result = _should_record_summaries_internal()
+  return False if result is None else result
+
+
 @tf_contextlib.contextmanager
 def _record_summaries(boolean=True):
   """Sets summary recording on or off per the provided boolean value.
@@ -86,7 +103,7 @@ def _record_summaries(boolean=True):
   # TODO(nickfelt): make this threadlocal
   global _SHOULD_RECORD_SUMMARIES
   key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  old = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
+  old = _SHOULD_RECORD_SUMMARIES.setdefault(key, None)
   try:
     _SHOULD_RECORD_SUMMARIES[key] = boolean
     yield
@@ -370,6 +387,98 @@ def summary_writer_initializer_op():
   return _SUMMARY_WRITER_INIT_OP.setdefault(key, [])
 
 
+_INVALID_SCOPE_CHARACTERS = re.compile(r"[^-_/.A-Za-z0-9]")
+
+
+@tf_export("summary.summary_scope", v1=[])
+@tf_contextlib.contextmanager
+def summary_scope(name, default_name="summary", values=None):
+  """A context manager for use when defining a custom summary op.
+
+  This behaves similarly to `tf.name_scope`, except that it returns a generated
+  summary tag in addition to the scope name. The tag is structurally similar to
+  the scope name - derived from the user-provided name, prefixed with enclosing
+  name scopes if any - but we relax the constraint that it be uniquified, as
+  well as the character set limitation (so the user-provided name can contain
+  characters not legal for scope names; in the scope name these are removed).
+
+  This makes the summary tag more predictable and consistent for the user.
+
+  For example, to define a new summary op called `my_op`:
+
+  ```python
+  def my_op(name, my_value, step):
+    with tf.summary.summary_scope(name, "MyOp", [my_value]) as (tag, scope):
+      my_value = tf.convert_to_tensor(my_value)
+      return tf.summary.write(tag, my_value, step=step)
+  ```
+
+  Args:
+    name: string name for the summary.
+    default_name: Optional; if provided, used as default name of the summary.
+    values: Optional; passed as `values` parameter to name_scope.
+
+  Yields:
+    A tuple `(tag, scope)` as described above.
+  """
+  name = name or default_name
+  current_scope = ops.get_name_scope()
+  tag = current_scope + "/" + name if current_scope else name
+  # Strip illegal characters from the scope name, and if that leaves nothing,
+  # use None instead so we pick up the default name.
+  name = _INVALID_SCOPE_CHARACTERS.sub("", name) or None
+  with ops.name_scope(name, default_name, values) as scope:
+    yield tag, scope
+
+
+@tf_export("summary.write", v1=[])
+def write(tag, tensor, step, metadata=None, name=None):
+  """Writes a generic summary to the default SummaryWriter if one exists.
+
+  This exists primarily to support the definition of type-specific summary ops
+  like scalar() and image(), and is not intended for direct use unless defining
+  a new type-specific summary op.
+
+  Args:
+    tag: string tag used to identify the summary (e.g. in TensorBoard), usually
+      generated with `tf.summary.summary_scope`
+    tensor: the Tensor holding the summary data to write
+    step: `int64`-castable monotic step value for this summary
+    metadata: Optional SummaryMetadata, as a proto or serialized bytes
+    name: Optional string name for this op.
+
+  Returns:
+    True on success, or false if no summary was written because no default
+    summary writer was available.
+  """
+  with ops.name_scope(name, "write_summary") as scope:
+    if context.context().summary_writer_resource is None:
+      return constant_op.constant(False)
+    if metadata is None:
+      serialized_metadata = constant_op.constant(b"")
+    elif hasattr(metadata, "SerializeToString"):
+      serialized_metadata = constant_op.constant(metadata.SerializeToString())
+    else:
+      serialized_metadata = metadata
+
+    def record():
+      """Record the actual summary and return True."""
+      # Note the identity to move the tensor to the CPU.
+      with ops.device("cpu:0"):
+        write_summary_op = gen_summary_ops.write_summary(
+            context.context().summary_writer_resource,
+            step,
+            array_ops.identity(tensor),
+            tag,
+            serialized_metadata,
+            name=scope)
+        with ops.control_dependencies([write_summary_op]):
+          return constant_op.constant(True)
+
+    return smart_cond.smart_cond(
+        _should_record_summaries_v2(), record, _nothing, name="summary_cond")
+
+
 def summary_writer_function(name, tensor, function, family=None):
   """Helper function to write summaries.
 
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index e3375ad0abe0edb93977a0b52e6143f5911bccdb..85333ee6b561c2c593eed3b12caff419eb7c1c84 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -20,10 +20,8 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
-import os
 import weakref
 
-from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -32,6 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import list_ops
@@ -40,10 +39,6 @@ from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.tf_export import tf_export
 
 
-ENABLE_TENSOR_ARRAY_V2 = (
-    tf2.enabled() or os.getenv("TF_ENABLE_TENSOR_ARRAY_V2") is not None)
-
-
 # _GraphTensorArray accesses many of the hidden generated ops, but is in
 # fact built to wrap these methods.
 # pylint: disable=protected-access
@@ -586,7 +581,11 @@ class _GraphTensorArrayV2(object):
 
   def concat(self, name=None):
     """See TensorArray."""
-    raise NotImplementedError("TensorArray.concat")
+    value = list_ops.tensor_list_concat(
+        input_handle=self._flow, element_dtype=self._dtype, name=name)
+    if self._element_shape and self._element_shape[0].dims is not None:
+      value.set_shape([None] + self._element_shape[0].dims[1:])
+    return value
 
   @tf_should_use.should_use_result
   def unstack(self, value, name=None):
@@ -630,7 +629,30 @@ class _GraphTensorArrayV2(object):
   @tf_should_use.should_use_result
   def split(self, value, lengths, name=None):
     """See TensorArray."""
-    raise NotImplementedError("TensorArray.split")
+    with ops.name_scope(name, "TensorArraySplit", [self._flow, value, lengths]):
+      value = ops.convert_to_tensor(value, name="value")
+      lengths_64 = math_ops.to_int64(lengths)
+      if self._infer_shape and not context.executing_eagerly():
+        clengths = tensor_util.constant_value(lengths_64)
+        if value.shape.dims is not None:
+          if clengths is not None and clengths.max() == clengths.min():
+            self._merge_element_shape(
+                tensor_shape.TensorShape([clengths[0]]).concatenate(
+                    value.shape[1:]))
+      flow_out = list_ops.tensor_list_split(
+          tensor=value,
+          lengths=lengths_64,
+          element_shape=self._element_shape[0] if self._element_shape else None,
+          name=name)
+      ta = TensorArray(
+          dtype=self._dtype,
+          handle=self.handle,
+          flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
+      return ta
 
   def size(self, name=None):
     """See TensorArray."""
@@ -986,7 +1008,7 @@ class TensorArray(object):
     if context.executing_eagerly():
       implementation = _EagerTensorArray
     else:
-      if ENABLE_TENSOR_ARRAY_V2:
+      if control_flow_util.ENABLE_CONTROL_FLOW_V2:
         implementation = _GraphTensorArrayV2
       else:
         implementation = _GraphTensorArray
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 4824c92a5adfbdf2a06a1dc82164523b85f0e890..d01b95666b3241b7c9e9a4caf3b6d6c375ff19fe 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -487,6 +487,10 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """
     raise NotImplementedError
 
+  @deprecated(
+      None,
+      "Use Variable.read_value. Variables in 2.X are initialized "
+      "automatically both in eager and graph (inside tf.defun) contexts.")
   def initialized_value(self):
     """Returns the value of the initialized variable.
 
@@ -506,7 +510,10 @@ class Variable(six.with_metaclass(VariableMetaclass,
       A `Tensor` holding the value of this variable after its initializer
       has run.
     """
-    raise NotImplementedError
+    with ops.init_scope():
+      return control_flow_ops.cond(is_variable_initialized(self),
+                                   self.read_value,
+                                   lambda: self.initial_value)
 
   @property
   def initial_value(self):
@@ -637,37 +644,84 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """
     raise NotImplementedError
 
+  def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
+    """Assigns `IndexedSlices` to this variable batch-wise.
+
+    Analogous to `batch_gather`. This assumes that this variable and the
+    sparse_delta IndexedSlices have a series of leading dimensions that are the
+    same for all of them, and the updates are performed on the last dimension of
+    indices. In other words, the dimensions should be the following:
+
+    `num_prefix_dims = sparse_delta.indices.ndims - 1`
+    `batch_dim = num_prefix_dims + 1`
+    `sparse_delta.updates.shape = sparse_delta.indices.shape + var.shape[
+         batch_dim:]`
+
+    where
+
+    `sparse_delta.updates.shape[:num_prefix_dims]`
+    `== sparse_delta.indices.shape[:num_prefix_dims]`
+    `== var.shape[:num_prefix_dims]`
+
+    And the operation performed can be expressed as:
+
+    `var[i_1, ..., i_n,
+         sparse_delta.indices[i_1, ..., i_n, j]] = sparse_delta.updates[
+            i_1, ..., i_n, j]`
+
+    When sparse_delta.indices is a 1D tensor, this operation is equivalent to
+    `scatter_update`.
+
+    To avoid this operation one can looping over the first `ndims` of the
+    variable and using `scatter_update` on the subtensors that result of slicing
+    the first dimension. This is a valid option for `ndims = 1`, but less
+    efficient than this implementation.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be assigned to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    raise NotImplementedError
+
   def scatter_nd_sub(self, indices, updates, name=None):
     """Applies sparse subtraction to individual values or slices in a Variable.
 
-    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+    Assuming the variable has rank `P` and `indices` is a `Tensor` of rank `Q`.
 
-    `indices` must be integer tensor, containing indices into `ref`.
+    `indices` must be integer tensor, containing indices into self.
     It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 
     The innermost dimension of `indices` (with length `K`) corresponds to
     indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-    dimension of `ref`.
+    dimension of self.
 
     `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
     ```
-    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    [d_0, ..., d_{Q-2}, self.shape[K], ..., self.shape[P-1]].
     ```
 
     For example, say we want to add 4 scattered elements to a rank-1 tensor to
     8 elements. In Python, that update would look like this:
 
     ```python
-        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        v = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
-        op = ref.scatter_nd_sub(indices, updates)
+        op = v.scatter_nd_sub(indices, updates)
         with tf.Session() as sess:
           print sess.run(op)
     ```
 
-    The resulting update to ref would look like this:
+    The resulting update to v would look like this:
 
         [1, -9, 3, -6, -6, 6, 7, -4]
 
@@ -691,34 +745,34 @@ class Variable(six.with_metaclass(VariableMetaclass,
   def scatter_nd_add(self, indices, updates, name=None):
     """Applies sparse addition to individual values or slices in a Variable.
 
-    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+    The Variable has rank `P` and `indices` is a `Tensor` of rank `Q`.
 
-    `indices` must be integer tensor, containing indices into `ref`.
+    `indices` must be integer tensor, containing indices into self.
     It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 
     The innermost dimension of `indices` (with length `K`) corresponds to
     indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-    dimension of `ref`.
+    dimension of self.
 
     `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
     ```
-    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    [d_0, ..., d_{Q-2}, self.shape[K], ..., self.shape[P-1]].
     ```
 
     For example, say we want to add 4 scattered elements to a rank-1 tensor to
     8 elements. In Python, that update would look like this:
 
     ```python
-        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        v = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
-        add = ref.scatter_nd_add(indices, updates)
+        add = v.scatter_nd_add(indices, updates)
         with tf.Session() as sess:
           print sess.run(add)
     ```
 
-    The resulting update to ref would look like this:
+    The resulting update to v would look like this:
 
         [1, 13, 3, 14, 14, 6, 7, 20]
 
@@ -742,34 +796,34 @@ class Variable(six.with_metaclass(VariableMetaclass,
   def scatter_nd_update(self, indices, updates, name=None):
     """Applies sparse assignment to individual values or slices in a Variable.
 
-    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+    The Variable has rank `P` and `indices` is a `Tensor` of rank `Q`.
 
-    `indices` must be integer tensor, containing indices into `ref`.
+    `indices` must be integer tensor, containing indices into self.
     It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 
     The innermost dimension of `indices` (with length `K`) corresponds to
     indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-    dimension of `ref`.
+    dimension of self.
 
     `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
     ```
-    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    [d_0, ..., d_{Q-2}, self.shape[K], ..., self.shape[P-1]].
     ```
 
     For example, say we want to add 4 scattered elements to a rank-1 tensor to
     8 elements. In Python, that update would look like this:
 
     ```python
-        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        v = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
-        op = ref.scatter_nd_assign(indices, updates)
+        op = v.scatter_nd_assign(indices, updates)
         with tf.Session() as sess:
           print sess.run(op)
     ```
 
-    The resulting update to ref would look like this:
+    The resulting update to v would look like this:
 
         [1, 11, 3, 10, 9, 6, 7, 12]
 
@@ -790,6 +844,7 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """
     raise NotImplementedError
 
+  @deprecated(None, "Prefer Dataset.range instead.")
   def count_up_to(self, limit):
     """Increments this variable until it reaches `limit`.
 
@@ -812,6 +867,9 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """
     raise NotImplementedError
 
+  @deprecated(
+      None,
+      "Prefer Variable.assign which has equivalent behavior in 2.X.")
   def load(self, value, session=None):
     """Load new value into this variable.
 
@@ -845,7 +903,15 @@ class Variable(six.with_metaclass(VariableMetaclass,
     Raises:
         ValueError: Session is not passed and no default session
     """
-    raise NotImplementedError
+    if context.executing_eagerly():
+      self.assign(value)
+    else:
+      session = session or ops.get_default_session()
+      if session is None:
+        raise ValueError(
+            "Either session argument should be provided or default session "
+            "should be established")
+      session.run(self.initializer, {self.initializer.inputs[1]: value})
 
   # Conversion to tensor.
   @staticmethod
@@ -884,7 +950,7 @@ class Variable(six.with_metaclass(VariableMetaclass,
 
     def _run_op(a, *args, **kwargs):
       # pylint: disable=protected-access
-      return tensor_oper(a._AsTensor(), *args, **kwargs)
+      return tensor_oper(a.value(), *args, **kwargs)
 
     functools.update_wrapper(_run_op, tensor_oper)
     setattr(cls, operator, _run_op)
@@ -915,6 +981,18 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """The name of this variable."""
     raise NotImplementedError
 
+  @property
+  def _shared_name(self):
+    """The shared name of the variable.
+
+      Unlike name(), shared_name doesn't have ":0" suffix. It is user-specified
+      name with name scope prefix.
+
+    Returns:
+      variable name.
+    """
+    return self.name[:self.name.index(":")]
+
   @property
   def initializer(self):
     """The initializer operation for this variable."""
@@ -950,8 +1028,8 @@ class Variable(six.with_metaclass(VariableMetaclass,
     raise NotImplementedError
 
   def get_shape(self):
-    """Alias of Variable.shape."""
-    raise NotImplementedError
+    """Alias of `Variable.shape`."""
+    return self.shape
 
   def to_proto(self, export_scope=None):
     """Converts a `Variable` to a `VariableDef` protocol buffer.
@@ -1555,16 +1633,6 @@ class RefVariable(VariableV1):
     """Conversion function for Graph.as_graph_element()."""
     return self._variable
 
-  def _AsTensor(self):  # pylint: disable=invalid-name
-    """Converts this variable to a Tensor.
-
-    See `tf.Variable.value`.
-
-    Returns:
-      A `Tensor` containing the value of the variable.
-    """
-    return self._snapshot
-
   def value(self):
     """Returns the last snapshot of this variable.
 
@@ -1656,30 +1724,6 @@ class RefVariable(VariableV1):
     """
     return self._variable.eval(session=session)
 
-  def initialized_value(self):
-    """Returns the value of the initialized variable.
-
-    You should use this instead of the variable itself to initialize another
-    variable with a value that depends on the value of this variable.
-
-    ```python
-    # Initialize 'v' with a random tensor.
-    v = tf.Variable(tf.truncated_normal([10, 40]))
-    # Use `initialized_value` to guarantee that `v` has been
-    # initialized before its value is used to initialize `w`.
-    # The random values are picked only once.
-    w = tf.Variable(v.initialized_value() * 2.0)
-    ```
-
-    Returns:
-      A `Tensor` holding the value of this variable after its initializer
-      has run.
-    """
-    with ops.init_scope():
-      return control_flow_ops.cond(is_variable_initialized(self),
-                                   self.read_value,
-                                   lambda: self.initial_value)
-
   @property
   def initial_value(self):
     """Returns the Tensor used as the initial value for the variable.
@@ -1842,6 +1886,55 @@ class RefVariable(VariableV1):
         use_locking=use_locking,
         name=name)
 
+  def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
+    """Assigns `IndexedSlices` to this variable batch-wise.
+
+    Analogous to `batch_gather`. This assumes that this variable and the
+    sparse_delta IndexedSlices have a series of leading dimensions that are the
+    same for all of them, and the updates are performed on the last dimension of
+    indices. In other words, the dimensions should be the following:
+
+    `num_prefix_dims = sparse_delta.indices.ndims - 1`
+    `batch_dim = num_prefix_dims + 1`
+    `sparse_delta.updates.shape = sparse_delta.indices.shape + var.shape[
+         batch_dim:]`
+
+    where
+
+    `sparse_delta.updates.shape[:num_prefix_dims]`
+    `== sparse_delta.indices.shape[:num_prefix_dims]`
+    `== var.shape[:num_prefix_dims]`
+
+    And the operation performed can be expressed as:
+
+    `var[i_1, ..., i_n,
+         sparse_delta.indices[i_1, ..., i_n, j]] = sparse_delta.updates[
+            i_1, ..., i_n, j]`
+
+    When sparse_delta.indices is a 1D tensor, this operation is equivalent to
+    `scatter_update`.
+
+    To avoid this operation one can looping over the first `ndims` of the
+    variable and using `scatter_update` on the subtensors that result of slicing
+    the first dimension. This is a valid option for `ndims = 1`, but less
+    efficient than this implementation.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be assigned to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    return state_ops.batch_scatter_update(
+        self, sparse_delta.indices, sparse_delta.values,
+        use_locking=use_locking, name=name)
+
   def scatter_nd_sub(self, indices, updates, name=None):
     """Applies sparse subtraction to individual values or slices in a Variable.
 
@@ -2021,6 +2114,7 @@ class RefVariable(VariableV1):
                                               new_axis_mask=new_axis_mask,
                                               shrink_axis_mask=shrink_axis_mask)
 
+  @deprecated(None, "Prefer Dataset.range instead.")
   def count_up_to(self, limit):
     """Increments this variable until it reaches `limit`.
 
@@ -2043,49 +2137,6 @@ class RefVariable(VariableV1):
     """
     return state_ops.count_up_to(self._variable, limit=limit)
 
-  def load(self, value, session=None):
-    """Load new value into this variable.
-
-    Writes new value to variable's memory. Doesn't add ops to the graph.
-
-    This convenience method requires a session where the graph
-    containing this variable has been launched. If no session is
-    passed, the default session is used.  See `tf.Session` for more
-    information on launching a graph and on sessions.
-
-    ```python
-    v = tf.Variable([1, 2])
-    init = tf.global_variables_initializer()
-
-    with tf.Session() as sess:
-        sess.run(init)
-        # Usage passing the session explicitly.
-        v.load([2, 3], sess)
-        print(v.eval(sess)) # prints [2 3]
-        # Usage with the default session.  The 'with' block
-        # above makes 'sess' the default session.
-        v.load([3, 4], sess)
-        print(v.eval()) # prints [3 4]
-    ```
-
-    Args:
-        value: New variable value
-        session: The session to use to evaluate this variable. If
-          none, the default session is used.
-
-    Raises:
-        ValueError: Session is not passed and no default session
-    """
-    if context.executing_eagerly():
-      self.assign(value)
-    else:
-      session = session or ops.get_default_session()
-      if session is None:
-        raise ValueError(
-            "Either session argument should be provided or default session "
-            "should be established")
-      session.run(self._initializer_op, {self._initializer_op.inputs[1]: value})
-
   # Conversion to tensor.
   @staticmethod
   def _TensorConversionFunction(v, dtype=None, name=None, as_ref=False):  # pylint: disable=invalid-name
@@ -2242,18 +2293,6 @@ class RefVariable(VariableV1):
     """The name of this variable."""
     return self._variable.name
 
-  @property
-  def _shared_name(self):
-    """The shared name of the variable.
-
-      Unlike name(), shared_name doesn't have ":0" suffix. It is user-specified
-      name with name scope prefix.
-
-    Returns:
-      variable name.
-    """
-    return self.name[:-2]
-
   @property
   def initializer(self):
     """The initializer operation for this variable."""
@@ -2288,10 +2327,6 @@ class RefVariable(VariableV1):
     """
     return self._variable.get_shape()
 
-  def get_shape(self):
-    """Alias of Variable.shape."""
-    return self.shape
-
   def to_proto(self, export_scope=None):
     """Converts a `Variable` to a `VariableDef` protocol buffer.
 
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 59ca29e3bad97ae16fe9f8bd1f518b264885e6cd..3e5a8fcdfac5c7134112ff14f0a59664d2deb207 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -23,6 +23,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph as func_graph_module
@@ -51,13 +52,6 @@ from tensorflow.python.util import nest
 # to them and then pass those in as data inputs. This should probably be
 # handled in the CapturingGraph itself.
 
-# Op types that output a resource tensor representing a TensorArray handle.
-TENSOR_ARRAY_HANDLE_OPS = (
-    "TensorArrayV3",
-    "TensorArrayGradV3",
-    "TensorArrayGradWithShape",
-)
-
 
 def while_loop(cond,
                body,
@@ -105,7 +99,7 @@ def while_loop(cond,
 
     # Automatic control dependencies are added in defuns, but not in v1
     # graphs. Propagate that behavior here.
-    add_control_dependencies = util.in_defun()
+    add_control_dependencies = ops.get_default_graph()._add_control_dependencies
 
     # Build a `cond` wrapper that can handle the extra counter loop_var.
     def wrapped_cond(loop_counter, *args):
@@ -200,30 +194,6 @@ def while_loop(cond,
             " this as a loop variable." % str(external_capture))
         cond_graph.capture(external_capture)
 
-    # Export all tensors in the loop body that may be needed for gradient
-    # computation. We do this by accumulating the intermediate values in
-    # TensorLists.
-    intermediate_tensors = _get_intermediates(body_graph)
-
-    for intermediate_tensor in intermediate_tensors:
-      tensor_list = list_ops.empty_tensor_list(
-          element_dtype=intermediate_tensor.dtype,
-          element_shape=intermediate_tensor.shape,
-          max_num_elements=maximum_iterations)
-      loop_vars.append(tensor_list)
-      with cond_graph.as_default():
-        # Add a placeholder to cond_graph's inputs corresponding to the
-        # tensor_list.
-        cond_graph.capture(tensor_list)
-      with body_graph.as_default():
-        # Push the intermediate tensor to the tensor list. This captures the
-        # `tensor_list` as well.
-        appended_tensor_list = list_ops.tensor_list_push_back(
-            tensor_list,
-            intermediate_tensor)
-        # Add this modified tensor list to the list of outputs.
-        body_graph.outputs.append(appended_tensor_list)
-
     # Make sure that the shapes of the loop outputs are compatible with the
     # shape invariants, or the shapes of the loop vars if the invariants are not
     # specified.
@@ -272,33 +242,32 @@ def while_loop(cond,
 @ops.RegisterGradient("While")
 def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   """The gradient of a While op produced by while_loop."""
-  body_graph = _get_body_graph(op)
-
-  # Set the incoming gradient of TensorArray handles to None. The gradient
-  # implementation currently assumes all resource tensors correspond to float32
-  # ResourceVariables, which can lead to runtime shape errors when used with a
-  # TensorArray. This is a workaround until TensorArrays are reimplemented with
-  # TensorLists instead of resources.
-  # Also set the incoming gradient of non-trainable inputs to None. It is
-  # possible that we receive non-None gradients for non-trainable types in
-  # nested while loops because we accumulate outputs of the inner while as
-  # variant tensors which are trainable and hence receive zeros_like tensors in
-  # the gradient pass. The non-trainable tensors then receive the popped zeros
-  # tensor from this zeros variant. The gradient for the loop vars corresponding
-  # to these tensors is None or zeros (this happens only if the loop var is
-  # accumulated as well) in _grad_fn so we reset these.
+  cond_graph = _get_graph(op, "cond")
+  body_graph = _get_graph(op, "body")
+  orig_num_params = len(body_graph.outputs)
+
+  maximum_iterations = op.get_attr(
+      "_maximum_iterations") if _is_in_xla_context() else None
+  assert not _is_in_xla_context() or maximum_iterations is not None
+
+  # Set the incoming gradient of non-trainable inputs to None. It is possible
+  # that we receive non-None gradients for non-trainable types in nested while
+  # loops because we accumulate outputs of the inner while as variant tensors
+  # which are trainable and hence receive zeros_like tensors in the gradient
+  # pass. The non-trainable tensors then receive the popped zeros tensor from
+  # this zeros variant. The gradient for the loop vars corresponding to these
+  # tensors is None or zeros (this happens only if the loop var is accumulated
+  # as well) in _grad_fn so we reset these.
   # TODO(b/118712257): Remove the IsTrainable filter once we can handle None
   # output grads in _grad_fn.
   grads = [
-      None if _is_tensor_array_handle(output) or
-      not gradients_impl.IsTrainable(output) else grad
-      for grad, output in zip(grads, op.outputs)
+      None if not _is_trainable(output) else grad
+      for grad, output in zip(grads, body_graph.outputs)
   ]
 
   # Ensure that all non-resource trainable outputs have incoming gradients.
-  assert all(g is not None or o.dtype == dtypes.resource or
-             not gradients_impl.IsTrainable(o)
-             for o, g in zip(op.outputs, grads)
+  assert all(g is not None or o.dtype == dtypes.resource or not _is_trainable(o)
+             for o, g in zip(body_graph.outputs, grads)
             ), "All trainable loop vars must receive incoming gradients."
   # We compute the gradient for the sub-graph between trainable ys and xs
   # with non-None incoming gradients. We later pad the None's to the list of
@@ -307,32 +276,36 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
       body_graph.outputs, body_graph.inputs, grads) if grad is not None])
 
   body_grad_graph, args = _create_grad_func(
-      ys, xs, non_none_grads, body_graph,
-      util.unique_grad_fn_name(body_graph.name), op)
+      ys, xs, non_none_grads, cond_graph, body_graph,
+      util.unique_grad_fn_name(body_graph.name), op, maximum_iterations)
 
-  intermediate_tensors = _get_intermediates(body_grad_graph)
+  if body_grad_graph.while_op_needs_rewrite:
+    # Modify 'op' to output the intermediate accumulators needed by the grad
+    # function.
+    # NOTE(skyewm): if there are any active sessions, this modification to `op`
+    # may make them unrunnable!
 
-  maximum_iterations = op.get_attr(
-      "_maximum_iterations") if _is_in_xla_context() else None
-  assert not _is_in_xla_context() or maximum_iterations is not None
-  for intermediate_tensor in intermediate_tensors:
-    tensor_list = list_ops.empty_tensor_list(
-        element_dtype=intermediate_tensor.dtype,
-        element_shape=intermediate_tensor.shape,
-        max_num_elements=maximum_iterations)
-
-    with body_grad_graph.as_default():
-      tensor_list_ph = body_grad_graph.capture(tensor_list, whitelisted=True)
-      # Push the intermediate tensor to the tensor list.
-      appended_tensor_list = list_ops.tensor_list_push_back(tensor_list_ph,
-                                                            intermediate_tensor)
-      # Add this modified tensor list to the list of outputs.
-      body_grad_graph.outputs.append(appended_tensor_list)
+    cond_graph.name += "_rewritten"
+    body_graph.name += "_rewritten"
+
+    new_inputs = body_grad_graph.empty_tensor_lists
+    new_outputs = body_graph.outputs[orig_num_params:]
+
+    op._set_func_attr("cond", util.create_new_tf_function(cond_graph))
+    op._set_func_attr("body", util.create_new_tf_function(body_graph))
+    op._set_type_list_attr("T", body_graph.output_types)
+    op._set_shape_list_attr("output_shapes", body_graph.output_shapes)
+    op._add_while_inputs(new_inputs)
+    op._add_outputs([t.dtype for t in new_outputs],
+                    [t.shape for t in new_outputs])
+    _copy_handle_data(new_outputs, op.outputs[orig_num_params:])
+
+  captured_inputs = _resolve_grad_captures(body_graph, body_grad_graph, op)
+  loop_vars = args + captured_inputs
 
   def grad_cond(counter, max_iters, *unused_args):
     return counter < max_iters
 
-  loop_vars = args + body_grad_graph.external_captures
   grad_cond_name = util.unique_grad_fn_name(op.get_attr("cond").name)
   cond_grad_graph = func_graph_module.func_graph_from_py_func(
       grad_cond_name, grad_cond, loop_vars, {},
@@ -354,8 +327,7 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   # See comment in while_loop.
   outputs = [array_ops.identity(t) for t in outputs]
 
-  # Set None as the output gradient for tensors with None input gradient
-  # e.g. TensorArray handles.
+  # Set None as the output gradient for tensors with None input gradient.
   # outputs[0] is the loop counter.
   # outputs[1] is the total number of loop iterations.
   index = 2
@@ -369,6 +341,24 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   return none_padded_outputs
 
 
+def _is_trainable(tensor):
+  """Returns whether the given tensor is trainable."""
+  if not gradients_impl.IsTrainable(tensor):
+    return False
+
+  # Special case: untrainable accumulator output. The gradients algorithm
+  # doesn't know about tensor lists of untrainable elements. In theory the
+  # tensor list gradient functions should return None as appropriate, but
+  # because we can't return None from the gradient function we filter out
+  # untrainable accumulator output here to avoid computing the gradient at all.
+  if tensor.op.type == "TensorListPopBack" and tensor.value_index == 0:
+    assert tensor.dtype == dtypes.variant
+    element_type = tensor.op.get_attr("element_dtype")
+    return gradients_impl.IsTrainable(element_type)
+
+  return True
+
+
 def _validate_and_convert_to_tensor(maximum_iterations):
   """Checks that `maximum_iterations` is valid.
 
@@ -410,20 +400,21 @@ def _validate_and_convert_to_tensor(maximum_iterations):
 
 
 # TODO(srbs): Pull this into common utils for cond_v2 and while_v2.
-def _get_body_graph(while_op):
-  """Returns `FuncGraph` for the while body.
+def _get_graph(while_op, func_attr_name):
+  """Returns `FuncGraph` for the given function attribute.
 
   Args:
     while_op: The While Operation.
+    func_attr_name: string
 
   Returns:
-    `FuncGraph` for the while body.
+    `FuncGraph`
   """
   # TODO(srbs): Handle TensorShapeProto in function_def_to_graph.input_shapes.
   input_shapes = [
       tensor_shape.TensorShape(s) for s in while_op.get_attr("output_shapes")
   ]
-  func_name = while_op.get_attr("body").name
+  func_name = while_op.get_attr(func_attr_name).name
   fdef = while_op.graph._get_function(func_name).definition
   # `while_op.graph` may not be the same as `ops.get_default_graph()` e.g.
   # if the `while_op` is in the body of another if/while/defun. We build the
@@ -436,7 +427,8 @@ def _get_body_graph(while_op):
   return func_graph
 
 
-def _create_grad_func(ys, xs, grads, func_graph, name, while_op):
+def _create_grad_func(ys, xs, grads, cond_graph, body_graph, name, while_op,
+                      max_iters):
   """Builds and returns the gradient FuncGraph of `func_graph` and its args.
 
   The returned grad_func_graph must be called with the returned
@@ -446,9 +438,11 @@ def _create_grad_func(ys, xs, grads, func_graph, name, while_op):
     ys: A `Tensor` or list of tensors to be differentiated.
     xs: A `Tensor` or list of tensors to be used for differentiation.
     grads: The incoming grads for `ys`.
-    func_graph: FuncGraph for the forward body function.
+    cond_graph: FuncGraph for the forward cond function.
+    body_graph: FuncGraph for the forward body function.
     name: Name of the returned gradient function.
     while_op: The forward While op.
+    max_iters: the maximum number of iterations, or None if no limit.
 
   Returns:
     2-tuple of (grad_func_graph, args).
@@ -464,9 +458,10 @@ def _create_grad_func(ys, xs, grads, func_graph, name, while_op):
   # `external_captures`.
   grad_func_graph = func_graph_module.func_graph_from_py_func(
       name,
-      lambda *args: _grad_fn(ys, xs, args, func_graph),
+      lambda *args: _grad_fn(ys, xs, args, body_graph),
       args, {},
-      func_graph=_WhileBodyGradFuncGraph(name, func_graph))
+      func_graph=_WhileBodyGradFuncGraph(name, cond_graph, body_graph,
+                                         max_iters))
 
   # Add the popped accumulators to the list of outputs.
   for internal_capture in grad_func_graph.internal_captures:
@@ -506,10 +501,11 @@ def _grad_fn(ys, xs, args, func_graph):
   # Build the gradient graph. Note that this builds the gradient computation of
   # func_graph in the current graph, which requires capturing tensors from
   # func_graph. The captured func_graph tensors are resolved to external tensors
-  # in _resolve_grad_inputs.
+  # after the forward While op has been rewritten in _resolve_grad_captures.
   # TODO(srbs): Mark GradientsHelper as public?
   grad_outs = gradients_impl._GradientsHelper(
-      ys, xs, grad_ys=grad_ys, src_graph=func_graph)
+      ys, xs, grad_ys=grad_ys, src_graph=func_graph,
+      unconnected_gradients="zero")
 
   # TODO(b/118712257): Handle the case when grad_outs has None's e.g. when there
   # is a tf.StopGradient in the loop body.
@@ -519,43 +515,45 @@ def _grad_fn(ys, xs, args, func_graph):
   return [counter + 1, total_iters] + grad_outs
 
 
-def _get_intermediates(func_graph):
-  """Returns all tensors in `func_graph` that should be accumulated."""
-  # We currently accumulate output tensors of most ops in the function and rely
-  # on the pruning pass to get rid of the unused accumulators at runtime.
-  # However, this can bloat the GraphDef and make debugging harder so we perform
-  # some optimizations.
-  #
-  # Optimization we currently perform:
-  # 1. We do not accumulate tensors which already have an accumulator
-  #    in the loop body.
-  # 2. We do not accumulate outputs of Identity nodes. When building the
-  #    FuncGraph, we add an Identity node for each output (see
-  #    `AutomaticControlDependencies.mark_as_return`). Accumulating outputs
-  #    of all these nodes bloats the GraphDef quite a bit so we remove those.
-  #    Since the gradient of an Identity node does not rely on its forward op's
-  #    input this is safe to do.
-  #
-  # Other possible optimizations:
-  # 1. Only accumulate tensors that will be required by the backward pass.
-  #    This will require running the gradient pass and hence would increase the
-  #    graph building time for the forward pass.
-  # 2. Do not accumulate Const nodes created inside the loop body.
-  # 3. Do not accumulate inputs that are passed as-is, e.g. loop invariants.
-  # TODO(srbs): 2 and 3 may be hard optimizations for the runtime optimizer
-  # since it requires knowledge of the while loop semantics. If so, consider
-  # doing those here.
-  intermediates = []
-
-  for op in func_graph.get_operations():
-    if op.type == "Identity":
-      continue
-    for o in op.outputs:
-      if (o != func_graph.inputs[0] and  # Loop counter.
-          o.dtype != dtypes.resource and  # Do not accumulate resource tensors.
-          _get_accumulator(o) is None):  # Has existing accumulator.
-        intermediates.append(o)
-  return intermediates
+def _resolve_grad_captures(body_graph, body_grad_graph, while_op):
+  """Returns the tensors to pass as captured inputs to `body_grad_graph`.
+
+  `body_grad_graph` may have external references to:
+  1. Its outer graph containing the input gradients. These are left as-is.
+  2. Accumulators captured from the forward-pass graph. These should have been
+     added as `while_op` outputs after the gradient graph was built. We replace
+     these with the corresponding output of `while_op`, i.e. a tensor in
+     `body_graph.outer_graph`. In the case of nested control flow or functions,
+     the gradient logic handling `body_grad_graph.outer_graph` will make sure
+     the tensor from `body_graph.outer_graph` is also correctly captured.
+
+  Args:
+    body_graph: FuncGraph. The forward-pass body function.
+    body_grad_graph: FuncGraph. The body gradients function.
+    while_op: The forward-pass While Operation calling `body_graph`.
+
+  Returns:
+    A list of input tensors to be passed as the captured inputs to
+      `body_grad_graph`.
+  """
+  new_capture_inputs = []
+  for t in body_grad_graph.external_captures:
+    # All values captured by gradient computation should be from the forward
+    # graph or a captured resource variable (note that input gradients are
+    # regular non-captured inputs).
+    if t.graph == body_graph:
+      # Captured accumulator
+      t = while_op.outputs[t.graph.outputs.index(t)]
+      # Note: We rely on the capturing logic of the gradient While op graph to
+      # correctly capture the tensors in `body_graph.outer_graph`. Both cond_v2
+      # and while_v2 handle this while building their gradient functions.
+      assert t.graph == body_graph.outer_graph
+    else:
+      # Captured resource variable
+      assert t.dtype == dtypes.resource
+
+    new_capture_inputs.append(t)
+  return new_capture_inputs
 
 
 def _get_accumulator(tensor):
@@ -629,9 +627,10 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
      b. Lookup the corresponding resource tensor in the forward outer graph and
         try to capture that.
   2. If the tensor is not of resource type:
-     a. Find the accumulator for that tensor.
-     b. Capture the forward While op output tensor corresponding to the
-        accumulator in this FuncGraph.
+     a. Create an accumulator for that tensor and output it from the forward
+        pass. Note this also requires adding it as an input to the forward pass.
+     b. Capture the accumulator from the forward pass in this FuncGraph. This
+        will later be resolved to the correct output of the forward While op.
      c. Pop a value from the captured placeholder and use it as the captured
         value for the forward pass tensor.
 
@@ -645,16 +644,25 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
   tensor.
 
   Attributes:
-    popped_tensor_lists: Dict from the captured accumulator placeholder to the
+    while_op_needs_rewrite: True if any non-resource intermediates were
+      captured, meaning the forward While op needs to be rewritten to output the
+      corresponding accumulators.
+    empty_tensor_lists: list of EmptyTensorList tensors to be used as initial
+      input to the new accumulators in the forward graph.
+    popped_tensor_lists: dict from the captured accumulator placeholder to the
       TensorList obtained after popping the intermediate tensor from it. The
       values of this dict need to be added to the list of outputs.
   """
 
-  def __init__(self, name, forward_graph):
+  def __init__(self, name, forward_cond_graph, forward_body_graph, max_iters):
     super(_WhileBodyGradFuncGraph, self).__init__(name)
+    self.empty_tensor_lists = []
     self.popped_tensor_lists = {}
     # FuncGraph for the body of the forward While op.
-    self._forward_graph = forward_graph
+    self._forward_graph = forward_body_graph
+    # FuncGraph for the cond of the forward While op.
+    self._forward_cond_graph = forward_cond_graph
+    self._maximum_iterations = max_iters
     # Dict from forward intermediate tensor to its indirectly captured tensor
     # in this graph. Indirect capturing happens in two ways:
     # 1. For non-resource tensors we capture their accumulators from the forward
@@ -663,13 +671,10 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
     # 2. For resource tensors we directly capture their corresponding tensor
     #    in the forward outer graph.
     self._indirect_captures = {}
-    # Dict from forward graph tensor to its corresponding tensor in
-    # `forward_graph.outer_graph`. For a non-resource tensor the value is the
-    # forward While op's "output" corresponding its accumulator. For a resource
-    # tensor it is the While op's "input" for the resource. Note: We disallow
-    # creation of resources inside the while loop so if a resource tensor exists
-    # inside while loop it must be a loop input.
-    self._inner_to_outer_tensor = {}
+
+  @property
+  def while_op_needs_rewrite(self):
+    return self.empty_tensor_lists
 
   def capture(self, tensor, name=None, whitelisted=False):
     """Selectively captures external tensors.
@@ -708,10 +713,6 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
 
     captured_tensor = self._indirect_captures.get(tensor)
     if captured_tensor is not None:
-      # For GradientTape housekeeping.
-      assert self._inner_to_outer_tensor[tensor] in self.captures
-      super(_WhileBodyGradFuncGraph, self)._capture_helper(
-          self._inner_to_outer_tensor[tensor], name)
       return captured_tensor
 
     if tensor.dtype == dtypes.resource:
@@ -736,36 +737,47 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
           index], "Resource tensors must be loop invariants %s." % str(
               self._forward_graph._while.inputs[index])
       tensor_in_outer_graph = self._forward_graph._while.inputs[index]
-      self._inner_to_outer_tensor[tensor] = tensor_in_outer_graph
       self._indirect_captures[tensor] = self.capture(
           tensor_in_outer_graph, whitelisted=True)
       return self._indirect_captures[tensor]
 
-    assert tensor not in self._inner_to_outer_tensor
-
-    accumulator = None
-
-    # Find the TensorList that was used to accumulate the tensors of this
-    # intermediate tensor.
+    # Create or find an existing accumulator output for `tensor` in the forward
+    # graph, and fetch from this accumulator in the gradient graph to get the
+    # raw intermediate value.
     accumulator = _get_accumulator(tensor)
     if accumulator is None:
-      raise ValueError("Reference to un-accumulated intermediate tensor: ",
-                       tensor.name)
-    assert accumulator.graph == self._forward_graph
-    # Get the While op output corresponding to the accumulator.
-    accumulator = self._forward_graph._while.outputs[self._forward_graph.outputs
-                                                     .index(accumulator)]
-
-    assert accumulator.graph == self._forward_graph.outer_graph
-    self._inner_to_outer_tensor[tensor] = accumulator
-
-    # Capture the `accumulator`.
-    accumulator_ph = super(_WhileBodyGradFuncGraph, self)._capture_helper(
+      # Create the initial empty tensor list.
+      with self._forward_graph.outer_graph.as_default():
+        tensor_list = list_ops.empty_tensor_list(
+            element_dtype=tensor.dtype, element_shape=tensor.shape,
+            max_num_elements=self._maximum_iterations)
+      self.empty_tensor_lists.append(tensor_list)
+
+      # Push the intermediate tensor to the tensor list. This captures
+      # `tensor_list`.
+      with self._forward_graph.as_default():
+        accumulator = list_ops.tensor_list_push_back(tensor_list, tensor)
+      # Add the modified tensor list to the list of outputs. This output will be
+      # all the accumulated values.
+      self._forward_graph.outputs.append(accumulator)
+
+      # Capture in the cond graph as well so the forward cond and body inputs
+      # match.
+      with self._forward_cond_graph.as_default():
+        self._forward_cond_graph.capture(tensor_list)
+
+    # Capture the accumulator tensor list in the gradient graph directly from
+    # the forward graph -- we'll later modify this to capture the final list
+    # output by the forward While op instead.
+    captured_accumulator = super(_WhileBodyGradFuncGraph, self)._capture_helper(
         accumulator, name)
+
+    # Pop the intermediate value from the tensor list in the gradient graph.
     new_tensor_list, captured_tensor = list_ops.tensor_list_pop_back(
-        accumulator_ph, element_dtype=tensor.dtype)
+        captured_accumulator, element_dtype=tensor.dtype)
+
     self._indirect_captures[tensor] = captured_tensor
-    self.popped_tensor_lists[accumulator_ph] = new_tensor_list
+    self.popped_tensor_lists[captured_accumulator] = new_tensor_list
     return captured_tensor
 
 
@@ -828,28 +840,6 @@ def _graph_name(graph):
   return "Base"
 
 
-def _is_tensor_array_handle(tensor):
-  """Returns whether tensor is a TensorArray handle."""
-  if tensor.dtype != dtypes.resource:
-    return False
-
-  if tensor.op.type == "While":
-    # We assume that any resource outputs of a While op correspond to a captured
-    # resource input (as opposed to a loop variable specified by the user).
-    # NOTE(skyewm): we could actually check this, but I can't think of when you
-    # would have a resource loop variable.
-    tensor = tensor.op.inputs[tensor.value_index]
-
-  # TODO(b/118452219): add test coverage for this.
-  tensor = func_graph_module.maybe_captured(tensor)
-
-  if isinstance(tensor, ops.EagerTensor):
-    # Eager execution doesn't quite support legacy tensorarray
-    return False
-
-  return tensor.op.type in TENSOR_ARRAY_HANDLE_OPS
-
-
 def _pack_sequence_as(structure_with_tas, loop_vars):
   """Like `nest.pack_sequence_as` but also replaces flows with TensorArrays."""
 
diff --git a/tensorflow/python/platform/googletest.py b/tensorflow/python/platform/googletest.py
index 4d34c508da8ef6a98db3f88ccb8bc77e6026aeaa..fe4b0d0d3767346f4300450f01d56a62e625cca4 100644
--- a/tensorflow/python/platform/googletest.py
+++ b/tensorflow/python/platform/googletest.py
@@ -112,6 +112,9 @@ def GetTempDir():
                               os.path.basename(tf_inspect.getfile(first_frame)))
       temp_dir = tempfile.mkdtemp(prefix=temp_dir.rstrip('.py'))
 
+    # Make sure we have the correct path separators.
+    temp_dir = temp_dir.replace('/', os.sep)
+
     def delete_temp_dir(dirname=temp_dir):
       try:
         file_io.delete_recursively(dirname)
@@ -119,6 +122,7 @@ def GetTempDir():
         logging.error('Error removing %s: %s', dirname, e)
 
     atexit.register(delete_temp_dir)
+
     _googletest_temp_dir = temp_dir
 
   return _googletest_temp_dir
@@ -142,7 +146,7 @@ def StatefulSessionAvailable():
   return False
 
 
-@tf_export('test.StubOutForTesting')
+@tf_export(v1=['test.StubOutForTesting'])
 class StubOutForTesting(object):
   """Support class for stubbing methods out for unit testing.
 
diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py
index a8859f845b3889325f0d86e8e9be80bb63ac6449..f96d721f46e162ee6753377569aacb439cd591d5 100644
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@@ -169,7 +169,7 @@ class RunMetadataTest(test.TestCase):
     ret = _extract_node(run_meta, 'MatMul:MatMul')
     self.assertEqual(len(ret), 0)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def testLoopCPU(self):
     ops.reset_default_graph()
     with ops.device('/cpu:0'):
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 8648f0b5148ecc6afcf0afe49ff91fe7c255e700..1c7c15be4fe5920ff06241175aff57bc52ac338e 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -76,6 +76,7 @@ class PrintModelAnalysisTest(test.TestCase):
                          '  ScalarW (1, 1/1 params)\n',
                          lib.CheckAndRemoveDoc(f.read()))
 
+  @test_util.run_v1_only('b/120545219')
   def testSelectEverythingDetail(self):
     ops.reset_default_graph()
     dev = '/device:GPU:0' if test.is_gpu_available() else '/device:CPU:0'
@@ -203,6 +204,7 @@ class PrintModelAnalysisTest(test.TestCase):
             lib.CheckAndRemoveDoc(f.read())[0:80])
         # pylint: enable=line-too-long
 
+  @test_util.run_v1_only('b/120545219')
   def testComplexCodeView(self):
     ops.reset_default_graph()
     outfile = os.path.join(test.get_temp_dir(), 'dump')
@@ -619,6 +621,7 @@ class PrintModelAnalysisTest(test.TestCase):
           else:
             self.assertEqual(len(gfile.ListDirectory(profile_dir)), 0)
 
+  @test_util.run_v1_only('b/120545219')
   def testAutoProfiling(self):
     ops.reset_default_graph()
     time_dir = os.path.join(test.get_temp_dir(), 'time')
@@ -706,6 +709,7 @@ class PrintModelAnalysisTest(test.TestCase):
                       exception_str)
       self.assertTrue(mat is None)
 
+  @test_util.run_v1_only('b/120545219')
   def testTrackPersistentBytes(self):
     ops.reset_default_graph()
     a = array_ops.constant(np.ones((100, 100)))
diff --git a/tensorflow/python/profiler/pprof_profiler_test.py b/tensorflow/python/profiler/pprof_profiler_test.py
index 120a0d0eaa6588fe06a49a229ce396a7c7ff6f06..3f5bd9e79be2254779e4b64507ef91baec3db49c 100644
--- a/tensorflow/python/profiler/pprof_profiler_test.py
+++ b/tensorflow/python/profiler/pprof_profiler_test.py
@@ -136,7 +136,7 @@ comment: 9
       profile.ParseFromString(profile_contents)
       self.assertEquals(expected_proto, str(profile))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def testProfileWithWhileLoop(self):
     options = config_pb2.RunOptions()
     options.trace_level = config_pb2.RunOptions.FULL_TRACE
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 31b5a59da2060e124daf3f89451364d2d01c8a52..71d9e34592b42829e3ec7787f07fd7bec5113ca7 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -12,6 +12,8 @@ licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 
 py_library(
     name = "saved_model",
@@ -21,6 +23,7 @@ py_library(
     deps = [
         ":builder",
         ":constants",
+        ":load",
         ":loader",
         ":main_op",
         ":save",
@@ -89,7 +92,7 @@ py_library(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
-        "//tensorflow/python:training",
+        "//tensorflow/python:saver",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
@@ -168,14 +171,15 @@ py_test(
         ":signature_def_utils",
         ":tag_constants",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:saver_test_utils",
+        "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:test_ops",
         "//tensorflow/python:training",
@@ -266,6 +270,14 @@ py_test(
     ],
 )
 
+tf_proto_library(
+    name = "saved_object_graph",
+    srcs = ["saved_object_graph.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+    visibility = ["//tensorflow:internal"],
+)
+
 py_library(
     name = "save",
     srcs = [
@@ -273,16 +285,31 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":loader",
+        ":builder",
+        ":constants",
+        ":function_serialization",
+        ":saved_object_graph_py",
         ":signature_constants",
         ":signature_def_utils",
+        ":tag_constants",
         ":utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:tensor_spec",
         "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/eager:test",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/checkpointable:tracking",
+        "//tensorflow/python/training/checkpointable:util",
     ],
 )
 
@@ -291,13 +318,70 @@ py_test(
     srcs = ["save_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":loader",
         ":save",
         ":signature_constants",
         ":tag_constants",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
+py_library(
+    name = "load",
+    srcs = [
+        "load.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":constants",
+        ":function_deserialization",
+        ":loader",
+        ":saved_object_graph_py",
+        ":utils",
+        "//tensorflow/python:function",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:util",
+        "//tensorflow/python/training/checkpointable:tracking",
+    ],
+)
+
+py_test(
+    name = "load_test",
+    srcs = ["load_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":load",
+        ":save",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/training/checkpointable:tracking",
+    ],
+)
+
+py_library(
+    name = "function_serialization",
+    srcs = [
+        "function_serialization.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":saved_object_graph_py",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:function",
+    ],
+)
+
+py_library(
+    name = "function_deserialization",
+    srcs = [
+        "function_deserialization.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow/python/eager:def_function"],
+)
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index fc84979b6ecb25e44bf7b8eeb5a670573fc21a40..f37d283a2a2cbb50faf62f1ae24cd69bd0f29d74 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -108,26 +108,6 @@ class _SavedModelBuilder(object):
     # weights.
     self._has_saved_variables = False
 
-  def _copy_assets_to_destination_dir(self, asset_filename_map):
-    """Copy all assets from source path to destination path."""
-    assets_destination_dir = saved_model_utils.get_or_create_assets_dir(
-        self._export_dir)
-
-    # Copy each asset from source path to destination path.
-    for asset_basename, asset_source_filepath in asset_filename_map.items():
-      asset_destination_filepath = os.path.join(
-          compat.as_bytes(assets_destination_dir),
-          compat.as_bytes(asset_basename))
-
-      # Only copy the asset file to the destination if it does not already
-      # exist. This is to ensure that an asset with the same name defined as
-      # part of multiple graphs is only copied the first time.
-      if not file_io.file_exists(asset_destination_filepath):
-        file_io.copy(asset_source_filepath, asset_destination_filepath)
-
-    tf_logging.info("Assets written to: %s",
-                    compat.as_text(assets_destination_dir))
-
   def _save_and_write_assets(self, meta_graph_def, assets_list=None):
     """Saves asset to the meta graph and writes asset files to disk.
 
@@ -145,7 +125,7 @@ class _SavedModelBuilder(object):
       return
 
     # Copy assets from source path to destination path.
-    self._copy_assets_to_destination_dir(asset_filename_map)
+    copy_assets_to_destination_dir(asset_filename_map, self._export_dir)
 
   def _tag_and_add_meta_graph(self, meta_graph_def, tags, signature_def_map):
     """Tags the meta graph def and adds it to the SavedModel.
@@ -470,7 +450,7 @@ class SavedModelBuilder(_SavedModelBuilder):
       return
 
     # Copy assets from source path to destination path.
-    self._copy_assets_to_destination_dir(asset_filename_map)
+    copy_assets_to_destination_dir(asset_filename_map, self._export_dir)
 
   def _maybe_add_main_op(self, main_op):
     """Adds main op to the SavedModel.
@@ -656,7 +636,7 @@ def _maybe_save_assets(write_fn, assets_to_add=None):
     if not asset_source_filepath:
       raise ValueError("Invalid asset filepath tensor %s" % asset_tensor)
 
-    asset_filename = _get_asset_filename_to_add(
+    asset_filename = get_asset_filename_to_add(
         asset_source_filepath, asset_filename_map)
 
     # Call the passed-in function that builds AssetFileDef proto and adds it
@@ -675,7 +655,7 @@ def _maybe_save_assets(write_fn, assets_to_add=None):
   return asset_filename_map
 
 
-def _get_asset_filename_to_add(asset_filepath, asset_filename_map):
+def get_asset_filename_to_add(asset_filepath, asset_filename_map):
   """Get a unique basename to add to the SavedModel if this file is unseen.
 
   Assets come from users as full paths, and we save them out to the
@@ -762,6 +742,27 @@ def _add_asset_to_metagraph(meta_graph_def, asset_filename, asset_tensor):
   asset_proto.tensor_info.name = asset_tensor.name
 
 
+def copy_assets_to_destination_dir(asset_filename_map, destination_dir):
+  """Copy all assets from source path to destination path."""
+  assets_destination_dir = saved_model_utils.get_or_create_assets_dir(
+      destination_dir)
+
+  # Copy each asset from source path to destination path.
+  for asset_basename, asset_source_filepath in asset_filename_map.items():
+    asset_destination_filepath = os.path.join(
+        compat.as_bytes(assets_destination_dir),
+        compat.as_bytes(asset_basename))
+
+    # Only copy the asset file to the destination if it does not already
+    # exist. This is to ensure that an asset with the same name defined as
+    # part of multiple graphs is only copied the first time.
+    if not file_io.file_exists(asset_destination_filepath):
+      file_io.copy(asset_source_filepath, asset_destination_filepath)
+
+  tf_logging.info("Assets written to: %s",
+                  compat.as_text(assets_destination_dir))
+
+
 def _add_asset_to_collection(asset_filename, asset_tensor):
   """Builds an asset proto and adds it to the asset collection of the graph.
 
diff --git a/tensorflow/python/saved_model/constants.py b/tensorflow/python/saved_model/constants.py
index 0cd5588a8f2a8ecd95b21ab75552ddf866d96d90..90511a409ed7eb34bede12f1ce9d665e0f1cc913 100644
--- a/tensorflow/python/saved_model/constants.py
+++ b/tensorflow/python/saved_model/constants.py
@@ -29,6 +29,9 @@ tf_export(
         "saved_model.ASSETS_DIRECTORY", "saved_model.constants.ASSETS_DIRECTORY"
     ]).export_constant(__name__, "ASSETS_DIRECTORY")
 
+# Subdirectory name containing unmanaged files from higher-level APIs.
+EXTRA_ASSETS_DIRECTORY = "assets.extra"
+
 # CollectionDef key containing SavedModel assets.
 ASSETS_KEY = "saved_model_assets"
 tf_export(
diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..46bd69ad03143719f8a04d8ffec6be3d6b4037af
--- /dev/null
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -0,0 +1,46 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tools for deserializing PolymorphicFunctions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import def_function
+
+
+def recreate_polymorphic_function(
+    saved_polymorphic_function, defined_functions):
+  """Creates a PolymorphicFunction which runs restored function definitions."""
+  @def_function.function
+  def restored_function(*args):
+    """Calls a restored function."""
+    # Try calling each function, return a value from the first one whose
+    # signature matches.
+    # TODO(allenl): Consider re-populating the function cache directly.
+    # TODO(allenl): Functions saved with input_signatures should revive with
+    # input_signatures.
+    for monomorphic_function in saved_polymorphic_function.monomorphic_function:
+      try:
+        # TODO(allenl): Passing an explicit name here prevents invalid name
+        # errors. We should replace this with something based on the actual
+        # Python function name.
+        return defined_functions[monomorphic_function.concrete_function](
+            *args, name="imported_function")
+      except ValueError:
+        continue
+    raise AssertionError(
+        "Could not find matching function to call for arguments: %s" % (args,))
+  return restored_function
diff --git a/tensorflow/python/saved_model/function_serialization.py b/tensorflow/python/saved_model/function_serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..69f34f0fd6d8be2249811356afe1b72a7ec6d5b2
--- /dev/null
+++ b/tensorflow/python/saved_model/function_serialization.py
@@ -0,0 +1,78 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tools for serializing PolymorphicFunctions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function as defun_lib
+from tensorflow.python.saved_model import saved_object_graph_pb2
+
+
+def _serialize_polymorphic_function(polymorphic_function):
+  monomorphic_functions = []
+  for concrete_function in list_all_concrete_functions(polymorphic_function):
+    monomorphic_functions.append(
+        saved_object_graph_pb2.SavedMonomorphicFunction(
+            concrete_function=concrete_function.name))
+  saved_polymorphic_function = saved_object_graph_pb2.SavedPolymorphicFunction(
+      monomorphic_function=monomorphic_functions)
+  return saved_polymorphic_function
+
+
+def list_all_concrete_functions(polymorphic_function):
+  """Given a polymorphic function, returns all of its concrete functions."""
+  concrete_functions = []
+  for signature in polymorphic_function._cached_input_signatures:  # pylint: disable=protected-access
+    if any(isinstance(arg, defun_lib.UnknownArgument) for arg in signature):
+      continue
+    concrete_function = polymorphic_function.get_concrete_function(*signature)
+    concrete_functions.append(concrete_function)
+  return concrete_functions
+
+
+def list_all_polymorphic_functions(checkpointable_object):
+  """Given a checkpointable object, returns all of its polymorphic functions."""
+  polymorphic_functions = dict()
+  for attribute_name in dir(checkpointable_object):
+    try:
+      attribute_value = getattr(checkpointable_object, attribute_name, None)
+    except:  # pylint: disable=bare-except
+      # We really don't want to throw an exception just because some object's
+      # attribute accessor is broken.
+      attribute_value = None
+    # TODO(allenl): Consider de-duplicating functions which are referenced
+    # from multiple attributes.
+    if isinstance(attribute_value, def_function.PolymorphicFunction):
+      polymorphic_functions[attribute_name] = attribute_value
+  return polymorphic_functions
+
+
+def add_polymorphic_functions_to_object_graph_proto(checkpointable_objects,
+                                                    saved_object_graph):
+  """Finds PolymorphicFunctions attached to objects and saves them."""
+  existing_objects = list(zip(checkpointable_objects, saved_object_graph.nodes))
+  for obj, obj_proto in existing_objects:
+    for name, polymorphic_function in list_all_polymorphic_functions(
+        obj).items():
+      function_node_id = len(saved_object_graph.nodes)
+      function_node = saved_object_graph.nodes.add()
+      function_node.function.CopyFrom(
+          _serialize_polymorphic_function(polymorphic_function))
+      reference = obj_proto.children.add()
+      reference.node_id = function_node_id
+      reference.local_name = name
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d9f60c69dd251e1f9c00ed6e96a01c0da9ae327
--- /dev/null
+++ b/tensorflow/python/saved_model/load.py
@@ -0,0 +1,126 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Import a checkpointable object from a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.framework import function as function_lib
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import function_deserialization
+from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.saved_model import saved_object_graph_pb2
+from tensorflow.python.saved_model import utils_impl as saved_model_utils
+from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.checkpointable import util
+from tensorflow.python.util import compat
+
+
+class _Loader(object):
+  """Helper class to load an object-based SavedModel."""
+
+  def __init__(self, object_graph_proto, saved_model_proto, export_dir):
+    meta_graph = saved_model_proto.meta_graphs[0]
+    self._asset_file_def = meta_graph.asset_file_def
+    self._proto = object_graph_proto
+    self._export_dir = export_dir
+    self._defined_functions = {}
+    for defined_function in function_lib.from_library(
+        meta_graph.graph_def.library):
+      # TODO(allenl): Do we need to do name mapping here? Not quite sure what
+      # happens when loaded names collide with existing names.
+      defined_function.add_to_graph(None)
+      self._defined_functions[defined_function.name] = defined_function
+    self._load_all()
+    self._restore_checkpoint()
+
+  def _load_all(self):
+    self._nodes = [self._recreate(proto) for proto in self._proto.nodes]
+    # After creating the objects, construct the edges between the objects.
+    for obj, object_proto in zip(self._nodes, self._proto.nodes):
+      for reference in object_proto.children:
+        setattr(obj, reference.local_name, self._nodes[reference.node_id])
+
+  def _restore_checkpoint(self):
+    variables_path = saved_model_utils.get_variables_path(self._export_dir)
+    saver = util.CheckpointableSaver(self.get(0))
+    saver.restore(variables_path).assert_consumed()
+
+  def get(self, node_id):
+    return self._nodes[node_id]
+
+  def _recreate(self, proto):
+    """Creates a Python object from a SavedObject protocol buffer."""
+    factory = {
+        "user_object": lambda: self._recreate_user_object(proto.user_object),
+        "asset": lambda: self._recreate_asset(proto.asset),
+        "function": lambda: self._recreate_function(proto.function),
+        "variable": lambda: self._recreate_variable(proto.variable),
+    }
+    kind = proto.WhichOneof("kind")
+    if kind not in factory:
+      raise ValueError("Unknown SavedObject type: %r" % kind)
+    return factory[kind]()
+
+  def _recreate_user_object(self, proto):
+    del proto
+    return tracking.Checkpointable()
+
+  def _recreate_asset(self, proto):
+    filename = os.path.join(
+        saved_model_utils.get_assets_dir(self._export_dir),
+        self._asset_file_def[proto.asset_file_def_index].filename)
+    return tracking.TrackableAsset(filename)
+
+  def _recreate_function(self, proto):
+    return function_deserialization.recreate_polymorphic_function(
+        proto, self._defined_functions)
+
+  def _recreate_variable(self, proto):
+    # TODO(andresp): Can we use the checkpointed value as initializer?
+    dummy_value = init_ops.Zeros(dtype=proto.dtype)(shape=proto.shape)
+    return variables.Variable(dummy_value)
+
+
+def _load_saved_object_graph_proto(filename):
+  with file_io.FileIO(filename, "rb") as f:
+    contents = f.read()
+    return saved_object_graph_pb2.SavedObjectGraph.FromString(contents)
+
+
+def load(export_dir):
+  """Load a SavedModel from `export_dir`."""
+  saved_model_proto = loader_impl.parse_saved_model(export_dir)
+  object_graph_filename = os.path.join(
+      compat.as_bytes(export_dir),
+      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY),
+      compat.as_bytes("object_graph.pb"))
+  if file_io.file_exists(object_graph_filename):
+    object_graph_proto = _load_saved_object_graph_proto(object_graph_filename)
+    loader = _Loader(object_graph_proto,
+                     saved_model_proto,
+                     export_dir)
+    root = loader.get(0)
+  else:
+    raise NotImplementedError(
+        "Currently only SavedModels exported with `tf.saved_model.save` may be "
+        "imported. Other SavedModels may eventually be supported via load().")
+  return root
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba88668f8c7310503ce5bc088c08658c5aa5e8da
--- /dev/null
+++ b/tensorflow/python/saved_model/load_test.py
@@ -0,0 +1,134 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for checkpointable object SavedModel loading."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import variables
+from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import save
+from tensorflow.python.training.checkpointable import tracking
+
+
+class LoadTest(test.TestCase):
+
+  def test_structure_import(self):
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root.dep_one = tracking.Checkpointable()
+    root.dep_two = tracking.Checkpointable()
+    root.dep_two.dep = tracking.Checkpointable()
+    root.dep_three = root.dep_two.dep
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir)
+    imported = load.load(save_dir)
+    self.assertIs(imported.dep_three, imported.dep_two.dep)
+    self.assertIsNot(imported.dep_one, imported.dep_two)
+    self.assertEqual(4., imported.f(constant_op.constant(2.)).numpy())
+
+  def test_variables(self):
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root.v1 = variables.Variable(1.)
+    root.v2 = variables.Variable(2.)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir)
+    imported = load.load(save_dir)
+    self.assertEquals(imported.v1.numpy(), 1.0)
+    self.assertEquals(imported.v2.numpy(), 2.0)
+
+  def _make_asset(self, contents):
+    filename = tempfile.mktemp(prefix=self.get_temp_dir())
+    with open(filename, "w") as f:
+      f.write(contents)
+    return filename
+
+  def test_assets_import(self):
+    file1 = self._make_asset("contents 1")
+    file2 = self._make_asset("contents 2")
+
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root.asset1 = tracking.TrackableAsset(file1)
+    root.asset2 = tracking.TrackableAsset(file2)
+
+    save_dir = os.path.join(self.get_temp_dir(), "save_dir")
+    save.save(root, save_dir)
+
+    file_io.delete_file(file1)
+    file_io.delete_file(file2)
+    load_dir = os.path.join(self.get_temp_dir(), "load_dir")
+    file_io.rename(save_dir, load_dir)
+
+    imported = load.load(load_dir)
+    with open(imported.asset1.asset_path.numpy(), "r") as f:
+      self.assertEquals("contents 1", f.read())
+    with open(imported.asset2.asset_path.numpy(), "r") as f:
+      self.assertEquals("contents 2", f.read())
+
+  def test_assets_dedup(self):
+    vocab = self._make_asset("contents")
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+
+    root.asset1 = tracking.TrackableAsset(vocab)
+    root.asset2 = tracking.TrackableAsset(vocab)
+
+    export_dir = os.path.join(self.get_temp_dir(), "save_dir")
+    save.save(root, export_dir)
+    imported = load.load(export_dir)
+
+    self.assertEqual(imported.asset1.asset_path.numpy(),
+                     imported.asset2.asset_path.numpy())
+
+  def test_only_implicit_signatures(self):
+    def func(x):
+      return 2 * x
+
+    root = tracking.Checkpointable()
+    root.f = def_function.function(func)
+
+    # Add two traces.
+    root.f(constant_op.constant(1.))
+    root.f(constant_op.constant(1))
+
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir, signatures=dict())
+    imported = load.load(save_dir)
+
+    self.assertEqual(4., imported.f(constant_op.constant(2.)).numpy())
+    self.assertEqual(14, imported.f(constant_op.constant(7)).numpy())
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 6bf39a2c676b476580e205096aeb935cf59a467d..e5be03aae4905f4465ac87590da610a7d46e2ae4 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -39,7 +39,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-def _parse_saved_model(export_dir):
+def parse_saved_model(export_dir):
   """Reads the savedmodel.pb or savedmodel.pbtxt file containing `SavedModel`.
 
   Args:
@@ -83,6 +83,11 @@ def _parse_saved_model(export_dir):
                    constants.SAVED_MODEL_FILENAME_PB))
 
 
+# TODO(b/120594573): Make this symbol also available as private, so that
+# tensorflow_transform and tensorflow_estimator do not break.
+_parse_saved_model = parse_saved_model
+
+
 def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   """Gets the asset tensors, if defined in the meta graph def to load.
 
@@ -276,7 +281,7 @@ class SavedModelLoader(object):
     """
     self._export_dir = export_dir
     self._variables_path = saved_model_utils.get_variables_path(export_dir)
-    self._saved_model = _parse_saved_model(export_dir)
+    self._saved_model = parse_saved_model(export_dir)
 
   @property
   def export_dir(self):
diff --git a/tensorflow/python/saved_model/loader_test.py b/tensorflow/python/saved_model/loader_test.py
index 3b7f0b250e7fd8fec560f4496508ac63394d07da..3e27c0801cd43eb43d1e0636f8aac1b1bc054485 100644
--- a/tensorflow/python/saved_model/loader_test.py
+++ b/tensorflow/python/saved_model/loader_test.py
@@ -94,7 +94,7 @@ class SavedModelLoaderTest(test.TestCase, parameterized.TestCase):
     super(SavedModelLoaderTest, self).tearDown()
     shutil.rmtree(test.get_temp_dir(), ignore_errors=True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def test_load_function(self, builder_cls):
     self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
@@ -110,7 +110,7 @@ class SavedModelLoaderTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(7, sess.graph.get_tensor_by_name("y:0").eval())
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def test_load_graph(self, builder_cls):
     self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
@@ -130,7 +130,7 @@ class SavedModelLoaderTest(test.TestCase, parameterized.TestCase):
       with self.assertRaises(errors.FailedPreconditionError):
         self.evaluate(y)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def test_load_with_import_scope(self, builder_cls):
     self.export_graph_with_main_op(builder_cls)
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
@@ -179,7 +179,7 @@ class SavedModelLoaderTest(test.TestCase, parameterized.TestCase):
       loader.restore_variables(sess, tf_saver.Saver())
       self.assertEqual(55, self.evaluate(z))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def test_run_init_op(self, builder_cls):
     self.export_graph_with_main_op(builder_cls)
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
@@ -212,7 +212,7 @@ class SavedModelLoaderTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(RuntimeError):
       loader.get_meta_graph_def_from_tags(["not_a_graph"])
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def test_load_saved_model_with_no_variables(self, builder_cls):
     """Test that SavedModel runs saver when there appear to be no variables.
 
diff --git a/tensorflow/python/saved_model/model_utils/BUILD b/tensorflow/python/saved_model/model_utils/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..192a610fd244c0d8950764cdfbf53fb62bd32698
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/BUILD
@@ -0,0 +1,100 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Description:
+#   Keras saving and loading libraries.
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "model_utils",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export_output",
+        ":export_utils",
+    ],
+)
+
+py_library(
+    name = "export_output",
+    srcs = ["export_output.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/saved_model:signature_def_utils",
+    ],
+)
+
+py_test(
+    name = "export_output_test",
+    srcs = ["export_output_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export_output",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/saved_model:signature_constants",
+    ],
+)
+
+py_library(
+    name = "export_utils",
+    srcs = ["export_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export_output",
+        "//tensorflow/python:mode_keys",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:signature_def_utils",
+        "//tensorflow/python/saved_model:tag_constants",
+    ],
+)
+
+py_test(
+    name = "export_test",
+    srcs = ["export_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export_utils",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:signature_def_utils",
+    ],
+)
diff --git a/tensorflow/python/saved_model/model_utils/__init__.py b/tensorflow/python/saved_model/model_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..84540badb4b100ab649b4653d9d84b5ebe922cf1
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils for saving a Keras Model or Estimator to the SavedModel format."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.python.saved_model.model_utils.export_output import *
+from tensorflow.python.saved_model.model_utils.export_utils import build_all_signature_defs
+from tensorflow.python.saved_model.model_utils.export_utils import export_outputs_for_mode
+from tensorflow.python.saved_model.model_utils.export_utils import EXPORT_TAG_MAP
+from tensorflow.python.saved_model.model_utils.export_utils import get_export_outputs
+from tensorflow.python.saved_model.model_utils.export_utils import get_temp_export_dir
+from tensorflow.python.saved_model.model_utils.export_utils import get_timestamped_export_dir
+# pylint: enable=wildcard-import
diff --git a/tensorflow/python/saved_model/model_utils/export_output.py b/tensorflow/python/saved_model/model_utils/export_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..b571bad067ebd0cbfdd3bfd94ee76d002d5f1880
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/export_output.py
@@ -0,0 +1,407 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes for different types of export output."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.saved_model import signature_def_utils
+
+
+class ExportOutput(object):
+  """Represents an output of a model that can be served.
+
+  These typically correspond to model heads.
+  """
+
+  __metaclass__ = abc.ABCMeta
+
+  _SEPARATOR_CHAR = '/'
+
+  @abc.abstractmethod
+  def as_signature_def(self, receiver_tensors):
+    """Generate a SignatureDef proto for inclusion in a MetaGraphDef.
+
+    The SignatureDef will specify outputs as described in this ExportOutput,
+    and will use the provided receiver_tensors as inputs.
+
+    Args:
+      receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
+        input nodes that will be fed.
+    """
+    pass
+
+  def _check_output_key(self, key, error_label):
+    # For multi-head models, the key can be a tuple.
+    if isinstance(key, tuple):
+      key = self._SEPARATOR_CHAR.join(key)
+
+    if not isinstance(key, six.string_types):
+      raise ValueError(
+          '{} output key must be a string; got {}.'.format(error_label, key))
+    return key
+
+  def _wrap_and_check_outputs(
+      self, outputs, single_output_default_name, error_label=None):
+    """Wraps raw tensors as dicts and checks type.
+
+    Note that we create a new dict here so that we can overwrite the keys
+    if necessary.
+
+    Args:
+      outputs: A `Tensor` or a dict of string to `Tensor`.
+      single_output_default_name: A string key for use in the output dict
+        if the provided `outputs` is a raw tensor.
+      error_label: descriptive string for use in error messages. If none,
+        single_output_default_name will be used.
+
+    Returns:
+      A dict of tensors
+
+    Raises:
+      ValueError: if the outputs dict keys are not strings or tuples of strings
+        or the values are not Tensors.
+    """
+    if not isinstance(outputs, dict):
+      outputs = {single_output_default_name: outputs}
+
+    output_dict = {}
+    for key, value in outputs.items():
+      error_name = error_label or single_output_default_name
+      key = self._check_output_key(key, error_name)
+      if not isinstance(value, ops.Tensor):
+        raise ValueError(
+            '{} output value must be a Tensor; got {}.'.format(
+                error_name, value))
+
+      output_dict[key] = value
+    return output_dict
+
+
+class ClassificationOutput(ExportOutput):
+  """Represents the output of a classification head.
+
+  Either classes or scores or both must be set.
+
+  The classes `Tensor` must provide string labels, not integer class IDs.
+
+  If only classes is set, it is interpreted as providing top-k results in
+  descending order.
+
+  If only scores is set, it is interpreted as providing a score for every class
+  in order of class ID.
+
+  If both classes and scores are set, they are interpreted as zipped, so each
+  score corresponds to the class at the same index.  Clients should not depend
+  on the order of the entries.
+  """
+
+  def __init__(self, scores=None, classes=None):
+    """Constructor for `ClassificationOutput`.
+
+    Args:
+      scores: A float `Tensor` giving scores (sometimes but not always
+          interpretable as probabilities) for each class.  May be `None`, but
+          only if `classes` is set.  Interpretation varies-- see class doc.
+      classes: A string `Tensor` giving predicted class labels.  May be `None`,
+          but only if `scores` is set.  Interpretation varies-- see class doc.
+
+    Raises:
+      ValueError: if neither classes nor scores is set, or one of them is not a
+          `Tensor` with the correct dtype.
+    """
+    if (scores is not None
+        and not (isinstance(scores, ops.Tensor)
+                 and scores.dtype.is_floating)):
+      raise ValueError('Classification scores must be a float32 Tensor; '
+                       'got {}'.format(scores))
+    if (classes is not None
+        and not (isinstance(classes, ops.Tensor)
+                 and dtypes.as_dtype(classes.dtype) == dtypes.string)):
+      raise ValueError('Classification classes must be a string Tensor; '
+                       'got {}'.format(classes))
+    if scores is None and classes is None:
+      raise ValueError('At least one of scores and classes must be set.')
+
+    self._scores = scores
+    self._classes = classes
+
+  @property
+  def scores(self):
+    return self._scores
+
+  @property
+  def classes(self):
+    return self._classes
+
+  def as_signature_def(self, receiver_tensors):
+    if len(receiver_tensors) != 1:
+      raise ValueError('Classification input must be a single string Tensor; '
+                       'got {}'.format(receiver_tensors))
+    (_, examples), = receiver_tensors.items()
+    if dtypes.as_dtype(examples.dtype) != dtypes.string:
+      raise ValueError('Classification input must be a single string Tensor; '
+                       'got {}'.format(receiver_tensors))
+    return signature_def_utils.classification_signature_def(
+        examples, self.classes, self.scores)
+
+
+class RegressionOutput(ExportOutput):
+  """Represents the output of a regression head."""
+
+  def __init__(self, value):
+    """Constructor for `RegressionOutput`.
+
+    Args:
+      value: a float `Tensor` giving the predicted values.  Required.
+
+    Raises:
+      ValueError: if the value is not a `Tensor` with dtype tf.float32.
+    """
+    if not (isinstance(value, ops.Tensor) and value.dtype.is_floating):
+      raise ValueError('Regression output value must be a float32 Tensor; '
+                       'got {}'.format(value))
+    self._value = value
+
+  @property
+  def value(self):
+    return self._value
+
+  def as_signature_def(self, receiver_tensors):
+    if len(receiver_tensors) != 1:
+      raise ValueError('Regression input must be a single string Tensor; '
+                       'got {}'.format(receiver_tensors))
+    (_, examples), = receiver_tensors.items()
+    if dtypes.as_dtype(examples.dtype) != dtypes.string:
+      raise ValueError('Regression input must be a single string Tensor; '
+                       'got {}'.format(receiver_tensors))
+    return signature_def_utils.regression_signature_def(examples, self.value)
+
+
+class PredictOutput(ExportOutput):
+  """Represents the output of a generic prediction head.
+
+  A generic prediction need not be either a classification or a regression.
+
+  Named outputs must be provided as a dict from string to `Tensor`,
+  """
+  _SINGLE_OUTPUT_DEFAULT_NAME = 'output'
+
+  def __init__(self, outputs):
+    """Constructor for PredictOutput.
+
+    Args:
+      outputs: A `Tensor` or a dict of string to `Tensor` representing the
+        predictions.
+
+    Raises:
+      ValueError: if the outputs is not dict, or any of its keys are not
+          strings, or any of its values are not `Tensor`s.
+    """
+
+    self._outputs = self._wrap_and_check_outputs(
+        outputs, self._SINGLE_OUTPUT_DEFAULT_NAME, error_label='Prediction')
+
+  @property
+  def outputs(self):
+    return self._outputs
+
+  def as_signature_def(self, receiver_tensors):
+    return signature_def_utils.predict_signature_def(receiver_tensors,
+                                                     self.outputs)
+
+
+class _SupervisedOutput(ExportOutput):
+  """Represents the output of a supervised training or eval process."""
+  __metaclass__ = abc.ABCMeta
+
+  LOSS_NAME = 'loss'
+  PREDICTIONS_NAME = 'predictions'
+  METRICS_NAME = 'metrics'
+
+  METRIC_VALUE_SUFFIX = 'value'
+  METRIC_UPDATE_SUFFIX = 'update_op'
+
+  _loss = None
+  _predictions = None
+  _metrics = None
+
+  def __init__(self, loss=None, predictions=None, metrics=None):
+    """Constructor for SupervisedOutput (ie, Train or Eval output).
+
+    Args:
+      loss: dict of Tensors or single Tensor representing calculated loss.
+      predictions: dict of Tensors or single Tensor representing model
+        predictions.
+      metrics: Dict of metric results keyed by name.
+        The values of the dict can be one of the following:
+        (1) instance of `Metric` class.
+        (2) (metric_value, update_op) tuples, or a single tuple.
+        metric_value must be a Tensor, and update_op must be a Tensor or Op.
+
+    Raises:
+      ValueError: if any of the outputs' dict keys are not strings or tuples of
+        strings or the values are not Tensors (or Operations in the case of
+        update_op).
+    """
+
+    if loss is not None:
+      loss_dict = self._wrap_and_check_outputs(loss, self.LOSS_NAME)
+      self._loss = self._prefix_output_keys(loss_dict, self.LOSS_NAME)
+    if predictions is not None:
+      pred_dict = self._wrap_and_check_outputs(
+          predictions, self.PREDICTIONS_NAME)
+      self._predictions = self._prefix_output_keys(
+          pred_dict, self.PREDICTIONS_NAME)
+    if metrics is not None:
+      self._metrics = self._wrap_and_check_metrics(metrics)
+
+  def _prefix_output_keys(self, output_dict, output_name):
+    """Prepend output_name to the output_dict keys if it doesn't exist.
+
+    This produces predictable prefixes for the pre-determined outputs
+    of SupervisedOutput.
+
+    Args:
+      output_dict: dict of string to Tensor, assumed valid.
+      output_name: prefix string to prepend to existing keys.
+
+    Returns:
+      dict with updated keys and existing values.
+    """
+
+    new_outputs = {}
+    for key, val in output_dict.items():
+      key = self._prefix_key(key, output_name)
+      new_outputs[key] = val
+    return new_outputs
+
+  def _prefix_key(self, key, output_name):
+    if key.find(output_name) != 0:
+      key = output_name + self._SEPARATOR_CHAR + key
+    return key
+
+  def _wrap_and_check_metrics(self, metrics):
+    """Handle the saving of metrics.
+
+    Metrics is either a tuple of (value, update_op), or a dict of such tuples.
+    Here, we separate out the tuples and create a dict with names to tensors.
+
+    Args:
+      metrics: Dict of metric results keyed by name.
+        The values of the dict can be one of the following:
+        (1) instance of `Metric` class.
+        (2) (metric_value, update_op) tuples, or a single tuple.
+        metric_value must be a Tensor, and update_op must be a Tensor or Op.
+
+    Returns:
+      dict of output_names to tensors
+
+    Raises:
+      ValueError: if the dict key is not a string, or the metric values or ops
+        are not tensors.
+    """
+    if not isinstance(metrics, dict):
+      metrics = {self.METRICS_NAME: metrics}
+
+    outputs = {}
+    for key, value in metrics.items():
+      if isinstance(value, tuple):
+        metric_val, metric_op = value
+      else:  # value is a keras.Metrics object
+        metric_val = value.result()
+        assert len(value.updates) == 1  # We expect only one update op.
+        metric_op = value.updates[0]
+      key = self._check_output_key(key, self.METRICS_NAME)
+      key = self._prefix_key(key, self.METRICS_NAME)
+
+      val_name = key + self._SEPARATOR_CHAR + self.METRIC_VALUE_SUFFIX
+      op_name = key + self._SEPARATOR_CHAR + self.METRIC_UPDATE_SUFFIX
+      if not isinstance(metric_val, ops.Tensor):
+        raise ValueError(
+            '{} output value must be a Tensor; got {}.'.format(
+                key, metric_val))
+      if (not isinstance(metric_op, ops.Tensor) and
+          not isinstance(metric_op, ops.Operation)):
+        raise ValueError(
+            '{} update_op must be a Tensor or Operation; got {}.'.format(
+                key, metric_op))
+
+      # We must wrap any ops in a Tensor before export, as the SignatureDef
+      # proto expects tensors only. See b/109740581
+      metric_op_tensor = metric_op
+      if isinstance(metric_op, ops.Operation):
+        with ops.control_dependencies([metric_op]):
+          metric_op_tensor = constant_op.constant([], name='metric_op_wrapper')
+
+      outputs[val_name] = metric_val
+      outputs[op_name] = metric_op_tensor
+
+    return outputs
+
+  @property
+  def loss(self):
+    return self._loss
+
+  @property
+  def predictions(self):
+    return self._predictions
+
+  @property
+  def metrics(self):
+    return self._metrics
+
+  @abc.abstractmethod
+  def _get_signature_def_fn(self):
+    """Returns a function that produces a SignatureDef given desired outputs."""
+    pass
+
+  def as_signature_def(self, receiver_tensors):
+    signature_def_fn = self._get_signature_def_fn()
+    return signature_def_fn(
+        receiver_tensors, self.loss, self.predictions, self.metrics)
+
+
+class TrainOutput(_SupervisedOutput):
+  """Represents the output of a supervised training process.
+
+  This class generates the appropriate signature def for exporting
+  training output by type-checking and wrapping loss, predictions, and metrics
+  values.
+  """
+
+  def _get_signature_def_fn(self):
+    return signature_def_utils.supervised_train_signature_def
+
+
+class EvalOutput(_SupervisedOutput):
+  """Represents the output of a supervised eval process.
+
+  This class generates the appropriate signature def for exporting
+  eval output by type-checking and wrapping loss, predictions, and metrics
+  values.
+  """
+
+  def _get_signature_def_fn(self):
+    return signature_def_utils.supervised_eval_signature_def
diff --git a/tensorflow/python/saved_model/model_utils/export_output_test.py b/tensorflow/python/saved_model/model_utils/export_output_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5262e9fa1e959f0845f9783fdb3fd3ed1a739b46
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/export_output_test.py
@@ -0,0 +1,405 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for export."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model.model_utils import export_output as export_output_lib
+
+
+class ExportOutputTest(test.TestCase):
+
+  def test_regress_value_must_be_float(self):
+    with context.graph_mode():
+      value = array_ops.placeholder(dtypes.string, 1, name='output-tensor-1')
+      with self.assertRaisesRegexp(
+          ValueError, 'Regression output value must be a float32 Tensor'):
+        export_output_lib.RegressionOutput(value)
+
+  def test_classify_classes_must_be_strings(self):
+    with context.graph_mode():
+      classes = array_ops.placeholder(dtypes.float32, 1, name='output-tensor-1')
+      with self.assertRaisesRegexp(
+          ValueError, 'Classification classes must be a string Tensor'):
+        export_output_lib.ClassificationOutput(classes=classes)
+
+  def test_classify_scores_must_be_float(self):
+    with context.graph_mode():
+      scores = array_ops.placeholder(dtypes.string, 1, name='output-tensor-1')
+      with self.assertRaisesRegexp(
+          ValueError, 'Classification scores must be a float32 Tensor'):
+        export_output_lib.ClassificationOutput(scores=scores)
+
+  def test_classify_requires_classes_or_scores(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'At least one of scores and classes must be set.'):
+      export_output_lib.ClassificationOutput()
+
+  def test_build_standardized_signature_def_regression(self):
+    with context.graph_mode():
+      input_tensors = {
+          'input-1':
+              array_ops.placeholder(
+                  dtypes.string, 1, name='input-tensor-1')
+      }
+      value = array_ops.placeholder(dtypes.float32, 1, name='output-tensor-1')
+
+      export_output = export_output_lib.RegressionOutput(value)
+      actual_signature_def = export_output.as_signature_def(input_tensors)
+
+      expected_signature_def = meta_graph_pb2.SignatureDef()
+      shape = tensor_shape_pb2.TensorShapeProto(
+          dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
+      dtype_float = types_pb2.DataType.Value('DT_FLOAT')
+      dtype_string = types_pb2.DataType.Value('DT_STRING')
+      expected_signature_def.inputs[
+          signature_constants.REGRESS_INPUTS].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='input-tensor-1:0',
+                                        dtype=dtype_string,
+                                        tensor_shape=shape))
+      expected_signature_def.outputs[
+          signature_constants.REGRESS_OUTPUTS].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='output-tensor-1:0',
+                                        dtype=dtype_float,
+                                        tensor_shape=shape))
+
+      expected_signature_def.method_name = (
+          signature_constants.REGRESS_METHOD_NAME)
+      self.assertEqual(actual_signature_def, expected_signature_def)
+
+  def test_build_standardized_signature_def_classify_classes_only(self):
+    """Tests classification with one output tensor."""
+    with context.graph_mode():
+      input_tensors = {
+          'input-1':
+              array_ops.placeholder(
+                  dtypes.string, 1, name='input-tensor-1')
+      }
+      classes = array_ops.placeholder(dtypes.string, 1, name='output-tensor-1')
+
+      export_output = export_output_lib.ClassificationOutput(classes=classes)
+      actual_signature_def = export_output.as_signature_def(input_tensors)
+
+      expected_signature_def = meta_graph_pb2.SignatureDef()
+      shape = tensor_shape_pb2.TensorShapeProto(
+          dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
+      dtype_string = types_pb2.DataType.Value('DT_STRING')
+      expected_signature_def.inputs[
+          signature_constants.CLASSIFY_INPUTS].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='input-tensor-1:0',
+                                        dtype=dtype_string,
+                                        tensor_shape=shape))
+      expected_signature_def.outputs[
+          signature_constants.CLASSIFY_OUTPUT_CLASSES].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='output-tensor-1:0',
+                                        dtype=dtype_string,
+                                        tensor_shape=shape))
+
+      expected_signature_def.method_name = (
+          signature_constants.CLASSIFY_METHOD_NAME)
+      self.assertEqual(actual_signature_def, expected_signature_def)
+
+  def test_build_standardized_signature_def_classify_both(self):
+    """Tests multiple output tensors that include classes and scores."""
+    with context.graph_mode():
+      input_tensors = {
+          'input-1':
+              array_ops.placeholder(
+                  dtypes.string, 1, name='input-tensor-1')
+      }
+      classes = array_ops.placeholder(dtypes.string, 1,
+                                      name='output-tensor-classes')
+      scores = array_ops.placeholder(dtypes.float32, 1,
+                                     name='output-tensor-scores')
+
+      export_output = export_output_lib.ClassificationOutput(
+          scores=scores, classes=classes)
+      actual_signature_def = export_output.as_signature_def(input_tensors)
+
+      expected_signature_def = meta_graph_pb2.SignatureDef()
+      shape = tensor_shape_pb2.TensorShapeProto(
+          dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
+      dtype_float = types_pb2.DataType.Value('DT_FLOAT')
+      dtype_string = types_pb2.DataType.Value('DT_STRING')
+      expected_signature_def.inputs[
+          signature_constants.CLASSIFY_INPUTS].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='input-tensor-1:0',
+                                        dtype=dtype_string,
+                                        tensor_shape=shape))
+      expected_signature_def.outputs[
+          signature_constants.CLASSIFY_OUTPUT_CLASSES].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='output-tensor-classes:0',
+                                        dtype=dtype_string,
+                                        tensor_shape=shape))
+      expected_signature_def.outputs[
+          signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='output-tensor-scores:0',
+                                        dtype=dtype_float,
+                                        tensor_shape=shape))
+
+      expected_signature_def.method_name = (
+          signature_constants.CLASSIFY_METHOD_NAME)
+      self.assertEqual(actual_signature_def, expected_signature_def)
+
+  def test_build_standardized_signature_def_classify_scores_only(self):
+    """Tests classification without classes tensor."""
+    with context.graph_mode():
+      input_tensors = {
+          'input-1':
+              array_ops.placeholder(
+                  dtypes.string, 1, name='input-tensor-1')
+      }
+
+      scores = array_ops.placeholder(dtypes.float32, 1,
+                                     name='output-tensor-scores')
+
+      export_output = export_output_lib.ClassificationOutput(
+          scores=scores)
+      actual_signature_def = export_output.as_signature_def(input_tensors)
+
+      expected_signature_def = meta_graph_pb2.SignatureDef()
+      shape = tensor_shape_pb2.TensorShapeProto(
+          dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
+      dtype_float = types_pb2.DataType.Value('DT_FLOAT')
+      dtype_string = types_pb2.DataType.Value('DT_STRING')
+      expected_signature_def.inputs[
+          signature_constants.CLASSIFY_INPUTS].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='input-tensor-1:0',
+                                        dtype=dtype_string,
+                                        tensor_shape=shape))
+      expected_signature_def.outputs[
+          signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='output-tensor-scores:0',
+                                        dtype=dtype_float,
+                                        tensor_shape=shape))
+
+      expected_signature_def.method_name = (
+          signature_constants.CLASSIFY_METHOD_NAME)
+      self.assertEqual(actual_signature_def, expected_signature_def)
+
+  def test_predict_outputs_valid(self):
+    """Tests that no errors are raised when provided outputs are valid."""
+    outputs = {
+        'output0': constant_op.constant([0]),
+        u'output1': constant_op.constant(['foo']),
+    }
+    export_output_lib.PredictOutput(outputs)
+
+    # Single Tensor is OK too
+    export_output_lib.PredictOutput(constant_op.constant([0]))
+
+  def test_predict_outputs_invalid(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Prediction output key must be a string'):
+      export_output_lib.PredictOutput({1: constant_op.constant([0])})
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Prediction output value must be a Tensor'):
+      export_output_lib.PredictOutput({
+          'prediction1': sparse_tensor.SparseTensor(
+              indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+      })
+
+
+class MockSupervisedOutput(export_output_lib._SupervisedOutput):
+  """So that we can test the abstract class methods directly."""
+
+  def _get_signature_def_fn(self):
+    pass
+
+
+class SupervisedOutputTest(test.TestCase):
+
+  def test_supervised_outputs_valid(self):
+    """Tests that no errors are raised when provided outputs are valid."""
+    with context.graph_mode():
+      loss = {'my_loss': constant_op.constant([0])}
+      predictions = {u'output1': constant_op.constant(['foo'])}
+      metric_obj = metrics_module.Mean()
+      metric_obj.update_state(constant_op.constant([0]))
+      metrics = {
+          'metrics': metric_obj,
+          'metrics2': (constant_op.constant([0]), constant_op.constant([10]))
+      }
+
+      outputter = MockSupervisedOutput(loss, predictions, metrics)
+      self.assertEqual(outputter.loss['loss/my_loss'], loss['my_loss'])
+      self.assertEqual(
+          outputter.predictions['predictions/output1'], predictions['output1'])
+      self.assertEqual(outputter.metrics['metrics/update_op'].name,
+                       'metric_op_wrapper:0')
+      self.assertEqual(
+          outputter.metrics['metrics2/update_op'], metrics['metrics2'][1])
+
+      # Single Tensor is OK too
+      outputter = MockSupervisedOutput(
+          loss['my_loss'], predictions['output1'], metrics['metrics'])
+      self.assertEqual(outputter.loss, {'loss': loss['my_loss']})
+      self.assertEqual(
+          outputter.predictions, {'predictions': predictions['output1']})
+      self.assertEqual(outputter.metrics['metrics/update_op'].name,
+                       'metric_op_wrapper_1:0')
+
+  def test_supervised_outputs_none(self):
+    outputter = MockSupervisedOutput(
+        constant_op.constant([0]), None, None)
+    self.assertEqual(len(outputter.loss), 1)
+    self.assertEqual(outputter.predictions, None)
+    self.assertEqual(outputter.metrics, None)
+
+  def test_supervised_outputs_invalid(self):
+    with self.assertRaisesRegexp(ValueError, 'predictions output value must'):
+      MockSupervisedOutput(constant_op.constant([0]), [3], None)
+    with self.assertRaisesRegexp(ValueError, 'loss output value must'):
+      MockSupervisedOutput('str', None, None)
+    with self.assertRaisesRegexp(ValueError, 'metrics output value must'):
+      MockSupervisedOutput(None, None, (15.3, 4))
+    with self.assertRaisesRegexp(ValueError, 'loss output key must'):
+      MockSupervisedOutput({25: 'Tensor'}, None, None)
+
+  def test_supervised_outputs_tuples(self):
+    """Tests that no errors are raised when provided outputs are valid."""
+    with context.graph_mode():
+      loss = {('my', 'loss'): constant_op.constant([0])}
+      predictions = {(u'output1', '2'): constant_op.constant(['foo'])}
+      metric_obj = metrics_module.Mean()
+      metric_obj.update_state(constant_op.constant([0]))
+      metrics = {
+          ('metrics', '1'):
+              metric_obj,
+          ('metrics', '2'): (constant_op.constant([0]),
+                             constant_op.constant([10]))
+      }
+
+      outputter = MockSupervisedOutput(loss, predictions, metrics)
+      self.assertEqual(set(outputter.loss.keys()), set(['loss/my/loss']))
+      self.assertEqual(set(outputter.predictions.keys()),
+                       set(['predictions/output1/2']))
+      self.assertEqual(
+          set(outputter.metrics.keys()),
+          set([
+              'metrics/1/value', 'metrics/1/update_op', 'metrics/2/value',
+              'metrics/2/update_op'
+          ]))
+
+  def test_supervised_outputs_no_prepend(self):
+    """Tests that no errors are raised when provided outputs are valid."""
+    with context.graph_mode():
+      loss = {'loss': constant_op.constant([0])}
+      predictions = {u'predictions': constant_op.constant(['foo'])}
+      metric_obj = metrics_module.Mean()
+      metric_obj.update_state(constant_op.constant([0]))
+      metrics = {
+          'metrics_1': metric_obj,
+          'metrics_2': (constant_op.constant([0]), constant_op.constant([10]))
+      }
+
+      outputter = MockSupervisedOutput(loss, predictions, metrics)
+      self.assertEqual(set(outputter.loss.keys()), set(['loss']))
+      self.assertEqual(set(outputter.predictions.keys()), set(['predictions']))
+      self.assertEqual(
+          set(outputter.metrics.keys()),
+          set([
+              'metrics_1/value', 'metrics_1/update_op', 'metrics_2/update_op',
+              'metrics_2/value'
+          ]))
+
+  def test_train_signature_def(self):
+    with context.graph_mode():
+      loss = {'my_loss': constant_op.constant([0])}
+      predictions = {u'output1': constant_op.constant(['foo'])}
+      metric_obj = metrics_module.Mean()
+      metric_obj.update_state(constant_op.constant([0]))
+      metrics = {
+          'metrics_1': metric_obj,
+          'metrics_2': (constant_op.constant([0]), constant_op.constant([10]))
+      }
+
+      outputter = export_output_lib.TrainOutput(loss, predictions, metrics)
+
+      receiver = {u'features': constant_op.constant(100, shape=(100, 2)),
+                  'labels': constant_op.constant(100, shape=(100, 1))}
+      sig_def = outputter.as_signature_def(receiver)
+
+      self.assertTrue('loss/my_loss' in sig_def.outputs)
+      self.assertTrue('metrics_1/value' in sig_def.outputs)
+      self.assertTrue('metrics_2/value' in sig_def.outputs)
+      self.assertTrue('predictions/output1' in sig_def.outputs)
+      self.assertTrue('features' in sig_def.inputs)
+
+  def test_eval_signature_def(self):
+    with context.graph_mode():
+      loss = {'my_loss': constant_op.constant([0])}
+      predictions = {u'output1': constant_op.constant(['foo'])}
+
+      outputter = export_output_lib.EvalOutput(loss, predictions, None)
+
+      receiver = {u'features': constant_op.constant(100, shape=(100, 2)),
+                  'labels': constant_op.constant(100, shape=(100, 1))}
+      sig_def = outputter.as_signature_def(receiver)
+
+      self.assertTrue('loss/my_loss' in sig_def.outputs)
+      self.assertFalse('metrics/value' in sig_def.outputs)
+      self.assertTrue('predictions/output1' in sig_def.outputs)
+      self.assertTrue('features' in sig_def.inputs)
+
+  def test_metric_op_is_tensor(self):
+    """Tests that ops.Operation is wrapped by a tensor for metric_ops."""
+    with context.graph_mode():
+      loss = {'my_loss': constant_op.constant([0])}
+      predictions = {u'output1': constant_op.constant(['foo'])}
+      metric_obj = metrics_module.Mean()
+      metric_obj.update_state(constant_op.constant([0]))
+      metrics = {
+          'metrics_1': metric_obj,
+          'metrics_2': (constant_op.constant([0]), control_flow_ops.no_op())
+      }
+
+      outputter = MockSupervisedOutput(loss, predictions, metrics)
+
+      self.assertTrue(outputter.metrics['metrics_1/update_op'].name.startswith(
+          'metric_op_wrapper'))
+      self.assertTrue(
+          isinstance(outputter.metrics['metrics_1/update_op'], ops.Tensor))
+      self.assertTrue(
+          isinstance(outputter.metrics['metrics_1/value'], ops.Tensor))
+
+      self.assertEqual(outputter.metrics['metrics_2/value'],
+                       metrics['metrics_2'][0])
+      self.assertTrue(outputter.metrics['metrics_2/update_op'].name.startswith(
+          'metric_op_wrapper'))
+      self.assertTrue(
+          isinstance(outputter.metrics['metrics_2/update_op'], ops.Tensor))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/saved_model/model_utils/export_test.py b/tensorflow/python/saved_model/model_utils/export_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..776bfff886aeba5d6fc08e14329be39ade8d6061
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/export_test.py
@@ -0,0 +1,257 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for export utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+import time
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model.model_utils import export_output
+from tensorflow.python.saved_model.model_utils import export_utils
+
+
+class LabeledTensorMock(object):
+  """Mock class emulating LabeledTensor."""
+
+  def __init__(self):
+    self.tensor = constant_op.constant([1])
+
+
+def _convert_labeled_tensor_mock_to_tensor(value, *args, **kwargs):
+  return ops.internal_convert_to_tensor(value.tensor, *args, **kwargs)
+
+
+ops.register_tensor_conversion_function(LabeledTensorMock,
+                                        _convert_labeled_tensor_mock_to_tensor)
+
+
+class ExportTest(test_util.TensorFlowTestCase):
+
+  def test_build_all_signature_defs_without_receiver_alternatives(self):
+    with context.graph_mode():
+      receiver_tensor = array_ops.placeholder(dtypes.string)
+      output_1 = constant_op.constant([1.])
+      output_2 = constant_op.constant(["2"])
+      output_3 = constant_op.constant(["3"])
+      export_outputs = {
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+              export_output.RegressionOutput(value=output_1),
+          "head-2": export_output.ClassificationOutput(classes=output_2),
+          "head-3": export_output.PredictOutput(outputs={
+              "some_output_3": output_3
+          }),
+      }
+
+      signature_defs = export_utils.build_all_signature_defs(
+          receiver_tensor, export_outputs)
+
+      expected_signature_defs = {
+          "serving_default":
+              signature_def_utils.regression_signature_def(receiver_tensor,
+                                                           output_1),
+          "head-2":
+              signature_def_utils.classification_signature_def(receiver_tensor,
+                                                               output_2, None),
+          "head-3":
+              signature_def_utils.predict_signature_def({
+                  "input": receiver_tensor
+              }, {"some_output_3": output_3})
+      }
+
+      self.assertDictEqual(expected_signature_defs, signature_defs)
+
+  def test_build_all_signature_defs_with_dict_alternatives(self):
+    with context.graph_mode():
+      receiver_tensor = array_ops.placeholder(dtypes.string)
+      receiver_tensors_alternative_1 = {
+          "foo": array_ops.placeholder(dtypes.int64),
+          "bar": array_ops.sparse_placeholder(dtypes.float32)}
+      receiver_tensors_alternatives = {"other": receiver_tensors_alternative_1}
+      output_1 = constant_op.constant([1.])
+      output_2 = constant_op.constant(["2"])
+      output_3 = constant_op.constant(["3"])
+      export_outputs = {
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+              export_output.RegressionOutput(value=output_1),
+          "head-2": export_output.ClassificationOutput(classes=output_2),
+          "head-3": export_output.PredictOutput(outputs={
+              "some_output_3": output_3
+          }),
+      }
+
+      signature_defs = export_utils.build_all_signature_defs(
+          receiver_tensor, export_outputs, receiver_tensors_alternatives)
+
+      expected_signature_defs = {
+          "serving_default":
+              signature_def_utils.regression_signature_def(
+                  receiver_tensor,
+                  output_1),
+          "head-2":
+              signature_def_utils.classification_signature_def(
+                  receiver_tensor,
+                  output_2, None),
+          "head-3":
+              signature_def_utils.predict_signature_def(
+                  {"input": receiver_tensor},
+                  {"some_output_3": output_3}),
+          "other:head-3":
+              signature_def_utils.predict_signature_def(
+                  receiver_tensors_alternative_1,
+                  {"some_output_3": output_3})
+
+          # Note that the alternatives 'other:serving_default' and
+          # 'other:head-2' are invalid, because regession and classification
+          # signatures must take a single string input.  Here we verify that
+          # these invalid signatures are not included in the export_utils.
+      }
+
+      self.assertDictEqual(expected_signature_defs, signature_defs)
+
+  def test_build_all_signature_defs_with_single_alternatives(self):
+    with context.graph_mode():
+      receiver_tensor = array_ops.placeholder(dtypes.string)
+      receiver_tensors_alternative_1 = array_ops.placeholder(dtypes.int64)
+      receiver_tensors_alternative_2 = array_ops.sparse_placeholder(
+          dtypes.float32)
+      # Note we are passing single Tensors as values of
+      # receiver_tensors_alternatives, where normally that is a dict.
+      # In this case a dict will be created using the default receiver tensor
+      # name "input".
+      receiver_tensors_alternatives = {"other1": receiver_tensors_alternative_1,
+                                       "other2": receiver_tensors_alternative_2}
+      output_1 = constant_op.constant([1.])
+      output_2 = constant_op.constant(["2"])
+      output_3 = constant_op.constant(["3"])
+      export_outputs = {
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+              export_output.RegressionOutput(value=output_1),
+          "head-2": export_output.ClassificationOutput(classes=output_2),
+          "head-3": export_output.PredictOutput(outputs={
+              "some_output_3": output_3
+          }),
+      }
+
+    signature_defs = export_utils.build_all_signature_defs(
+        receiver_tensor, export_outputs, receiver_tensors_alternatives)
+
+    expected_signature_defs = {
+        "serving_default":
+            signature_def_utils.regression_signature_def(
+                receiver_tensor,
+                output_1),
+        "head-2":
+            signature_def_utils.classification_signature_def(
+                receiver_tensor,
+                output_2, None),
+        "head-3":
+            signature_def_utils.predict_signature_def(
+                {"input": receiver_tensor},
+                {"some_output_3": output_3}),
+        "other1:head-3":
+            signature_def_utils.predict_signature_def(
+                {"input": receiver_tensors_alternative_1},
+                {"some_output_3": output_3}),
+        "other2:head-3":
+            signature_def_utils.predict_signature_def(
+                {"input": receiver_tensors_alternative_2},
+                {"some_output_3": output_3})
+
+        # Note that the alternatives 'other:serving_default' and 'other:head-2'
+        # are invalid, because regession and classification signatures must take
+        # a single string input.  Here we verify that these invalid signatures
+        # are not included in the export_utils.
+    }
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
+  def test_build_all_signature_defs_export_outputs_required(self):
+    receiver_tensor = constant_op.constant(["11"])
+
+    with self.assertRaises(ValueError) as e:
+      export_utils.build_all_signature_defs(receiver_tensor, None)
+
+    self.assertTrue(str(e.exception).startswith(
+        "export_outputs must be a dict"))
+
+  def test_get_timestamped_export_dir(self):
+    export_dir_base = tempfile.mkdtemp() + "export/"
+    export_dir_1 = export_utils.get_timestamped_export_dir(
+        export_dir_base)
+    time.sleep(2)
+    export_dir_2 = export_utils.get_timestamped_export_dir(
+        export_dir_base)
+    time.sleep(2)
+    export_dir_3 = export_utils.get_timestamped_export_dir(
+        export_dir_base)
+
+    # Export directories should be named using a timestamp that is seconds
+    # since epoch.  Such a timestamp is 10 digits long.
+    time_1 = os.path.basename(export_dir_1)
+    self.assertEqual(10, len(time_1))
+    time_2 = os.path.basename(export_dir_2)
+    self.assertEqual(10, len(time_2))
+    time_3 = os.path.basename(export_dir_3)
+    self.assertEqual(10, len(time_3))
+
+    self.assertTrue(int(time_1) < int(time_2))
+    self.assertTrue(int(time_2) < int(time_3))
+
+  def test_build_all_signature_defs_serving_only(self):
+    with context.graph_mode():
+      receiver_tensor = {"input": array_ops.placeholder(dtypes.string)}
+      output_1 = constant_op.constant([1.])
+      export_outputs = {
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+              export_output.PredictOutput(outputs=output_1),
+          "train": export_output.TrainOutput(loss=output_1),
+      }
+
+      signature_defs = export_utils.build_all_signature_defs(
+          receiver_tensor, export_outputs)
+
+      expected_signature_defs = {
+          "serving_default": signature_def_utils.predict_signature_def(
+              receiver_tensor, {"output": output_1})
+      }
+
+      self.assertDictEqual(expected_signature_defs, signature_defs)
+
+      signature_defs = export_utils.build_all_signature_defs(
+          receiver_tensor, export_outputs, serving_only=False)
+
+      expected_signature_defs.update({
+          "train": signature_def_utils.supervised_train_signature_def(
+              receiver_tensor, loss={"loss": output_1})
+      })
+
+      self.assertDictEqual(expected_signature_defs, signature_defs)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/saved_model/model_utils/export_utils.py b/tensorflow/python/saved_model/model_utils/export_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f8933758d92199ca1cbdd4a6f046a90e5a97f21
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/export_utils.py
@@ -0,0 +1,340 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for creating SavedModels."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os
+import time
+
+import six
+
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.saved_model.model_utils import export_output as export_output_lib
+from tensorflow.python.training import mode_keys
+from tensorflow.python.util import compat
+
+
+# Mapping of the modes to appropriate MetaGraph tags in the SavedModel.
+EXPORT_TAG_MAP = {
+    mode_keys.ModeKeys.PREDICT: [tag_constants.SERVING],
+    mode_keys.ModeKeys.TRAIN: [tag_constants.TRAINING],
+    mode_keys.ModeKeys.TEST: [tag_constants.EVAL],
+}
+
+
+_SINGLE_FEATURE_DEFAULT_NAME = 'feature'
+_SINGLE_RECEIVER_DEFAULT_NAME = 'input'
+_SINGLE_LABEL_DEFAULT_NAME = 'label'
+
+### Below utilities are specific to SavedModel exports.
+
+
+def build_all_signature_defs(receiver_tensors,
+                             export_outputs,
+                             receiver_tensors_alternatives=None,
+                             serving_only=True):
+  """Build `SignatureDef`s for all export outputs.
+
+  Args:
+    receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
+      input nodes where this receiver expects to be fed by default.  Typically,
+      this is a single placeholder expecting serialized `tf.Example` protos.
+    export_outputs: a dict of ExportOutput instances, each of which has
+      an as_signature_def instance method that will be called to retrieve
+      the signature_def for all export output tensors.
+    receiver_tensors_alternatives: a dict of string to additional
+      groups of receiver tensors, each of which may be a `Tensor` or a dict of
+      string to `Tensor`.  These named receiver tensor alternatives generate
+      additional serving signatures, which may be used to feed inputs at
+      different points within the input receiver subgraph.  A typical usage is
+      to allow feeding raw feature `Tensor`s *downstream* of the
+      tf.parse_example() op.  Defaults to None.
+    serving_only: boolean; if true, resulting signature defs will only include
+      valid serving signatures. If false, all requested signatures will be
+      returned.
+
+  Returns:
+    signature_def representing all passed args.
+
+  Raises:
+    ValueError: if export_outputs is not a dict
+  """
+  if not isinstance(receiver_tensors, dict):
+    receiver_tensors = {_SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
+  if export_outputs is None or not isinstance(export_outputs, dict):
+    raise ValueError('export_outputs must be a dict and not'
+                     '{}'.format(type(export_outputs)))
+
+  signature_def_map = {}
+  excluded_signatures = {}
+  for output_key, export_output in export_outputs.items():
+    signature_name = '{}'.format(output_key or 'None')
+    try:
+      signature = export_output.as_signature_def(receiver_tensors)
+      signature_def_map[signature_name] = signature
+    except ValueError as e:
+      excluded_signatures[signature_name] = str(e)
+
+  if receiver_tensors_alternatives:
+    for receiver_name, receiver_tensors_alt in (
+        six.iteritems(receiver_tensors_alternatives)):
+      if not isinstance(receiver_tensors_alt, dict):
+        receiver_tensors_alt = {
+            _SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors_alt
+        }
+      for output_key, export_output in export_outputs.items():
+        signature_name = '{}:{}'.format(receiver_name or 'None', output_key or
+                                        'None')
+        try:
+          signature = export_output.as_signature_def(receiver_tensors_alt)
+          signature_def_map[signature_name] = signature
+        except ValueError as e:
+          excluded_signatures[signature_name] = str(e)
+
+  _log_signature_report(signature_def_map, excluded_signatures)
+
+  # The above calls to export_output_lib.as_signature_def should return only
+  # valid signatures; if there is a validity problem, they raise a ValueError,
+  # in which case we exclude that signature from signature_def_map above.
+  # The is_valid_signature check ensures that the signatures produced are
+  # valid for serving, and acts as an additional sanity check for export
+  # signatures produced for serving. We skip this check for training and eval
+  # signatures, which are not intended for serving.
+  if serving_only:
+    signature_def_map = {
+        k: v
+        for k, v in signature_def_map.items()
+        if signature_def_utils.is_valid_signature(v)
+    }
+  return signature_def_map
+
+
+_FRIENDLY_METHOD_NAMES = {
+    signature_constants.CLASSIFY_METHOD_NAME: 'Classify',
+    signature_constants.REGRESS_METHOD_NAME: 'Regress',
+    signature_constants.PREDICT_METHOD_NAME: 'Predict',
+    signature_constants.SUPERVISED_TRAIN_METHOD_NAME: 'Train',
+    signature_constants.SUPERVISED_EVAL_METHOD_NAME: 'Eval',
+}
+
+
+def _log_signature_report(signature_def_map, excluded_signatures):
+  """Log a report of which signatures were produced."""
+  sig_names_by_method_name = collections.defaultdict(list)
+
+  # We'll collect whatever method_names are present, but also we want to make
+  # sure to output a line for each of the three standard methods even if they
+  # have no signatures.
+  for method_name in _FRIENDLY_METHOD_NAMES:
+    sig_names_by_method_name[method_name] = []
+
+  for signature_name, sig in signature_def_map.items():
+    sig_names_by_method_name[sig.method_name].append(signature_name)
+
+  # TODO(b/67733540): consider printing the full signatures, not just names
+  for method_name, sig_names in sig_names_by_method_name.items():
+    if method_name in _FRIENDLY_METHOD_NAMES:
+      method_name = _FRIENDLY_METHOD_NAMES[method_name]
+    logging.info('Signatures INCLUDED in export for {}: {}'.format(
+        method_name, sig_names if sig_names else 'None'))
+
+  if excluded_signatures:
+    logging.info('Signatures EXCLUDED from export because they cannot be '
+                 'be served via TensorFlow Serving APIs:')
+    for signature_name, message in excluded_signatures.items():
+      logging.info('\'{}\' : {}'.format(signature_name, message))
+
+  if not signature_def_map:
+    logging.warn('Export includes no signatures!')
+  elif (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY not in
+        signature_def_map):
+    logging.warn('Export includes no default signature!')
+
+
+# When we create a timestamped directory, there is a small chance that the
+# directory already exists because another process is also creating these
+# directories. In this case we just wait one second to get a new timestamp and
+# try again. If this fails several times in a row, then something is seriously
+# wrong.
+MAX_DIRECTORY_CREATION_ATTEMPTS = 10
+
+
+def get_timestamped_export_dir(export_dir_base):
+  """Builds a path to a new subdirectory within the base directory.
+
+  Each export is written into a new subdirectory named using the
+  current time.  This guarantees monotonically increasing version
+  numbers even across multiple runs of the pipeline.
+  The timestamp used is the number of seconds since epoch UTC.
+
+  Args:
+    export_dir_base: A string containing a directory to write the exported
+        graph and checkpoints.
+  Returns:
+    The full path of the new subdirectory (which is not actually created yet).
+
+  Raises:
+    RuntimeError: if repeated attempts fail to obtain a unique timestamped
+      directory name.
+  """
+  attempts = 0
+  while attempts < MAX_DIRECTORY_CREATION_ATTEMPTS:
+    timestamp = int(time.time())
+
+    result_dir = os.path.join(
+        compat.as_bytes(export_dir_base), compat.as_bytes(str(timestamp)))
+    if not gfile.Exists(result_dir):
+      # Collisions are still possible (though extremely unlikely): this
+      # directory is not actually created yet, but it will be almost
+      # instantly on return from this function.
+      return result_dir
+    time.sleep(1)
+    attempts += 1
+    logging.warn('Directory {} already exists; retrying (attempt {}/{})'.format(
+        result_dir, attempts, MAX_DIRECTORY_CREATION_ATTEMPTS))
+  raise RuntimeError('Failed to obtain a unique export directory name after '
+                     '{} attempts.'.format(MAX_DIRECTORY_CREATION_ATTEMPTS))
+
+
+def get_temp_export_dir(timestamped_export_dir):
+  """Builds a directory name based on the argument but starting with 'temp-'.
+
+  This relies on the fact that TensorFlow Serving ignores subdirectories of
+  the base directory that can't be parsed as integers.
+
+  Args:
+    timestamped_export_dir: the name of the eventual export directory, e.g.
+      /foo/bar/<timestamp>
+
+  Returns:
+    A sister directory prefixed with 'temp-', e.g. /foo/bar/temp-<timestamp>.
+  """
+  (dirname, basename) = os.path.split(timestamped_export_dir)
+  temp_export_dir = os.path.join(
+      compat.as_bytes(dirname), compat.as_bytes('temp-{}'.format(basename)))
+  return temp_export_dir
+
+
+def export_outputs_for_mode(
+    mode, serving_export_outputs=None, predictions=None, loss=None,
+    metrics=None):
+  """Util function for constructing a `ExportOutput` dict given a mode.
+
+  The returned dict can be directly passed to `build_all_signature_defs` helper
+  function as the `export_outputs` argument, used for generating a SignatureDef
+  map.
+
+  Args:
+    mode: A `ModeKeys` specifying the mode.
+    serving_export_outputs: Describes the output signatures to be exported to
+      `SavedModel` and used during serving. Should be a dict or None.
+    predictions: A dict of Tensors or single Tensor representing model
+        predictions. This argument is only used if serving_export_outputs is not
+        set.
+    loss: A dict of Tensors or single Tensor representing calculated loss.
+    metrics: A dict of (metric_value, update_op) tuples, or a single tuple.
+      metric_value must be a Tensor, and update_op must be a Tensor or Op
+
+  Returns:
+    Dictionary mapping the a key to an `tf.estimator.export.ExportOutput` object
+    The key is the expected SignatureDef key for the mode.
+
+  Raises:
+    ValueError: if an appropriate ExportOutput cannot be found for the mode.
+  """
+  # TODO(b/113185250): move all model export helper functions into an util file.
+  if mode == mode_keys.ModeKeys.PREDICT:
+    return get_export_outputs(serving_export_outputs, predictions)
+  elif mode == mode_keys.ModeKeys.TRAIN:
+    return {mode: export_output_lib.TrainOutput(
+        loss=loss, predictions=predictions, metrics=metrics)}
+  elif mode == mode_keys.ModeKeys.TEST:
+    return {mode: export_output_lib.EvalOutput(
+        loss=loss, predictions=predictions, metrics=metrics)}
+  else:
+    raise ValueError(
+        'Export output type not found for mode: {}'.format(mode))
+
+
+def get_export_outputs(export_outputs, predictions):
+  """Validate export_outputs or create default export_outputs.
+
+  Args:
+    export_outputs: Describes the output signatures to be exported to
+      `SavedModel` and used during serving. Should be a dict or None.
+    predictions:  Predictions `Tensor` or dict of `Tensor`.
+
+  Returns:
+    Valid export_outputs dict
+
+  Raises:
+    TypeError: if export_outputs is not a dict or its values are not
+      ExportOutput instances.
+  """
+  if export_outputs is None:
+    default_output = export_output_lib.PredictOutput(predictions)
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: default_output}
+
+  if not isinstance(export_outputs, dict):
+    raise TypeError('export_outputs must be dict, given: {}'.format(
+        export_outputs))
+  for v in six.itervalues(export_outputs):
+    if not isinstance(v, export_output_lib.ExportOutput):
+      raise TypeError(
+          'Values in export_outputs must be ExportOutput objects. '
+          'Given: {}'.format(export_outputs))
+
+  _maybe_add_default_serving_output(export_outputs)
+
+  return export_outputs
+
+
+def _maybe_add_default_serving_output(export_outputs):
+  """Add a default serving output to the export_outputs if not present.
+
+  Args:
+    export_outputs: Describes the output signatures to be exported to
+      `SavedModel` and used during serving. Should be a dict.
+
+  Returns:
+    export_outputs dict with default serving signature added if necessary
+
+  Raises:
+    ValueError: if multiple export_outputs were provided without a default
+      serving key.
+  """
+  if len(export_outputs) == 1:
+    (key, value), = export_outputs.items()
+    if key != signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+      export_outputs[
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = value
+  if len(export_outputs) > 1:
+    if (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+        not in export_outputs):
+      raise ValueError(
+          'Multiple export_outputs were provided, but none of them is '
+          'specified as the default.  Do this by naming one of them with '
+          'signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY.')
+
+  return export_outputs
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index d52251e49fb2a21b595fefcd805ef51b4bde200d..6c2d5e6f2bb840251ce8c4ba4b1df9186ab9fd71 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -19,51 +19,38 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
 import os
 
+from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function
+from tensorflow.python.eager import function as defun
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_spec
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.saved_model import builder_impl
 from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import function_serialization
+from tensorflow.python.saved_model import saved_object_graph_pb2
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils_impl
 from tensorflow.python.training.checkpointable import base
+from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.training.checkpointable import util
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
-
-def _check_for_functional_keras_model(root):
-  """Makes an export signature for `root` if it's a functional Keras Model."""
-  # If nothing is decorated yet but this is a functional Keras Model (duck
-  # typed), we'll try to make a signature ourselves.
-  try:
-    inputs = root.inputs
-    input_names = root.input_names
-  except AttributeError:
-    return None
-  input_signature = []
-  for input_tensor, input_name in zip(inputs, input_names):
-    input_signature.append(tensor_spec.TensorSpec(
-        shape=input_tensor.shape, dtype=input_tensor.dtype,
-        name=input_name))
-
-  @def_function.function(input_signature=input_signature)
-  def _wrapped_model(*args):
-    outputs_list = nest.flatten(root(inputs=list(args)))
-    return {name: output for name, output
-            in zip(root.output_names, outputs_list)}
-  return _wrapped_model
+DEFAULT_SIGNATURE_ATTR = "_default_save_signature"
 
 
 def _find_function_to_export(root):
@@ -85,7 +72,7 @@ def _find_function_to_export(root):
       exported_function = attribute_value
       previous_attribute_name = attribute_name
   if exported_function is None:
-    exported_function = _check_for_functional_keras_model(root)
+    exported_function = getattr(root, DEFAULT_SIGNATURE_ATTR, None)
   if exported_function is None:
     raise ValueError(
         ("Exporting an object with no tf.saved_model.save(..., signatures=...) "
@@ -105,7 +92,7 @@ def _canonicalize_signatures(signatures):
         signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures}
   concrete_signatures = {}
   for serving_key, signature_function in signatures.items():
-    if isinstance(signature_function, (function.PolymorphicFunction,
+    if isinstance(signature_function, (defun.PolymorphicFunction,
                                        def_function.PolymorphicFunction)):
       input_signature = signature_function._input_signature  # pylint: disable=protected-access
       if input_signature is None:
@@ -116,7 +103,7 @@ def _canonicalize_signatures(signatures):
              "converted to concrete functions using "
              "`f.get_concrete_function(...)`.").format(signature_function))
       signature_function = signature_function.get_concrete_function()
-    elif not isinstance(signature_function, function.Function):
+    elif not isinstance(signature_function, defun.Function):
       raise ValueError(
           ("Expected a TensorFlow function to generate a signature for, but "
            "got {}. Python functions may be decorated with "
@@ -173,64 +160,65 @@ def _tensor_dict_to_tensorinfo(tensor_dict):
           for key, value in tensor_dict.items()}
 
 
-def _map_captured_resources_to_created_resources(
+def _map_captures_to_created_tensors(
     original_captures, resource_map):
-  """Maps eager resources captured by a function to Graph resources for export.
+  """Maps eager tensors captured by a function to Graph resources for export.
 
   Args:
-    original_captures: A dictionary mapping from resource tensors captured by
-      the function to interior placeholders for those resources (inside the
-      function body).
+    original_captures: A dictionary mapping from tensors captured by the
+      function to interior placeholders for those tensors (inside the function
+      body).
     resource_map: A dictionary mapping from resource tensors owned by the eager
       context to resource tensors in the exported graph.
 
   Returns:
-    A dictionary mapping from interior placeholders in the function body to
-    exterior stand-in resource tensors which belong to the exported graph.
+    A list of stand-in tensors which belong to the exported graph, corresponding
+    to the function's captures.
 
   Raises:
     AssertionError: If the function references a resource which is not part of
       `resource_map`.
   """
-  export_captures = {}
+  export_captures = []
   for exterior, interior in original_captures.items():
     mapped_resource = resource_map.get(exterior, None)
     if mapped_resource is None:
-      raise AssertionError(
-          ("Tried to export a function which references untracked stateful "
-           "object {}. Stateful TensorFlow objects (e.g. tf.Variable) must "
-           "be tracked by the main object. Objects may be tracked by "
-           "assigning them to an attribute of another tracked object, or to "
-           "an attribute of the main object directly.")
-          .format(interior))
-    export_captures[interior] = mapped_resource
+      if exterior.dtype == dtypes.resource:
+        raise AssertionError(
+            ("Tried to export a function which references untracked stateful "
+             "object {}. Stateful TensorFlow objects (e.g. tf.Variable) must "
+             "be tracked by the main object. Objects may be tracked by "
+             "assigning them to an attribute of another tracked object, or to "
+             "an attribute of the main object directly.")
+            .format(interior))
+      else:
+        # This is a captured Tensor, but it's not a resource. We'll just add it
+        # to the graph as a constant.
+        mapped_resource = constant_op.constant(exterior.numpy())
+    export_captures.append(mapped_resource)
   return export_captures
 
 
-def _map_function_inputs_to_created_inputs(
-    function_inputs, export_captures, signature_key, function_name):
-  """Creates exterior placeholders in the exported graph for function inputs.
+def _map_function_arguments_to_created_inputs(
+    function_arguments, signature_key, function_name):
+  """Creates exterior placeholders in the exported graph for function arguments.
 
   Functions have two types of inputs: tensors captured from the outside (eager)
   context, and arguments to the function which we expect to receive from the
-  user at each call. `_map_captured_resources_to_created_resources` replaces
+  user at each call. `_map_captures_to_created_tensors` replaces
   captured tensors with stand-ins (typically these are resource dtype tensors
   associated with variables). `_map_function_inputs_to_created_inputs` runs over
-  every input, either captured or argument. For captures, it uses the mapped
-  resource from `export_captures`. For arguments, it creates a new placeholder
-  which will belong to the exported graph rather than the function body.
+  every argument, creating a new placeholder for each which will belong to the
+  exported graph rather than the function body.
 
   Args:
-    function_inputs: A list of all placeholders in the function body.
-    export_captures: A dictionary mapping from interior placeholders in the
-      function body to exterior stand-in resource tensors which belong to the
-      exported graph (see `_map_captured_resources_to_created_resources`).
+    function_arguments: A list of argument placeholders in the function body.
     signature_key: The name of the signature being exported, for error messages.
     function_name: The name of the function, for error messages.
 
   Returns:
     A tuple of (mapped_inputs, exterior_placeholders)
-      mapped_inputs: A list with entries corresponding to `function_inputs`
+      mapped_inputs: A list with entries corresponding to `function_arguments`
         containing all of the inputs of the function gathered from the exported
         graph (both captured resources and arguments).
       exterior_argument_placeholders: A dictionary mapping from argument names
@@ -248,12 +236,7 @@ def _map_function_inputs_to_created_inputs(
   # MetaGraph.
   exterior_argument_placeholders = {}
   mapped_inputs = []
-  for placeholder in function_inputs:
-    mapped_resource_tensor = export_captures.get(placeholder, None)
-    if mapped_resource_tensor is not None:
-      # This is a captured resource.
-      mapped_inputs.append(mapped_resource_tensor)
-      continue
+  for placeholder in function_arguments:
     # `export_captures` contains an exhaustive set of captures, so if we don't
     # find the input there then we now know we have an argument.
     user_input_name = compat.as_str_any(
@@ -286,6 +269,20 @@ def _map_function_inputs_to_created_inputs(
   return mapped_inputs, exterior_argument_placeholders
 
 
+def _call_function_with_mapped_captures(function, args, resource_map):
+  """Calls `function` in the exported graph, using mapped resource captures."""
+  export_captures = _map_captures_to_created_tensors(
+      function.graph.captures, resource_map)
+  mapped_inputs = args + export_captures
+  # Calls the function quite directly, since we have new captured resource
+  # tensors we need to feed in which weren't part of the original function
+  # definition.
+  # pylint: disable=protected-access
+  outputs = function._build_call_outputs(
+      function._inference_function.call(context.context(), mapped_inputs))
+  return outputs
+
+
 def _generate_signatures(signature_functions, resource_map):
   """Validates and calls `signature_functions` in the default graph.
 
@@ -315,35 +312,81 @@ def _generate_signatures(signature_functions, resource_map):
     SignatureDefs as part of that MetaGraph.
   """
   signatures = {}
-  for signature_key, func in sorted(signature_functions.items()):
-    # Register the inference function for this signature in the exported
-    # graph. There is no direct use for the gradient of this function, so we
-    # don't generate/register a gradient function here (but may end up with one
-    # if another function relies on it). Users can still take symbolic gradients
-    # of the function on import, the gradient just won't be in the saved
-    # graph. When exporting a signature which already computes gradients, this
-    # stops us from taking needless second-order gradients.
-    func.add_to_graph(register_gradient_functions=False)
-    export_captures = _map_captured_resources_to_created_resources(
-        func.graph.captures, resource_map)
+  for signature_key, function in sorted(signature_functions.items()):
+    if function.graph.captures:
+      argument_inputs = function.graph.inputs[:-len(function.graph.captures)]
+    else:
+      argument_inputs = function.graph.inputs
     mapped_inputs, exterior_argument_placeholders = (
-        _map_function_inputs_to_created_inputs(
-            func.inputs, export_captures, signature_key, func.name))
-    # Calls the function quite directly, since we have new captured resource
-    # tensors we need to feed in which weren't part of the original function
-    # definition.
-    # pylint: disable=protected-access
+        _map_function_arguments_to_created_inputs(
+            argument_inputs, signature_key, function.name))
     outputs = _normalize_outputs(
-        func._build_call_outputs(
-            func._inference_function.call(context.context(), mapped_inputs)),
-        func.name, signature_key)
-    # pylint: enable=protected-access
+        _call_function_with_mapped_captures(
+            function, mapped_inputs, resource_map),
+        function.name, signature_key)
     signatures[signature_key] = signature_def_utils.build_signature_def(
         _tensor_dict_to_tensorinfo(exterior_argument_placeholders),
         _tensor_dict_to_tensorinfo(outputs))
   return signatures
 
 
+def _trace_resource_initializers(accessible_objects):
+  """Create concrete functions from `TrackableResource` objects."""
+  resource_initializers = []
+
+  def _wrap_initializer(obj):
+    obj.initialize()
+    return constant_op.constant(1.)  # Dummy control output
+
+  for obj in accessible_objects:
+    if isinstance(obj, tracking.TrackableResource):
+      resource_initializers.append(def_function.function(
+          functools.partial(_wrap_initializer, obj),
+          # All inputs are captures.
+          input_signature=[]).get_concrete_function())
+  return resource_initializers
+
+
+_AssetInfo = collections.namedtuple(
+    "_AssetInfo", [
+        # List of AssetFileDef protocol buffers
+        "asset_defs",
+        # Map from asset variable resource Tensors to their init ops
+        "asset_initializers_by_resource",
+        # Map from base asset filenames to full paths
+        "asset_filename_map",
+        # Map from TrackableAsset to index of corresponding AssetFileDef
+        "asset_index"])
+
+
+def _process_asset(trackable_asset, asset_info, resource_map):
+  """Add `trackable_asset` to `asset_info` and `resource_map`."""
+  original_variable = trackable_asset.asset_path
+  with context.eager_mode():
+    original_path = original_variable.numpy()
+  path = builder_impl.get_asset_filename_to_add(
+      asset_filepath=original_path,
+      asset_filename_map=asset_info.asset_filename_map)
+  # TODO(andresp): Instead of mapping 1-1 between trackable asset
+  # and asset in the graph def consider deduping the assets that
+  # point to the same file.
+  asset_path_initializer = array_ops.placeholder(
+      shape=original_variable.shape,
+      dtype=dtypes.string,
+      name="asset_path_initializer")
+  asset_variable = resource_variable_ops.ResourceVariable(
+      asset_path_initializer)
+  asset_info.asset_filename_map[path] = original_path
+  asset_def = meta_graph_pb2.AssetFileDef()
+  asset_def.filename = path
+  asset_def.tensor_info.name = asset_path_initializer.name
+  asset_info.asset_defs.append(asset_def)
+  asset_info.asset_initializers_by_resource[original_variable.handle] = (
+      asset_variable.initializer)
+  asset_info.asset_index[trackable_asset] = len(asset_info.asset_defs) - 1
+  resource_map[original_variable.handle] = asset_variable.handle
+
+
 def _map_resources(accessible_objects):
   """Makes new resource handle ops corresponding to existing resource tensors.
 
@@ -357,47 +400,153 @@ def _map_resources(accessible_objects):
       to create replacements for.
 
   Returns:
-    A tuple of (object_map, resource_map):
+    A tuple of (object_map, resource_map, asset_info):
       object_map: A dictionary mapping from object in `accessible_objects` to
         replacement objects created to hold the new resource tensors.
       resource_map: A dictionary mapping from resource tensors extracted from
         `accessible_objects` to newly created resource tensors.
+      asset_info: An _AssetInfo tuple describing external assets referenced from
+        accessible_objects.
   """
-  # TODO(allenl, rohanj): Map generic resources rather than just variables.
   # TODO(allenl): Handle MirroredVariables and other types of variables which
   # may need special casing.
   object_map = {}
   resource_map = {}
+  asset_info = _AssetInfo(
+      asset_defs=[],
+      asset_initializers_by_resource={},
+      asset_filename_map={},
+      asset_index={})
   for obj in accessible_objects:
-    if resource_variable_ops.is_resource_variable(obj):
+    if isinstance(obj, tracking.TrackableResource):
+      new_resource = obj.create_resource()
+      resource_map[obj.resource_handle] = new_resource
+    elif resource_variable_ops.is_resource_variable(obj):
       new_variable = resource_variable_ops.copy_to_graph_uninitialized(obj)
       object_map[obj] = new_variable
       resource_map[obj.handle] = new_variable.handle
-  return object_map, resource_map
+    elif isinstance(obj, tracking.TrackableAsset):
+      _process_asset(obj, asset_info, resource_map)
+  return object_map, resource_map, asset_info
 
 
-def _make_graph_def(root, signature_functions, object_saver):
-  """Generates and exports call ops for `signature_functions`."""
+def _fill_meta_graph_def(meta_graph_def, obj, signature_functions,
+                         object_saver):
+  """Generates a MetaGraph which calls `signature_functions`.
+
+  Args:
+    meta_graph_def: The MetaGraphDef proto to fill.
+    obj: The checkpointable object being exported.
+    signature_functions: A dictionary mapping signature keys to concrete
+      functions containing signatures to add to the MetaGraph.
+    object_saver: A CheckpointableSaver to add to the MetaGraph.
+
+  Returns:
+    An _AssetInfo, which contains information to help creating the SavedModel.
+  """
   signatures = {}
   # List objects from the eager context to make sure Optimizers give us the
   # right Graph-dependent variables.
-  accessible_objects = util.list_objects(root)
+  accessible_objects = util.list_objects(obj)
+  resource_initializer_functions = _trace_resource_initializers(
+      accessible_objects)
   exported_graph = ops.Graph()
+  resource_initializer_ops = []
   with exported_graph.as_default():
-    object_map, resource_map = _map_resources(accessible_objects)
+    object_map, resource_map, asset_info = _map_resources(accessible_objects)
+    for resource_initializer_function in resource_initializer_functions:
+      asset_dependencies = []
+      for capture in resource_initializer_function.graph.external_captures:
+        asset_initializer = asset_info.asset_initializers_by_resource.get(
+            capture, None)
+        if asset_initializer is not None:
+          asset_dependencies.append(asset_initializer)
+      with ops.control_dependencies(asset_dependencies):
+        resource_initializer_ops.append(
+            _call_function_with_mapped_captures(
+                resource_initializer_function, [], resource_map))
+    with ops.control_dependencies(resource_initializer_ops):
+      init_op = control_flow_ops.no_op()
+    # Add the same op to the main_op collection and to the init_op
+    # signature. The collection is for compatibility with older loader APIs;
+    # only one will be executed.
+    meta_graph_def.collection_def[constants.MAIN_OP_KEY].node_list.value.append(
+        init_op.name)
+    meta_graph_def.signature_def[constants.INIT_OP_SIGNATURE_KEY].CopyFrom(
+        signature_def_utils.op_signature_def(
+            init_op, constants.INIT_OP_SIGNATURE_KEY))
+
   # Saving an object-based checkpoint again gathers variables. We need to do the
   # gathering from the eager context so Optimizers save the right set of
   # variables, but want any operations associated with the save/restore to be in
   # the exported graph (thus the `to_graph` argument).
   saver = object_saver.freeze(object_map=object_map, to_graph=exported_graph)
+
+  # We must resolve the concrete function to add to MetaGraph while in eager
+  # mode.
+  concrete_functions = []
+  for accessible_object in accessible_objects:
+    for function in function_serialization.list_all_polymorphic_functions(
+        accessible_object).values():
+      concrete_functions.extend(
+          function_serialization.list_all_concrete_functions(function))
+
   with exported_graph.as_default():
     signatures = _generate_signatures(signature_functions, resource_map)
+    for concrete_function in concrete_functions:
+      concrete_function.add_to_graph()
     saver_def = saver.to_proto()
+    meta_graph_def.saver_def.CopyFrom(saver_def)
   graph_def = exported_graph.as_graph_def(add_shapes=True)
   # Clean reference cycles so repeated export()s don't make work for the garbage
   # collector.
   ops.dismantle_graph(exported_graph)
-  return graph_def, signatures, saver_def
+
+  meta_graph_def.graph_def.CopyFrom(graph_def)
+  meta_graph_def.meta_info_def.tags.append(tag_constants.SERVING)
+  meta_graph_def.asset_file_def.extend(asset_info.asset_defs)
+  for signature_key, signature in signatures.items():
+    meta_graph_def.signature_def[signature_key].CopyFrom(signature)
+  meta_graph.strip_graph_default_valued_attrs(meta_graph_def)
+  return asset_info
+
+
+def _write_object_graph(root, export_dir, asset_file_def_index):
+  """Save a SavedObjectGraph proto for `root`."""
+  # SavedObjectGraph is similar to the CheckpointableObjectGraph proto in the
+  # checkpoint. It will eventually go into the SavedModel.
+  proto = saved_object_graph_pb2.SavedObjectGraph()
+
+  checkpointable_objects, node_ids, slot_variables = util.find_objects(root)
+  util.fill_object_graph_proto(checkpointable_objects, node_ids, slot_variables,
+                               proto)
+
+  for obj, obj_proto in zip(checkpointable_objects, proto.nodes):
+    _write_object_proto(obj, obj_proto, asset_file_def_index)
+
+  function_serialization.add_polymorphic_functions_to_object_graph_proto(
+      checkpointable_objects, proto)
+
+  extra_asset_dir = os.path.join(
+      compat.as_bytes(export_dir),
+      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY))
+  file_io.recursive_create_dir(extra_asset_dir)
+  object_graph_filename = os.path.join(
+      extra_asset_dir, compat.as_bytes("object_graph.pb"))
+  file_io.write_string_to_file(object_graph_filename, proto.SerializeToString())
+
+
+def _write_object_proto(obj, proto, asset_file_def_index):
+  """Saves an object into SavedObject proto."""
+  if isinstance(obj, tracking.TrackableAsset):
+    proto.asset.SetInParent()
+    proto.asset.asset_file_def_index = asset_file_def_index[obj]
+  elif resource_variable_ops.is_resource_variable(obj):
+    proto.variable.SetInParent()
+    proto.variable.dtype = obj.dtype.as_datatype_enum
+    proto.variable.shape.CopyFrom(obj.shape.as_proto())
+  else:
+    proto.user_object.SetInParent()
 
 
 @tf_export("saved_model.save", v1=["saved_model.experimental.save"])
@@ -547,7 +696,25 @@ def save(obj, export_dir, signatures=None):
 
   Raises:
     ValueError: If `obj` is not checkpointable.
+
+  @compatibility(eager)
+  Not supported when graph building. From TensorFlow 1.x,
+  `tf.enable_eager_execution()` must run first. May not be called from within a
+  function body.
+  @end_compatibility
   """
+  if not context.executing_eagerly():
+    with ops.init_scope():
+      if context.executing_eagerly():
+        raise AssertionError(
+            "tf.saved_model.save is not supported inside a traced "
+            "@tf.function. Move the call to the outer eagerly-executed "
+            "context.")
+      else:
+        raise AssertionError(
+            "tf.saved_model.save is not supported when graph building. "
+            "tf.enable_eager_execution() must run first when calling it from "
+            "TensorFlow 1.x.")
   # pylint: enable=line-too-long
   if not isinstance(obj, base.CheckpointableBase):
     raise ValueError(
@@ -556,27 +723,27 @@ def save(obj, export_dir, signatures=None):
     # Note that we run this before saving the checkpoint, since looping over
     # attributes may have the side effect of creating variables in some cases.
     signatures = _find_function_to_export(obj)
-  object_saver = util.CheckpointableSaver(obj)
-  utils_impl.get_or_create_variables_dir(export_dir)
-  object_saver.save(utils_impl.get_variables_path(export_dir))
 
   signatures = _canonicalize_signatures(signatures)
-  graph_def, signatures, saver_def = _make_graph_def(
-      obj, signatures, object_saver)
-  saved_model = saved_model_pb2.SavedModel()
-  saved_model.saved_model_schema_version = (
-      constants.SAVED_MODEL_SCHEMA_VERSION)
-  meta_graph_def = saved_model.meta_graphs.add()
-  meta_graph_def.meta_info_def.tags.append(tag_constants.SERVING)
-  meta_graph_def.saver_def.CopyFrom(saver_def)
   # TODO(allenl): Factor out some subset of SavedModelBuilder which is 2.x
   # compatible (no sessions) and share it with this export API rather than
   # making a SavedModel proto and writing it directly.
-  meta_graph_def.graph_def.MergeFrom(graph_def)
-  for signature_key, signature in signatures.items():
-    meta_graph_def.signature_def[signature_key].MergeFrom(signature)
-  meta_graph.strip_graph_default_valued_attrs(meta_graph_def)
+  saved_model = saved_model_pb2.SavedModel()
+  meta_graph_def = saved_model.meta_graphs.add()
+  object_saver = util.CheckpointableSaver(obj)
+  asset_info = _fill_meta_graph_def(
+      meta_graph_def, obj, signatures, object_saver)
+  saved_model.saved_model_schema_version = (
+      constants.SAVED_MODEL_SCHEMA_VERSION)
+  # So far we've just been generating protocol buffers with no I/O. Now we write
+  # the checkpoint, copy assets into the assets directory, and write out the
+  # SavedModel proto itself.
+  utils_impl.get_or_create_variables_dir(export_dir)
+  object_saver.save(utils_impl.get_variables_path(export_dir))
+  builder_impl.copy_assets_to_destination_dir(asset_info.asset_filename_map,
+                                              export_dir)
   path = os.path.join(
       compat.as_bytes(export_dir),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
   file_io.write_string_to_file(path, saved_model.SerializeToString())
+  _write_object_graph(obj, export_dir, asset_info.asset_index)
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index ef0a92fddd9800bb8ad71d507dec61d4204b1cd4..5381c2f031385043ff69f00fc673ea2d0d69b31a 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -21,8 +21,7 @@ from __future__ import print_function
 import os
 import sys
 
-import numpy
-
+from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
@@ -31,11 +30,9 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.engine import input_layer
-from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.layers import merge
-from tensorflow.python.ops import array_ops
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import loader
@@ -47,10 +44,9 @@ from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.training.checkpointable import util
 
 
-class _ModelWithOptimizer(training.Model):
+class _ModelWithOptimizer(util.Checkpoint):
 
   def __init__(self):
-    super(_ModelWithOptimizer, self).__init__()
     self.dense = core.Dense(1)
     self.optimizer = adam.AdamOptimizer(0.01)
 
@@ -60,32 +56,33 @@ class _ModelWithOptimizer(training.Model):
   def call(self, x, y):
     with backprop.GradientTape() as tape:
       loss = math_ops.reduce_mean((self.dense(x) - y) ** 2.)
-    trainable_variables = self.trainable_variables
+    trainable_variables = self.dense.trainable_variables
     gradients = tape.gradient(loss, trainable_variables)
     self.optimizer.apply_gradients(zip(gradients, trainable_variables))
     return {"loss": loss}
 
 
-class SaveTest(test.TestCase):
+def _import_and_infer(
+    save_dir, inputs,
+    signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY):
+  """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
+  graph = ops.Graph()
+  with graph.as_default(), session_lib.Session() as session:
+    model = loader.load(session, [tag_constants.SERVING], save_dir)
+    signature = model.signature_def[signature_key]
+    assert set(inputs.keys()) == set(signature.inputs.keys())
+    feed_dict = {}
+    for arg_name in inputs.keys():
+      feed_dict[graph.get_tensor_by_name(signature.inputs[arg_name].name)] = (
+          inputs[arg_name])
+    output_dict = {}
+    for output_name, output_tensor_info in signature.outputs.items():
+      output_dict[output_name] = graph.get_tensor_by_name(
+          output_tensor_info.name)
+    return session.run(output_dict, feed_dict=feed_dict)
 
-  def _import_and_infer(
-      self, save_dir, inputs,
-      signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY):
-    """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
-    graph = ops.Graph()
-    with graph.as_default(), self.session(graph) as session:
-      model = loader.load(session, [tag_constants.SERVING], save_dir)
-      signature = model.signature_def[signature_key]
-      self.assertEqual(set(inputs.keys()), set(signature.inputs.keys()))
-      feed_dict = {}
-      for arg_name in inputs.keys():
-        feed_dict[graph.get_tensor_by_name(signature.inputs[arg_name].name)] = (
-            inputs[arg_name])
-      output_dict = {}
-      for output_name, output_tensor_info in signature.outputs.items():
-        output_dict[output_name] = graph.get_tensor_by_name(
-            output_tensor_info.name)
-      return session.run(output_dict, feed_dict=feed_dict)
+
+class SaveTest(test.TestCase):
 
   def test_method_save_signature(self):
     root = tracking.Checkpointable()
@@ -97,7 +94,7 @@ class SaveTest(test.TestCase):
     save.save(root, save_dir, root.f)
     self.assertEqual(
         {"output_0": 2.},
-        self._import_and_infer(save_dir, {"x": 1.}))
+        _import_and_infer(save_dir, {"x": 1.}))
 
   def test_method_save_concrete(self):
     root = tracking.Checkpointable()
@@ -112,7 +109,7 @@ class SaveTest(test.TestCase):
             tensor_spec.TensorSpec(None, dtypes.float32))})
     self.assertEqual(
         {"out": 2.},
-        self._import_and_infer(
+        _import_and_infer(
             save_dir, {"z": 1.}, signature_key="non_default_key"))
 
   def test_non_concrete_error(self):
@@ -148,9 +145,9 @@ class SaveTest(test.TestCase):
       save.save(root, save_dir, to_save)
 
   def test_nested_dict_outputs(self):
-    root = tracking.Checkpointable()
-    root.f = def_function.function(
-        lambda x: {"a": 2. * x, "b": (3. * x, 4. * x)})
+    root = util.Checkpoint(
+        f=def_function.function(
+            lambda x: {"a": 2. * x, "b": (3. * x, 4. * x)}))
     root.f(constant_op.constant(1.))
     to_save = root.f.get_concrete_function(constant_op.constant(1.))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
@@ -169,20 +166,20 @@ class SaveTest(test.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(root, save_dir, to_save)
     self.assertAllEqual({"output_0": 12.},
-                        self._import_and_infer(save_dir, {"x": 2.}))
+                        _import_and_infer(save_dir, {"x": 2.}))
 
   def test_optimizer(self):
     x = constant_op.constant([[3., 4.]])
     y = constant_op.constant([2.])
     model = _ModelWithOptimizer()
-    first_loss = model(x, y)
+    first_loss = model.call(x, y)
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(model, save_dir, model.call)
-    second_loss = model(x, y)
+    second_loss = model.call(x, y)
     self.assertNotEqual(first_loss, second_loss)
     self.assertAllClose(
         second_loss,
-        self._import_and_infer(save_dir, {"x": [[3., 4.]], "y": [2.]}))
+        _import_and_infer(save_dir, {"x": [[3., 4.]], "y": [2.]}))
 
   def test_trivial_save_exception(self):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
@@ -193,12 +190,12 @@ class SaveTest(test.TestCase):
     model = _ModelWithOptimizer()
     x = constant_op.constant([[3., 4.]])
     y = constant_op.constant([2.])
-    model(x, y)
+    model.call(x, y)
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(model, save_dir)
     self.assertIn("loss",
-                  self._import_and_infer(save_dir,
-                                         {"x": [[3., 4.]], "y": [2.]}))
+                  _import_and_infer(save_dir,
+                                    {"x": [[3., 4.]], "y": [2.]}))
 
   def test_single_function_default_signature(self):
     model = tracking.Checkpointable()
@@ -207,31 +204,46 @@ class SaveTest(test.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(model, save_dir)
     self.assertAllClose({"output_0": 3.},
-                        self._import_and_infer(save_dir, {}))
+                        _import_and_infer(save_dir, {}))
 
   def test_ambiguous_signatures(self):
     model = _ModelWithOptimizer()
     x = constant_op.constant([[3., 4.]])
     y = constant_op.constant([2.])
-    model(x, y)
+    model.call(x, y)
     model.second_function = def_function.function(lambda: 1.)
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     with self.assertRaisesRegexp(ValueError, "call.*second_function"):
       save.save(model, save_dir)
 
-  def test_subclassed_no_signature(self):
+  def test_no_signature(self):
 
-    class Subclassed(training.Model):
+    class Model(util.Checkpoint):
 
       def call(self, inputs):
         return inputs * 2.
 
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    model = Subclassed()
+    model = Model()
     with self.assertRaisesRegexp(
         ValueError, "no @tf.function-decorated methods"):
       save.save(model, save_dir)
 
+  def test_find_default_save_function(self):
+
+    class ObjWithDefaultSignature(util.Checkpoint):
+
+      @def_function.function(input_signature=[tensor_spec.TensorSpec(
+          shape=None, dtype=dtypes.float32)])
+      def _default_save_signature(self, x):
+        return x + x + 1
+
+    obj = ObjWithDefaultSignature()
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(obj, save_dir)
+    self.assertAllClose(
+        {"output_0": 7.}, _import_and_infer(save_dir, {"x": 3.}))
+
   def test_docstring(self):
 
     class Adder(util.Checkpoint):
@@ -246,7 +258,7 @@ class SaveTest(test.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(to_save, save_dir)
     self.assertAllClose({"output_0": 7.},
-                        self._import_and_infer(save_dir, {"x": 3.}))
+                        _import_and_infer(save_dir, {"x": 3.}))
 
   def test_default_attr_stripping(self):
 
@@ -272,45 +284,70 @@ class SaveTest(test.TestCase):
       self.assertNotIn("T", complex_node.attr)
       self.assertNotIn("Tout", complex_node.attr)
 
-  def test_export_functional_keras_model(self):
-    x = input_layer.Input((4,), name="x")
-    y = core.Dense(4, name="out")(x)
-    model = training.Model(x, y)
+
+class AssetTests(test.TestCase):
+
+  def setUp(self):
+    super(AssetTests, self).setUp()
+    self._vocab_path = os.path.join(self.get_temp_dir(), "vocab.txt")
+    with open(self._vocab_path, "w") as f:
+      f.write("alpha\nbeta\ngamma\n")
+
+  def test_table(self):
+    initializer = lookup_ops.TextFileInitializer(
+        self._vocab_path,
+        key_dtype=dtypes.string,
+        key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
+        value_dtype=dtypes.int64,
+        value_index=lookup_ops.TextFileIndex.LINE_NUMBER)
+    root = util.Checkpoint(table=lookup_ops.HashTable(
+        initializer, default_value=-1))
+    root.table_user = def_function.function(
+        root.table.lookup,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
+    self.assertEqual(
+        2,
+        self.evaluate(root.table_user(constant_op.constant("gamma"))))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    save.save(model, save_dir)
+    save.save(root, save_dir)
+    file_io.delete_file(self._vocab_path)
     self.assertAllClose(
-        {"out": model(array_ops.ones([1, 4]))},
-        self._import_and_infer(save_dir, {"x": [[1., 1., 1., 1.]]}))
-
-  @test_util.run_deprecated_v1
-  def test_export_functional_keras_model_after_fit(self):
-    x = input_layer.Input((1,))
-    y = core.Dense(1, name="y")(x)
-    model = training.Model(x, y)
-    model.compile(optimizer="sgd", loss="mse")
-    model.fit(x=numpy.array([[1.]]),
-              y=numpy.array([2.]), epochs=2)
-    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    save.save(model, save_dir)
+        {"output_0": [2, 0]},
+        _import_and_infer(save_dir, {"keys": ["gamma", "alpha"]}))
+    second_dir = os.path.join(self.get_temp_dir(), "second_dir")
+    # Asset paths should track the location the SavedModel is loaded from.
+    file_io.rename(save_dir, second_dir)
     self.assertAllClose(
-        {"y": model(constant_op.constant([[1.], [2.]]))},
-        self._import_and_infer(save_dir, {"input_1": [[1.], [2.]]}))
-
-  def test_export_multi_input_functional_keras_model(self):
-    x1 = input_layer.Input((2,), name="x1")
-    x2 = input_layer.Input((2,), name="x2")
-    y1 = core.Dense(4)(merge.Add()([x1, x2]))
-    y2 = core.Dense(4)(merge.Multiply()([x1, x2]))
-    model = training.Model([x1, x2], [y1, y2])
-    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    save.save(model, save_dir)
-    outputs = model([array_ops.ones([1, 2]), 2. * array_ops.ones([1, 2])])
+        {"output_0": [2, 1]},
+        _import_and_infer(second_dir, {"keys": ["gamma", "beta"]}))
+
+  def test_unused_asset(self):
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root.asset = tracking.TrackableAsset(self._vocab_path)
+
+    export_dir = os.path.join(self.get_temp_dir(), "save_dir")
+    save.save(root, export_dir)
     self.assertAllClose(
-        {"dense": outputs[0], "dense_1": outputs[1]},
-        self._import_and_infer(
-            save_dir,
-            {"x1": [[1., 1.]],
-             "x2": [[2., 2.]]}))
+        {"output_0": [0.2]},
+        _import_and_infer(export_dir, {"x": [0.1]}))
+
+  def test_sensible_graph_building_exception(self):
+    root = util.Checkpoint(v=variables.Variable(2.))
+    root.f = def_function.function(
+        lambda x: 2. * root.v,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    export_dir = os.path.join(self.get_temp_dir(), "save_dir")
+    @def_function.function
+    def _calls_save():
+      save.save(root, export_dir)
+    with self.assertRaisesRegexp(AssertionError, "tf.function"):
+      _calls_save()
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(AssertionError, "enable_eager_execution"):
+        save.save(root, export_dir)
 
 
 class MemoryTests(test.TestCase):
@@ -322,7 +359,7 @@ class MemoryTests(test.TestCase):
   def test_no_reference_cycles(self):
     x = constant_op.constant([[3., 4.]])
     y = constant_op.constant([2.])
-    self._model(x, y)
+    self._model.call(x, y)
     if sys.version_info[0] < 3:
       # TODO(allenl): debug reference cycles in Python 2.x
       self.skipTest("This test only works in Python 3+. Reference cycles are "
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 0f18fb1a01681e89a560b5d8d509e15529f499dd..e36b8b30bf25c0d6f9b78cfdc2afee31f106f632 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -328,7 +328,7 @@ class SavedModelTest(SavedModelTestBase):
       self.assertRaises(RuntimeError, loader.load, sess, ["foo", "baz"],
                         export_dir)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testVariables(self):
     export_dir = self._get_export_dir("test_variables")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
@@ -474,7 +474,7 @@ class SavedModelTest(SavedModelTestBase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testCollections(self):
     export_dir = self._get_export_dir("test_collections")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
@@ -819,6 +819,7 @@ class SavedModelTest(SavedModelTestBase):
       self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
                             "foo bar baz 0", "asset_file_tensor_0:0")
 
+  @test_util.run_v1_only("b/120545219")
   def testCustomInitOp(self):
     export_dir = self._get_export_dir("test_main_op")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
@@ -854,7 +855,7 @@ class SavedModelTest(SavedModelTestBase):
       # the main_op, following a restore.
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testTrainOp(self):
     export_dir = self._get_export_dir("test_train_op")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
@@ -882,7 +883,7 @@ class SavedModelTest(SavedModelTestBase):
       self.assertIsInstance(
           loader_impl.get_train_op(meta_graph_def), ops.Tensor)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testTrainOpGroup(self):
     export_dir = self._get_export_dir("test_train_op_group")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
@@ -910,7 +911,7 @@ class SavedModelTest(SavedModelTestBase):
       self.assertIsInstance(
           loader_impl.get_train_op(meta_graph_def), ops.Operation)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testTrainOpAfterVariables(self):
     export_dir = self._get_export_dir("test_train_op_after_variables")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
@@ -1029,7 +1030,7 @@ class SavedModelTest(SavedModelTestBase):
       self._validate_assets(export_dir, bar_graph.asset_file_def, "foo.txt",
                             "content_foo", "asset_file_tensor:0")
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testOp(self):
     export_dir = self._get_export_dir("test_op")
     builder = saved_model_builder._SavedModelBuilder(export_dir)
@@ -1083,7 +1084,7 @@ class SavedModelTest(SavedModelTestBase):
       # CheckpointedOp is a key-value table that can be saved across sessions.
       # The table register itself in SAVEABLE_OBJECTS collection.
       v1 = saver_test_utils.CheckpointedOp(name="v1")
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       v1.insert("k1", 3.0).run()
       # Once the table is restored, we can access it through this reference.
       ops.add_to_collection("table_ref", v1.table_ref)
@@ -1460,10 +1461,8 @@ class SavedModelV1Test(SavedModelTestBase):
     self.assertIn("Tout", complex_node.attr)
 
     # Load graph "foo" from disk as-is to verify default attrs are stripped.
-    # pylint: disable=protected-access
-    saved_model_pb = loader_impl._parse_saved_model(export_dir)
+    saved_model_pb = loader_impl.parse_saved_model(export_dir)
     self.assertIsNotNone(saved_model_pb)
-    # pylint: enable=protected-access
 
     meta_graph_foo_def = None
     meta_graph_bar_def = None
@@ -1494,7 +1493,7 @@ class SavedModelV1Test(SavedModelTestBase):
     self.assertIn("T", node_def.attr)
     self.assertIn("Tout", node_def.attr)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testLegacyInitOp(self):
     export_dir = self._get_export_dir("test_legacy_init_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
diff --git a/tensorflow/python/saved_model/saved_object_graph.proto b/tensorflow/python/saved_model/saved_object_graph.proto
new file mode 100644
index 0000000000000000000000000000000000000000..b95990ad348ce2a513b9e7af8f541a3ff9ff1832
--- /dev/null
+++ b/tensorflow/python/saved_model/saved_object_graph.proto
@@ -0,0 +1,96 @@
+syntax = "proto3";
+
+import "tensorflow/core/protobuf/checkpointable_object_graph.proto";
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
+
+option cc_enable_arenas = true;
+
+package tensorflow;
+
+// A SavedObjectGraph is part of object-based SavedModels in TF 2.0. It
+// describes the directed graph of Python objects (or equivalent in other
+// languages) that make up a model, with nodes[0] at the root.
+
+// SavedObjectGraph shares some structure with CheckpointableObjectGraph, but
+// ObjectGraph belongs to the SavedModel and contains pointers to functions and
+// type information, while CheckpointableObjectGraph lives in the checkpoint and
+// contains pointers only to variable values.
+
+// NOTE: This protocol buffer format is experimental and subject to change.
+
+message SavedObjectGraph {
+  // List of objects in the SavedModel.
+  //
+  // The position of the object in this list indicates its id.
+  // Nodes[0] is considered the root node.
+  repeated SavedObject nodes = 1;
+}
+
+message SavedObject {
+  // Objects which this object depends on: named edges in the dependency
+  // graph.
+  //
+  // Note: only valid if kind == "object".
+  repeated CheckpointableObjectGraph.CheckpointableObject.ObjectReference
+      children = 1;
+
+  // Removed when forking from CheckpointableObjectGraph.
+  reserved "attributes";
+  reserved 2;
+
+  // Slot variables owned by this object. This describes the three-way
+  // (optimizer, variable, slot variable) relationship; none of the three
+  // depend on the others directly.
+  //
+  // Note: only valid if kind == "object".
+  repeated CheckpointableObjectGraph.CheckpointableObject.SlotVariableReference
+      slot_variables = 3;
+
+  oneof kind {
+    SavedUserObject user_object = 4;
+    SavedAsset asset = 5;
+    SavedPolymorphicFunction function = 6;
+    SavedVariable variable = 7;
+  }
+}
+
+// A SavedUserObject is an object (in the object-oriented language of the
+// TensorFlow program) of some user- or framework-defined class other than
+// those handled specifically by the other kinds of SavedObjects.
+//
+// This object cannot be evaluated as a tensor, and therefore cannot be bound
+// to an input of a function.
+message SavedUserObject {}
+
+// A SavedAsset represents a file in a SavedModel.
+//
+// When bound to a function this object evaluates to a Variable from which the
+// absolute filename can be read. Users should not expect the filename to be
+// maintained.
+message SavedAsset {
+  // Index into `MetaGraphDef.asset_file_def[]` that describes the Asset.
+  //
+  // Only the field `AssetFileDef.filename` is used. Other fields, such as
+  // `AssetFileDef.tensor_info`, MUST be ignored.
+  uint32 asset_file_def_index = 1;
+}
+
+// A function with multiple signatures, possibly with non-Tensor arguments.
+message SavedPolymorphicFunction {
+  repeated SavedMonomorphicFunction monomorphic_function = 1;
+}
+
+message SavedMonomorphicFunction {
+  // A reference to a TensorFlow function in the MetaGraph's FunctionDefLibrary
+  string concrete_function = 1;
+}
+
+// Represents a Variable that is initialized by loading the contents from the
+// SavedModel checkpoint.
+message SavedVariable {
+  DataType dtype = 1;
+  TensorShapeProto shape = 2;
+
+  // TODO(andresp): Add "trainable" and save_slice_info_def.
+}
diff --git a/tensorflow/python/saved_model/signature_def_utils_test.py b/tensorflow/python/saved_model/signature_def_utils_test.py
index 53c452359f155263f97fc6db294534c40d6c6d39..d1347eb0178423f9293022e4f36eeb90caac833e 100644
--- a/tensorflow/python/saved_model/signature_def_utils_test.py
+++ b/tensorflow/python/saved_model/signature_def_utils_test.py
@@ -423,6 +423,7 @@ class SignatureDefUtilsTest(test.TestCase):
         {},
         signature_constants.PREDICT_METHOD_NAME)
 
+  @test_util.run_v1_only("b/120545219")
   def testOpSignatureDef(self):
     key = "adding_1_and_2_key"
     add_op = math_ops.add(1, 2, name="adding_1_and_2")
@@ -430,6 +431,7 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertIn(key, signature_def.outputs)
     self.assertEqual(add_op.name, signature_def.outputs[key].name)
 
+  @test_util.run_v1_only("b/120545219")
   def testLoadOpFromSignatureDef(self):
     key = "adding_1_and_2_key"
     add_op = math_ops.add(1, 2, name="adding_1_and_2")
diff --git a/tensorflow/python/saved_model/utils_test.py b/tensorflow/python/saved_model/utils_test.py
index 0888dcb411e34b030416362663fe4e2d11899cfd..2afe8abfd646f26f0562d7cc56b82c5781a586ef 100644
--- a/tensorflow/python/saved_model/utils_test.py
+++ b/tensorflow/python/saved_model/utils_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
@@ -32,6 +33,7 @@ from tensorflow.python.saved_model import utils
 
 class UtilsTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testBuildTensorInfoOp(self):
     x = constant_op.constant(1, name="x")
     y = constant_op.constant(2, name="y")
@@ -41,6 +43,7 @@ class UtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_INVALID, z_op_info.dtype)
     self.assertEqual(0, len(z_op_info.tensor_shape.dim))
 
+  @test_util.run_v1_only("b/120545219")
   def testBuildTensorInfoDefunOp(self):
     @function.defun
     def my_init_fn(x, y):
@@ -54,6 +57,7 @@ class UtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_INVALID, init_op_info.dtype)
     self.assertEqual(0, len(init_op_info.tensor_shape.dim))
 
+  @test_util.run_v1_only("b/120545219")
   def testBuildTensorInfoDense(self):
     x = array_ops.placeholder(dtypes.float32, 1, name="x")
     x_tensor_info = utils.build_tensor_info(x)
@@ -62,6 +66,7 @@ class UtilsTest(test.TestCase):
     self.assertEqual(1, len(x_tensor_info.tensor_shape.dim))
     self.assertEqual(1, x_tensor_info.tensor_shape.dim[0].size)
 
+  @test_util.run_v1_only("b/120545219")
   def testBuildTensorInfoSparse(self):
     x = array_ops.sparse_placeholder(dtypes.float32, [42, 69], name="x")
     x_tensor_info = utils.build_tensor_info(x)
@@ -76,6 +81,7 @@ class UtilsTest(test.TestCase):
     self.assertEqual(42, x_tensor_info.tensor_shape.dim[0].size)
     self.assertEqual(69, x_tensor_info.tensor_shape.dim[1].size)
 
+  @test_util.run_v1_only("b/120545219")
   def testGetTensorFromInfoDense(self):
     expected = array_ops.placeholder(dtypes.float32, 1, name="x")
     tensor_info = utils.build_tensor_info(expected)
@@ -83,6 +89,7 @@ class UtilsTest(test.TestCase):
     self.assertIsInstance(actual, ops.Tensor)
     self.assertEqual(expected.name, actual.name)
 
+  @test_util.run_v1_only("b/120545219")
   def testGetTensorFromInfoSparse(self):
     expected = array_ops.sparse_placeholder(dtypes.float32, name="x")
     tensor_info = utils.build_tensor_info(expected)
@@ -122,6 +129,7 @@ class UtilsTest(test.TestCase):
                                                  import_scope="foo")
       self.assertEqual(expected.name, actual.name)
 
+  @test_util.run_v1_only("b/120545219")
   def testGetTensorFromInfoRaisesErrors(self):
     expected = array_ops.placeholder(dtypes.float32, 1, name="x")
     tensor_info = utils.build_tensor_info(expected)
diff --git a/tensorflow/python/tf2.py b/tensorflow/python/tf2.py
index c9782a71199f73a1fc6207ea4e9568b6bac9c00a..75748f8f2c5ba2b78a2d220011e3e28e12276b62 100644
--- a/tensorflow/python/tf2.py
+++ b/tensorflow/python/tf2.py
@@ -25,6 +25,21 @@ from __future__ import print_function
 import os
 
 
+_force_enable = False
+
+
+def enable():
+  """Enables v2 behaviors."""
+  global _force_enable
+  _force_enable = True
+
+
+def disable():
+  """Disables v2 behaviors (TF2_BEHAVIOR env variable is still respected)."""
+  global _force_enable
+  _force_enable = False
+
+
 def enabled():
   """Returns True iff TensorFlow 2.0 behavior should be enabled."""
-  return os.getenv("TF2_BEHAVIOR") is not None
+  return _force_enable or os.getenv("TF2_BEHAVIOR", "0") != "0"
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 3517c11cc93718f0fb5da457250fe7a3cece1798..5fee9c5eaf9a44aaf50a433a6395fc603e8417a0 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -4,6 +4,8 @@
 TENSORFLOW_API_INIT_FILES = [
     # BEGIN GENERATED FILES
     "__init__.py",
+    "autograph/__init__.py",
+    "autograph/experimental/__init__.py",
     "bitwise/__init__.py",
     "compat/__init__.py",
     "data/__init__.py",
@@ -18,6 +20,7 @@ TENSORFLOW_API_INIT_FILES = [
     "graph_util/__init__.py",
     "image/__init__.py",
     "io/__init__.py",
+    "queue/__init__.py",
     "initializers/__init__.py",
     "keras/__init__.py",
     "keras/activations/__init__.py",
@@ -64,10 +67,10 @@ TENSORFLOW_API_INIT_FILES = [
     "lite/constants/__init__.py",
     "losses/__init__.py",
     "math/__init__.py",
-    "metrics/__init__.py",
     "nn/__init__.py",
     "nn/rnn_cell/__init__.py",
     "quantization/__init__.py",
+    "ragged/__init__.py",
     "random/__init__.py",
     "saved_model/__init__.py",
     "sets/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index e35b9c43740d4e59e9478cca978b15c7451ac96e..8d3b86bf265c3b8ea6d4c9d910f028f023e57a2b 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -5,6 +5,8 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     # BEGIN GENERATED FILES
     "__init__.py",
     "app/__init__.py",
+    "autograph/__init__.py",
+    "autograph/experimental/__init__.py",
     "bitwise/__init__.py",
     "compat/__init__.py",
     "data/__init__.py",
@@ -21,6 +23,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "graph_util/__init__.py",
     "image/__init__.py",
     "io/__init__.py",
+    "queue/__init__.py",
     "initializers/__init__.py",
     "keras/__init__.py",
     "keras/activations/__init__.py",
@@ -77,6 +80,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "profiler/__init__.py",
     "python_io/__init__.py",
     "quantization/__init__.py",
+    "ragged/__init__.py",
     "random/__init__.py",
     "resource_loader/__init__.py",
     "strings/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/doc_srcs.py b/tensorflow/python/tools/api/generator/doc_srcs.py
index abb5886deb3d9dd2e6981ee5822b0323a87eef1d..b567eead3d0c8c3023322f95402662408152ce45 100644
--- a/tensorflow/python/tools/api/generator/doc_srcs.py
+++ b/tensorflow/python/tools/api/generator/doc_srcs.py
@@ -54,6 +54,7 @@ _TENSORFLOW_DOC_SOURCES = {
     'nn': DocSource(docstring_module_name='ops.nn_ops'),
     'nn.rnn_cell': DocSource(docstring_module_name='ops.rnn_cell'),
     'python_io': DocSource(docstring_module_name='lib.io.python_io'),
+    'ragged': DocSource(docstring_module_name='ops.ragged'),
     'resource_loader': DocSource(
         docstring_module_name='platform.resource_loader'),
     'sets': DocSource(docstring_module_name='ops.sets'),
diff --git a/tensorflow/python/tools/freeze_graph_test.py b/tensorflow/python/tools/freeze_graph_test.py
index efdf7dd2cf1276cd7611e434a63afecc9fe25d2a..de2672db3c4c4e6b94d3803767a749a943910d2c 100644
--- a/tensorflow/python/tools/freeze_graph_test.py
+++ b/tensorflow/python/tools/freeze_graph_test.py
@@ -161,11 +161,11 @@ class FreezeGraphTest(test_util.TensorFlowTestCase):
             },)
         builder.save(as_text=True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testFreezeGraphV1(self):
     self._testFreezeGraph(saver_pb2.SaverDef.V1)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testFreezeGraphV2(self):
     self._testFreezeGraph(saver_pb2.SaverDef.V2)
 
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index c4c3756c0407f2ed6a6a411b6778b2431428eea6..afc4e517cdd0a34171038cc0ae2d74ce30ecb6a9 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -659,6 +659,28 @@ def scan(args):
       scan_meta_graph_def(meta_graph_def)
 
 
+def convert_with_tensorrt(args):
+  """Function triggered by 'convert tensorrt' command.
+
+  Args:
+    args: A namespace parsed from command line.
+  """
+  # Import here instead of at top, because this will crash if TensorRT is
+  # not installed
+  from tensorflow.contrib import tensorrt  # pylint: disable=g-import-not-at-top
+  tensorrt.create_inference_graph(
+      None,
+      None,
+      max_batch_size=args.max_batch_size,
+      max_workspace_size_bytes=args.max_workspace_size_bytes,
+      precision_mode=args.precision_mode,
+      minimum_segment_size=args.minimum_segment_size,
+      is_dynamic_op=args.is_dynamic_op,
+      input_saved_model_dir=args.dir,
+      input_saved_model_tags=args.tag_set.split(','),
+      output_saved_model_dir=args.output_dir)
+
+
 def create_parser():
   """Creates a parser that parse the command line arguments.
 
@@ -812,6 +834,71 @@ def create_parser():
       help='tag-set of graph in SavedModel to scan, separated by \',\'')
   parser_scan.set_defaults(func=scan)
 
+  # convert command
+  convert_msg = ('Usage example:\n'
+                 'To convert the SavedModel to one that have TensorRT ops:\n'
+                 '$saved_model_cli convert \\\n'
+                 '   --dir /tmp/saved_model \\\n'
+                 '   --tag_set serve \\\n'
+                 '   --output_dir /tmp/saved_model_trt \\\n'
+                 '   tensorrt \n')
+  parser_convert = subparsers.add_parser(
+      'convert',
+      description=convert_msg,
+      formatter_class=argparse.RawTextHelpFormatter)
+  parser_convert.add_argument(
+      '--dir',
+      type=str,
+      required=True,
+      help='directory containing the SavedModel to convert')
+  parser_convert.add_argument(
+      '--output_dir',
+      type=str,
+      required=True,
+      help='output directory for the converted SavedModel')
+  parser_convert.add_argument(
+      '--tag_set',
+      type=str,
+      required=True,
+      help='tag-set of graph in SavedModel to convert, separated by \',\'')
+  convert_subparsers = parser_convert.add_subparsers(
+      title='conversion methods',
+      description='valid conversion methods',
+      help='the conversion to run with the SavedModel')
+  parser_convert_with_tensorrt = convert_subparsers.add_parser(
+      'tensorrt',
+      description='Convert the SavedModel with Tensorflow-TensorRT integration',
+      formatter_class=argparse.RawTextHelpFormatter)
+  parser_convert_with_tensorrt.add_argument(
+      '--max_batch_size',
+      type=int,
+      default=1,
+      help='max size for the input batch')
+  parser_convert_with_tensorrt.add_argument(
+      '--max_workspace_size_bytes',
+      type=int,
+      default=2 << 20,
+      help=('the maximum GPU temporary memory which the TRT engine can use at '
+            'execution time'))
+  parser_convert_with_tensorrt.add_argument(
+      '--precision_mode',
+      type=str,
+      default='FP32',
+      help='one of FP32, FP16 and INT8')
+  parser_convert_with_tensorrt.add_argument(
+      '--minimum_segment_size',
+      type=int,
+      default=3,
+      help=('the minimum number of nodes required for a subgraph to be replaced'
+            'in a TensorRT node'))
+  parser_convert_with_tensorrt.add_argument(
+      '--is_dynamic_op',
+      type=bool,
+      default=False,
+      help=('whether to generate dynamic TRT ops which will build the TRT '
+            'network and engine at run time'))
+  parser_convert_with_tensorrt.set_defaults(func=convert_with_tensorrt)
+
   return parser
 
 
diff --git a/tensorflow/python/training/adagrad_test.py b/tensorflow/python/training/adagrad_test.py
index da26fcdb7f6bb0277c35acabfd3b5c2362587c7c..3528fdaa8b09b588d594d1aef61812a41c1ce373 100644
--- a/tensorflow/python/training/adagrad_test.py
+++ b/tensorflow/python/training/adagrad_test.py
@@ -106,7 +106,7 @@ class AdagradOptimizerTest(test.TestCase):
         pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
         loss = pred * pred
         sgd_op = adagrad.AdagradOptimizer(1.0).minimize(loss)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         # Fetch params to validate initial values
         self.assertAllCloseAccordingToType([[1.0, 2.0], [3.0, 4.0]],
                                            self.evaluate(var0))
@@ -129,7 +129,7 @@ class AdagradOptimizerTest(test.TestCase):
             constant_op.constant(3.0), initial_accumulator_value=0.1)
         ada_update = ada_opt.apply_gradients(
             zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         # Fetch params to validate initial values
         self.assertAllClose([1.0, 2.0], self.evaluate(var0))
         self.assertAllClose([3.0, 4.0], self.evaluate(var1))
@@ -163,7 +163,7 @@ class AdagradOptimizerTest(test.TestCase):
         ada_opt = adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
         ada_update = ada_opt.apply_gradients(
             zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         # Fetch params to validate initial values
         self.assertAllClose([[1.0], [2.0]], self.evaluate(var0))
         self.assertAllClose([[3.0], [4.0]], self.evaluate(var1))
@@ -198,7 +198,7 @@ class AdagradOptimizerTest(test.TestCase):
             [(grad_repeated_index, repeated_index_update_var)])
         aggregated_update = adagrad.AdagradOptimizer(3.0).apply_gradients(
             [(grad_aggregated, aggregated_update_var)])
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose(aggregated_update_var.eval(),
                             self.evaluate(repeated_index_update_var))
         for _ in range(3):
@@ -223,7 +223,7 @@ class AdagradOptimizerTest(test.TestCase):
             2.0).minimize(loss_repeated)
         update_op_aggregated = adagrad.AdagradOptimizer(
             2.0).minimize(loss_aggregated)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllCloseAccordingToType(
             self.evaluate(var_repeated), self.evaluate(var_aggregated))
         for _ in range(3):
@@ -289,7 +289,7 @@ class AdagradOptimizerTest(test.TestCase):
         self.assertEquals(slot0.get_shape(), var0.get_shape())
         slot1 = ada_opt.get_slot(var1, "accumulator")
         self.assertEquals(slot1.get_shape(), var1.get_shape())
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         # Fetch params to validate initial values.
         self.assertAllClose([1.0, 2.0], self.evaluate(var0))
@@ -306,7 +306,7 @@ class AdagradOptimizerTest(test.TestCase):
             np.array([2.715679168701172, 3.715679168701172]),
             self.evaluate(var1))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testDynamicShapeVariable_Ok(self):
     with self.cached_session():
       v = variable_scope.get_variable("v", initializer=constant_op.constant(1.),
@@ -315,7 +315,7 @@ class AdagradOptimizerTest(test.TestCase):
       # Creating optimizer should cause no exception.
       adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testDynamicShapeVariableWithCallableInit(self):
     var0 = variable_scope.get_variable("var0",
                                        initializer=constant_op.constant(1.),
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index b0bae275773cf05b4e6233706b60f60ca13c9ac0..15958112bd8ca25a5dc434f0630da0c6685f130c 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -68,8 +68,8 @@ class AdamOptimizerTest(test.TestCase):
           var0 = resource_variable_ops.ResourceVariable(var0_np)
           var1 = resource_variable_ops.ResourceVariable(var1_np)
         else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
+          var0 = variables.RefVariable(var0_np)
+          var1 = variables.RefVariable(var1_np)
         grads0_np_indices = np.array([0, 1], dtype=np.int32)
         grads0 = ops.IndexedSlices(
             constant_op.constant(grads0_np),
@@ -156,6 +156,9 @@ class AdamOptimizerTest(test.TestCase):
                               self.evaluate(repeated_index_update_var))
 
   def doTestBasic(self, use_resource=False, use_callable_params=False):
+    if context.executing_eagerly() and not use_resource:
+      self.skipTest(
+          "Skipping test with use_resource=False and executing eagerly.")
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
       with self.session(graph=ops.Graph()):
         # Initialize variables for numpy implementation.
@@ -171,8 +174,8 @@ class AdamOptimizerTest(test.TestCase):
           var1 = resource_variable_ops.ResourceVariable(
               var1_np, name="var1_%d" % i)
         else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
+          var0 = variables.RefVariable(var0_np)
+          var1 = variables.RefVariable(var1_np)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
 
@@ -194,6 +197,14 @@ class AdamOptimizerTest(test.TestCase):
         self.assertTrue(beta2_power is not None)
         self.assertIn(beta1_power, opt_variables)
         self.assertIn(beta2_power, opt_variables)
+        # Ensure that non-slot variables are the same type as the requested
+        # variables.
+        self.assertEqual(
+            use_resource,
+            resource_variable_ops.is_resource_variable(beta1_power))
+        self.assertEqual(
+            use_resource,
+            resource_variable_ops.is_resource_variable(beta2_power))
 
         if not context.executing_eagerly():
           with ops.Graph().as_default():
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index b64c7ada62abbebfb77992f47bcfc2e62daaef4a..86718ab45fc539d6c7d90878860ca510cda31e47 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -163,7 +163,7 @@ class NeverTriggerTimer(_HookTimer):
     return None
 
 
-@tf_export("train.LoggingTensorHook")
+@tf_export(v1=["train.LoggingTensorHook"])
 class LoggingTensorHook(session_run_hook.SessionRunHook):
   """Prints the given tensors every N local steps, every N seconds, or at end.
 
@@ -373,7 +373,7 @@ class _MultiStepStopAtStepHook(session_run_hook.SessionRunHook):
       self._update_steps_per_run_variable(global_step, run_context.session)
 
 
-@tf_export("train.StopAtStepHook")
+@tf_export(v1=["train.StopAtStepHook"])
 class StopAtStepHook(session_run_hook.SessionRunHook):
   """Hook that requests stop at a specified step."""
 
@@ -495,7 +495,7 @@ class CheckpointSaverListener(object):
     pass
 
 
-@tf_export("train.CheckpointSaverHook")
+@tf_export(v1=["train.CheckpointSaverHook"])
 class CheckpointSaverHook(session_run_hook.SessionRunHook):
   """Saves checkpoints every N steps or seconds."""
 
@@ -634,7 +634,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
     return savers[0]
 
 
-@tf_export("train.StepCounterHook")
+@tf_export(v1=["train.StepCounterHook"])
 class StepCounterHook(session_run_hook.SessionRunHook):
   """Hook that counts steps per second."""
 
@@ -725,7 +725,7 @@ class NanLossDuringTrainingError(RuntimeError):
     return "NaN loss during training."
 
 
-@tf_export("train.NanTensorHook")
+@tf_export(v1=["train.NanTensorHook"])
 class NanTensorHook(session_run_hook.SessionRunHook):
   """Monitors the loss tensor and stops training if loss is NaN.
 
@@ -757,7 +757,7 @@ class NanTensorHook(session_run_hook.SessionRunHook):
         run_context.request_stop()
 
 
-@tf_export("train.SummarySaverHook")
+@tf_export(v1=["train.SummarySaverHook"])
 class SummarySaverHook(session_run_hook.SessionRunHook):
   """Saves summaries every N steps."""
 
@@ -866,7 +866,7 @@ class SummarySaverHook(session_run_hook.SessionRunHook):
     return summary_op
 
 
-@tf_export("train.GlobalStepWaiterHook")
+@tf_export(v1=["train.GlobalStepWaiterHook"])
 class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
   """Delays execution until global step reaches `wait_until_step`.
 
@@ -914,7 +914,7 @@ class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
       time.sleep(0.5)
 
 
-@tf_export("train.FinalOpsHook")
+@tf_export(v1=["train.FinalOpsHook"])
 class FinalOpsHook(session_run_hook.SessionRunHook):
   """A hook which evaluates `Tensors` at the end of a session."""
 
@@ -958,7 +958,7 @@ class FinalOpsHook(session_run_hook.SessionRunHook):
         raise e
 
 
-@tf_export("train.FeedFnHook")
+@tf_export(v1=["train.FeedFnHook"])
 class FeedFnHook(session_run_hook.SessionRunHook):
   """Runs `feed_fn` and sets the `feed_dict` accordingly."""
 
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 8e54a14f47c3775df23b262af57b09760800e692..1af27626ba764b0bf4a2787e492983a72c1491e9 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -1122,6 +1122,7 @@ class StepCounterHookTest(test.TestCase):
         self.assertGreater(summary_value.simple_value, 0)
 
 
+@test_util.run_v1_only('b/120545219')
 class SummarySaverHookTest(test.TestCase):
 
   def setUp(self):
@@ -1148,13 +1149,11 @@ class SummarySaverHookTest(test.TestCase):
       basic_session_run_hooks.SummarySaverHook(
           scaffold=monitored_session.Scaffold(), summary_op=self.summary_op)
 
-  @test_util.run_deprecated_v1
   def test_raise_in_both_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.SummarySaverHook(
           save_secs=10, save_steps=20, summary_writer=self.summary_writer)
 
-  @test_util.run_deprecated_v1
   def test_raise_in_none_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.SummarySaverHook(
@@ -1372,7 +1371,7 @@ class FinalOpsHookTest(test.TestCase):
   def test_final_ops_triggers_out_of_range_error(self):
     with ops.Graph().as_default():
       dataset = dataset_ops.Dataset.range(1)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       read_ops = iterator.get_next()
       final_ops = read_ops
 
@@ -1405,6 +1404,7 @@ class FinalOpsHookTest(test.TestCase):
                              hook.final_ops_values.tolist())
 
 
+@test_util.run_v1_only('b/120545219')
 class ResourceSummarySaverHookTest(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py
index f745ab4824ac364b51758e6c3fb60a5679d210fb..a7ad1f70e5e86d2fcd86b76c54314238edd400e1 100644
--- a/tensorflow/python/training/checkpoint_management.py
+++ b/tensorflow/python/training/checkpoint_management.py
@@ -56,10 +56,6 @@ def _GetCheckpointFilename(save_dir, latest_filename):
   return os.path.join(save_dir, latest_filename)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions=("Use tf.train.CheckpointManager to manage checkpoints rather "
-                  "than editing the Checkpoint proto manually."))
 @tf_export(v1=["train.generate_checkpoint_state_proto"])
 def generate_checkpoint_state_proto(save_dir,
                                     model_checkpoint_path,
diff --git a/tensorflow/python/training/checkpoint_ops_test.py b/tensorflow/python/training/checkpoint_ops_test.py
index 21ad3df1c8f4c71ff43dddb6681f167b873efd76..a0fd2dc6bae9b4a3376dffc67355de289e59c00e 100644
--- a/tensorflow/python/training/checkpoint_ops_test.py
+++ b/tensorflow/python/training/checkpoint_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import partitioned_variables
@@ -33,6 +34,7 @@ from tensorflow.python.training import checkpoint_ops
 from tensorflow.python.training import saver as saver_lib
 
 
+@test_util.run_v1_only('b/120545219')
 class LoadAndRemapWrappersTest(test.TestCase):
   """Tests for the functionality of the Python wrappers."""
 
@@ -152,7 +154,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(2))
 
     with self.cached_session():
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_matrix,
                           remapped_matrix.as_tensor().eval())
 
@@ -186,7 +188,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(2))
 
     with self.cached_session():
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_matrix,
                           remapped_matrix.as_tensor().eval())
 
@@ -224,7 +226,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(2))
 
     with self.cached_session():
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_matrix,
                           remapped_matrix.as_tensor().eval())
 
@@ -260,7 +262,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(2))
 
     with self.cached_session():
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_matrix,
                           remapped_matrix.as_tensor().eval())
 
@@ -294,7 +296,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(2))
 
     with self.cached_session():
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_embeddings,
                           remapped_embeddings.as_tensor().eval())
 
@@ -340,7 +342,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(2))
 
     with self.cached_session():
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_embeddings,
                           remapped_embeddings.as_tensor().eval())
 
@@ -378,7 +380,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(2))
 
     with self.cached_session():
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_embeddings,
                           remapped_embeddings.as_tensor().eval())
 
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 58166dbb6818e686bbb938f71ed36ec3786cc2a3..74b46179e75423b530191cce5a52034879712eaa 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import six
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -29,8 +30,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import distribution_strategy_context
-from tensorflow.python.training import saver
+from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -311,10 +311,10 @@ def _set_checkpoint_initializer(variable,
     restore_op = io_ops.restore_v2(
         ckpt_file, [tensor_name], [slice_spec], [base_type], name=name)[0]
 
-    names_to_saveables = saver.BaseSaverBuilder.OpListToDict([variable])
+    names_to_saveables = saveable_object_util.op_list_to_dict([variable])
     saveable_objects = []
     for name, op in names_to_saveables.items():
-      for s in saver.BaseSaverBuilder.SaveableObjectsForOp(op, name):
+      for s in saveable_object_util.saveable_objects_for_op(op, name):
         saveable_objects.append(s)
 
     assert len(saveable_objects) == 1  # Should be only one variable.
diff --git a/tensorflow/python/training/checkpointable/BUILD b/tensorflow/python/training/checkpointable/BUILD
index f97f42a6593603482c125229383093f158a43f22..3201c755afd5f4927a28f8b5de65c564144423aa 100644
--- a/tensorflow/python/training/checkpointable/BUILD
+++ b/tensorflow/python/training/checkpointable/BUILD
@@ -25,9 +25,9 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:io_ops_gen",
         "//tensorflow/python:platform",
-        "//tensorflow/python:saveable_object",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/saving:saveable_object",
     ],
 )
 
@@ -114,7 +114,6 @@ py_library(
         "//tensorflow/python:init_ops",
         "//tensorflow/python:io_ops_gen",
         "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:saveable_object",
         "//tensorflow/python:saver",
         "//tensorflow/python:session",
         "//tensorflow/python:tensor_shape",
@@ -122,6 +121,10 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/training/saving:functional_saver",
+        "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/training/saving:saveable_object_util",
     ],
 )
 
@@ -156,6 +159,43 @@ py_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
+        "@absl_py//absl/testing:parameterized",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "util_with_v1_optimizers_test",
+    srcs = ["util_with_v1_optimizers_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],  # b/74395663
+    deps = [
+        ":base",
+        ":tracking",
+        ":util",
+        "//tensorflow/python:checkpoint_management",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:session",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:template",
+        "//tensorflow/python:training",
+        "//tensorflow/python:training_util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:layers",
+        "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/training/checkpointable/base.py b/tensorflow/python/training/checkpointable/base.py
index 095a90ddd4f831e5af63f8eb7e231eacb5a91975..3cd1c6f9c8b0b5b5acf517e5f5801db66d0045b2 100644
--- a/tensorflow/python/training/checkpointable/base.py
+++ b/tensorflow/python/training/checkpointable/base.py
@@ -25,7 +25,6 @@ import weakref
 
 import six
 
-from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -34,7 +33,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import saveable_object
+from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.util import nest
 from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_decorator
@@ -374,41 +373,10 @@ class _CheckpointPosition(object):
       eagerly.
     """
     (restore_ops,
-     named_saveables,
+     tensor_saveables,
      python_saveables) = self._gather_ops_or_named_saveables()
-
-    # Eagerly run restorations for Python state.
-    reader = pywrap_tensorflow.NewCheckpointReader(
-        self._checkpoint.save_path_string)
-    for saveable in python_saveables:
-      spec_names = [spec.name for spec in saveable.specs]
-      saveable.python_restore(
-          [reader.get_tensor(name) for name in spec_names])
-
-    # If we have new SaveableObjects, extract and cache restore ops.
-    if named_saveables:
-      validated_saveables = (
-          self._checkpoint.builder._ValidateAndSliceInputs(named_saveables))  # pylint: disable=protected-access
-      validated_names = set(saveable.name for saveable in validated_saveables)
-      if set(named_saveables.keys()) != validated_names:
-        raise AssertionError(
-            ("Saveable keys changed when validating. Got back %s, was "
-             "expecting %s") % (named_saveables.keys(), validated_names))
-      all_tensors = self._checkpoint.builder.bulk_restore(
-          filename_tensor=self._checkpoint.save_path_tensor,
-          saveables=validated_saveables, preferred_shard=-1,
-          restore_sequentially=False)
-      saveable_index = 0
-      for saveable in validated_saveables:
-        num_specs = len(saveable.specs)
-        saveable_tensors = all_tensors[
-            saveable_index:saveable_index + num_specs]
-        saveable_index += num_specs
-        restore_op = saveable.restore(saveable_tensors, restored_shapes=None)
-        if not context.executing_eagerly():
-          assert saveable.name not in self._checkpoint.restore_ops_by_name
-          self._checkpoint.restore_ops_by_name[saveable.name] = restore_op
-          restore_ops.append(restore_op)
+    restore_ops.extend(self._checkpoint.restore_saveables(
+        tensor_saveables, python_saveables))
     return restore_ops
 
   @property
diff --git a/tensorflow/python/training/checkpointable/data_structures_test.py b/tensorflow/python/training/checkpointable/data_structures_test.py
index 9cefd942ac9761d968bca2a41c643075ffb79c31..bcec6e01001eec6c164cf4bb17db3d4ed55b0935 100644
--- a/tensorflow/python/training/checkpointable/data_structures_test.py
+++ b/tensorflow/python/training/checkpointable/data_structures_test.py
@@ -73,6 +73,7 @@ class HasList(training.Model):
 class ListTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTracking(self):
     model = HasList()
     output = model(array_ops.ones([32, 2]))
@@ -105,6 +106,7 @@ class ListTests(test.TestCase):
     self.assertIn(v, model.trainable_variables)
     self.assertNotIn(v, model.non_trainable_variables)
 
+  @test_util.run_v1_only("b/120545219")
   def testUpdatesForwarded(self):
     with context.graph_mode():
       model = HasList()
@@ -121,6 +123,7 @@ class ListTests(test.TestCase):
       self.assertEqual(0, len(model.updates))
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testLossesForwarded(self):
     model = HasList()
     model_input = array_ops.ones([32, 2])
@@ -295,6 +298,7 @@ class HasMapping(training.Model):
 class MappingTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTracking(self):
     model = HasMapping()
     output = model(array_ops.ones([32, 2]))
diff --git a/tensorflow/python/training/checkpointable/tracking.py b/tensorflow/python/training/checkpointable/tracking.py
index c85b208d47985553ced692ccf0ef1627f9428a89..4e96aee0c51d441c4a32ce68943e27dbf592349c 100644
--- a/tensorflow/python/training/checkpointable/tracking.py
+++ b/tensorflow/python/training/checkpointable/tracking.py
@@ -17,6 +17,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training.checkpointable import base
 from tensorflow.python.training.checkpointable import data_structures
 from tensorflow.python.util import tf_contextlib
@@ -145,3 +149,36 @@ class TrackableResource(base.CheckpointableBase):
     if self._resource_handle is None:
       self._resource_handle = self.create_resource()
     return self._resource_handle
+
+
+class TrackableAsset(base.CheckpointableBase):
+  """Base class for asset files which need to be tracked."""
+
+  def __init__(self, path):
+    """Record the full path to the asset."""
+    # We use a variable here so that @tf.functions do not capture a literal
+    # value. The init_scope prevents functions from capturing `path` in an
+    # initialization graph, since it is transient and should not end up in a
+    # serialized function body. When serialized in a SavedModel, the variable
+    # will be set during the loading process to its location in the assets/
+    # directory.
+    with ops.init_scope():
+      if context.executing_eagerly():
+        self._path = self._no_dependency(
+            resource_variable_ops.ResourceVariable(
+                path, dtype=dtypes.string,
+                name="asset_path"))
+      else:
+        # Adding a variable is too disruptive when v1-style graph building,
+        # since things may get fed and local variable initializers would then
+        # need to be run.
+        self._path = path
+
+  @property
+  def asset_path(self):
+    """Fetch the current asset path."""
+    return self._path
+
+ops.register_tensor_conversion_function(
+    TrackableAsset,
+    lambda asset, **kw: ops.internal_convert_to_tensor(asset.asset_path, **kw))
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py
index 394cc33ad0f078745e03edc9f31e7f7902413b6d..fde91948e572bd03897d8763a679012a90bf7999 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -26,6 +26,7 @@ from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -38,12 +39,15 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import optimizer as optimizer_lib
-from tensorflow.python.training import saveable_object as saveable_object_lib
-from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import optimizer as optimizer_v1
+from tensorflow.python.training import saver as v1_saver_lib
 from tensorflow.python.training.checkpointable import base
 from tensorflow.python.training.checkpointable import data_structures
 from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.saving import functional_saver
+from tensorflow.python.training.saving import saveable_object as saveable_object_lib
+from tensorflow.python.training.saving import saveable_object_util
+from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
@@ -88,7 +92,6 @@ class _CheckpointRestoreCoordinator(object):
         referenced every restore (e.g. for Python state); otherwise they would
         create their own ops every restore.
     """
-    self.builder = saver_lib.BulkSaverBuilder()
     self.object_graph_proto = object_graph_proto
     self.restore_uid = ops.uid()
     # Maps from objects to lists of attributes which were in the checkpoint but
@@ -143,6 +146,57 @@ class _CheckpointRestoreCoordinator(object):
     if self.new_restore_ops_callback:
       self.new_restore_ops_callback(new_ops)  # pylint: disable=not-callable
 
+  def restore_saveables(self, tensor_saveables, python_saveables):
+    """Run or build restore operations for SaveableObjects.
+
+    Args:
+      tensor_saveables: `SaveableObject`s which correspond to Tensors.
+      python_saveables: `PythonStateSaveable`s which correspond to Python
+        values.
+
+    Returns:
+      When graph building, a list of restore operations, either cached or newly
+      created, to restore `tensor_saveables`.
+    """
+    restore_ops = []
+    # Eagerly run restorations for Python state.
+    reader = pywrap_tensorflow.NewCheckpointReader(
+        self.save_path_string)
+    for saveable in python_saveables:
+      spec_names = [spec.name for spec in saveable.specs]
+      saveable.python_restore(
+          [reader.get_tensor(name) for name in spec_names])
+
+    # If we have new SaveableObjects, extract and cache restore ops.
+    if tensor_saveables:
+      validated_saveables = saveable_object_util.validate_and_slice_inputs(
+          tensor_saveables)
+      validated_names = set(saveable.name for saveable in validated_saveables)
+      if set(tensor_saveables.keys()) != validated_names:
+        raise AssertionError(
+            ("Saveable keys changed when validating. Got back %s, was "
+             "expecting %s") % (tensor_saveables.keys(), validated_names))
+      for saveable in validated_saveables:
+        if saveable.device:
+          device = saveable_object_util.set_cpu0(saveable.device)
+        else:
+          device = None
+        with ops.device(device):
+          tensors = []
+          for spec in saveable.specs:
+            tensors.append(
+                io_ops.restore_v2(
+                    self.save_path_tensor,
+                    [spec.name],
+                    [spec.slice_spec],
+                    [spec.dtype])[0])
+          restore_op = saveable.restore(tensors, restored_shapes=None)
+        if not context.executing_eagerly():
+          assert saveable.name not in self.restore_ops_by_name
+          self.restore_ops_by_name[saveable.name] = restore_op
+          restore_ops.append(restore_op)
+    return restore_ops
+
 
 class _NameBasedRestoreCoordinator(object):
   """Keeps the status of a name-based checkpoint restore."""
@@ -182,11 +236,11 @@ class _NameBasedRestoreCoordinator(object):
           continue
       else:
         saveable = saveable_factory
-      names_to_saveables = saver_lib.BaseSaverBuilder.OpListToDict(
+      names_to_saveables = saveable_object_util.op_list_to_dict(
           [saveable],
           convert_variable_to_tensor=False)
       for name, op in names_to_saveables.items():
-        for saveable_object in saver_lib.BaseSaverBuilder.SaveableObjectsForOp(
+        for saveable_object in saveable_object_util.saveable_objects_for_op(
             op=op, name=name):
           yield saveable_object
 
@@ -506,7 +560,9 @@ def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
   non_slot_objects = list(checkpointable_objects)
   slot_variables = _ObjectIdentityDictionary()
   for checkpointable in non_slot_objects:
-    if isinstance(checkpointable, optimizer_lib.Optimizer):
+    if (isinstance(checkpointable, optimizer_v1.Optimizer)
+        # TODO(b/110718070): Fix Keras imports.
+        or hasattr(checkpointable, "_create_or_restore_slot_variable")):
       naming_scheme = _slot_variable_naming_for_optimizer(
           optimizer_path=object_names[checkpointable])
       slot_names = checkpointable.get_slot_names()
@@ -516,7 +572,7 @@ def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
           try:
             slot_variable = checkpointable.get_slot(
                 original_variable, slot_name)
-          except AttributeError:
+          except (AttributeError, KeyError):
             slot_variable = None
           if slot_variable is None:
             continue
@@ -605,10 +661,10 @@ def _add_attributes_to_object_graph(
           # Figure out the name-based Saver's name for this variable. If it's
           # already a SaveableObject we'd just get the checkpoint key back, so
           # we leave full_name blank.
-          saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
+          saver_dict = saveable_object_util.op_list_to_dict(
               [maybe_saveable], convert_variable_to_tensor=False)
           full_name, = saver_dict.keys()
-          saveables = tuple(saver_lib.BaseSaverBuilder.SaveableObjectsForOp(
+          saveables = tuple(saveable_object_util.saveable_objects_for_op(
               op=maybe_saveable, name=attribute.checkpoint_key))
           for saveable in saveables:
             saveable.full_name = full_name
@@ -648,10 +704,14 @@ def _add_attributes_to_object_graph(
   return named_saveable_objects, feed_additions
 
 
-def _make_object_graph_proto(checkpointable_objects, node_ids, slot_variables):
+def fill_object_graph_proto(checkpointable_objects,
+                            node_ids,
+                            slot_variables,
+                            object_graph_proto=None):
   """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
-  object_graph_proto = (
-      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+  if object_graph_proto is None:
+    object_graph_proto = (
+        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
   for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
     assert node_ids[checkpointable] == checkpoint_id
     object_proto = object_graph_proto.nodes.add()
@@ -676,7 +736,7 @@ def _serialize_gathered_objects(
       checkpointable_objects=checkpointable_objects,
       node_ids=node_ids,
       object_names=object_names)
-  object_graph_proto = _make_object_graph_proto(
+  object_graph_proto = fill_object_graph_proto(
       checkpointable_objects=checkpointable_objects,
       node_ids=node_ids,
       slot_variables=slot_variables)
@@ -729,7 +789,7 @@ def named_saveables(root_checkpointable):
   return _serialize_object_graph(root_checkpointable, None)[0]
 
 
-def _find_objects(root_checkpointable):
+def find_objects(root_checkpointable):
   """Find and number objects which are dependencies of `root_checkpointable`."""
   checkpointable_objects, path_to_root = (
       _breadth_first_checkpointable_traversal(root_checkpointable))
@@ -760,18 +820,10 @@ def list_objects(root_checkpointable):
   Returns:
     A flat list of objects.
   """
-  checkpointable_objects, _, _ = _find_objects(root_checkpointable)
+  checkpointable_objects, _, _ = find_objects(root_checkpointable)
   return checkpointable_objects
 
 
-def make_object_graph_without_attributes(root_checkpointable):
-  """Construct a CheckpointableObjectGraph proto with no variable values."""
-  checkpointable_objects, node_ids, slot_variables = _find_objects(
-      root_checkpointable)
-  return _make_object_graph_proto(
-      checkpointable_objects, node_ids, slot_variables)
-
-
 def gather_initializers(root_checkpointable):
   """Traverse the object graph and find initialization ops.
 
@@ -1229,7 +1281,7 @@ class NameBasedSaverStatus(_LoadStatus):
       session = ops.get_default_session()
     with ops.device("/cpu:0"):
       saveables = self._gather_saveable_objects()
-      saver_lib.Saver(saveables).restore(
+      v1_saver_lib.Saver(saveables).restore(
           sess=session, save_path=self._checkpoint.save_path)
 
   def initialize_or_restore(self, session=None):
@@ -1254,18 +1306,6 @@ class _SessionWithFeedDictAdditions(session_lib.SessionInterface):
         fetches=fetches, feed_dict=feed_dict, **kwargs)
 
 
-def _copy_saver_with_new_var_list(old_saver, new_var_list):
-  """Copy a `tf.train.Saver`'s state to a new Saver with different variables."""
-  new_saver = saver_lib.Saver(var_list=new_var_list, max_to_keep=None)
-  # TODO(allenl): Move to copying functionality to Saver?
-  # pylint: disable=protected-access
-  new_saver._last_checkpoints = old_saver._last_checkpoints
-  new_saver._checkpoints_to_be_deleted = old_saver._checkpoints_to_be_deleted
-  new_saver._next_checkpoint_time = old_saver._next_checkpoint_time
-  # pylint: enable=protected-access
-  return new_saver
-
-
 class CheckpointableSaver(object):
   """Saves and restores a `Checkpointable` object and its dependencies.
 
@@ -1304,7 +1344,8 @@ class CheckpointableSaver(object):
     # Op caching for save
     self._object_graph_feed_tensor = None
     self._last_save_object_graph = None
-    self._last_save_saver = None
+    self._file_prefix_feed_tensor = None
+    self._cached_save_operation = None
 
     # Op caching for restore, shared between _CheckpointRestoreCoordinators
     self._restore_op_cache = {}
@@ -1371,13 +1412,16 @@ class CheckpointableSaver(object):
           base.NoRestoreSaveable(
               tensor=object_graph_tensor,
               name=base.OBJECT_GRAPH_PROTO_KEY))
-      # TODO(allenl, haoliang): Swap in a function-based saver here.
-      return saver_lib.Saver(
+      # TODO(allenl): Swap in a function-based saver here once it can serialize
+      # to a SaverDef.
+      return v1_saver_lib.Saver(
           var_list=named_saveable_objects, max_to_keep=None)
 
-  def _prepare_save(self,
-                    object_graph_tensor=None,
-                    saveable_object_cache=None):
+  def _save_cached_when_graph_building(
+      self,
+      file_prefix,
+      object_graph_tensor=None,
+      saveable_object_cache=None):
     """Create or retrieve save ops.
 
     When graph building, `saveable_object_cache` will typically be non-`None`,
@@ -1386,15 +1430,17 @@ class CheckpointableSaver(object):
     unnecessarily re-creating save ops.
 
     Args:
+      file_prefix: The prefix for saved checkpoint files.
       object_graph_tensor: A `Tensor` to which the current object graph will be
         fed.
       saveable_object_cache: A dictionary; if specified, used to cache
         `SaveableObject`s.
 
     Returns:
-      A two-element tuple with a `tf.train.Saver` and a feed_dict of `Tensor`s
-      to feed when running save ops. The feed dict contains the current object
-      graph and any Python state to be saved in the checkpoint.
+      A two-element tuple with a filename tensor and a feed_dict of tensors to
+      feed when running it (if graph building). The feed dict contains the
+      current object graph and any Python state to be saved in the
+      checkpoint. When executing eagerly only the first argument is meaningful.
     """
     (named_saveable_objects, graph_proto,
      feed_additions) = self._gather_saveables(
@@ -1406,15 +1452,11 @@ class CheckpointableSaver(object):
         # constructors. That means the Saver needs to be copied with a new
         # var_list.
         or context.executing_eagerly()):
-      if self._last_save_object_graph is not None:
-        self._last_save_saver = _copy_saver_with_new_var_list(
-            old_saver=self._last_save_saver,
-            new_var_list=named_saveable_objects)
-      else:
-        self._last_save_saver = saver_lib.Saver(
-            var_list=named_saveable_objects, max_to_keep=None)
+      saver = functional_saver.Saver(named_saveable_objects)
+      with ops.device("/cpu:0"):
+        self._cached_save_operation = saver.save(file_prefix)
       self._last_save_object_graph = graph_proto
-    return self._last_save_saver, feed_additions
+    return self._cached_save_operation, feed_additions
 
   def save(self, file_prefix, checkpoint_number=None, session=None):
     """Save a training checkpoint.
@@ -1438,36 +1480,42 @@ class CheckpointableSaver(object):
     Returns:
       The full path to the checkpoint.
     """
-    feed_additions = {}
+    feed_dict = {}
     graph_building = not context.executing_eagerly()
+    if checkpoint_number:
+      file_prefix = "%s-%d" % (file_prefix, checkpoint_number)
     if graph_building:
       if self._object_graph_feed_tensor is None:
         with ops.device("/cpu:0"):
           self._object_graph_feed_tensor = constant_op.constant(
               "", dtype=dtypes.string)
+          self._file_prefix_feed_tensor = constant_op.constant(
+              "", dtype=dtypes.string)
       object_graph_tensor = self._object_graph_feed_tensor
+      file_prefix_tensor = self._file_prefix_feed_tensor
+      feed_dict[file_prefix_tensor] = file_prefix
     else:
+      with ops.device("/cpu:0"):
+        file_prefix_tensor = constant_op.constant(
+            file_prefix, dtype=dtypes.string)
       object_graph_tensor = None
 
-    saver, new_feed_additions = self._prepare_save(
+    file_io.recursive_create_dir(os.path.dirname(file_prefix))
+    save_path, new_feed_additions = self._save_cached_when_graph_building(
+        file_prefix=file_prefix_tensor,
         object_graph_tensor=object_graph_tensor,
         saveable_object_cache=self._saveable_object_cache)
     if new_feed_additions:
-      feed_additions.update(new_feed_additions)
+      feed_dict.update(new_feed_additions)
     if not graph_building:
       session = None
     elif session is None:
       session = ops.get_default_session()
 
-    file_io.recursive_create_dir(os.path.dirname(file_prefix))
-    with ops.device("/cpu:0"):
-      save_path = saver.save(
-          sess=_SessionWithFeedDictAdditions(
-              session=session, feed_additions=feed_additions),
-          save_path=file_prefix,
-          write_meta_graph=False,
-          write_state=False,
-          global_step=checkpoint_number)
+    if session:
+      save_path = session.run(save_path, feed_dict=feed_dict)
+    else:
+      save_path = save_path.numpy()
     return save_path
 
   def restore(self, save_path):
@@ -1710,7 +1758,8 @@ class Checkpoint(tracking.Checkpointable):
     """
     super(Checkpoint, self).__init__()
     for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
-      if not isinstance(v, base.CheckpointableBase):
+      if not isinstance(v, (base.CheckpointableBase,
+                            def_function.PolymorphicFunction)):
         raise ValueError(
             ("`Checkpoint` was expecting a checkpointable object (an object "
              "derived from `CheckpointableBase`), got %s. If you believe this "
@@ -1755,9 +1804,9 @@ class Checkpoint(tracking.Checkpointable):
     Returns:
       The full path to the checkpoint (i.e. `file_prefix`).
     """
-    return self._saver.save(
+    return compat.as_str(self._saver.save(
         file_prefix=file_prefix,
-        session=session)
+        session=session))
 
   @property
   def save_counter(self):
@@ -1816,7 +1865,7 @@ class Checkpoint(tracking.Checkpointable):
       checkpoint_number = assign_op.numpy()
     file_path = self.write("%s-%d" % (file_prefix, checkpoint_number),
                            session=session)
-    checkpoint_management.update_checkpoint_state(
+    checkpoint_management.update_checkpoint_state_internal(
         save_dir=os.path.dirname(file_prefix),
         model_checkpoint_path=file_path,
         all_model_checkpoint_paths=[file_path])
diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py
index de9cac0863213912d3f678c550deb95fa4779b8b..61de46898a69ba65a720d42ca8647bcced65a1db 100644
--- a/tensorflow/python/training/checkpointable/util_test.py
+++ b/tensorflow/python/training/checkpointable/util_test.py
@@ -20,10 +20,10 @@ import functools
 import json
 import os
 
+from absl.testing import parameterized
 import six
 
 from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -35,16 +35,15 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.training import adam
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import momentum
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpointable import base
@@ -199,17 +198,6 @@ class InterfaceTests(test.TestCase):
     with self.assertRaises(NotImplementedError):
       checkpoint_reversed.save(prefix)
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def test_object_graph_no_attributes(self):
-    root = tracking.Checkpointable()
-    root.v = resource_variable_ops.ResourceVariable(1.)
-    root.opt = momentum.MomentumOptimizer(0.01, 0.5)
-    root.opt.minimize(root.v.read_value)
-    object_graph = checkpointable_utils.make_object_graph_without_attributes(
-        root)
-    # Four objects: Root, v, opt, and a slot variable for v
-    self.assertEqual(4, len(object_graph.nodes))
-
 
 class _MirroringSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
 
@@ -255,7 +243,7 @@ class _OwnsMirroredVariables(base.CheckpointableBase):
     return self.non_dep_variable.name
 
 
-class CheckpointingTests(test.TestCase):
+class CheckpointingTests(parameterized.TestCase, test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testNamingWithOptimizer(self):
@@ -264,41 +252,32 @@ class CheckpointingTests(test.TestCase):
     # A nuisance Model using the same optimizer. Its slot variables should not
     # go in the checkpoint, since it is never depended on.
     other_model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    optimizer_step = training_util.get_or_create_global_step()
+    optimizer = adam.Adam(0.001)
+    step = training_util.get_or_create_global_step()
     root_checkpointable = checkpointable_utils.Checkpoint(
-        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
-    if context.executing_eagerly():
-      optimizer.minimize(
-          lambda: model(input_value),
-          global_step=optimizer_step)
-      optimizer.minimize(
-          lambda: other_model(input_value),
-          global_step=optimizer_step)
-    else:
-      train_op = optimizer.minimize(
-          model(input_value), global_step=optimizer_step)
-      optimizer.minimize(
-          other_model(input_value),
-          global_step=optimizer_step)
-      self.evaluate(checkpointable_utils.gather_initializers(
-          root_checkpointable))
-      self.evaluate(train_op)
+        optimizer=optimizer, model=model, step=step)
+
+    with backprop.GradientTape() as tape:
+      loss = model(input_value)
+    variables = model.trainable_variables
+    gradients = tape.gradient(loss, variables)
+    train_op = control_flow_ops.group(
+        optimizer.apply_gradients(zip(gradients, variables)),
+        step.assign_add(1))
+
+    with backprop.GradientTape() as tape:
+      loss = other_model(input_value)
+    variables = other_model.trainable_variables
+    gradients = tape.gradient(loss, variables)
+    optimizer.apply_gradients(zip(gradients, variables))
+
+    self.evaluate(checkpointable_utils.gather_initializers(
+        root_checkpointable))
+    self.evaluate(train_op)
     named_variables, serialized_graph, _ = (
         checkpointable_utils._serialize_object_graph(
             root_checkpointable, saveables_cache=None))
-    expected_checkpoint_names = (
-        # Created in the root node, so no prefix.
-        "optimizer_step",
-        "model/_second/kernel",
-        "model/_named_dense/kernel",
-        "model/_named_dense/bias",
-        # non-Layer dependency of the model
-        "model/_non_layer/a_variable",
-        # The optimizer creates two non-slot variables
-        "optimizer/beta1_power",
-        "optimizer/beta2_power",
-        # Slot variables
+    expected_slot_keys = (
         "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
         "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
         "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
@@ -306,9 +285,26 @@ class CheckpointingTests(test.TestCase):
         "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
         "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
     )
+    expected_checkpoint_names = (
+        # Created in the root node, so no prefix.
+        "step",
+        "model/_second/kernel",
+        "model/_named_dense/kernel",
+        "model/_named_dense/bias",
+        # non-Layer dependency of the model
+        "model/_non_layer/a_variable",
+        "optimizer/learning_rate",
+        "optimizer/beta_1",
+        "optimizer/beta_2",
+        "optimizer/epsilon",
+        "optimizer/iter",
+        "optimizer/decay",
+    ) + expected_slot_keys
     suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
     expected_checkpoint_names = [
         name + suffix for name in expected_checkpoint_names]
+    expected_checkpoint_names.append(
+        "optimizer/.ATTRIBUTES/OBJECT_CONFIG_JSON")
     # The Dense layers also save get_config() JSON
     expected_checkpoint_names.extend(
         ["model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
@@ -319,7 +315,7 @@ class CheckpointingTests(test.TestCase):
     # Check that we've mapped to the right variable objects (not exhaustive)
     self.assertEqual(
         "global_step",
-        named_variables["optimizer_step" + suffix].full_name)
+        named_variables["step" + suffix].full_name)
     self.assertEqual(
         "my_model/dense_1/kernel",
         named_variables["model/_second/kernel" + suffix].full_name)
@@ -327,48 +323,31 @@ class CheckpointingTests(test.TestCase):
         "my_model/dense/kernel",
         named_variables["model/_named_dense/kernel" + suffix].full_name)
     self.assertEqual(
-        "beta1_power",
-        named_variables["optimizer/beta1_power" + suffix].full_name)
+        "beta_1",
+        named_variables["optimizer/beta_1" + suffix].full_name)
     self.assertEqual(
-        "beta2_power",
-        named_variables["optimizer/beta2_power" + suffix].full_name)
+        "beta_2",
+        named_variables["optimizer/beta_2" + suffix].full_name)
     # Spot check the generated protocol buffers.
     self.assertEqual("optimizer",
                      serialized_graph.nodes[0].children[1].local_name)
     optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
         1].node_id]
-    self.assertEqual("beta1_power",
-                     optimizer_node.children[0].local_name)
-    self.assertEqual("beta1_power",
-                     serialized_graph.nodes[optimizer_node.children[0].node_id]
-                     .attributes[0].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel",
-        serialized_graph.nodes[optimizer_node.slot_variables[0]
-                               .original_variable_node_id]
-        .attributes[0].full_name)
-    # We strip off the :0 suffix, as variable.name-based saving does.
-    self.assertEqual(
-        "my_model/dense/kernel/Adam",
-        serialized_graph.nodes[optimizer_node.slot_variables[0]
-                               .slot_variable_node_id]
-        .attributes[0].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel/Adam:0",
-        optimizer.get_slot(
-            var=model._named_dense.kernel,
-            name="m").name)
-    self.assertEqual(
-        "model/_named_dense/kernel" + suffix,
-        serialized_graph.nodes[
-            optimizer_node.slot_variables[0]
-            .original_variable_node_id].attributes[0].checkpoint_key)
-    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
-    self.assertEqual(
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
-        serialized_graph.nodes[
-            optimizer_node.slot_variables[0]
-            .slot_variable_node_id].attributes[0].checkpoint_key)
+    children = [node.local_name for node in optimizer_node.children]
+    six.assertCountEqual(
+        self,
+        # Non-slot dependencies
+        ["beta_1", "beta_2", "iter", "decay", "epsilon", "learning_rate"],
+        children)
+    serialized_slot_keys = []
+    for slot in optimizer_node.slot_variables:
+      for attribute in (
+          serialized_graph.nodes[slot.slot_variable_node_id].attributes):
+        serialized_slot_keys.append(attribute.checkpoint_key)
+    six.assertCountEqual(
+        self,
+        [key + suffix for key in expected_slot_keys],
+        serialized_slot_keys)
 
   @test_util.run_in_graph_and_eager_modes
   def testMoreComplexSaveableReturned(self):
@@ -409,20 +388,19 @@ class CheckpointingTests(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testSaveRestore(self):
     model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
+    optimizer = adam.Adam(0.001)
     root_checkpointable = checkpointable_utils.Checkpoint(
         optimizer=optimizer, model=model)
     input_value = constant_op.constant([[3.]])
-    if context.executing_eagerly():
-      optimizer.minimize(
-          lambda: model(input_value))
-    else:
-      train_op = optimizer.minimize(model(input_value))
-      # TODO(allenl): Make initialization more pleasant when graph building.
-      root_checkpointable.save_counter  # pylint: disable=pointless-statement
-      self.evaluate(checkpointable_utils.gather_initializers(
-          root_checkpointable))
-      self.evaluate(train_op)
+    with backprop.GradientTape() as tape:
+      loss = model(input_value)
+    variables = model.trainable_variables
+    gradients = tape.gradient(loss, variables)
+    train_op = optimizer.apply_gradients(zip(gradients, variables))
+    root_checkpointable.save_counter  # pylint: disable=pointless-statement
+    self.evaluate(checkpointable_utils.gather_initializers(
+        root_checkpointable))
+    self.evaluate(train_op)
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
     m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
@@ -430,7 +408,8 @@ class CheckpointingTests(test.TestCase):
     save_path = root_checkpointable.save(file_prefix=prefix)
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
     self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
-    optimizer_variables = self.evaluate(optimizer.variables())
+    optimizer_variables = self.evaluate(
+        sorted(optimizer.variables(), key=lambda v: v.name))
     self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
     # Immediate restoration
     status = root_checkpointable.restore(save_path=save_path).assert_consumed()
@@ -441,11 +420,7 @@ class CheckpointingTests(test.TestCase):
     if not context.executing_eagerly():
       return  # Restore-on-create is only supported when executing eagerly
     on_create_model = MyModel()
-    on_create_optimizer = adam.AdamOptimizer(
-        0.001,
-        # Preserve beta1_power and beta2_power when appying gradients so we can
-        # test that they've been restored correctly.
-        beta1=1.0, beta2=1.0)
+    on_create_optimizer = adam.Adam(0.001)
     on_create_root = checkpointable_utils.Checkpoint(
         optimizer=on_create_optimizer, model=on_create_model)
     # Deferred restoration
@@ -467,15 +442,15 @@ class CheckpointingTests(test.TestCase):
     # Optimizer slot variables are created when the original variable is
     # restored.
     self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
-    self.assertAllEqual(optimizer_variables[2:],
-                        self.evaluate(on_create_optimizer.variables()))
     dummy_var = resource_variable_ops.ResourceVariable([1.])
-    on_create_optimizer.minimize(loss=dummy_var.read_value)
+    on_create_optimizer.minimize(loss=dummy_var.read_value,
+                                 var_list=[dummy_var])
     status.assert_existing_objects_matched()
     status.assert_consumed()
-    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
-    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
-    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+    self.assertAllEqual(
+        optimizer_variables,
+        # Creation order is different, so .variables() needs to be re-sorted.
+        self.evaluate(sorted(optimizer.variables(), key=lambda v: v.name)))
 
   # TODO(allenl): Debug garbage created by this test in python3.
   def testDeferredRestorationUsageEager(self):
@@ -485,21 +460,22 @@ class CheckpointingTests(test.TestCase):
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     for training_continuation in range(3):
       model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001)
+      optimizer = adam.Adam(0.001)
       root = checkpointable_utils.Checkpoint(
-          optimizer=optimizer, model=model,
-          optimizer_step=training_util.get_or_create_global_step())
+          optimizer=optimizer, model=model)
       root.restore(checkpoint_management.latest_checkpoint(
           checkpoint_directory))
       for _ in range(num_training_steps):
         # TODO(allenl): Use a Dataset and serialize/checkpoint it.
         input_value = constant_op.constant([[3.]])
-        optimizer.minimize(
-            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
-            global_step=root.optimizer_step)
+        with backprop.GradientTape() as tape:
+          loss = model(input_value)
+        variables = model.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        optimizer.apply_gradients(zip(gradients, variables))
       root.save(file_prefix=checkpoint_prefix)
       self.assertEqual((training_continuation + 1) * num_training_steps,
-                       root.optimizer_step.numpy())
+                       root.optimizer.iterations.numpy())
 
   def testUsageGraph(self):
     """Expected usage when graph building."""
@@ -510,14 +486,16 @@ class CheckpointingTests(test.TestCase):
       for training_continuation in range(3):
         with ops.Graph().as_default():
           model = MyModel()
-          optimizer = adam.AdamOptimizer(0.001)
+          optimizer = adam.Adam(0.001)
           root = checkpointable_utils.Checkpoint(
-              optimizer=optimizer, model=model,
-              global_step=training_util.get_or_create_global_step())
+              optimizer=optimizer, model=model)
           input_value = constant_op.constant([[3.]])
-          train_op = optimizer.minimize(
-              model(input_value),
-              global_step=root.global_step)
+          with backprop.GradientTape() as tape:
+            loss = model(input_value)
+          variables = model.trainable_variables
+          gradients = tape.gradient(loss, variables)
+          train_op = optimizer.apply_gradients(zip(gradients, variables))
+
           checkpoint_path = checkpoint_management.latest_checkpoint(
               checkpoint_directory)
           with self.session(graph=ops.get_default_graph()) as session:
@@ -536,7 +514,7 @@ class CheckpointingTests(test.TestCase):
               session.run(train_op)
             root.save(file_prefix=checkpoint_prefix, session=session)
             self.assertEqual((training_continuation + 1) * num_training_steps,
-                             session.run(root.global_step))
+                             session.run(root.optimizer.iterations))
             self.assertEqual(training_continuation + 1,
                              session.run(root.save_counter))
 
@@ -546,21 +524,23 @@ class CheckpointingTests(test.TestCase):
     # Does create garbage when executing eagerly due to ops.Graph() creation.
     num_training_steps = 10
     checkpoint_directory = self.get_temp_dir()
+    def _train_fn(model, input_value):
+      with backprop.GradientTape() as tape:
+        loss = model(input_value)
+      variables = model.trainable_variables
+      gradients = tape.gradient(loss, variables)
+      return optimizer.apply_gradients(zip(gradients, variables))
     for training_continuation in range(3):
       with test_util.device(use_gpu=True):
         model = MyModel()
-        optimizer = adam.AdamOptimizer(0.001)
+        optimizer = adam.Adam(0.001)
         root = checkpointable_utils.Checkpoint(
-            optimizer=optimizer, model=model,
-            global_step=training_util.get_or_create_global_step())
+            optimizer=optimizer, model=model)
         manager = checkpoint_management.CheckpointManager(
             root, checkpoint_directory, max_to_keep=1)
         status = root.restore(save_path=manager.latest_checkpoint)
         input_value = constant_op.constant([[3.]])
-        train_fn = functools.partial(
-            optimizer.minimize,
-            functools.partial(model, input_value),
-            global_step=root.global_step)
+        train_fn = functools.partial(_train_fn, model, input_value)
         if not context.executing_eagerly():
           train_fn = functools.partial(self.evaluate, train_fn())
         status.initialize_or_restore()
@@ -568,7 +548,7 @@ class CheckpointingTests(test.TestCase):
           train_fn()
         manager.save()
         self.assertEqual((training_continuation + 1) * num_training_steps,
-                         self.evaluate(root.global_step))
+                         self.evaluate(root.optimizer.iterations))
         self.assertEqual(training_continuation + 1,
                          self.evaluate(root.save_counter))
 
@@ -628,6 +608,7 @@ class CheckpointingTests(test.TestCase):
 
   # pylint: disable=cell-var-from-loop
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testWithDefun(self):
     num_training_steps = 2
     checkpoint_directory = self.get_temp_dir()
@@ -636,10 +617,9 @@ class CheckpointingTests(test.TestCase):
       with test_util.device(use_gpu=True):
         model = MyModel()
         # Don't actually train so we can test variable values
-        optimizer = adam.AdamOptimizer(0.)
+        optimizer = adam.Adam(0.)
         root = checkpointable_utils.Checkpoint(
-            optimizer=optimizer, model=model,
-            global_step=training_util.get_or_create_global_step())
+            optimizer=optimizer, model=model)
         checkpoint_path = checkpoint_management.latest_checkpoint(
             checkpoint_directory)
         status = root.restore(save_path=checkpoint_path)
@@ -650,8 +630,7 @@ class CheckpointingTests(test.TestCase):
           with backprop.GradientTape() as tape:
             loss = _call_model(constant_op.constant([[3.]]))
           gradients = tape.gradient(loss, model.variables)
-          return optimizer.apply_gradients(zip(gradients, model.variables),
-                                           global_step=root.global_step)
+          return optimizer.apply_gradients(zip(gradients, model.variables))
         if not context.executing_eagerly():
           train_fn = functools.partial(
               self.evaluate, train_fn())
@@ -665,7 +644,7 @@ class CheckpointingTests(test.TestCase):
           self.evaluate(model.variables[0].assign([[42.]]))
         root.save(file_prefix=checkpoint_prefix)
         self.assertEqual((training_continuation + 1) * num_training_steps,
-                         self.evaluate(root.global_step))
+                         self.evaluate(optimizer.iterations))
         self.assertEqual(training_continuation + 1,
                          self.evaluate(root.save_counter))
   # pylint: enable=cell-var-from-loop
@@ -727,7 +706,7 @@ class CheckpointingTests(test.TestCase):
 
     with context.eager_mode():
       model = Model()
-      optimizer = adam.AdamOptimizer(learning_rate=0.05)
+      optimizer = adam.Adam(learning_rate=0.05)
       checkpoint_directory = self.get_temp_dir()
       checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
       checkpoint = checkpointable_utils.Checkpoint(
@@ -813,24 +792,24 @@ class CheckpointingTests(test.TestCase):
     root = tracking.Checkpointable()
     root.var = checkpointable_utils.add_variable(
         root, name="var", initializer=0.)
-    optimizer = adam.AdamOptimizer(0.1)
-    if context.executing_eagerly():
-      optimizer.minimize(root.var.read_value)
-    else:
-      train_op = optimizer.minimize(root.var)
-      # Note that `optimizer` has not been added as a dependency of
-      # `root`. Create a one-off grouping so that slot variables for `root.var`
-      # get initialized too.
-      self.evaluate(checkpointable_utils.gather_initializers(
-          checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
-      self.evaluate(train_op)
+    optimizer = adam.Adam(0.1)
+    variables = [root.var]
+    gradients = [1.]
+    train_op = optimizer.apply_gradients(zip(gradients, variables))
+    # Note that `optimizer` has not been added as a dependency of
+    # `root`. Create a one-off grouping so that slot variables for `root.var`
+    # get initialized too.
+    self.evaluate(checkpointable_utils.gather_initializers(
+        checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
+    self.evaluate(train_op)
     self.evaluate(state_ops.assign(root.var, 12.))
     no_slots_path = checkpointable_utils.CheckpointableSaver(root).save(
         os.path.join(checkpoint_directory, "no_slots"))
     root.optimizer = optimizer
     self.evaluate(state_ops.assign(root.var, 13.))
-    self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
-                                   14.))
+    self.evaluate(state_ops.assign(
+        optimizer.get_slot(slot_name="m", var=root.var),
+        14.))
     slots_path = checkpointable_utils.CheckpointableSaver(root).save(
         os.path.join(checkpoint_directory, "with_slots"))
     new_root = tracking.Checkpointable()
@@ -847,29 +826,32 @@ class CheckpointingTests(test.TestCase):
     no_slot_status.assert_consumed()
     no_slot_status.run_restore_ops()
     self.assertEqual(12., self.evaluate(new_root.var))
-    new_root.optimizer = adam.AdamOptimizer(0.1)
+    new_root.optimizer = adam.Adam(0.1)
     slot_status.assert_existing_objects_matched()
-    with self.assertRaisesRegexp(AssertionError, "beta1_power"):
+    with self.assertRaisesRegexp(AssertionError, "Unresolved object"):
       slot_status.assert_consumed()
     self.assertEqual(12., self.evaluate(new_root.var))
     if context.executing_eagerly():
       # Slot variables are only created with restoring initializers when
       # executing eagerly.
       self.assertEqual(14., self.evaluate(
-          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+          new_root.optimizer.get_slot(slot_name="m", var=new_root.var)))
     else:
-      self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var),
-                    None)
-    if context.executing_eagerly():
-      new_root.optimizer.minimize(new_root.var.read_value)
-    else:
-      train_op = new_root.optimizer.minimize(new_root.var)
-      # The slot variable now exists; restore() didn't create it, but we should
-      # now have a restore op for it.
-      slot_status.run_restore_ops()
+      # Slot variables are not created eagerly when graph building.
+      with self.assertRaises(KeyError):
+        new_root.optimizer.get_slot(slot_name="m", var=new_root.var)
+    variables = [new_root.var]
+    gradients = [1.]
+    train_op = new_root.optimizer.apply_gradients(zip(gradients, variables))
+    # The slot variable now exists; restore() didn't create it, but we should
+    # now have a restore op for it.
+    slot_status.run_restore_ops()
+    if not context.executing_eagerly():
+      # The train op hasn't run when graph building, so the slot variable has
+      # its restored value. It has run in eager, so the value will be different.
       self.assertEqual(14., self.evaluate(
-          new_root.optimizer.get_slot(name="m", var=new_root.var)))
-      self.evaluate(train_op)
+          new_root.optimizer.get_slot(slot_name="m", var=new_root.var)))
+    self.evaluate(train_op)
     slot_status.assert_consumed()
 
   @test_util.run_in_graph_and_eager_modes
@@ -1029,18 +1011,18 @@ class CheckpointingTests(test.TestCase):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     first = tracking.Checkpointable()
-    first.var1 = variables.Variable(0., name="outside_var")
-    first.var2 = variables.Variable(0., name="blah")
+    first.var1 = variables_lib.Variable(0., name="outside_var")
+    first.var2 = variables_lib.Variable(0., name="blah")
     self.evaluate(first.var1.assign(4.))
     self.evaluate(first.var2.assign(8.))
     save_path = checkpointable_utils.CheckpointableSaver(first).save(
         checkpoint_prefix)
 
     second = tracking.Checkpointable()
-    second.var2 = variables.Variable(0., name="blah")
+    second.var2 = variables_lib.Variable(0., name="blah")
     status = checkpointable_utils.CheckpointableSaver(
         second).restore(save_path)
-    recreated_var1 = variables.Variable(0., name="outside_var")
+    recreated_var1 = variables_lib.Variable(0., name="outside_var")
     status.run_restore_ops()
     self.assertEqual(8., self.evaluate(second.var2))
     self.evaluate(recreated_var1.assign(-2.))
@@ -1057,15 +1039,16 @@ class CheckpointingTests(test.TestCase):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
         obj = tracking.Checkpointable()
-        obj.var = variable_scope.get_variable(name="v", initializer=0.)
-        obj.opt = adam.AdamOptimizer(0.1)
-        obj.opt.minimize(obj.var.read_value())
+        obj.var = variables_lib.Variable(0., name="v")
+        obj.opt = adam.Adam(0.1)
+        variables = [obj.var]
+        gradients = [1.]
+        obj.opt.apply_gradients(zip(gradients, variables))
         self.evaluate(checkpointable_utils.gather_initializers(obj))
         saver = checkpointable_utils.CheckpointableSaver(obj)
         saver.save(checkpoint_prefix)
-        before_ops = graph.get_operations()
+        graph.finalize()
         saver.save(checkpoint_prefix)
-        self.assertEqual(before_ops, graph.get_operations())
 
   @test_util.run_in_graph_and_eager_modes
   def testCheckpointState(self):
@@ -1146,74 +1129,17 @@ class CheckpointingTests(test.TestCase):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
         obj = tracking.Checkpointable()
-        obj.var = variable_scope.get_variable(name="v", initializer=0.)
-        obj.opt = adam.AdamOptimizer(0.1)
-        obj.opt.minimize(obj.var.read_value())
+        obj.var = variables_lib.Variable(0., name="v")
+        obj.opt = adam.Adam(0.1)
+        variables = [obj.var]
+        gradients = [1.]
+        obj.opt.apply_gradients(zip(gradients, variables))
         self.evaluate(checkpointable_utils.gather_initializers(obj))
         saver = checkpointable_utils.CheckpointableSaver(obj)
         save_path = saver.save(checkpoint_prefix)
         saver.restore(save_path)
-        before_ops = graph.get_operations()
+        graph.finalize()
         saver.restore(save_path)
-        self.assertEqual(before_ops, graph.get_operations())
-
-  def testMultipleGraphsNonSlotVariables(self):
-    with context.graph_mode():
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      optimizer = adam.AdamOptimizer(0.001)
-      # Construct a model in one graph
-      first_graph = ops.Graph()
-      first_session = session_lib.Session(graph=first_graph)
-      with first_graph.as_default(), first_session.as_default():
-        first_variable = resource_variable_ops.ResourceVariable([1.])
-        first_root_checkpointable = checkpointable_utils.Checkpoint(
-            optimizer=optimizer, variable=first_variable)
-        train_op = optimizer.minimize(first_variable.read_value)
-        self.evaluate(checkpointable_utils.gather_initializers(
-            first_root_checkpointable))
-        self.evaluate(train_op)
-        self.evaluate(first_variable.assign([1.]))
-        self.evaluate(optimizer.get_slot(
-            var=first_variable, name="m").assign([2.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.evaluate(beta1_power.assign(3.))
-
-      # Save and load in a second graph
-      second_graph = ops.Graph()
-      with second_graph.as_default(), session_lib.Session(graph=second_graph):
-        second_variable = resource_variable_ops.ResourceVariable([1.])
-        second_root_checkpointable = checkpointable_utils.Checkpoint(
-            optimizer=optimizer, variable=second_variable)
-        train_op = optimizer.minimize(second_variable.read_value)
-        second_root_checkpointable.restore(None).initialize_or_restore()
-        self.evaluate(train_op)
-        self.evaluate(second_variable.assign([4.]))
-        self.evaluate(optimizer.get_slot(
-            var=second_variable, name="m").assign([5.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.evaluate(beta1_power.assign(6.))
-        save_path = second_root_checkpointable.save(checkpoint_prefix)
-        self.evaluate(second_variable.assign([7.]))
-        self.evaluate(optimizer.get_slot(
-            var=second_variable, name="m").assign([8.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(6., self.evaluate(beta1_power))
-        status = second_root_checkpointable.restore(save_path)
-        status.assert_consumed().run_restore_ops()
-        self.assertAllEqual([4.], self.evaluate(second_variable))
-        self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
-            var=second_variable, name="m")))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(6., self.evaluate(beta1_power))
-
-      # Check that the first graph is unmolested
-      with first_graph.as_default(), first_session.as_default():
-        self.assertAllEqual([1.], self.evaluate(first_variable))
-        self.assertAllEqual([2.], self.evaluate(optimizer.get_slot(
-            var=first_variable, name="m")))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(3., self.evaluate(beta1_power))
 
   @test_util.run_in_graph_and_eager_modes
   def test_sequential(self):
@@ -1254,10 +1180,9 @@ class CheckpointingTests(test.TestCase):
     optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
     with test_util.device(use_gpu=True):
       model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001)
+      optimizer = adam.Adam(0.001)
       root = checkpointable_utils.Checkpoint(
-          model=model,  # Do not save the optimizer with the checkpoint.
-          global_step=training_util.get_or_create_global_step())
+          model=model)  # Do not save the optimizer with the checkpoint.
       optimizer_checkpoint = checkpointable_utils.Checkpoint(
           optimizer=optimizer)
 
@@ -1265,65 +1190,78 @@ class CheckpointingTests(test.TestCase):
           checkpoint_directory)
       status = root.restore(save_path=checkpoint_path)
       input_value = constant_op.constant([[3.]])
-      train_fn = functools.partial(
-          optimizer.minimize,
-          functools.partial(model, input_value),
-          global_step=root.global_step)
+      def train_fn():
+        with backprop.GradientTape() as tape:
+          loss = model(input_value)
+        variables = model.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        return optimizer.apply_gradients(zip(gradients, variables))
       if not context.executing_eagerly():
         train_fn = functools.partial(self.evaluate, train_fn())
       status.initialize_or_restore()
-      self.evaluate([v.initializer for v in optimizer.variables()])
+      # TODO(tanzheny): Add hyper variables to .variables(), and set them with
+      # set_weights etc.
+      variables_not_in_the_variables_property = [
+          obj for obj in optimizer._hyper.values()
+          if isinstance(obj, variables_lib.Variable)]
+      self.evaluate([v.initializer for v
+                     in optimizer.variables()
+                     + variables_not_in_the_variables_property])
       train_fn()
       model_save_path = root.save(file_prefix=checkpoint_prefix)
-      self.evaluate(optimizer.variables()[0].assign(42.))
+      self.evaluate(optimizer.beta_1.assign(42.))
       optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
+    del train_fn
 
     # Restore into a graph with the optimizer
     with test_util.device(use_gpu=True):
       model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001)
+      optimizer = adam.Adam(0.001)
       root = checkpointable_utils.Checkpoint(
-          optimizer=optimizer, model=model,
-          global_step=training_util.get_or_create_global_step())
+          optimizer=optimizer, model=model)
       status = root.restore(save_path=model_save_path)
       input_value = constant_op.constant([[3.]])
-      train_fn = functools.partial(
-          optimizer.minimize,
-          functools.partial(model, input_value),
-          global_step=root.global_step)
+      def train_fn1():
+        with backprop.GradientTape() as tape:
+          loss = model(input_value)
+        variables = model.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        return optimizer.apply_gradients(zip(gradients, variables))
       if not context.executing_eagerly():
-        train_fn = functools.partial(self.evaluate, train_fn())
+        train_fn1 = functools.partial(self.evaluate, train_fn1())
       status.initialize_or_restore()
-      train_fn()
+      train_fn1()
       with self.assertRaises(AssertionError):
         status.assert_existing_objects_matched()
       with self.assertRaises(AssertionError):
         status.assert_consumed()
+    del train_fn1
 
     # Make sure initialization doesn't clobber later restores
     with test_util.device(use_gpu=True):
       model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001, beta1=1.0)
+      optimizer = adam.Adam(0.001, beta1=1.0)
       root = checkpointable_utils.Checkpoint(
-          optimizer=optimizer, model=model,
-          global_step=training_util.get_or_create_global_step())
+          optimizer=optimizer, model=model)
       opt_root = checkpointable_utils.Checkpoint(
           optimizer=optimizer)
       status = root.restore(save_path=model_save_path)
       init_only_optimizer_status = opt_root.restore(save_path=None)
       optimizer_status = opt_root.restore(save_path=optimizer_save_path)
       input_value = constant_op.constant([[3.]])
-      train_fn = functools.partial(
-          optimizer.minimize,
-          functools.partial(model, input_value),
-          global_step=root.global_step)
+      def train_fn2():
+        with backprop.GradientTape() as tape:
+          loss = model(input_value)
+        variables = model.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        return optimizer.apply_gradients(zip(gradients, variables))
       if not context.executing_eagerly():
-        train_fn = functools.partial(self.evaluate, train_fn())
+        train_fn2 = functools.partial(self.evaluate, train_fn2())
       optimizer_status.run_restore_ops()
       status.initialize_or_restore()
       init_only_optimizer_status.initialize_or_restore()
-      train_fn()
-      self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
+      train_fn2()
+      self.assertEqual(42., self.evaluate(optimizer.beta_1))
 
   @test_util.run_in_graph_and_eager_modes
   def test_restore_after_adding_empty_checkpointable_data_structure(self):
@@ -1356,7 +1294,7 @@ class _ManualScope(tracking.Checkpointable):
     return variable_scope.get_variable(name="in_manual_scope", shape=[])
 
 
-class TemplateTests(test.TestCase):
+class TemplateTests(parameterized.TestCase, test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_checkpointable_save_restore(self):
@@ -1380,10 +1318,11 @@ class TemplateTests(test.TestCase):
     manual_dep, = manual_scope._checkpoint_dependencies
     self.assertEqual("in_manual_scope", manual_dep.name)
     self.assertIs(manual_scope_v, manual_dep.ref)
-    optimizer = adam.AdamOptimizer(0.0)
+    optimizer = adam.Adam(0.0)
     save_root = checkpointable_utils.Checkpoint(
         my_template=save_template, optimizer=optimizer)
-    optimizer.minimize(v1_save.read_value)
+    optimizer.minimize(v1_save.read_value,
+                       var_list=[v1_save])
     self.evaluate([v.initializer for v in save_template.variables])
     self.evaluate([v.initializer for v in optimizer.variables()])
     self.evaluate(v1_save.assign([12.]))
@@ -1393,13 +1332,13 @@ class TemplateTests(test.TestCase):
     save_path = save_root.save(checkpoint_prefix)
 
     load_template = template.make_template("s2", _templated)
-    load_optimizer = adam.AdamOptimizer(0.0)
+    load_optimizer = adam.Adam(0.0)
     load_root = checkpointable_utils.Checkpoint(
         my_template=load_template, optimizer=load_optimizer)
     status = load_root.restore(save_path)
     var, var_plus_one, var2, _, _ = load_template()
-    load_optimizer.minimize(var.read_value)
-    self.assertEqual(3, len(load_template._checkpoint_dependencies))
+    load_optimizer.minimize(var.read_value, var_list=[var])
+    self.assertLen(load_template._checkpoint_dependencies, 3)
     self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
     self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
     self.assertEqual("ManualScope",
@@ -1440,14 +1379,14 @@ class TemplateTests(test.TestCase):
     status = load_root.restore(save_path)
     (inner_template_one, inner_template_two), (v1, v2, v3) = load_template()
     outer_template_dependencies = load_root.my_template._checkpoint_dependencies
-    self.assertEqual(2, len(outer_template_dependencies))
+    self.assertLen(outer_template_dependencies, 2)
     self.assertEqual("i1", outer_template_dependencies[0].name)
     self.assertIs(inner_template_one, outer_template_dependencies[0].ref)
     self.assertEqual("i2", outer_template_dependencies[1].name)
     self.assertIs(inner_template_two, outer_template_dependencies[1].ref)
-    self.assertEqual(1, len(inner_template_one._checkpoint_dependencies))
+    self.assertLen(inner_template_one._checkpoint_dependencies, 1)
     self.assertEqual("v", inner_template_one._checkpoint_dependencies[0].name)
-    self.assertEqual(1, len(inner_template_two._checkpoint_dependencies))
+    self.assertLen(inner_template_two._checkpoint_dependencies, 1)
     self.assertEqual("v", inner_template_two._checkpoint_dependencies[0].name)
     status.assert_consumed().run_restore_ops()
     self.assertAllEqual([20.], self.evaluate(v1))
@@ -1460,13 +1399,14 @@ class CheckpointCompatibilityTests(test.TestCase):
   def _initialized_model(self):
     input_value = constant_op.constant([[3.]])
     model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    optimizer_step = training_util.get_or_create_global_step()
+    optimizer = adam.Adam(0.001)
     root_checkpointable = checkpointable_utils.Checkpoint(
-        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
-    train_op = optimizer.minimize(
-        functools.partial(model, input_value),
-        global_step=optimizer_step)
+        optimizer=optimizer, model=model)
+    with backprop.GradientTape() as tape:
+      loss = model(input_value)
+    variables = model.trainable_variables
+    gradients = tape.gradient(loss, variables)
+    train_op = optimizer.apply_gradients(zip(gradients, variables))
     self.evaluate(checkpointable_utils.gather_initializers(
         root_checkpointable))
     self.evaluate(train_op)
@@ -1474,28 +1414,26 @@ class CheckpointCompatibilityTests(test.TestCase):
     # with known values to check when loading.
     self.evaluate(model._named_dense.bias.assign([1.]))
     self.evaluate(optimizer.get_slot(
-        var=model._named_dense.bias, name="m").assign([2.]))
-    beta1_power, _ = optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(3.))
+        var=model._named_dense.bias, slot_name="m").assign([2.]))
+    self.evaluate(optimizer.beta_1.assign(3.))
     return root_checkpointable
 
   def _set_sentinels(self, root_checkpointable):
     self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
     self.evaluate(
         root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")
+            var=root_checkpointable.model._named_dense.bias, slot_name="m")
         .assign([102.]))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(103.))
+    self.evaluate(root_checkpointable.optimizer.beta_1.assign(103.))
 
   def _check_sentinels(self, root_checkpointable):
     self.assertAllEqual(
         [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
     self.assertAllEqual([2.], self.evaluate(
         root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
-    self.assertAllEqual(3., self.evaluate(beta1_power))
+            var=root_checkpointable.model._named_dense.bias, slot_name="m")))
+    self.assertAllEqual(3.,
+                        self.evaluate(root_checkpointable.optimizer.beta_1))
 
   def _write_name_based_checkpoint(self):
     checkpoint_directory = self.get_temp_dir()
@@ -1508,7 +1446,7 @@ class CheckpointCompatibilityTests(test.TestCase):
         name_saver = saver_lib.Saver()
         return name_saver.save(
             sess=session, save_path=checkpoint_prefix,
-            global_step=root.optimizer_step)
+            global_step=root.optimizer.iterations)
 
   @test_util.run_in_graph_and_eager_modes
   def testLoadFromNameBasedSaver(self):
diff --git a/tensorflow/python/training/checkpointable/util_with_v1_optimizers_test.py b/tensorflow/python/training/checkpointable/util_with_v1_optimizers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..00d5747f7838ae48d022675fd878b59d659db38a
--- /dev/null
+++ b/tensorflow/python/training/checkpointable/util_with_v1_optimizers_test.py
@@ -0,0 +1,873 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for object-based saving which use tf.train.* optimizers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+import six
+
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import template
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import adam
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import training_util
+from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
+
+
+class NonLayerCheckpointable(tracking.Checkpointable):
+
+  def __init__(self):
+    super(NonLayerCheckpointable, self).__init__()
+    self.a_variable = checkpointable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+# pylint: disable=not-callable
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Checkpointables which aren't Layers.
+    self._non_layer = NonLayerCheckpointable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class CheckpointingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNamingWithOptimizer(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    # A nuisance Model using the same optimizer. Its slot variables should not
+    # go in the checkpoint, since it is never depended on.
+    other_model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value),
+          global_step=optimizer_step)
+      optimizer.minimize(
+          lambda: other_model(input_value),
+          global_step=optimizer_step)
+    else:
+      train_op = optimizer.minimize(
+          model(input_value), global_step=optimizer_step)
+      optimizer.minimize(
+          other_model(input_value),
+          global_step=optimizer_step)
+      self.evaluate(checkpointable_utils.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    named_variables, serialized_graph, _ = (
+        checkpointable_utils._serialize_object_graph(
+            root_checkpointable, saveables_cache=None))
+    expected_checkpoint_names = (
+        # Created in the root node, so no prefix.
+        "optimizer_step",
+        "model/_second/kernel",
+        "model/_named_dense/kernel",
+        "model/_named_dense/bias",
+        # non-Layer dependency of the model
+        "model/_non_layer/a_variable",
+        # The optimizer creates two non-slot variables
+        "optimizer/beta1_power",
+        "optimizer/beta2_power",
+        # Slot variables
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
+    )
+    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+    expected_checkpoint_names = [
+        name + suffix for name in expected_checkpoint_names]
+    # The Dense layers also save get_config() JSON
+    expected_checkpoint_names.extend(
+        ["model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
+         "model/_named_dense/.ATTRIBUTES/OBJECT_CONFIG_JSON"])
+    named_variables = {v.name: v for v in named_variables}
+    six.assertCountEqual(self, expected_checkpoint_names,
+                         named_variables.keys())
+    # Check that we've mapped to the right variable objects (not exhaustive)
+    self.assertEqual(
+        "global_step",
+        named_variables["optimizer_step" + suffix].full_name)
+    self.assertEqual(
+        "my_model/dense_1/kernel",
+        named_variables["model/_second/kernel" + suffix].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel",
+        named_variables["model/_named_dense/kernel" + suffix].full_name)
+    self.assertEqual(
+        "beta1_power",
+        named_variables["optimizer/beta1_power" + suffix].full_name)
+    self.assertEqual(
+        "beta2_power",
+        named_variables["optimizer/beta2_power" + suffix].full_name)
+    # Spot check the generated protocol buffers.
+    self.assertEqual("optimizer",
+                     serialized_graph.nodes[0].children[1].local_name)
+    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
+        1].node_id]
+    self.assertEqual("beta1_power",
+                     optimizer_node.children[0].local_name)
+    self.assertEqual("beta1_power",
+                     serialized_graph.nodes[optimizer_node.children[0].node_id]
+                     .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .original_variable_node_id]
+        .attributes[0].full_name)
+    # We strip off the :0 suffix, as variable.name-based saving does.
+    self.assertEqual(
+        "my_model/dense/kernel/Adam",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .slot_variable_node_id]
+        .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel/Adam:0",
+        optimizer.get_slot(
+            var=model._named_dense.kernel,
+            name="m").name)
+    self.assertEqual(
+        "model/_named_dense/kernel" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .original_variable_node_id].attributes[0].checkpoint_key)
+    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
+    self.assertEqual(
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .slot_variable_node_id].attributes[0].checkpoint_key)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSaveRestore(self):
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model)
+    input_value = constant_op.constant([[3.]])
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value))
+    else:
+      train_op = optimizer.minimize(model(input_value))
+      # TODO(allenl): Make initialization more pleasant when graph building.
+      root_checkpointable.save_counter  # pylint: disable=pointless-statement
+      self.evaluate(checkpointable_utils.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
+    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
+    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
+    save_path = root_checkpointable.save(file_prefix=prefix)
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
+    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
+    optimizer_variables = self.evaluate(optimizer.variables())
+    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
+    # Immediate restoration
+    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
+    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+    if not context.executing_eagerly():
+      return  # Restore-on-create is only supported when executing eagerly
+    on_create_model = MyModel()
+    on_create_optimizer = adam.AdamOptimizer(
+        0.001,
+        # Preserve beta1_power and beta2_power when appying gradients so we can
+        # test that they've been restored correctly.
+        beta1=1.0, beta2=1.0)
+    on_create_root = checkpointable_utils.Checkpoint(
+        optimizer=on_create_optimizer, model=on_create_model)
+    # Deferred restoration
+    status = on_create_root.restore(save_path=save_path)
+    status.assert_nontrivial_match()
+    status.assert_existing_objects_matched()
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
+    on_create_model(constant_op.constant([[3.]]))  # create variables
+    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
+    self.assertAllEqual([42.],
+                        self.evaluate(
+                            on_create_model._named_dense.variables[1]))
+    on_create_m_bias_slot = on_create_optimizer.get_slot(
+        on_create_model._named_dense.variables[1], "m")
+    status.assert_existing_objects_matched()
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
+    # Optimizer slot variables are created when the original variable is
+    # restored.
+    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+    self.assertAllEqual(optimizer_variables[2:],
+                        self.evaluate(on_create_optimizer.variables()))
+    dummy_var = resource_variable_ops.ResourceVariable([1.])
+    on_create_optimizer.minimize(loss=dummy_var.read_value)
+    status.assert_existing_objects_matched()
+    status.assert_consumed()
+    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
+    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
+    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+
+  # TODO(allenl): Debug garbage created by this test in python3.
+  def testDeferredRestorationUsageEager(self):
+    """An idiomatic eager execution example."""
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          optimizer_step=training_util.get_or_create_global_step())
+      root.restore(checkpoint_management.latest_checkpoint(
+          checkpoint_directory))
+      for _ in range(num_training_steps):
+        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
+        input_value = constant_op.constant([[3.]])
+        optimizer.minimize(
+            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
+            global_step=root.optimizer_step)
+      root.save(file_prefix=checkpoint_prefix)
+      self.assertEqual((training_continuation + 1) * num_training_steps,
+                       root.optimizer_step.numpy())
+
+  def testUsageGraph(self):
+    """Expected usage when graph building."""
+    with context.graph_mode():
+      num_training_steps = 10
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      for training_continuation in range(3):
+        with ops.Graph().as_default():
+          model = MyModel()
+          optimizer = adam.AdamOptimizer(0.001)
+          root = checkpointable_utils.Checkpoint(
+              optimizer=optimizer, model=model,
+              global_step=training_util.get_or_create_global_step())
+          input_value = constant_op.constant([[3.]])
+          train_op = optimizer.minimize(
+              model(input_value),
+              global_step=root.global_step)
+          checkpoint_path = checkpoint_management.latest_checkpoint(
+              checkpoint_directory)
+          with self.session(graph=ops.get_default_graph()) as session:
+            status = root.restore(save_path=checkpoint_path)
+            status.initialize_or_restore(session=session)
+            if checkpoint_path is None:
+              self.assertEqual(0, training_continuation)
+              with self.assertRaises(AssertionError):
+                status.assert_consumed()
+              with self.assertRaises(AssertionError):
+                status.assert_existing_objects_matched()
+            else:
+              status.assert_consumed()
+              status.assert_existing_objects_matched()
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            root.save(file_prefix=checkpoint_prefix, session=session)
+            self.assertEqual((training_continuation + 1) * num_training_steps,
+                             session.run(root.global_step))
+            self.assertEqual(training_continuation + 1,
+                             session.run(root.save_counter))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAgnosticUsage(self):
+    """Graph/eager agnostic usage."""
+    # Does create garbage when executing eagerly due to ops.Graph() creation.
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    for training_continuation in range(3):
+      with test_util.device(use_gpu=True):
+        model = MyModel()
+        optimizer = adam.AdamOptimizer(0.001)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        manager = checkpoint_management.CheckpointManager(
+            root, checkpoint_directory, max_to_keep=1)
+        status = root.restore(save_path=manager.latest_checkpoint)
+        input_value = constant_op.constant([[3.]])
+        train_fn = functools.partial(
+            optimizer.minimize,
+            functools.partial(model, input_value),
+            global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        manager.save()
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+
+  # pylint: disable=cell-var-from-loop
+  @test_util.run_in_graph_and_eager_modes
+  def testWithDefun(self):
+    num_training_steps = 2
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with test_util.device(use_gpu=True):
+        model = MyModel()
+        # Don't actually train so we can test variable values
+        optimizer = adam.AdamOptimizer(0.)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = checkpoint_management.latest_checkpoint(
+            checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        def train_fn():
+          @def_function.function
+          def _call_model(x):
+            return model(x)
+          with backprop.GradientTape() as tape:
+            loss = _call_model(constant_op.constant([[3.]]))
+          gradients = tape.gradient(loss, model.variables)
+          return optimizer.apply_gradients(zip(gradients, model.variables),
+                                           global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(
+              self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        if training_continuation > 0:
+          status.assert_consumed()
+          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+        else:
+          self.evaluate(model.variables[0].assign([[42.]]))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+  # pylint: enable=cell-var-from-loop
+
+  def _get_checkpoint_name(self, name):
+    root = tracking.Checkpointable()
+    checkpointable_utils.add_variable(
+        root, name=name, shape=[1, 2], dtype=dtypes.float64)
+    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
+        root, saveables_cache=None)
+    with ops.name_scope("root/" + named_variable.name):
+      pass  # Make sure we can use this as an op name if we prefix it.
+    return named_variable.name
+
+  def testAnonymousVarsInInit(self):
+
+    class Model(training.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.w = resource_variable_ops.ResourceVariable(0.0)
+        self.b = resource_variable_ops.ResourceVariable(0.0)
+        self.vars = [self.w, self.b]
+
+      def call(self, x):
+        return x * self.w + self.b
+
+    with context.eager_mode():
+      model = Model()
+      optimizer = adam.AdamOptimizer(learning_rate=0.05)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      checkpoint = checkpointable_utils.Checkpoint(
+          model=model, optimizer=optimizer)
+      for _ in range(2):
+        checkpoint.save(checkpoint_prefix)
+        with backprop.GradientTape() as tape:
+          loss = (constant_op.constant(1.)
+                  - model(constant_op.constant(1.))) ** 2
+        grad = tape.gradient(loss, model.vars)
+        optimizer.apply_gradients(
+            [(g, v) for g, v in zip(grad, model.vars)])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeferredSlotRestoration(self):
+    checkpoint_directory = self.get_temp_dir()
+
+    root = tracking.Checkpointable()
+    root.var = checkpointable_utils.add_variable(
+        root, name="var", initializer=0.)
+    optimizer = adam.AdamOptimizer(0.1)
+    if context.executing_eagerly():
+      optimizer.minimize(root.var.read_value)
+    else:
+      train_op = optimizer.minimize(root.var)
+      # Note that `optimizer` has not been added as a dependency of
+      # `root`. Create a one-off grouping so that slot variables for `root.var`
+      # get initialized too.
+      self.evaluate(checkpointable_utils.gather_initializers(
+          checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
+      self.evaluate(train_op)
+    self.evaluate(state_ops.assign(root.var, 12.))
+    no_slots_path = checkpointable_utils.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "no_slots"))
+    root.optimizer = optimizer
+    self.evaluate(state_ops.assign(root.var, 13.))
+    self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
+                                   14.))
+    slots_path = checkpointable_utils.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "with_slots"))
+    new_root = tracking.Checkpointable()
+    # Load the slot-containing checkpoint (deferred), then immediately overwrite
+    # the non-slot variable (also deferred).
+    slot_status = checkpointable_utils.CheckpointableSaver(
+        new_root).restore(slots_path)
+    no_slot_status = checkpointable_utils.CheckpointableSaver(
+        new_root).restore(no_slots_path)
+    with self.assertRaises(AssertionError):
+      no_slot_status.assert_consumed()
+    new_root.var = checkpointable_utils.add_variable(
+        new_root, name="var", shape=[])
+    no_slot_status.assert_consumed()
+    no_slot_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    new_root.optimizer = adam.AdamOptimizer(0.1)
+    slot_status.assert_existing_objects_matched()
+    with self.assertRaisesRegexp(AssertionError, "beta1_power"):
+      slot_status.assert_consumed()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    if context.executing_eagerly():
+      # Slot variables are only created with restoring initializers when
+      # executing eagerly.
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+    else:
+      self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var),
+                    None)
+    if context.executing_eagerly():
+      new_root.optimizer.minimize(new_root.var.read_value)
+    else:
+      train_op = new_root.optimizer.minimize(new_root.var)
+      # The slot variable now exists; restore() didn't create it, but we should
+      # now have a restore op for it.
+      slot_status.run_restore_ops()
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+      self.evaluate(train_op)
+    slot_status.assert_consumed()
+
+  def testManySavesGraph(self):
+    """Saves after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = tracking.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(checkpointable_utils.gather_initializers(obj))
+        saver = checkpointable_utils.CheckpointableSaver(obj)
+        saver.save(checkpoint_prefix)
+        before_ops = graph.get_operations()
+        saver.save(checkpoint_prefix)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  def testManyRestoresGraph(self):
+    """Restores after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = tracking.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(checkpointable_utils.gather_initializers(obj))
+        saver = checkpointable_utils.CheckpointableSaver(obj)
+        save_path = saver.save(checkpoint_prefix)
+        saver.restore(save_path)
+        before_ops = graph.get_operations()
+        saver.restore(save_path)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  def testMultipleGraphsNonSlotVariables(self):
+    with context.graph_mode():
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      optimizer = adam.AdamOptimizer(0.001)
+      # Construct a model in one graph
+      first_graph = ops.Graph()
+      first_session = session_lib.Session(graph=first_graph)
+      with first_graph.as_default(), first_session.as_default():
+        first_variable = resource_variable_ops.ResourceVariable([1.])
+        first_root_checkpointable = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, variable=first_variable)
+        train_op = optimizer.minimize(first_variable.read_value)
+        self.evaluate(checkpointable_utils.gather_initializers(
+            first_root_checkpointable))
+        self.evaluate(train_op)
+        self.evaluate(first_variable.assign([1.]))
+        self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m").assign([2.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(3.))
+
+      # Save and load in a second graph
+      second_graph = ops.Graph()
+      with second_graph.as_default(), session_lib.Session(graph=second_graph):
+        second_variable = resource_variable_ops.ResourceVariable([1.])
+        second_root_checkpointable = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, variable=second_variable)
+        train_op = optimizer.minimize(second_variable.read_value)
+        second_root_checkpointable.restore(None).initialize_or_restore()
+        self.evaluate(train_op)
+        self.evaluate(second_variable.assign([4.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([5.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(6.))
+        save_path = second_root_checkpointable.save(checkpoint_prefix)
+        self.evaluate(second_variable.assign([7.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([8.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+        status = second_root_checkpointable.restore(save_path)
+        status.assert_consumed().run_restore_ops()
+        self.assertAllEqual([4.], self.evaluate(second_variable))
+        self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+
+      # Check that the first graph is unmolested
+      with first_graph.as_default(), first_session.as_default():
+        self.assertAllEqual([1.], self.evaluate(first_variable))
+        self.assertAllEqual([2.], self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_initialize_if_not_restoring(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
+    with test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = checkpointable_utils.Checkpoint(
+          model=model,  # Do not save the optimizer with the checkpoint.
+          global_step=training_util.get_or_create_global_step())
+      optimizer_checkpoint = checkpointable_utils.Checkpoint(
+          optimizer=optimizer)
+
+      checkpoint_path = checkpoint_management.latest_checkpoint(
+          checkpoint_directory)
+      status = root.restore(save_path=checkpoint_path)
+      input_value = constant_op.constant([[3.]])
+      train_fn = functools.partial(
+          optimizer.minimize,
+          functools.partial(model, input_value),
+          global_step=root.global_step)
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      status.initialize_or_restore()
+      self.evaluate([v.initializer for v in optimizer.variables()])
+      train_fn()
+      model_save_path = root.save(file_prefix=checkpoint_prefix)
+      self.evaluate(optimizer.variables()[0].assign(42.))
+      optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
+
+    # Restore into a graph with the optimizer
+    with test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          global_step=training_util.get_or_create_global_step())
+      status = root.restore(save_path=model_save_path)
+      input_value = constant_op.constant([[3.]])
+      train_fn = functools.partial(
+          optimizer.minimize,
+          functools.partial(model, input_value),
+          global_step=root.global_step)
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      status.initialize_or_restore()
+      train_fn()
+      with self.assertRaises(AssertionError):
+        status.assert_existing_objects_matched()
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
+
+    # Make sure initialization doesn't clobber later restores
+    with test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001, beta1=1.0)
+      root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          global_step=training_util.get_or_create_global_step())
+      opt_root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer)
+      status = root.restore(save_path=model_save_path)
+      init_only_optimizer_status = opt_root.restore(save_path=None)
+      optimizer_status = opt_root.restore(save_path=optimizer_save_path)
+      input_value = constant_op.constant([[3.]])
+      train_fn = functools.partial(
+          optimizer.minimize,
+          functools.partial(model, input_value),
+          global_step=root.global_step)
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      optimizer_status.run_restore_ops()
+      status.initialize_or_restore()
+      init_only_optimizer_status.initialize_or_restore()
+      train_fn()
+      self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
+
+
+class _ManualScope(tracking.Checkpointable):
+
+  def __call__(self):
+    with variable_scope.variable_scope("ManualScope") as vs:
+      self.variable_scope = vs
+      with checkpointable_utils.capture_dependencies(template=self):
+        return self._build()
+
+  def _build(self):
+    return variable_scope.get_variable(name="in_manual_scope", shape=[])
+
+
+class TemplateTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_checkpointable_save_restore(self):
+
+    def _templated():
+      v = variable_scope.get_variable(
+          "v", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
+      v2 = variable_scope.get_variable(
+          "v2", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
+      manual = _ManualScope()
+      return v, v + 1., v2, manual, manual()
+
+    save_template = template.make_template("s1", _templated)
+    v1_save, _, v2_save, manual_scope, manual_scope_v = save_template()
+    six.assertCountEqual(
+        self,
+        [v1_save, v2_save, manual_scope, manual_scope_v, save_template],
+        checkpointable_utils.list_objects(save_template))
+    manual_dep, = manual_scope._checkpoint_dependencies
+    self.assertEqual("in_manual_scope", manual_dep.name)
+    self.assertIs(manual_scope_v, manual_dep.ref)
+    optimizer = adam.AdamOptimizer(0.0)
+    save_root = checkpointable_utils.Checkpoint(
+        my_template=save_template, optimizer=optimizer)
+    optimizer.minimize(v1_save.read_value)
+    self.evaluate([v.initializer for v in save_template.variables])
+    self.evaluate([v.initializer for v in optimizer.variables()])
+    self.evaluate(v1_save.assign([12.]))
+    self.evaluate(v2_save.assign([14.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_root.save(checkpoint_prefix)
+
+    load_template = template.make_template("s2", _templated)
+    load_optimizer = adam.AdamOptimizer(0.0)
+    load_root = checkpointable_utils.Checkpoint(
+        my_template=load_template, optimizer=load_optimizer)
+    status = load_root.restore(save_path)
+    var, var_plus_one, var2, _, _ = load_template()
+    load_optimizer.minimize(var.read_value)
+    self.assertEqual(3, len(load_template._checkpoint_dependencies))
+    self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
+    self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
+    self.assertEqual("ManualScope",
+                     load_template._checkpoint_dependencies[2].name)
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual([12.], self.evaluate(var))
+    self.assertAllEqual([13.], self.evaluate(var_plus_one))
+    self.assertAllEqual([14.], self.evaluate(var2))
+
+
+class CheckpointCompatibilityTests(test.TestCase):
+
+  def _initialized_model(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    train_op = optimizer.minimize(
+        functools.partial(model, input_value),
+        global_step=optimizer_step)
+    self.evaluate(checkpointable_utils.gather_initializers(
+        root_checkpointable))
+    self.evaluate(train_op)
+    # A regular variable, a slot variable, and a non-slot Optimizer variable
+    # with known values to check when loading.
+    self.evaluate(model._named_dense.bias.assign([1.]))
+    self.evaluate(optimizer.get_slot(
+        var=model._named_dense.bias, name="m").assign([2.]))
+    beta1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(3.))
+    return root_checkpointable
+
+  def _set_sentinels(self, root_checkpointable):
+    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+    self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")
+        .assign([102.]))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(103.))
+
+  def _check_sentinels(self, root_checkpointable):
+    self.assertAllEqual(
+        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+    self.assertAllEqual([2.], self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  def _write_name_based_checkpoint(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        name_saver = saver_lib.Saver()
+        return name_saver.save(
+            sess=session, save_path=checkpoint_prefix,
+            global_step=root.optimizer_step)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testLoadFromNameBasedSaver(self):
+    """Save a name-based checkpoint, load it using the object-based API."""
+    with test_util.device(use_gpu=True):
+      save_path = self._write_name_based_checkpoint()
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      with self.assertRaises(AssertionError):
+        self._check_sentinels(root)
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      self._set_sentinels(root)
+      status = object_saver.restore(save_path)
+      if context.executing_eagerly():
+        self._check_sentinels(root)
+      if context.executing_eagerly():
+        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
+          status.assert_consumed()
+        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
+          status.assert_existing_objects_matched()
+        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
+          status.assert_nontrivial_match()
+      else:
+        # When graph building, we haven't read any keys, so we don't know
+        # whether the restore will be complete.
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_consumed()
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_existing_objects_matched()
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_nontrivial_match()
+      status.run_restore_ops()
+      self._check_sentinels(root)
+      self._set_sentinels(root)
+      status = object_saver.restore(save_path)
+      status.initialize_or_restore()
+      self._check_sentinels(root)
+      # Check that there is no error when keys are missing from the name-based
+      # checkpoint.
+      root.not_in_name_checkpoint = resource_variable_ops.ResourceVariable([1.])
+      status = object_saver.restore(save_path)
+      with self.assertRaises(AssertionError):
+        status.assert_existing_objects_matched()
+
+  def testSaveGraphLoadEager(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        save_path = root.save(session=session, file_prefix=checkpoint_prefix)
+    with context.eager_mode():
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      root.restore(save_path).assert_consumed()
+      self._check_sentinels(root)
+
+  def testSaveEagerLoadGraph(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.eager_mode():
+      root = self._initialized_model()
+      save_path = root.save(file_prefix=checkpoint_prefix)
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.session(
+          graph=save_graph):
+        root = self._initialized_model()
+        self._set_sentinels(root)
+        root.restore(save_path).assert_consumed().run_restore_ops()
+        self._check_sentinels(root)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/evaluation.py b/tensorflow/python/training/evaluation.py
index a10178f8cfe3af1ac45a5084b8e16abe1beee267..37d46795b16cb4b4ed5ce2b4f5cf9b17cdcafab3 100644
--- a/tensorflow/python/training/evaluation.py
+++ b/tensorflow/python/training/evaluation.py
@@ -253,7 +253,7 @@ def _evaluate_once(checkpoint_path,
       if isinstance(h, (_StopAfterNEvalsHook, _MultiStepStopAfterNEvalsHook)):
         h._set_evals_completed_tensor(eval_step_value)  # pylint: disable=protected-access
 
-  logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
+  logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%dT%H:%M:%SZ',
                                                          time.gmtime()))
 
   # Prepare the session creator.
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index a3d268a0174a3139923a6f676464130b0808cfc7..5efc15d56f9530569b98a9cde975d74de1f110ef 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -58,7 +58,7 @@ class MatchFilenamesOnceTest(test_lib.TestCase):
       question = inp.match_filenames_once(
           os.path.join(self.get_temp_dir(), "match_filenames.?"))
       one = inp.match_filenames_once(additional[1])
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       self.assertItemsEqual(
           map(compat.as_bytes, filenames), self.evaluate(star))
@@ -84,7 +84,7 @@ class LimitEpochsTest(test_lib.TestCase):
     with self.cached_session():
       love_me = constant_op.constant("Love Me")
       love_me_two_times = inp.limit_epochs(love_me, num_epochs=2)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       self.assertEqual(b"Love Me", self.evaluate(love_me_two_times))
       self.assertEqual(b"Love Me", self.evaluate(love_me_two_times))
@@ -105,7 +105,7 @@ class InputProducerTest(test_lib.TestCase):
           input_tensor, num_epochs=num_epochs, shuffle=False)
       dequeue_many = queue.dequeue_many(len(input_tensor) * num_epochs)
       dequeue = queue.dequeue()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -132,7 +132,7 @@ class InputProducerTest(test_lib.TestCase):
           input_tensor, element_shape=[4], num_epochs=num_epochs, shuffle=False)
       dequeue_many = queue.dequeue_many(len(input_value) * num_epochs)
       dequeue = queue.dequeue()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -163,7 +163,7 @@ class StringInputProducerTest(test_lib.TestCase):
           strings, num_epochs=num_epochs, shuffle=False)
       dequeue_many = queue.dequeue_many(len(strings) * num_epochs)
       dequeue = queue.dequeue()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -186,7 +186,7 @@ class StringInputProducerTest(test_lib.TestCase):
           strings, num_epochs=num_epochs, shuffle=True, seed=271828)
       dequeue_many = queue.dequeue_many(len(strings))
       dequeue = queue.dequeue()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -234,7 +234,7 @@ class StringInputProducerTest(test_lib.TestCase):
           constant_op.constant(
               [], dtype=dtypes.string))
       dequeue = queue.dequeue()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
       with self.assertRaises(errors_impl.OutOfRangeError):
@@ -284,7 +284,7 @@ class RangeInputProducerTest(test_lib.TestCase):
           range_size, num_epochs=num_epochs, shuffle=False)
       dequeue_many = queue.dequeue_many(range_size * num_epochs)
       dequeue = queue.dequeue()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -307,7 +307,7 @@ class RangeInputProducerTest(test_lib.TestCase):
           range_size, num_epochs=num_epochs, shuffle=True, seed=314159)
       dequeue_many = queue.dequeue_many(range_size)
       dequeue = queue.dequeue()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -358,7 +358,7 @@ class SliceInputProducerTest(test_lib.TestCase):
       source_ints = [2, 3, 5, 7]
       slices = inp.slice_input_producer(
           [source_strings, source_ints], num_epochs=num_epochs, shuffle=False)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -386,7 +386,7 @@ class SliceInputProducerTest(test_lib.TestCase):
           num_epochs=num_epochs,
           shuffle=True,
           seed=161803)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -487,7 +487,7 @@ class BatchTest(test_lib.TestCase):
         batched = inp.batch(
             [counter, sparse_counter, "string"], batch_size=batch_size)
         batched_fetch = batched
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -555,7 +555,7 @@ class BatchTest(test_lib.TestCase):
       counter = examples.count_up_to(num_batches * batch_size)
       string = array_ops.tile(["string"],
                               math_ops.to_int32(array_ops.stack([counter])))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       batched = inp.batch(
           [counter, string], batch_size=batch_size, dynamic_pad=True)
@@ -590,7 +590,7 @@ class BatchTest(test_lib.TestCase):
           dense_shape=[1])
       pre_batched = inp.batch([counter, sparse_counter, "string"], batch_size=2)
       batched = inp.batch(pre_batched, enqueue_many=True, batch_size=batch_size)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -629,7 +629,7 @@ class BatchTest(test_lib.TestCase):
           [counter, sparse_counter, "string"],
           batch_size=batch_size,
           num_threads=4)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -672,7 +672,7 @@ class BatchTest(test_lib.TestCase):
           [counter, sparse_counter, "string"],
           batch_size=batch_size,
           allow_smaller_final_batch=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -730,7 +730,7 @@ class BatchTest(test_lib.TestCase):
           batch_size=batch_size,
           num_threads=4,
           allow_smaller_final_batch=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -872,19 +872,19 @@ class BatchTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
@@ -1058,7 +1058,7 @@ class BatchJoinTest(test_lib.TestCase):
                           batched_fetch[1].dense_shape.get_shape().as_list())
       self.assertAllEqual((batch_size,), batched_fetch[2].get_shape().as_list())
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1157,7 +1157,7 @@ class BatchJoinTest(test_lib.TestCase):
       self.assertAllEqual((batch_size,), batched[0].get_shape().as_list())
       self.assertAllEqual((batch_size, None), batched[1].get_shape().as_list())
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1244,7 +1244,7 @@ class BatchJoinTest(test_lib.TestCase):
       self.assertAllEqual((2,), batched[1].dense_shape.get_shape().as_list())
       self.assertAllEqual((None,), batched[2].get_shape().as_list())
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1339,7 +1339,7 @@ class BatchJoinTest(test_lib.TestCase):
       self.assertAllEqual((None,), batched[0].get_shape().as_list())
       self.assertAllEqual((None, None), batched[1].get_shape().as_list())
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1482,19 +1482,19 @@ class BatchJoinTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
@@ -1644,7 +1644,7 @@ class ShuffleBatchTest(test_lib.TestCase):
             min_after_dequeue=16,
             seed=141421)
         batched_fetch = batched
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1702,7 +1702,7 @@ class ShuffleBatchTest(test_lib.TestCase):
           seed=141421,
           allow_smaller_final_batch=True)
       batched_fetch = batched
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1756,7 +1756,7 @@ class ShuffleBatchTest(test_lib.TestCase):
           min_after_dequeue=16,
           seed=173205,
           num_threads=4)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1807,7 +1807,7 @@ class ShuffleBatchTest(test_lib.TestCase):
           seed=173205,
           num_threads=4,
           allow_smaller_final_batch=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1905,19 +1905,19 @@ class ShuffleBatchTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
@@ -2070,7 +2070,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
                           batched_fetch[1].dense_shape.get_shape().as_list())
       self.assertAllEqual((batch_size,), batched_fetch[2].get_shape().as_list())
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -2165,7 +2165,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       self.assertAllEqual((2,), batched[1].dense_shape.get_shape().as_list())
       self.assertAllEqual((None,), batched[2].get_shape().as_list())
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -2309,19 +2309,19 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       for thread in threads:
         thread.join()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
diff --git a/tensorflow/python/training/learning_rate_decay_test.py b/tensorflow/python/training/learning_rate_decay_test.py
index 9de5bc8168f5a7e37a51f6803833e6ce98cc427f..1029d4cea8f67d0e8614983ff106ccc57ccb9064 100644
--- a/tensorflow/python/training/learning_rate_decay_test.py
+++ b/tensorflow/python/training/learning_rate_decay_test.py
@@ -101,7 +101,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
     self.assertAllClose(self.evaluate(decayed_lr), 0.001, 1e-6)
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testPiecewiseConstantEdgeCases(self):
     x_int = resource_variable_ops.ResourceVariable(
         0, dtype=variables.dtypes.int32)
diff --git a/tensorflow/python/training/localhost_cluster_performance_test.py b/tensorflow/python/training/localhost_cluster_performance_test.py
index 7c097b943d05cd1a049886af6ef1d018d7b2c9ab..c4cbc8a55dc5d40b9aeae2fed400b1d29d6c7499 100644
--- a/tensorflow/python/training/localhost_cluster_performance_test.py
+++ b/tensorflow/python/training/localhost_cluster_performance_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -34,6 +35,7 @@ from tensorflow.python.training import device_setter
 
 class CreateLocalClusterTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testCreateLocalCluster(self):
     workers, _ = test.create_local_cluster(num_workers=2, num_ps=2)
     worker_sessions = [session_lib.Session(w.target) for w in workers]
diff --git a/tensorflow/python/training/mode_keys.py b/tensorflow/python/training/mode_keys.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef64554bd5783e7e3ac802708099424ff8244fd8
--- /dev/null
+++ b/tensorflow/python/training/mode_keys.py
@@ -0,0 +1,33 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model modeKeys for TensorFlow and Estimator."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class ModeKeys(object):
+  """Standard names for model modes.
+
+  The following standard keys are defined:
+
+  * `TRAIN`: training/fitting mode.
+  * `TEST`: testing/evaluation mode.
+  * `PREDICT`: prediction/inference mode.
+  """
+
+  TRAIN = 'train'
+  TEST = 'test'
+  PREDICT = 'predict'
diff --git a/tensorflow/python/training/mode_keys_test.py b/tensorflow/python/training/mode_keys_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4435b7d4870ac1675a3f2f4d80def111dc85ae5
--- /dev/null
+++ b/tensorflow/python/training/mode_keys_test.py
@@ -0,0 +1,29 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.train.ModeKeys."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import test
+from tensorflow.python.training import mode_keys
+
+
+class ModeKeysTest(test.TestCase):
+
+  def testKeyEquality(self):
+    self.assertEqual(mode_keys.ModeKeys.PREDICT, 'predict')
+    self.assertEqual(mode_keys.ModeKeys.TRAIN, 'train')
+    self.assertEqual(mode_keys.ModeKeys.TEST, 'test')
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index 9dbcfa52b7c3d79a0caef01402f5071f81ed84ac..99ee9ea7e2e4d32f9a24513d9c46f9de4fa2d797 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -541,6 +541,7 @@ class WrappedSessionTest(test.TestCase):
       self.assertFalse(wrapped_sess1.should_stop())
       self.assertTrue(wrapped_sess1.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_close_twice(self):
     with self.cached_session() as sess:
       wrapped_sess = monitored_session._WrappedSession(sess)
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 8785f9a8e71eeb4db27bc8a3ab826f063de7a456..72670f0ca39f67b151abcb1813ede7ee36c6544b 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -26,7 +27,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import slot_creator
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/training/moving_averages_test.py b/tensorflow/python/training/moving_averages_test.py
index b15f7377f071f6f7b08e4b560547ab7cecfcdd2c..0a7cff4f56207dcfadf095da5e03371730417ad2 100644
--- a/tensorflow/python/training/moving_averages_test.py
+++ b/tensorflow/python/training/moving_averages_test.py
@@ -43,7 +43,7 @@ class MovingAveragesTest(test.TestCase):
       decay = 0.25
       assign = moving_averages.assign_moving_average(
           var, val, decay, zero_debias=False)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose([10.0, 11.0], self.evaluate(var))
       assign.op.run()
       self.assertAllClose(
@@ -57,7 +57,7 @@ class MovingAveragesTest(test.TestCase):
       val = constant_op.constant([1.0, 2.0], dtypes.float32)
       decay = 0.25
       assign = moving_averages.assign_moving_average(var, val, decay)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose([0.0, 0.0], self.evaluate(var))
       assign.op.run()
       self.assertAllClose(
@@ -98,7 +98,7 @@ class MovingAveragesTest(test.TestCase):
       val = array_ops.placeholder(dtypes.float32, [])
 
       wma = moving_averages.weighted_moving_average(val, decay, weight)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Get the first weighted moving average.
       val_1 = 3.0
@@ -125,7 +125,7 @@ class MovingAveragesTest(test.TestCase):
       val = array_ops.placeholder(dtypes.bfloat16, [])
 
       wma = moving_averages.weighted_moving_average(val, decay, weight)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Get the first weighted moving average.
       val_1 = 3.0
@@ -164,7 +164,7 @@ class ExponentialMovingAverageTest(test.TestCase):
     thirties = _Repeat(30.0, dim)
     var0 = variables.Variable(tens, name="v0")
     var1 = variables.Variable(thirties, name="v1")
-    variables.global_variables_initializer().run()
+    self.evaluate(variables.global_variables_initializer())
     # Note that tensor2 is not a Variable but just a plain Tensor resulting
     # from the sum operation.
     tensor2 = var0 + var1
@@ -178,7 +178,7 @@ class ExponentialMovingAverageTest(test.TestCase):
     self.assertFalse(avg0 in variables.trainable_variables())
     self.assertFalse(avg1 in variables.trainable_variables())
     self.assertFalse(avg2 in variables.trainable_variables())
-    variables.global_variables_initializer().run()
+    self.evaluate(variables.global_variables_initializer())
 
     self.assertEqual("v0/ExponentialMovingAverage:0", avg0.name)
     self.assertEqual("v1/ExponentialMovingAverage:0", avg1.name)
@@ -219,38 +219,38 @@ class ExponentialMovingAverageTest(test.TestCase):
                         (10.0 + 30.0) * (1 - dk)) / _Scale(dk, 2), dim)
     self.assertAllClose(expected, self.evaluate(avg2))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNoNumUpdates_Scalar(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25)
       self._CheckDecay(ema, actual_decay=0.25, dim=1)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNoNumUpdates_Scalar_Debias(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.25, dim=1)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNoNumUpdates_Vector(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25)
       self._CheckDecay(ema, actual_decay=0.25, dim=5)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNoNumUpdates_Vector_Debias(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.25, dim=5)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNumUpdates_Scalar(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
       ema = moving_averages.ExponentialMovingAverage(0.25, num_updates=1)
       self._CheckDecay(ema, actual_decay=0.181818, dim=1)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNumUpdates_Scalar_Debias(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
@@ -258,14 +258,14 @@ class ExponentialMovingAverageTest(test.TestCase):
           0.25, num_updates=1, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.181818, dim=1)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNumUpdates_Vector(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
       ema = moving_averages.ExponentialMovingAverage(0.25, num_updates=1)
       self._CheckDecay(ema, actual_decay=0.181818, dim=5)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNumUpdates_Vector_Debias(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
@@ -273,7 +273,7 @@ class ExponentialMovingAverageTest(test.TestCase):
           0.25, num_updates=1, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.181818, dim=5)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesWithControlDeps(self):
     with self.cached_session() as sess:
       v0 = variables.Variable(0, name="v0")
@@ -299,7 +299,7 @@ class ExponentialMovingAverageTest(test.TestCase):
       self.assertEqual([17.5], self.evaluate(v1_avg))
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testBasicEager(self):
     v0 = variables.Variable(1.0)
     v1 = variables.Variable(2.0)
@@ -355,11 +355,11 @@ class ExponentialMovingAverageTest(test.TestCase):
       self.assertEqual(ema.average(v1).op.name, ema.average_name(v1))
       self.assertEqual(ema.average(tensor2).op.name, ema.average_name(tensor2))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNames(self):
     self.averageVariablesNamesHelper(zero_debias=True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNamesNoDebias(self):
     self.averageVariablesNamesHelper(zero_debias=False)
 
@@ -405,15 +405,15 @@ class ExponentialMovingAverageTest(test.TestCase):
         self.assertEqual(
             ema.average(tensor2).op.name, ema.average_name(tensor2))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNamesRespectScope(self):
     self.averageVariablesNamesRespectScopeHelper(zero_debias=True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNamesRespectScopeNoDebias(self):
     self.averageVariablesNamesRespectScopeHelper(zero_debias=False)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSubsetAverageVariablesNames(self):
     with self.cached_session():
       v0 = variables.Variable(10.0, name="v0")
@@ -442,7 +442,7 @@ class ExponentialMovingAverageTest(test.TestCase):
       self.assertEqual(ema.average(v1).op.name, ema.average_name(v1))
       self.assertEqual(ema.average(tensor2).op.name, ema.average_name(tensor2))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesDeviceAssignment(self):
     with ops.device("/job:dev_v0"):
       v0 = variables.Variable(10.0, name="v0")
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index bf9a79660bc57357ee718925d5d3ddabd61c0dc1..eaa563e84aa76f6c27ed497c4e7c5db51cdb3fda 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -25,6 +25,7 @@ import abc
 import six
 
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -38,7 +39,6 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribution_strategy_context as distribute_ctx
 from tensorflow.python.training import slot_creator
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
@@ -663,8 +663,10 @@ class Optimizer(
         ds_reduce_util.ReduceOp.SUM, grads_and_vars)
     var_list = [v for _, v in grads_and_vars]
     grads_and_vars = zip(reduced_grads, var_list)
+
     # Note that this is called in a cross-replica context.
-    self._create_slots(var_list)
+    with ops.init_scope():
+      self._create_slots(var_list)
 
     def update(v, g):
       """Apply gradients to a replica variable."""
@@ -754,7 +756,7 @@ class Optimizer(
       # `_resource_apply_dense`.
       distributed_container = var._distributed_container()
       assert distributed_container is not None
-      if context.executing_eagerly():
+      if ops.executing_eagerly_outside_functions():
         key = distributed_container._unique_id
       else:
         key = (distributed_container.graph, distributed_container._shared_name)
@@ -820,7 +822,10 @@ class Optimizer(
               name=name, shape=None)
           if restored_initial_value is not None:
             initial_value = restored_initial_value
-        v = variable_scope.variable(initial_value, name=name, trainable=False)
+        v = variable_scope.variable(
+            initial_value, name=name, trainable=False,
+            use_resource=resource_variable_ops.is_resource_variable(
+                colocate_with))
       # Restore this variable by name if necessary, but don't add a
       # Checkpointable dependency. Optimizers return the current graph's
       # non-slot variables from _checkpoint_dependencies explicitly rather
diff --git a/tensorflow/python/training/quantize_training_test.py b/tensorflow/python/training/quantize_training_test.py
index 62e783f200093a4a4d0004d1239bc019c7bdf64e..2352af7e99b5bab99826fb9a628a98846e25444c 100644
--- a/tensorflow/python/training/quantize_training_test.py
+++ b/tensorflow/python/training/quantize_training_test.py
@@ -53,7 +53,7 @@ class PywrapQuantizeTrainingTest(test.TestCase):
 
   # Test that save/restoring works for EMA variables generated in the
   # quantized training rewrite.
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def testQuantizedSaveRestore(self):
     save_path = os.path.join(self.get_temp_dir(), 'quantized_save_restore')
 
diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py
index 4113cecf55d357c6d9835a671b5cdc7bc1a6f6d4..c5085079b77c78df80fbb1ee423e9a7519d8e53a 100644
--- a/tensorflow/python/training/queue_runner_test.py
+++ b/tensorflow/python/training/queue_runner_test.py
@@ -41,7 +41,7 @@ _MockOp = collections.namedtuple("MockOp", ["name"])
 
 class QueueRunnerTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testBasic(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -49,7 +49,7 @@ class QueueRunnerTest(test.TestCase):
       var = variables.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       qr = queue_runner_impl.QueueRunner(queue, [count_up_to])
       threads = qr.create_threads(sess)
       self.assertEqual(sorted(t.name for t in threads),
@@ -62,7 +62,7 @@ class QueueRunnerTest(test.TestCase):
       # The variable should be 3.
       self.assertEqual(3, self.evaluate(var))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testTwoOps(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -77,7 +77,7 @@ class QueueRunnerTest(test.TestCase):
       self.assertEqual(sorted(t.name for t in threads),
                        ["QueueRunnerThread-fifo_queue-CountUpTo:0",
                         "QueueRunnerThread-fifo_queue-CountUpTo_1:0"])
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       for t in threads:
         t.start()
       for t in threads:
@@ -93,7 +93,7 @@ class QueueRunnerTest(test.TestCase):
       qr = queue_runner_impl.QueueRunner(queue, [_MockOp("i fail"),
                                                  _MockOp("so fail")])
       threads = qr.create_threads(sess)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       for t in threads:
         t.start()
       for t in threads:
@@ -132,7 +132,7 @@ class QueueRunnerTest(test.TestCase):
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError, "is closed"):
         self.evaluate(dequeue1)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testRespectCoordShouldStop(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -140,7 +140,7 @@ class QueueRunnerTest(test.TestCase):
       var = variables.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       qr = queue_runner_impl.QueueRunner(queue, [count_up_to])
       # As the coordinator to stop.  The queue runner should
       # finish immediately.
@@ -196,7 +196,7 @@ class QueueRunnerTest(test.TestCase):
         var = variables.VariableV1(zero64)
         count_up_to = var.count_up_to(3)
         queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         coord = coordinator.Coordinator()
         qr = queue_runner_impl.QueueRunner(queue, [count_up_to])
         # NOTE that this test does not actually start the threads.
@@ -212,7 +212,7 @@ class QueueRunnerTest(test.TestCase):
       var = variables.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       coord = coordinator.Coordinator()
       qr = queue_runner_impl.QueueRunner(queue, [count_up_to])
       threads = []
@@ -221,7 +221,7 @@ class QueueRunnerTest(test.TestCase):
       new_threads = qr.create_threads(sess, coord=coord)
       self.assertEqual([], new_threads)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testThreads(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -229,7 +229,7 @@ class QueueRunnerTest(test.TestCase):
       var = variables.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       qr = queue_runner_impl.QueueRunner(queue, [count_up_to,
                                                  _MockOp("bad_op")])
       threads = qr.create_threads(sess, start=True)
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index a29926a57df847fd6553e0813a5e2dfeebb3885e..348b8bf1ef0a89a971eb26c9cb7e5f9d01c51a4b 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -14,7 +14,11 @@
 # ==============================================================================
 
 # pylint: disable=invalid-name
-"""Save and restore variables."""
+"""Save and restore variables.
+
+Symbols in this file are deprecated. See replacements in
+tensorflow/python/training/checkpointable and tensorflow/python/training/saving.
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -25,7 +29,6 @@ import time
 import uuid
 
 import numpy as np
-import six
 
 from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
@@ -42,16 +45,15 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import saveable_object
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -67,31 +69,6 @@ get_checkpoint_mtimes = checkpoint_management.get_checkpoint_mtimes
 remove_checkpoint = checkpoint_management.remove_checkpoint
 
 
-# Op names which identify variable reads which should be saved.
-_VARIABLE_OPS = set(["Variable",
-                     "VariableV2",
-                     "AutoReloadVariable",
-                     "VarHandleOp",
-                     "ReadVariableOp"])
-
-
-def _set_cpu0(device_string):
-  """Creates a new device string based on `device_string` but using /CPU:0.
-
-  If the device is already on /CPU:0, this is a no-op.
-
-  Args:
-    device_string: A device string.
-
-  Returns:
-    A device string.
-  """
-  parsed_device = pydev.DeviceSpec.from_string(device_string)
-  parsed_device.device_type = "CPU"
-  parsed_device.device_index = 0
-  return parsed_device.to_string()
-
-
 class BaseSaverBuilder(object):
   """Base class for Savers.
 
@@ -101,64 +78,9 @@ class BaseSaverBuilder(object):
   SaveSpec = saveable_object.SaveSpec
   SaveableObject = saveable_object.SaveableObject
 
-  class VariableSaveable(SaveableObject):
-    """SaveableObject implementation that handles Variables."""
-
-    def __init__(self, var, slice_spec, name):
-      spec = BaseSaverBuilder.SaveSpec(var, slice_spec, name, dtype=var.dtype)
-      super(BaseSaverBuilder.VariableSaveable, self).__init__(var, [spec], name)
-
-    def restore(self, restored_tensors, restored_shapes):
-      restored_tensor = restored_tensors[0]
-      if restored_shapes is not None:
-        restored_tensor = array_ops.reshape(restored_tensor, restored_shapes[0])
-      return state_ops.assign(
-          self.op,
-          restored_tensor,
-          validate_shape=restored_shapes is None and
-          self.op.get_shape().is_fully_defined())
-
-  class ResourceVariableSaveable(SaveableObject):
-    """SaveableObject implementation that handles ResourceVariables."""
-
-    def __init__(self, var, slice_spec, name):
-      self._var_device = var.device
-      self._var_shape = var.shape
-      if isinstance(var, ops.Tensor):
-        self.handle_op = var.op.inputs[0]
-        tensor = var
-      elif isinstance(var, resource_variable_ops.ResourceVariable):
-
-        def _read_variable_closure(v):
-          def f():
-            with ops.device(v.device):
-              x = v.read_value()
-              # To allow variables placed on non-CPU devices to be checkpointed,
-              # we copy them to CPU on the same machine first.
-              with ops.device("/device:CPU:0"):
-                return array_ops.identity(x)
-          return f
-
-        self.handle_op = var.handle
-        tensor = _read_variable_closure(var)
-      else:
-        raise ValueError(
-            "Saveable is neither a resource variable nor a read operation."
-            " Got: %s" % repr(var))
-      spec = BaseSaverBuilder.SaveSpec(tensor, slice_spec, name,
-                                       dtype=var.dtype)
-      super(BaseSaverBuilder.ResourceVariableSaveable, self).__init__(
-          var, [spec], name)
-
-    def restore(self, restored_tensors, restored_shapes):
-      restored_tensor = restored_tensors[0]
-      if restored_shapes is not None:
-        restored_tensor = array_ops.reshape(restored_tensor, restored_shapes[0])
-      # Copy the restored tensor to the variable's device.
-      with ops.device(self._var_device):
-        restored_tensor = array_ops.identity(restored_tensor)
-        return resource_variable_ops.shape_safe_assign_variable_handle(
-            self.handle_op, self._var_shape, restored_tensor)
+  # Aliases for code which was moved but still has lots of users.
+  VariableSaveable = saveable_object_util.ReferenceVariableSaveable
+  ResourceVariableSaveable = saveable_object_util.ResourceVariableSaveable
 
   def __init__(self, write_version=saver_pb2.SaverDef.V2):
     self._write_version = write_version
@@ -224,7 +146,11 @@ class BaseSaverBuilder(object):
     del restore_sequentially
     all_tensors = []
     for saveable in saveables:
-      with ops.device(_set_cpu0(saveable.device) if saveable.device else None):
+      if saveable.device:
+        device = saveable_object_util.set_cpu0(saveable.device)
+      else:
+        device = None
+      with ops.device(device):
         all_tensors.extend(
             self.restore_op(filename_tensor, saveable, preferred_shard))
     return all_tensors
@@ -336,7 +262,7 @@ class BaseSaverBuilder(object):
     last_device = None
     for shard, (device, saveables) in enumerate(per_device):
       last_device = device
-      with ops.device(_set_cpu0(device)):
+      with ops.device(saveable_object_util.set_cpu0(device)):
         sharded_filename = self.sharded_filename(tmp_checkpoint_prefix, shard,
                                                  num_shards_tensor)
         sharded_prefixes.append(sharded_filename)
@@ -344,7 +270,7 @@ class BaseSaverBuilder(object):
 
     with ops.control_dependencies([x.op for x in sharded_saves]):
       # Co-locates the merge step with the last device.
-      with ops.device(_set_cpu0(last_device)):
+      with ops.device(saveable_object_util.set_cpu0(last_device)):
         # V2 format write path consists of a metadata merge step.  Once merged,
         # attempts to delete the temporary directory, "<user-fed prefix>_temp".
         merge_step = gen_io_ops.merge_v2_checkpoints(
@@ -459,10 +385,6 @@ class BaseSaverBuilder(object):
                 name="restore_shard"))
     return control_flow_ops.group(*sharded_restores, name="restore_all")
 
-  @staticmethod
-  def _IsVariable(v):
-    return isinstance(v, ops.Tensor) and v.op.type in _VARIABLE_OPS
-
   def _GroupByDevices(self, saveables):
     """Group Variable tensor slices per device.
 
@@ -490,220 +412,6 @@ class BaseSaverBuilder(object):
       per_device[canonical_device.pop()].append(saveable)
     return sorted(per_device.items(), key=lambda t: t[0])
 
-  @staticmethod
-  def OpListToDict(op_list, convert_variable_to_tensor=True):
-    """Create a dictionary of names to operation lists.
-
-    Args:
-      op_list: A list, tuple, or set of Variables or SaveableObjects.
-      convert_variable_to_tensor: Whether or not to convert single Variables
-        with no slice info into Tensors.
-
-    Returns:
-      A dictionary of names to the operations that must be saved under
-      that name.  Variables with save_slice_info are grouped together under the
-      same key in no particular order.
-
-    Raises:
-      TypeError: If the type of op_list or its elements is not supported.
-      ValueError: If at least two saveables share the same name.
-    """
-    if not isinstance(op_list, (list, tuple, set)):
-      raise TypeError("Variables to save should be passed in a dict or a "
-                      "list: %s" % op_list)
-    # When ResourceVariables are converted to Tensors, read ops are added to the
-    # graph. Sorting the op_list ensures that the resulting graph is always
-    # constructed in a deterministic way:
-    op_list = sorted(op_list, key=lambda x: x.name)
-    names_to_saveables = {}
-    # pylint: disable=protected-access
-    for var in op_list:
-      if isinstance(var, BaseSaverBuilder.SaveableObject):
-        names_to_saveables[var.name] = var
-      elif isinstance(var, variables.PartitionedVariable):
-        if var.name in names_to_saveables:
-          raise ValueError("At least two variables have the same name: %s" %
-                           var.name)
-        names_to_saveables[var.name] = var
-      elif isinstance(var, variables.Variable) and var._save_slice_info:
-        name = var._save_slice_info.full_name
-        if name in names_to_saveables:
-          if not isinstance(names_to_saveables[name], list):
-            raise ValueError("Mixing slices and non-slices with the same name: "
-                             "%s" % name)
-          names_to_saveables[name].append(var)
-        else:
-          names_to_saveables[name] = [var]
-      elif (isinstance(var, checkpointable.CheckpointableBase)
-            and not isinstance(var, variables.Variable)):
-        checkpointable_saveables = [
-            (factory() if callable(factory) else factory)
-            for factory in var._gather_saveables_for_checkpoint().values()]
-        names_to_saveables.update(
-            BaseSaverBuilder.OpListToDict(checkpointable_saveables))
-      else:
-        if context.executing_eagerly():
-          if not isinstance(var, resource_variable_ops.ResourceVariable):
-            raise ValueError(
-                "Can only save/restore ResourceVariables when eager execution "
-                "is enabled, type: %s." % type(var))
-          set_var = names_to_saveables.setdefault(var._shared_name, var)
-          if set_var is not var:
-            raise ValueError(
-                ("Two different ResourceVariable objects with the same "
-                 "shared_name '%s' were passed to the Saver. This likely means "
-                 "that they were created in different Graphs or isolation "
-                 "contexts, and may not be checkpointed together.") %
-                (var._shared_name,))
-        else:
-          if convert_variable_to_tensor:
-            if isinstance(var, resource_variable_ops.ResourceVariable):
-              var = var._graph_element  # pylint: disable=protected-access
-            else:
-              var = ops.internal_convert_to_tensor(var, as_ref=True)
-            if not BaseSaverBuilder._IsVariable(var):
-              raise TypeError("Variable to save is not a Variable: %s" % var)
-          if var.op.type == "ReadVariableOp":
-            name = var.op.inputs[0].op.name
-          else:
-            name = var.op.name
-          if name in names_to_saveables:
-            raise ValueError("At least two variables have the same name: %s" %
-                             name)
-          names_to_saveables[name] = var
-
-      # pylint: enable=protected-access
-    return names_to_saveables
-
-  @staticmethod
-  def SaveableObjectsForOp(op, name):
-    """Create `SaveableObject`s from an operation.
-
-    Args:
-      op: A variable, operation, or SaveableObject to coerce into a
-        SaveableObject.
-      name: A string name for the SaveableObject.
-
-    Yields:
-      `SaveableObject`s which together save/restore `op`.
-
-    Raises:
-      TypeError: If `name` is not a string.
-      ValueError: For operations with no known conversion to SaveableObject.
-    """
-    if not isinstance(name, six.string_types):
-      raise TypeError(
-          "names_to_saveables must be a dict mapping string names to "
-          "checkpointable operations. Name is not a string: %s" % name)
-    if isinstance(op, BaseSaverBuilder.SaveableObject):
-      yield op
-    elif isinstance(op, (list, tuple, variables.PartitionedVariable)):
-      if isinstance(op, variables.PartitionedVariable):
-        op = list(op)
-      # A set of slices.
-      slice_name = None
-      # pylint: disable=protected-access
-      for variable in op:
-        if not isinstance(variable, variables.Variable):
-          raise ValueError("Slices must all be Variables: %s" % variable)
-        if not variable._save_slice_info:
-          raise ValueError("Slices must all be slices: %s" % variable)
-        if slice_name is None:
-          slice_name = variable._save_slice_info.full_name
-        elif slice_name != variable._save_slice_info.full_name:
-          raise ValueError(
-              "Slices must all be from the same tensor: %s != %s" %
-              (slice_name, variable._save_slice_info.full_name))
-        if variable.op.type in ["Variable", "VariableV2",
-                                "AutoReloadVariable"]:
-          yield BaseSaverBuilder.VariableSaveable(
-              variable, variable._save_slice_info.spec, name)
-        else:
-          yield BaseSaverBuilder.ResourceVariableSaveable(
-              variable, variable._save_slice_info.spec, name)
-      # pylint: enable=protected-access
-    elif isinstance(op, checkpointable.CheckpointableBase) and not isinstance(
-        op, variables.Variable):
-      # pylint: disable=protected-access
-      for attr, factory in op._gather_saveables_for_checkpoint().items():
-        if attr == checkpointable.VARIABLE_VALUE_KEY:
-          # Keep original name for classes masquerading as variables.
-          full_name = name
-        else:
-          full_name = name + "_" + attr
-        op = (factory(full_name) if callable(factory) else factory)
-        for op in BaseSaverBuilder.SaveableObjectsForOp(op, op.name):
-          yield op
-      # pylint: enable=protected-access
-    else:
-      # A variable or tensor.
-      if context.executing_eagerly():
-        if not isinstance(op, resource_variable_ops.ResourceVariable):
-          raise ValueError("Can only save/restore ResourceVariable eager "
-                           "mode is enabled, type: %s." % type(op))
-        yield BaseSaverBuilder.ResourceVariableSaveable(op, "", name)
-      else:
-        if isinstance(op, resource_variable_ops.ResourceVariable):
-          variable = op._graph_element  # pylint: disable=protected-access
-        else:
-          variable = ops.internal_convert_to_tensor(op, as_ref=True)
-        if not BaseSaverBuilder._IsVariable(variable):
-          raise TypeError("names_to_saveables must be a dict mapping string "
-                          "names to Tensors/Variables. Not a variable: %s" %
-                          variable)
-        if variable.op.type in ["Variable", "VariableV2",
-                                "AutoReloadVariable"]:
-          yield BaseSaverBuilder.VariableSaveable(variable, "", name)
-        else:
-          yield BaseSaverBuilder.ResourceVariableSaveable(
-              variable, "", name)
-
-  def _ValidateAndSliceInputs(self, names_to_saveables):
-    """Returns the variables and names that will be used for a Saver.
-
-    Args:
-      names_to_saveables: A dict (k, v) where k is the name of an operation and
-         v is an operation to save or a BaseSaverBuilder.Saver.
-
-    Returns:
-      A list of BaseSaverBuilder.SaveableObject objects.
-
-    Raises:
-      TypeError: If any of the keys are not strings or any of the
-        values are not one of Tensor or Variable or a checkpointable operation.
-      ValueError: If the same operation is given in more than one value
-        (this also applies to slices of SlicedVariables).
-    """
-    if not isinstance(names_to_saveables, dict):
-      names_to_saveables = BaseSaverBuilder.OpListToDict(names_to_saveables)
-
-    saveables = []
-    seen_ops = set()
-    for name, op in sorted(names_to_saveables.items(),
-                           # Avoid comparing ops, sort only by name.
-                           key=lambda x: x[0]):
-      for converted_saveable_object in self.SaveableObjectsForOp(op, name):
-        self._AddSaveable(saveables, seen_ops, converted_saveable_object)
-    return saveables
-
-  def _AddSaveable(self, saveables, seen_ops, saveable):
-    """Adds the saveable to the saveables list.
-
-    Args:
-      saveables: List to append the SaveableObject to.
-      seen_ops: Set of the ops of the saveables already processed.  Used to
-        check that each saveable is only saved once.
-      saveable: The saveable.
-
-    Raises:
-      ValueError: If the saveable has already been processed.
-    """
-    if saveable.op in seen_ops:
-      raise ValueError("The same saveable will be restored with two names: %s" %
-                       saveable.name)
-    saveables.append(saveable)
-    seen_ops.add(saveable.op)
-
   def build(self,
             names_to_saveables,
             reshape=False,
@@ -775,7 +483,8 @@ class BaseSaverBuilder(object):
       raise ValueError("save and restore operations need to be built together "
                        " when eager execution is not enabled.")
 
-    saveables = self._ValidateAndSliceInputs(names_to_saveables)
+    saveables = saveable_object_util.validate_and_slice_inputs(
+        names_to_saveables)
     if max_to_keep is None:
       max_to_keep = 0
 
@@ -1077,16 +786,28 @@ class Saver(object):
     @compatibility(eager)
     When eager execution is enabled, `var_list` must specify a `list` or `dict`
     of variables to save. Otherwise, a `RuntimeError` will be raised.
+
+    Although Saver works in some cases when executing eagerly, it is
+    fragile. Please switch to `tf.train.Checkpoint` or
+    `tf.keras.Model.save_weights`, which perform a more robust object-based
+    saving. These APIs will load checkpoints written by `Saver`.
     @end_compatibility
     """
     if defer_build and var_list:
       raise ValueError(
           "If `var_list` is provided then build cannot be deferred. "
           "Either set defer_build=False or var_list=None.")
-    if context.executing_eagerly() and var_list is None:
-      raise RuntimeError(
-          "When eager execution is enabled, `var_list` must specify a list or "
-          "dict of variables to save")
+    if context.executing_eagerly():
+      logging.warning(
+          "Saver is deprecated, please switch to tf.train.Checkpoint or "
+          "tf.keras.Model.save_weights for training checkpoints. When "
+          "executing eagerly variables do not necessarily have unique names, "
+          "and so the variable.name-based lookups Saver performs are "
+          "error-prone.")
+      if var_list is None:
+        raise RuntimeError(
+            "When eager execution is enabled, `var_list` must specify a list "
+            "or dict of variables to save")
     self._var_list = var_list
     self._reshape = reshape
     self._sharded = sharded
@@ -1656,6 +1377,37 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
   NOTE: Restarting training from saved `meta_graph` only works if the
   device assignments have not changed.
 
+  Example 2:
+  Variables, placeholders, and independent operations can also be stored, as
+  shown in the following example.
+
+  ```Python
+  # Saving contents and operations.
+  v1 = tf.placeholder(tf.float32, name="v1")
+  v2 = tf.placeholder(tf.float32, name="v2")
+  v3 = tf.mul(v1, v2)
+  vx = tf.Variable(10.0, name="vx")
+  v4 = tf.add(v3, vx, name="v4")
+  saver = tf.train.Saver([vx])
+  sess = tf.Session()
+  sess.run(tf.initialize_all_variables())
+  sess.run(vx.assign(tf.add(vx, vx)))
+  result = sess.run(v4, feed_dict={v1:12.0, v2:3.3})
+  print(result)
+  saver.save(sess, "./model_ex1")
+  ```
+
+  Later this model can be restored and contents loaded.
+
+  ```Python
+  # Restoring variables and running operations.
+  saver = tf.train.import_meta_graph("./model_ex1.meta")
+  sess = tf.Session()
+  saver.restore(sess, "./model_ex1")
+  result = sess.run("v4:0", feed_dict={"v1:0": 12.0, "v2:0": 3.3})
+  print(result)
+  ```
+
   Args:
     meta_graph_or_file: `MetaGraphDef` protocol buffer or filename (including
       the path) containing a `MetaGraphDef`.
@@ -1898,17 +1650,41 @@ def saver_from_object_based_checkpoint(
   if builder is None:
     builder = BulkSaverBuilder()
 
-  saveables = builder._ValidateAndSliceInputs(var_list)  # pylint: disable=protected-access
+  saveables = saveable_object_util.validate_and_slice_inputs(var_list)
+  current_names = set()
+  for saveable in saveables:
+    for spec in saveable.specs:
+      current_names.add(spec.name)
+  previous_names = set(names_to_keys.keys())
+  missing_names = current_names - previous_names
+  if missing_names:
+    extra_names = previous_names - current_names
+    intersecting_names = previous_names.intersection(current_names)
+    raise errors.NotFoundError(
+        None, None,
+        message=(
+            "\n\nExisting variables not in the checkpoint: %s\n\n"
+            "Variables names when this checkpoint was written which don't "
+            "exist now: %s\n\n"
+            "(%d variable name(s) did match)\n\n"
+            "Could not find some variables in the checkpoint (see names "
+            "above). Saver was attempting to load an object-based checkpoint "
+            "(saved using tf.train.Checkpoint or tf.keras.Model.save_weights) "
+            "using variable names. If the checkpoint was written with eager "
+            "execution enabled, it's possible that variable names have "
+            "changed (for example missing a '_1' suffix). It's also "
+            "possible that there are new variables which did not exist "
+            "when the checkpoint was written. You can construct a "
+            "Saver(var_list=...) with only the variables which previously "
+            "existed, and if variable names have changed you may need to "
+            "make this a dictionary with the old names as keys. If you're "
+            "using an Estimator, you'll need to return a tf.train.Saver "
+            "inside a tf.train.Scaffold from your model_fn.")
+        % (", ".join(sorted(missing_names)), ", ".join(sorted(extra_names)),
+           len(intersecting_names)))
   for saveable in saveables:
     for spec in saveable.specs:
-      if spec.name not in names_to_keys:
-        raise errors.NotFoundError(
-            None, None,
-            message=("Attempting to load an object-based checkpoint using "
-                     "variable names, but could not find %s in the "
-                     "checkpoint.") % spec.name)
       spec.name = names_to_keys[spec.name]
-
   if cached_saver is None:
     return Saver(saveables)
   return cached_saver
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 40b7c0d5a7fab6fc2dfb9ff239621b7eddb5c695..d1b51adaa4f89aaa0394bca3f6fd82ab9823258b 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -124,8 +124,8 @@ class SaverTest(test.TestCase):
       if not context.executing_eagerly():
         self.assertEqual(
             len(variables.report_uninitialized_variables().eval()), 2)
-        self.assertEqual(0, len(v2.keys().eval()))
-        self.assertEqual(0, len(v2.values().eval()))
+        self.assertEqual(0, len(self.evaluate(v2.keys())))
+        self.assertEqual(0, len(self.evaluate(v2.values())))
       # Restore the saved values in the parameter nodes.
       save = saver_module.Saver({"v0": v0, "v1": v1, "v2": v2.saveable})
       save.restore(sess, save_path)
@@ -331,10 +331,10 @@ class SaverTest(test.TestCase):
       self.evaluate(init_all_op)
 
       # Check that the parameter nodes have been initialized.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
-      self.assertEqual(b"k1", v2.keys().eval())
-      self.assertEqual(30.0, v2.values().eval())
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
+      self.assertEqual(b"k1", self.evaluate(v2.keys()))
+      self.assertEqual(30.0, self.evaluate(v2.values()))
 
       # Save the initialized values in the file at "save_path"
       val = save.save(sess, save_path1)
@@ -360,16 +360,16 @@ class SaverTest(test.TestCase):
       # Assert that the variables are not initialized.
       self.assertEqual(
           len(variables.report_uninitialized_variables().eval()), 2)
-      self.assertEqual(0, len(v2.keys().eval()))
-      self.assertEqual(0, len(v2.values().eval()))
+      self.assertEqual(0, len(self.evaluate(v2.keys())))
+      self.assertEqual(0, len(self.evaluate(v2.values())))
 
       # Restore the saved values in the parameter nodes.
       save.restore(sess, save_path2)
       # Check that the parameter nodes have been restored.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
-      self.assertEqual(b"k1", v2.keys().eval())
-      self.assertEqual(30.0, v2.values().eval())
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
+      self.assertEqual(b"k1", self.evaluate(v2.keys()))
+      self.assertEqual(30.0, self.evaluate(v2.values()))
 
   @test_util.run_deprecated_v1
   def testFilenameTensor(self):
@@ -390,7 +390,7 @@ class SaverTest(test.TestCase):
             ValueError, "The passed save_path is not a valid checkpoint:"):
           save.restore(sess, "invalid path")
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testInt64(self):
     save_path = os.path.join(self.get_temp_dir(), "int64")
 
@@ -398,7 +398,7 @@ class SaverTest(test.TestCase):
       # Build a graph with 1 node, and save and restore for them.
       v = variables.VariableV1(np.int64(15), name="v")
       save = saver_module.Saver({"v": v}, restore_sequentially=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Save the initialized values in the file at "save_path"
       val = save.save(sess, save_path)
@@ -416,7 +416,7 @@ class SaverTest(test.TestCase):
       # Restore the saved values in the parameter nodes.
       save.restore(sess, save_path)
       # Check that the parameter nodes have been restored.
-      self.assertEqual(np.int64(15), v.eval())
+      self.assertEqual(np.int64(15), self.evaluate(v))
 
   def testSomeErrors(self):
     with ops_lib.Graph().as_default():
@@ -466,7 +466,7 @@ class SaverTest(test.TestCase):
       # Verify non-duplicate names work.
       saver_module.Saver({"v0": v0, "v2": v2.saveable})
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testBasicsWithListOfVariables(self):
     save_path = os.path.join(self.get_temp_dir(), "basics_with_list")
 
@@ -478,14 +478,14 @@ class SaverTest(test.TestCase):
       v2 = saver_test_utils.CheckpointedOp(name="v2")
       v2_init = v2.insert("k1", 30.0)
       save = saver_module.Saver([v0, v1, v2.saveable])
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       v2_init.run()
 
       # Check that the parameter nodes have been initialized.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
-      self.assertEqual(b"k1", v2.keys().eval())
-      self.assertEqual(30.0, v2.values().eval())
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
+      self.assertEqual(b"k1", self.evaluate(v2.keys()))
+      self.assertEqual(30.0, self.evaluate(v2.values()))
 
       # Save the initialized values in the file at "save_path"
       val = save.save(sess, save_path)
@@ -506,16 +506,16 @@ class SaverTest(test.TestCase):
       with self.assertRaisesWithPredicateMatch(
           errors_impl.OpError, lambda e: "uninitialized value v1" in e.message):
         self.evaluate(v1)
-      self.assertEqual(0, len(v2.keys().eval()))
-      self.assertEqual(0, len(v2.values().eval()))
+      self.assertEqual(0, len(self.evaluate(v2.keys())))
+      self.assertEqual(0, len(self.evaluate(v2.values())))
 
       # Restore the saved values in the parameter nodes.
       save.restore(sess, save_path)
       # Check that the parameter nodes have been restored.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
-      self.assertEqual(b"k1", v2.keys().eval())
-      self.assertEqual(30.0, v2.values().eval())
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
+      self.assertEqual(b"k1", self.evaluate(v2.keys()))
+      self.assertEqual(30.0, self.evaluate(v2.values()))
 
     # Build another graph with 2 nodes, initialized
     # differently, and a Restore node for them.
@@ -525,20 +525,20 @@ class SaverTest(test.TestCase):
       v2_2 = saver_test_utils.CheckpointedOp(name="v2")
       save2 = saver_module.Saver([v0_2, v1_2, v2_2.saveable])
       v2_2.insert("k1000", 3000.0).run()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Check that the parameter nodes have been initialized.
-      self.assertEqual(1000.0, v0_2.eval())
-      self.assertEqual(2000.0, v1_2.eval())
-      self.assertEqual(b"k1000", v2_2.keys().eval())
-      self.assertEqual(3000.0, v2_2.values().eval())
+      self.assertEqual(1000.0, self.evaluate(v0_2))
+      self.assertEqual(2000.0, self.evaluate(v1_2))
+      self.assertEqual(b"k1000", self.evaluate(v2_2.keys()))
+      self.assertEqual(3000.0, self.evaluate(v2_2.values()))
       # Restore the values saved earlier in the parameter nodes.
       save2.restore(sess, save_path)
       # Check that the parameter nodes have been restored.
-      self.assertEqual(10.0, v0_2.eval())
-      self.assertEqual(20.0, v1_2.eval())
-      self.assertEqual(b"k1", v2_2.keys().eval())
-      self.assertEqual(30.0, v2_2.values().eval())
+      self.assertEqual(10.0, self.evaluate(v0_2))
+      self.assertEqual(20.0, self.evaluate(v1_2))
+      self.assertEqual(b"k1", self.evaluate(v2_2.keys()))
+      self.assertEqual(30.0, self.evaluate(v2_2.values()))
 
   def _SaveAndLoad(self, var_name, var_value, other_value, save_path):
     with self.session(graph=ops_lib.Graph()) as sess:
@@ -582,14 +582,14 @@ class SaverTest(test.TestCase):
       with sess.graph.device(test.gpu_device_name()):
         v0_1 = variables.VariableV1(123.45)
       save = saver_module.Saver({"v0": v0_1})
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       save.save(sess, save_path)
 
     with session.Session("", graph=ops_lib.Graph()) as sess:
       with sess.graph.device(test.gpu_device_name()):
         v0_2 = variables.VariableV1(543.21)
       save = saver_module.Saver({"v0": v0_2})
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
   def testSharedServerOnGPU(self):
     if not test.is_gpu_available():
@@ -599,14 +599,14 @@ class SaverTest(test.TestCase):
       with sess.graph.device(test.gpu_device_name()):
         v0_1 = variables.VariableV1(123.45)
       save = saver_module.Saver({"v0": v0_1}, sharded=True, allow_empty=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       save.save(sess, save_path)
 
     with session.Session("", graph=ops_lib.Graph()) as sess:
       with sess.graph.device(test.gpu_device_name()):
         v0_2 = variables.VariableV1(543.21)
       save = saver_module.Saver({"v0": v0_2}, sharded=True, allow_empty=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
   def testVariables(self):
     save_path = os.path.join(self.get_temp_dir(), "variables")
@@ -627,10 +627,10 @@ class SaverTest(test.TestCase):
       # Saver with no arg, defaults to 'all variables'.
       save = saver_module.Saver()
       save.restore(sess, save_path)
-      self.assertAllClose(1.0, one.eval())
-      self.assertAllClose([2.0, 2.0, 2.0], twos.eval())
-      self.assertEqual(b"k1", v2.keys().eval())
-      self.assertEqual(3.0, v2.values().eval())
+      self.assertAllClose(1.0, self.evaluate(one))
+      self.assertAllClose([2.0, 2.0, 2.0], self.evaluate(twos))
+      self.assertEqual(b"k1", self.evaluate(v2.keys()))
+      self.assertEqual(3.0, self.evaluate(v2.values()))
 
   def testVarListShouldBeEmptyInDeferredBuild(self):
     with ops_lib.Graph().as_default():
@@ -664,10 +664,10 @@ class SaverTest(test.TestCase):
       # Saver with no arg, defaults to 'all variables'.
       save = saver_module.Saver()
       save.restore(sess, save_path)
-      self.assertAllClose(1.0, one.eval())
-      self.assertAllClose([2.0, 2.0, 2.0], twos.eval())
+      self.assertAllClose(1.0, self.evaluate(one))
+      self.assertAllClose([2.0, 2.0, 2.0], self.evaluate(twos))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testReshape(self):
     save_path = os.path.join(self.get_temp_dir(), "variables_reshape")
     with session.Session("", graph=ops_lib.Graph()) as sess:
@@ -691,7 +691,8 @@ class SaverTest(test.TestCase):
       var = variables.VariableV1([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])
       save = saver_module.Saver(reshape=True)
       save.restore(sess, save_path)
-      self.assertAllClose([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], var.eval())
+      self.assertAllClose([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
+                          self.evaluate(var))
 
   @test_util.run_in_graph_and_eager_modes
   def testSaveWithGlobalStep(self, pad_step_number=False):
@@ -726,7 +727,6 @@ class SaverTest(test.TestCase):
   def testSaveWithGlobalStepWithPadding(self):
     self.testSaveWithGlobalStep(pad_step_number=True)
 
-  @test_util.run_deprecated_v1
   def testSaveToNonexistingPath(self):
     file_io.write_string_to_file(
         os.path.join(self.get_temp_dir(), "actually_a_file"), "")
@@ -753,8 +753,8 @@ class SaverTest(test.TestCase):
           self.evaluate(init_all_op)
 
           # Check that the parameter nodes have been initialized.
-          self.assertEqual(10.0, v0.eval())
-          self.assertEqual(20.0, v1.eval())
+          self.assertEqual(10.0, self.evaluate(v0))
+          self.assertEqual(20.0, self.evaluate(v1))
 
           # Save the graph.
           save.save(sess, save_path)
@@ -763,13 +763,12 @@ class SaverTest(test.TestCase):
           # Restore the saved values in the parameter nodes.
           save.restore(sess, save_path)
           # Check that the parameter nodes have been restored.
-          self.assertEqual(10.0, v0.eval())
-          self.assertEqual(20.0, v1.eval())
+          self.assertEqual(10.0, self.evaluate(v0))
+          self.assertEqual(20.0, self.evaluate(v1))
       except ValueError as exc:
         error_msg_template = "Parent directory of {} doesn't exist, can't save."
         self.assertEqual(error_msg_template.format(save_path), str(exc))
 
-  @test_util.run_deprecated_v1
   def testSaveToURI(self):
     # ParseURI functions don't work on Windows yet.
     # TODO(jhseu): Remove this check when it works.
@@ -789,8 +788,8 @@ class SaverTest(test.TestCase):
       self.evaluate(init_all_op)
 
       # Check that the parameter nodes have been initialized.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
       save.save(sess, save_path)
 
   def testSaveRestoreAndValidateVariableDtype(self):
@@ -835,7 +834,7 @@ class SaverTest(test.TestCase):
       orig_vars = _model()
       self.evaluate(variables.global_variables_initializer())
       save = saver_module.Saver(max_to_keep=1)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       save.save(sess, save_dir)
       orig_vals = self.evaluate(orig_vars)
 
@@ -882,7 +881,7 @@ class SaveRestoreShardedTest(test.TestCase):
           },
           write_version=self._WRITE_VERSION,
           sharded=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       t0.insert("k1", 30.0).run()
       t1.insert("k2", 40.0).run()
       val = save.save(sess, save_path)
@@ -908,15 +907,15 @@ class SaveRestoreShardedTest(test.TestCase):
             },
             write_version=self._WRITE_VERSION,
             sharded=True)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         t0.insert("k11", 33.0).run()
-        self.assertEqual(111, v0.eval())
-        self.assertEqual(b"k11", t0.keys().eval())
-        self.assertEqual(33.0, t0.values().eval())
+        self.assertEqual(111, self.evaluate(v0))
+        self.assertEqual(b"k11", self.evaluate(t0.keys()))
+        self.assertEqual(33.0, self.evaluate(t0.values()))
         save.restore(sess, save_path + "-00000-of-00002")
-        self.assertEqual(10, v0.eval())
-        self.assertEqual(b"k1", t0.keys().eval())
-        self.assertEqual(30.0, t0.values().eval())
+        self.assertEqual(10, self.evaluate(v0))
+        self.assertEqual(b"k1", self.evaluate(t0.keys()))
+        self.assertEqual(30.0, self.evaluate(t0.values()))
 
       # Restore different ops from shard 1 of the saved files.
       with session.Session(
@@ -932,15 +931,15 @@ class SaveRestoreShardedTest(test.TestCase):
             },
             write_version=self._WRITE_VERSION,
             sharded=True)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         t1.insert("k22", 44.0).run()
-        self.assertEqual(222, v1.eval())
-        self.assertEqual(b"k22", t1.keys().eval())
-        self.assertEqual(44.0, t1.values().eval())
+        self.assertEqual(222, self.evaluate(v1))
+        self.assertEqual(b"k22", self.evaluate(t1.keys()))
+        self.assertEqual(44.0, self.evaluate(t1.values()))
         save.restore(sess, save_path + "-00001-of-00002")
-        self.assertEqual(20, v1.eval())
-        self.assertEqual(b"k2", t1.keys().eval())
-        self.assertEqual(40.0, t1.values().eval())
+        self.assertEqual(20, self.evaluate(v1))
+        self.assertEqual(b"k2", self.evaluate(t1.keys()))
+        self.assertEqual(40.0, self.evaluate(t1.values()))
 
     # Now try a restore with the sharded filename.
     with session.Session(
@@ -961,26 +960,26 @@ class SaveRestoreShardedTest(test.TestCase):
           },
           write_version=self._WRITE_VERSION,
           sharded=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       t0.insert("k11", 33.0).run()
       t1.insert("k22", 44.0).run()
-      self.assertEqual(111, v0.eval())
-      self.assertEqual(222, v1.eval())
-      self.assertEqual(b"k11", t0.keys().eval())
-      self.assertEqual(33.0, t0.values().eval())
-      self.assertEqual(b"k22", t1.keys().eval())
-      self.assertEqual(44.0, t1.values().eval())
+      self.assertEqual(111, self.evaluate(v0))
+      self.assertEqual(222, self.evaluate(v1))
+      self.assertEqual(b"k11", self.evaluate(t0.keys()))
+      self.assertEqual(33.0, self.evaluate(t0.values()))
+      self.assertEqual(b"k22", self.evaluate(t1.keys()))
+      self.assertEqual(44.0, self.evaluate(t1.values()))
       save_path = os.path.join(self.get_temp_dir(), "sharded_basics")
       if save._write_version is saver_pb2.SaverDef.V1:
         save.restore(sess, save_path + "-?????-of-?????")
       else:
         save.restore(sess, save_path)
-      self.assertEqual(10, v0.eval())
-      self.assertEqual(20, v1.eval())
-      self.assertEqual(b"k1", t0.keys().eval())
-      self.assertEqual(30.0, t0.values().eval())
-      self.assertEqual(b"k2", t1.keys().eval())
-      self.assertEqual(40.0, t1.values().eval())
+      self.assertEqual(10, self.evaluate(v0))
+      self.assertEqual(20, self.evaluate(v1))
+      self.assertEqual(b"k1", self.evaluate(t0.keys()))
+      self.assertEqual(30.0, self.evaluate(t0.values()))
+      self.assertEqual(b"k2", self.evaluate(t1.keys()))
+      self.assertEqual(40.0, self.evaluate(t1.values()))
 
     if save._write_version is saver_pb2.SaverDef.V1:
       self.assertEqual(
@@ -1028,7 +1027,7 @@ class SaveRestoreShardedTest(test.TestCase):
           else:
             vs = [variables.VariableV1(rnd, name=var_name)]
 
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         if call_saver_with_dict:
           saver = saver_module.Saver({var_name: vs[0]})
         else:
@@ -1056,7 +1055,7 @@ class SaveRestoreShardedTest(test.TestCase):
                   name=var_name)
           ]
 
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         if call_saver_with_dict:
           saver = saver_module.Saver({
               var_name: new_vs[0]
@@ -1203,7 +1202,7 @@ class MaxToKeepTest(test.TestCase):
     with self.cached_session() as sess:
       v = variables.VariableV1(10.0, name="v")
       save = saver_module.Saver({"v": v}, max_to_keep=2)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual([], save.last_checkpoints)
 
       s1 = save.save(sess, os.path.join(save_dir, "s1"))
@@ -1388,7 +1387,7 @@ class MaxToKeepTest(test.TestCase):
               "v0": v0,
               "v1": v1
           }, sharded=True, max_to_keep=2)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual([], save.last_checkpoints)
 
       s1 = save.save(sess, os.path.join(save_dir, "s1"))
@@ -1434,14 +1433,13 @@ class MaxToKeepTest(test.TestCase):
       self.assertTrue(
           gfile.Exists(checkpoint_management.meta_graph_filename(s3)))
 
-  @test_util.run_deprecated_v1
   def testNoMaxToKeep(self):
     save_dir = self._get_test_dir("no_max_to_keep")
     save_dir2 = self._get_test_dir("max_to_keep_0")
 
     with self.cached_session() as sess:
       v = variables.VariableV1(10.0, name="v")
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Test max_to_keep being None.
       save = saver_module.Saver({"v": v}, max_to_keep=None)
@@ -1463,14 +1461,13 @@ class MaxToKeepTest(test.TestCase):
       self.assertEqual([], save2.last_checkpoints)
       self.assertTrue(checkpoint_management.checkpoint_exists(s2))
 
-  @test_util.run_deprecated_v1
   def testNoMetaGraph(self):
     save_dir = self._get_test_dir("no_meta_graph")
 
     with self.cached_session() as sess:
       v = variables.VariableV1(10.0, name="v")
       save = saver_module.Saver({"v": v})
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       s1 = save.save(sess, os.path.join(save_dir, "s1"), write_meta_graph=False)
       self.assertTrue(checkpoint_management.checkpoint_exists(s1))
@@ -1487,7 +1484,6 @@ class KeepCheckpointEveryNHoursTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   @test.mock.patch.object(saver_module, "time")
-  @test_util.run_deprecated_v1
   def testNonSharded(self, mock_time):
     save_dir = self._get_test_dir("keep_checkpoint_every_n_hours")
 
@@ -1607,7 +1603,6 @@ class SaveRestoreWithVariableNameMap(test.TestCase):
       self.assertEqual(20.0, self.evaluate(v1))
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.run_deprecated_v1
   def testNonReshapeResourceVariable(self):
     self._testNonReshape(resource_variable_ops.ResourceVariable)
 
@@ -1622,7 +1617,7 @@ class MetaGraphTest(test.TestCase):
     gfile.MakeDirs(test_dir)
     return test_dir
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testAddCollectionDef(self):
     test_dir = self._get_test_dir("good_collection")
     filename = os.path.join(test_dir, "metafile")
@@ -1714,7 +1709,7 @@ class MetaGraphTest(test.TestCase):
       saver1 = saver_module.Saver({"v1": v1}, name="saver1")
       ops_lib.add_to_collection("savers", saver0)
       ops_lib.add_to_collection("savers", saver1)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       # Saves to different checkpoints.
       saver0.save(sess, saver0_ckpt)
       saver1.save(sess, saver1_ckpt)
@@ -1760,7 +1755,8 @@ class MetaGraphTest(test.TestCase):
       new_saver0.restore(sess, saver0_ckpt)
       v0 = sess.graph.get_tensor_by_name("v0:0")
       v1 = sess.graph.get_tensor_by_name("v1:0")
-      self.assertAllEqual([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], v0.eval())
+      self.assertAllEqual([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
+                          self.evaluate(v0))
       self.assertEqual([3, 2], v0.get_shape())
       self.assertEqual([], v1.get_shape())
       with self.assertRaisesWithPredicateMatch(
@@ -1770,15 +1766,15 @@ class MetaGraphTest(test.TestCase):
       new_saver1 = savers[1]
       new_saver1.restore(sess, saver1_ckpt)
       v1 = sess.graph.get_tensor_by_name("v1:0")
-      self.assertEqual(11.0, v1.eval())
+      self.assertEqual(11.0, self.evaluate(v1))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testMultiSaverCollection(self):
     test_dir = self._get_test_dir("saver_collection")
     self._testMultiSaverCollectionSave(test_dir)
     self._testMultiSaverCollectionRestore(test_dir)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testClearExtraneousSavers(self):
     test_dir = self._get_test_dir("clear_extraneous_savers")
     filename = os.path.join(test_dir, "metafile")
@@ -1794,7 +1790,7 @@ class MetaGraphTest(test.TestCase):
       saver1 = saver_module.Saver({"v1": v1}, name="saver1")
       ops_lib.add_to_collection("savers", saver0)
       ops_lib.add_to_collection("savers", saver1)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Saves to different checkpoints.
       saver0.save(sess, saver0_ckpt)
@@ -1866,7 +1862,7 @@ class MetaGraphTest(test.TestCase):
                                                lambda e: "does not exist"):
         saver_module.import_meta_graph(filename)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSliceVariable(self):
     test_dir = self._get_test_dir("slice_saver")
     filename = os.path.join(test_dir, "metafile")
@@ -1878,7 +1874,7 @@ class MetaGraphTest(test.TestCase):
 
       # The names are different and will work.
       slice_saver = saver_module.Saver({"first": v1, "second": v2})
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       # Exports to meta_graph
       meta_graph_def = slice_saver.export_meta_graph(filename)
 
@@ -2093,7 +2089,6 @@ class MetaGraphTest(test.TestCase):
       return i + 1, x + r
     self._testWhileLoopAndGradientSerDes(body)
 
-  @test_util.run_deprecated_v1
   def testNestedControlFlowSerDes(self):
     # Test while loop in a cond in a while loop.
     # pylint: disable=g-long-lambda
@@ -2122,7 +2117,7 @@ class MetaGraphTest(test.TestCase):
                                       lambda: math_ops.multiply(x, -1.0))))
     # pylint: enable=g-long-lambda
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testStrippedOpListDef(self):
     with self.cached_session():
       # Creates a graph.
@@ -2364,7 +2359,7 @@ class MetaGraphTest(test.TestCase):
   def testPreserveDatasetAndFunctions(self):
     with ops_lib.Graph().as_default() as g:
       dataset = dataset_ops.Dataset.range(10).map(lambda x: x * x)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       next_element = iterator.get_next()
       _ = array_ops.identity(next_element, name="output")
 
@@ -2745,7 +2740,7 @@ class ScopedGraphTest(test.TestCase):
       graph.add_to_collection(ops_lib.GraphKeys.SAVERS, saver2)
 
     with self.session(graph=graph) as sess:
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       saver1.save(sess, saver1_ckpt, write_state=False)
       saver2.save(sess, saver2_ckpt, write_state=False)
 
@@ -2762,7 +2757,7 @@ class ScopedGraphTest(test.TestCase):
 
     with self.session(graph=graph1) as sess:
       saver_list1[0].restore(sess, saver1_ckpt)
-      self.assertEqual(1.0, var_dict1["variable1:0"].eval())
+      self.assertEqual(1.0, self.evaluate(var_dict1["variable1:0"]))
 
     graph2 = ops_lib.Graph()
     var_dict2 = meta_graph.copy_scoped_meta_graph(
@@ -2777,7 +2772,7 @@ class ScopedGraphTest(test.TestCase):
 
     with self.session(graph=graph2) as sess:
       saver_list2[0].restore(sess, saver2_ckpt)
-      self.assertEqual(2.0, var_dict2["variable2:0"].eval())
+      self.assertEqual(2.0, self.evaluate(var_dict2["variable2:0"]))
 
 
 class _OwnsAVariableSimple(checkpointable_base.CheckpointableBase):
@@ -2988,7 +2983,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
       # exception" block in Python 3.
       self.assertNotIn("NewCheckpointReader", cs.exception.message)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testGraphChangedForRestoreErrorRaised(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
@@ -3010,7 +3005,6 @@ class CheckpointableCompatibilityTests(test.TestCase):
             "a mismatch between the current graph and the graph"):
           a_saver.restore(sess=sess, save_path=save_path)
 
-  @test_util.run_deprecated_v1
   def testLoadFromObjectBasedGraph(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
@@ -3042,7 +3036,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
       self.assertEqual(before_second_restore_ops,
                        restore_graph.get_operations())
       with self.assertRaisesRegexp(errors.NotFoundError,
-                                   "could not find a_variable"):
+                                   "Could not find some variables"):
         saver.restore(sess=sess, save_path=second_path)
 
   def testLoadFromObjectBasedEager(self):
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..67ccd59b88c289a11791c9098a2014c48e6c33fb
--- /dev/null
+++ b/tensorflow/python/training/saving/BUILD
@@ -0,0 +1,55 @@
+# Description:
+#   Low-level utilities for reading and writing checkpoints.
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+py_library(
+    name = "functional_saver",
+    srcs = ["functional_saver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":saveable_object",
+        ":saveable_object_util",
+        "//tensorflow/python/eager:def_function",
+    ],
+)
+
+cuda_py_test(
+    name = "functional_saver_test",
+    size = "medium",
+    srcs = [
+        "functional_saver_test.py",
+    ],
+    additional_deps = [
+        ":functional_saver",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "saveable_object",
+    srcs = ["saveable_object.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "saveable_object_util",
+    srcs = ["saveable_object_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/training/checkpointable:base",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/python/training/saving/functional_saver.py b/tensorflow/python/training/saving/functional_saver.py
new file mode 100644
index 0000000000000000000000000000000000000000..7eed3336626ef63942a40702f9787e6b5847b97b
--- /dev/null
+++ b/tensorflow/python/training/saving/functional_saver.py
@@ -0,0 +1,101 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Saves and restore variables inside traced @tf.functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.training.saving import saveable_object_util
+
+
+class Saver(object):
+  """A minimal utility class for saving and restoring checkpoints.
+
+  Note that this is a low-level utility which stores Tensors in the keys
+  specified by `SaveableObject`s. Higher-level utilities for object-based
+  checkpointing are built on top of it.
+  """
+
+  def __init__(self, saveable_objects):
+    """Specify a list of `SaveableObject`s to save and restore.
+
+    Args:
+      saveable_objects: A list of `SaveableObject`s.
+    """
+    saveable_objects = list(saveable_objects)
+    for saveable in saveable_objects:
+      if not isinstance(saveable, saveable_object.SaveableObject):
+        raise ValueError(
+            "Saver expected a list of SaveableObjects, got %s." % (saveable,))
+    self._saveable_objects = saveable_objects
+
+  # TODO(b/120569892): Use tf.function here
+  def save(self, file_prefix):
+    """Save the saveable objects to a checkpoint with `file_prefix`.
+
+    Args:
+      file_prefix: A string or scalar string Tensor containing the prefix to
+        save under.
+    Returns:
+      A scalar string Tensor containing `file_prefix` with control dependencies
+      on the save ops.
+    """
+    tensor_names = []
+    tensors = []
+    tensor_slices = []
+    for saveable in self._saveable_objects:
+      for spec in saveable.specs:
+        tensor_names.append(spec.name)
+        tensors.append(spec.tensor)
+        tensor_slices.append(spec.slice_spec)
+    with ops.control_dependencies(
+        [io_ops.save_v2(file_prefix, tensor_names, tensor_slices, tensors)]):
+      return array_ops.identity(file_prefix)
+
+  # TODO(b/120569892): Use tf.function here
+  def restore(self, file_prefix):
+    """Restore the saveable objects from a checkpoint with `file_prefix`.
+
+    Args:
+      file_prefix: A string or scalar string Tensor containing the prefix for
+        files to read from.
+
+    Returns:
+      An operation which restores the `Saver`'s `SaveableObject`s when run, or
+      None if executing eagerly.
+    """
+    restore_ops = []
+    for saveable in self._saveable_objects:
+      if saveable.device:
+        device = saveable_object_util.set_cpu0(saveable.device)
+      else:
+        device = None
+      with ops.device(device):
+        tensors = []
+        for spec in saveable.specs:
+          tensors.append(
+              io_ops.restore_v2(
+                  file_prefix,
+                  [spec.name],
+                  [spec.slice_spec],
+                  [spec.dtype])[0])
+        restore_ops.append(saveable.restore(tensors, restored_shapes=None))
+    return control_flow_ops.group(restore_ops)
diff --git a/tensorflow/python/training/saving/functional_saver_test.py b/tensorflow/python/training/saving/functional_saver_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..40002255aacd4b3579bab6ea44bc9e5ee98f9177
--- /dev/null
+++ b/tensorflow/python/training/saving/functional_saver_test.py
@@ -0,0 +1,50 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests for the functional saver."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training.saving import functional_saver
+from tensorflow.python.training.saving import saveable_object_util
+
+
+class SaverTest(test.TestCase):
+
+  def test_resource_variable(self):
+    v1 = resource_variable_ops.ResourceVariable(2.)
+    saver = functional_saver.Saver(
+        saveable_object_util.saveable_objects_for_op(v1, "x"))
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    save_path = saver.save(constant_op.constant(prefix))
+    v1.assign(1.)
+    saver.restore(save_path)
+    self.assertEqual(2., self.evaluate(v1))
+
+    v2 = resource_variable_ops.ResourceVariable(3.)
+    second_saver = functional_saver.Saver(
+        saveable_object_util.saveable_objects_for_op(v2, "x"))
+    second_saver.restore(save_path)
+    self.assertEqual(2., self.evaluate(v2))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/saveable_object.py b/tensorflow/python/training/saving/saveable_object.py
similarity index 100%
rename from tensorflow/python/training/saveable_object.py
rename to tensorflow/python/training/saving/saveable_object.py
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa88d2c6ebd2f29c2d2de7583a918dcbc6b28b51
--- /dev/null
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -0,0 +1,340 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for working with and creating SaveableObjects."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.saving import saveable_object
+
+
+# Op names which identify variable reads which should be saved.
+_VARIABLE_OPS = set(["Variable",
+                     "VariableV2",
+                     "AutoReloadVariable",
+                     "VarHandleOp",
+                     "ReadVariableOp"])
+
+
+def set_cpu0(device_string):
+  """Creates a new device string based on `device_string` but using /CPU:0.
+
+  If the device is already on /CPU:0, this is a no-op.
+
+  Args:
+    device_string: A device string.
+
+  Returns:
+    A device string.
+  """
+  parsed_device = pydev.DeviceSpec.from_string(device_string)
+  parsed_device.device_type = "CPU"
+  parsed_device.device_index = 0
+  return parsed_device.to_string()
+
+
+class ReferenceVariableSaveable(saveable_object.SaveableObject):
+  """SaveableObject implementation that handles reference variables."""
+
+  def __init__(self, var, slice_spec, name):
+    spec = saveable_object.SaveSpec(var, slice_spec, name, dtype=var.dtype)
+    super(ReferenceVariableSaveable, self).__init__(var, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    restored_tensor = restored_tensors[0]
+    if restored_shapes is not None:
+      restored_tensor = array_ops.reshape(restored_tensor, restored_shapes[0])
+    return state_ops.assign(
+        self.op,
+        restored_tensor,
+        validate_shape=restored_shapes is None and
+        self.op.get_shape().is_fully_defined())
+
+
+class ResourceVariableSaveable(saveable_object.SaveableObject):
+  """SaveableObject implementation that handles ResourceVariables."""
+
+  def __init__(self, var, slice_spec, name):
+    self._var_device = var.device
+    self._var_shape = var.shape
+    if isinstance(var, ops.Tensor):
+      self.handle_op = var.op.inputs[0]
+      tensor = var
+    elif isinstance(var, resource_variable_ops.ResourceVariable):
+
+      def _read_variable_closure(v):
+        def f():
+          with ops.device(v.device):
+            x = v.read_value()
+            # To allow variables placed on non-CPU devices to be checkpointed,
+            # we copy them to CPU on the same machine first.
+            with ops.device("/device:CPU:0"):
+              return array_ops.identity(x)
+        return f
+
+      self.handle_op = var.handle
+      tensor = _read_variable_closure(var)
+    else:
+      raise ValueError(
+          "Saveable is neither a resource variable nor a read operation."
+          " Got: %s" % repr(var))
+    spec = saveable_object.SaveSpec(tensor, slice_spec, name,
+                                    dtype=var.dtype)
+    super(ResourceVariableSaveable, self).__init__(var, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    restored_tensor = restored_tensors[0]
+    if restored_shapes is not None:
+      restored_tensor = array_ops.reshape(restored_tensor, restored_shapes[0])
+    # Copy the restored tensor to the variable's device.
+    with ops.device(self._var_device):
+      restored_tensor = array_ops.identity(restored_tensor)
+      return resource_variable_ops.shape_safe_assign_variable_handle(
+          self.handle_op, self._var_shape, restored_tensor)
+
+
+def _tensor_comes_from_variable(v):
+  return isinstance(v, ops.Tensor) and v.op.type in _VARIABLE_OPS
+
+
+def saveable_objects_for_op(op, name):
+  """Create `SaveableObject`s from an operation.
+
+  Args:
+    op: A variable, operation, or SaveableObject to coerce into a
+      SaveableObject.
+    name: A string name for the SaveableObject.
+
+  Yields:
+    `SaveableObject`s which together save/restore `op`.
+
+  Raises:
+    TypeError: If `name` is not a string.
+    ValueError: For operations with no known conversion to SaveableObject.
+  """
+  if not isinstance(name, six.string_types):
+    raise TypeError(
+        "names_to_saveables must be a dict mapping string names to "
+        "checkpointable operations. Name is not a string: %s" % name)
+  if isinstance(op, saveable_object.SaveableObject):
+    yield op
+  elif isinstance(op, (list, tuple, variables.PartitionedVariable)):
+    if isinstance(op, variables.PartitionedVariable):
+      op = list(op)
+    # A set of slices.
+    slice_name = None
+    # pylint: disable=protected-access
+    for variable in op:
+      if not isinstance(variable, variables.Variable):
+        raise ValueError("Slices must all be Variables: %s" % variable)
+      if not variable._save_slice_info:
+        raise ValueError("Slices must all be slices: %s" % variable)
+      if slice_name is None:
+        slice_name = variable._save_slice_info.full_name
+      elif slice_name != variable._save_slice_info.full_name:
+        raise ValueError(
+            "Slices must all be from the same tensor: %s != %s" %
+            (slice_name, variable._save_slice_info.full_name))
+      if variable.op.type in ["Variable", "VariableV2",
+                              "AutoReloadVariable"]:
+        yield ReferenceVariableSaveable(
+            variable, variable._save_slice_info.spec, name)
+      else:
+        yield ResourceVariableSaveable(
+            variable, variable._save_slice_info.spec, name)
+    # pylint: enable=protected-access
+  elif isinstance(op, checkpointable.CheckpointableBase) and not isinstance(
+      op, variables.Variable):
+    # pylint: disable=protected-access
+    for attr, factory in op._gather_saveables_for_checkpoint().items():
+      if attr == checkpointable.VARIABLE_VALUE_KEY:
+        # Keep original name for classes masquerading as variables.
+        full_name = name
+      else:
+        full_name = name + "_" + attr
+      op = (factory(full_name) if callable(factory) else factory)
+      for op in saveable_objects_for_op(op, op.name):
+        yield op
+    # pylint: enable=protected-access
+  else:
+    # A variable or tensor.
+    if isinstance(op, resource_variable_ops.ResourceVariable):
+      # pylint: disable=protected-access
+      if op._in_graph_mode:
+        variable = op._graph_element
+      else:
+        variable = op
+      # pylint: enable=protected-access
+      yield ResourceVariableSaveable(variable, "", name)
+    else:
+      with ops.init_scope():
+        if context.executing_eagerly():
+          raise ValueError("Can only save/restore ResourceVariables when "
+                           "executing eagerly, got type: %s." % type(op))
+
+      variable = ops.internal_convert_to_tensor(op, as_ref=True)
+      if not _tensor_comes_from_variable(variable):
+        raise TypeError("names_to_saveables must be a dict mapping string "
+                        "names to Tensors/Variables. Not a variable: %s" %
+                        variable)
+      if variable.op.type in ["Variable", "VariableV2",
+                              "AutoReloadVariable"]:
+        yield ReferenceVariableSaveable(variable, "", name)
+      else:
+        yield ResourceVariableSaveable(
+            variable, "", name)
+
+
+def op_list_to_dict(op_list, convert_variable_to_tensor=True):
+  """Create a dictionary of names to operation lists.
+
+  Args:
+    op_list: A list, tuple, or set of Variables or SaveableObjects.
+    convert_variable_to_tensor: Whether or not to convert single Variables
+      with no slice info into Tensors.
+
+  Returns:
+    A dictionary of names to the operations that must be saved under
+    that name.  Variables with save_slice_info are grouped together under the
+    same key in no particular order.
+
+  Raises:
+    TypeError: If the type of op_list or its elements is not supported.
+    ValueError: If at least two saveables share the same name.
+  """
+  if not isinstance(op_list, (list, tuple, set)):
+    raise TypeError("Variables to save should be passed in a dict or a "
+                    "list: %s" % op_list)
+  # When ResourceVariables are converted to Tensors, read ops are added to the
+  # graph. Sorting the op_list ensures that the resulting graph is always
+  # constructed in a deterministic way:
+  op_list = sorted(op_list, key=lambda x: x.name)
+  names_to_saveables = {}
+  # pylint: disable=protected-access
+  for var in op_list:
+    if isinstance(var, saveable_object.SaveableObject):
+      names_to_saveables[var.name] = var
+    elif isinstance(var, variables.PartitionedVariable):
+      if var.name in names_to_saveables:
+        raise ValueError("At least two variables have the same name: %s" %
+                         var.name)
+      names_to_saveables[var.name] = var
+    elif isinstance(var, variables.Variable) and var._save_slice_info:
+      name = var._save_slice_info.full_name
+      if name in names_to_saveables:
+        if not isinstance(names_to_saveables[name], list):
+          raise ValueError("Mixing slices and non-slices with the same name: "
+                           "%s" % name)
+        names_to_saveables[name].append(var)
+      else:
+        names_to_saveables[name] = [var]
+    elif (isinstance(var, checkpointable.CheckpointableBase)
+          and not isinstance(var, variables.Variable)):
+      checkpointable_saveables = [
+          (factory() if callable(factory) else factory)
+          for factory in var._gather_saveables_for_checkpoint().values()]
+      names_to_saveables.update(
+          op_list_to_dict(checkpointable_saveables))
+    else:
+      if context.executing_eagerly():
+        if not isinstance(var, resource_variable_ops.ResourceVariable):
+          raise ValueError(
+              "Can only save/restore ResourceVariables when eager execution "
+              "is enabled, type: %s." % type(var))
+        set_var = names_to_saveables.setdefault(var._shared_name, var)
+        if set_var is not var:
+          raise ValueError(
+              ("Two different ResourceVariable objects with the same "
+               "shared_name '%s' were passed to the Saver. This likely means "
+               "that they were created in different Graphs or isolation "
+               "contexts, and may not be checkpointed together.") %
+              (var._shared_name,))
+      else:
+        if convert_variable_to_tensor:
+          if isinstance(var, resource_variable_ops.ResourceVariable):
+            var = var._graph_element  # pylint: disable=protected-access
+          else:
+            var = ops.internal_convert_to_tensor(var, as_ref=True)
+          if not _tensor_comes_from_variable(var):
+            raise TypeError("Variable to save is not a Variable: %s" % var)
+        if var.op.type == "ReadVariableOp":
+          name = var.op.inputs[0].op.name
+        else:
+          name = var.op.name
+        if name in names_to_saveables:
+          raise ValueError("At least two variables have the same name: %s" %
+                           name)
+        names_to_saveables[name] = var
+
+    # pylint: enable=protected-access
+  return names_to_saveables
+
+
+def _add_saveable(saveables, seen_ops, saveable):
+  """Adds the saveable to the saveables list.
+
+  Args:
+    saveables: List to append the SaveableObject to.
+    seen_ops: Set of the ops of the saveables already processed.  Used to
+      check that each saveable is only saved once.
+    saveable: The saveable.
+
+  Raises:
+    ValueError: If the saveable has already been processed.
+  """
+  if saveable.op in seen_ops:
+    raise ValueError("The same saveable will be restored with two names: %s" %
+                     saveable.name)
+  saveables.append(saveable)
+  seen_ops.add(saveable.op)
+
+
+def validate_and_slice_inputs(names_to_saveables):
+  """Returns the variables and names that will be used for a Saver.
+
+  Args:
+    names_to_saveables: A dict (k, v) where k is the name of an operation and
+       v is an operation to save or a BaseSaverBuilder.Saver.
+
+  Returns:
+    A list of SaveableObjects.
+
+  Raises:
+    TypeError: If any of the keys are not strings or any of the
+      values are not one of Tensor or Variable or a checkpointable operation.
+    ValueError: If the same operation is given in more than one value
+      (this also applies to slices of SlicedVariables).
+  """
+  if not isinstance(names_to_saveables, dict):
+    names_to_saveables = op_list_to_dict(names_to_saveables)
+
+  saveables = []
+  seen_ops = set()
+  for name, op in sorted(names_to_saveables.items(),
+                         # Avoid comparing ops, sort only by name.
+                         key=lambda x: x[0]):
+    for converted_saveable_object in saveable_objects_for_op(op, name):
+      _add_saveable(saveables, seen_ops, converted_saveable_object)
+  return saveables
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index 302ca2dd44b99d2a5cfeffa163d95634513f9eaa..c5ca2ac403567c237307b12662fd6277afa794fa 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -23,6 +23,7 @@ from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -93,7 +94,8 @@ def _make_server_def(server_or_cluster_def, job_name, task_index, protocol,
   return server_def
 
 
-@tf_export("train.Server")
+@tf_export("distribute.Server", v1=["distribute.Server", "train.Server"])
+@deprecation.deprecated_endpoints("train.Server")
 class Server(object):
   """An in-process TensorFlow server, for use in distributed training.
 
@@ -342,6 +344,9 @@ class ClusterSpec(object):
     ret = {}
     for job in self.jobs:
       task_indices = self.task_indices(job)
+      if len(task_indices) == 0:
+        ret[job] = {}
+        continue
       if max(task_indices) + 1 == len(task_indices):
         # Return a list because the task indices are dense. This
         # matches the behavior of `as_dict()` before support for
diff --git a/tensorflow/python/training/server_lib_same_variables_no_clear_test.py b/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
index 1b2d588f444a1b829526deb07870f6ed26381032..ff3fab9f372aecae28adf84a3d800759e3487665 100644
--- a/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
+++ b/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
@@ -34,7 +34,7 @@ class SameVariablesNoClearTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSameVariablesNoClear(self):
     server = server_lib.Server.create_local_server()
 
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index 323e94c257c4116a6120e28b2355a42657d1bea8..db45d80bd2b890d8a8fcc5aaff55b0a3a720a167 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -55,6 +56,7 @@ class GrpcServerTest(test.TestCase):
       self.assertAllEqual([[4]], sess.run(e))
     # TODO(mrry): Add `server.stop()` and `server.join()` when these work.
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleSessions(self):
     server = self._cached_server
 
@@ -73,6 +75,7 @@ class GrpcServerTest(test.TestCase):
     # TODO(mrry): Add `server.stop()` and `server.join()` when these work.
 
   # Verifies various reset failures.
+  @test_util.run_v1_only("b/120545219")
   def testResetFails(self):
     # Creates variable with container name.
     with ops.container("test0"):
@@ -146,6 +149,7 @@ class GrpcServerTest(test.TestCase):
       self.assertEqual(0.5, min_val)
       self.assertEqual(0.5, max_val)
 
+  @test_util.run_v1_only("b/120545219")
   def testCloseCancelsBlockingOperation(self):
     server = self._cached_server
     sess = session.Session(server.target, config=self._useRPCConfig())
@@ -207,6 +211,7 @@ class GrpcServerTest(test.TestCase):
               "local": ["localhost"]
           }, job_name="local", task_index=0)
 
+  @test_util.run_v1_only("b/120545219")
   def testTimeoutRaisesException(self):
     server = self._cached_server
     q = data_flow_ops.FIFOQueue(1, [dtypes.float32])
@@ -241,6 +246,7 @@ class GrpcServerTest(test.TestCase):
       queue_runner_impl.start_queue_runners(sess)
       sess.run(var.assign(3.0))
 
+  @test_util.run_v1_only("b/120545219")
   def testIsolateSessionState(self):
     server = self._cached_server
 
@@ -296,6 +302,7 @@ class GrpcServerTest(test.TestCase):
     self.assertAllEqual(37, isolate_sess_0.run(v))
     self.assertAllEqual([19, 86], isolate_sess_1.run(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testShapeChangingIsolateState(self):
     server = self._cached_server
     sharing_config = config_pb2.ConfigProto(isolate_session_state=False)
@@ -446,6 +453,29 @@ class ClusterSpecTest(test.TestCase):
                          tasks { key: 2 value: 'worker2:2222' } }
     """
 
+    self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_cluster_def()).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_dict()).as_cluster_def())
+
+  def testProtoDictDefEquivalencesWithZeroWorker(self):
+    cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": []
+    })
+
+    expected_proto = """
+    job { name: 'ps' tasks { key: 0 value: 'ps0:2222' }
+                     tasks { key: 1 value: 'ps1:2222' } }
+    job { name: 'worker' }
+    """
+
     self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
     self.assertProtoEquals(
         expected_proto, server_lib.ClusterSpec(cluster_spec).as_cluster_def())
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index 14658630c559dedb10bece93bad9df0c61a88a37..0f68fcfe8bb4cb81e54ba27d35bfb0b2e3888a1b 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -25,7 +25,6 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -183,12 +182,6 @@ class SessionManager(object):
     """
     self._target = master
     sess = session.Session(self._target, graph=self._graph, config=config)
-    # TODO(jhseu): Delete once tpu.initialize_system() goes away.
-    initialize_ops = (
-        distribution_strategy_context.get_distribution_strategy().initialize()
-    )
-    if initialize_ops:
-      sess.run(initialize_ops)
 
     if checkpoint_dir and checkpoint_filename_with_path:
       raise ValueError("Can not provide both checkpoint_dir and "
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index 4294ffa8512d3348968fcb2903918ce3315e8729..c9a0c56ffc1e78f1f654b4ec224bf8480d53ad9b 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -69,7 +69,7 @@ class SessionManagerTest(test.TestCase):
           "", init_fn=lambda sess: sess.run(v.initializer))
       self.assertAllClose([125], sess.run(v))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionFails(self):
     checkpoint_dir = os.path.join(self.get_temp_dir(), "prepare_session")
     checkpoint_dir2 = os.path.join(self.get_temp_dir(), "prepare_session2")
@@ -154,7 +154,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEquals(1, sess.run(v))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSession(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session")
@@ -187,6 +187,7 @@ class SessionManagerTest(test.TestCase):
           checkpoint_filename_with_path=checkpoint_management.latest_checkpoint(
               checkpoint_dir))
 
+  @test_util.run_v1_only("b/120545219")
   def testWaitForSessionReturnsNoneAfterTimeout(self):
     with ops.Graph().as_default():
       variables.VariableV1(1, name="v")
@@ -209,7 +210,7 @@ class SessionManagerTest(test.TestCase):
               variables.global_variables()),
           local_init_op=None)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionWithReadyForLocalInitOp(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(),
@@ -263,7 +264,7 @@ class SessionManagerTest(test.TestCase):
       self.assertEquals(1, sess.run(v))
       self.assertEquals(1, sess.run(w))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionWithReadyForLocalInitOpFailsToReadyLocal(self):
     # We use ready_for_local_init_op=tf.report_uninitialized_variables(),
     # which causes recover_session to not run local_init_op, and to return
@@ -320,7 +321,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEquals(1, sess.run(v))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionNoChkptStillRunsLocalInitOp(self):
     # This test checks for backwards compatibility.
     # In particular, we continue to ensure that recover_session will execute
@@ -349,7 +350,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEquals(1, sess.run(w))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionFailsStillRunsLocalInitOp(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(
@@ -393,7 +394,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEquals(1, sess.run(w))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWaitForSessionLocalInit(self):
     server = server_lib.Server.create_local_server()
     with ops.Graph().as_default() as graph:
@@ -445,7 +446,7 @@ class SessionManagerTest(test.TestCase):
         # because of overly restrictive ready_for_local_init_op
         sm.wait_for_session("", max_wait_secs=3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWaitForSessionInsufficientReadyForLocalInitCheck(self):
     with ops.Graph().as_default() as graph:
       v = variables.VariableV1(1, name="v")
@@ -463,7 +464,7 @@ class SessionManagerTest(test.TestCase):
                                  "Session was not ready after waiting.*"):
       sm.wait_for_session("", max_wait_secs=3)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithReadyForLocalInitOp(self):
     with ops.Graph().as_default():
       v = variables.VariableV1(1, name="v")
@@ -503,7 +504,7 @@ class SessionManagerTest(test.TestCase):
       self.assertEquals(1, sess.run(w))
       self.assertEquals(3, sess.run(x))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithPartialInitOp(self):
     with ops.Graph().as_default():
       v = variables.VariableV1(1, name="v")
@@ -570,7 +571,7 @@ class SessionManagerTest(test.TestCase):
       self.assertEquals(1, sess.run(w_res))
       self.assertEquals(3, sess.run(x_res))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithCyclicInitializer(self):
     # Regression test. Previously Variable._build_initializer_expr would enter
     # into an infinite recursion when the variable's initial_value involved
@@ -644,7 +645,7 @@ class SessionManagerTest(test.TestCase):
           "Init operations did not make model ready for local_init"):
         sm2.prepare_session("", init_op=None)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithInsufficientReadyForLocalInitCheck(self):
     with ops.Graph().as_default():
       v = variables.VariableV1(1, name="v")
@@ -697,7 +698,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
           "", init_fn=lambda sess: sess.run(v.initializer))
       self.assertAllClose([125], sess.run(v))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionFails(self):
     checkpoint_dir = os.path.join(self.get_temp_dir(), "prepare_session")
     checkpoint_dir2 = os.path.join(self.get_temp_dir(), "prepare_session2")
@@ -759,7 +760,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSession(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session")
@@ -798,6 +799,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEquals(1, sess.run(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testWaitForSessionReturnsNoneAfterTimeout(self):
     with ops.Graph().as_default():
       variables.VariableV1(1, name="v")
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index e9a61def7430fec0190c8f7b788fd7b72492e432..886ca46ed59d7626b970261c531e7087da4b411e 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -94,7 +94,7 @@ import collections
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.SessionRunHook")
+@tf_export(v1=["train.SessionRunHook"])
 class SessionRunHook(object):
   """Hook to extend calls to MonitoredSession.run()."""
 
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index d76b22acd86956e9b7bbd768299e3db7f630a4d5..ecf5a96ed49146fe4cafce6a809925aab5bdc6fb 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -39,13 +39,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribution_strategy_context
 
 
 def _create_slot_var(primary, val, scope, validate_shape, shape, dtype):
diff --git a/tensorflow/python/training/slot_creator_test.py b/tensorflow/python/training/slot_creator_test.py
index 1f26aaa434e04667ca2900f2067f21c90c65b96b..ec2eec39324eaed08406d6301b8a329d4888d688 100644
--- a/tensorflow/python/training/slot_creator_test.py
+++ b/tensorflow/python/training/slot_creator_test.py
@@ -32,13 +32,13 @@ from tensorflow.python.training import slot_creator
 
 class SlotCreatorTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testCreateSlotFromVariable(self):
     with self.cached_session():
       v = variables.Variable([1.0, 2.5], name="var")
       slot = slot_creator.create_slot(v, v.initialized_value(), name="slot")
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
@@ -51,7 +51,7 @@ class SlotCreatorTest(test.TestCase):
       v = constant_op.constant([1.0, 2.5], name="const")
       slot = slot_creator.create_slot(v, v * 2, name="slot")
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
@@ -66,14 +66,14 @@ class SlotCreatorTest(test.TestCase):
         slot = slot_creator.create_zeros_slot(
             v, name="slot", dtype=dtypes.float64)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float64, slot.dtype.base_dtype)
       self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testCreateZerosSlotFromDynamicShapedVariable(self):
     with self.cached_session():
       dyn_shape = constant_op.constant([2], dtype=dtypes.int32)
@@ -88,7 +88,7 @@ class SlotCreatorTest(test.TestCase):
         slot = slot_creator.create_zeros_slot(
             v, name="slot", dtype=dtypes.float64)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], array_ops.shape(slot).eval())
@@ -102,7 +102,7 @@ class SlotCreatorTest(test.TestCase):
       with ops.control_dependencies(None):
         slot = slot_creator.create_zeros_slot(v, name="slot")
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
@@ -118,14 +118,14 @@ class SlotCreatorTest(test.TestCase):
         slot = slot_creator.create_zeros_slot(
             v, name="slot", dtype=dtypes.float64)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], array_ops.shape(slot).eval())
       self.assertEqual(dtypes.float64, slot.dtype.base_dtype)
       self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testCreateSlotFromVariableRespectsScope(self):
     # See discussion on #2740.
     with self.cached_session():
diff --git a/tensorflow/python/training/supervisor_test.py b/tensorflow/python/training/supervisor_test.py
index f6505acc9ac2a1a17ffca9b12c1a6838f3820148..180ddb52876635c584a12aad26c3703f0fae9d9a 100644
--- a/tensorflow/python/training/supervisor_test.py
+++ b/tensorflow/python/training/supervisor_test.py
@@ -421,7 +421,7 @@ class SupervisorTest(test.TestCase):
       with self.assertRaisesRegexp(RuntimeError, "requires a summary writer"):
         sv.summary_computed(sess, sess.run(summ))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testLogdirButExplicitlyNoSummaryWriter(self):
     logdir = self._test_dir("explicit_no_summary_writer")
     with ops.Graph().as_default():
@@ -507,7 +507,7 @@ class SupervisorTest(test.TestCase):
       sv = supervisor.Supervisor(logdir="", session_manager=sm)
       sv.prepare_or_wait_for_session("")
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testInitOp(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
@@ -517,7 +517,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testInitFn(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
@@ -531,7 +531,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testInitOpWithFeedDict(self):
     logdir = self._test_dir("feed_dict_init_op")
     with ops.Graph().as_default():
@@ -545,7 +545,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testReadyForLocalInitOp(self):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("default_ready_for_local_init_op")
@@ -588,7 +588,7 @@ class SupervisorTest(test.TestCase):
     sv0.stop()
     sv1.stop()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testReadyForLocalInitOpRestoreFromCheckpoint(self):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("ready_for_local_init_op_restore")
@@ -720,7 +720,7 @@ class SupervisorTest(test.TestCase):
                                    "Variables not initialized: w"):
         sv.prepare_or_wait_for_session(server.target)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSetupFail(self):
     logdir = self._test_dir("setup_fail")
     with ops.Graph().as_default():
@@ -731,7 +731,7 @@ class SupervisorTest(test.TestCase):
       variables.VariableV1([1.0, 2.0, 3.0], name="v")
       supervisor.Supervisor(logdir=logdir, is_chief=False)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testDefaultGlobalStep(self):
     logdir = self._test_dir("default_global_step")
     with ops.Graph().as_default():
@@ -741,7 +741,7 @@ class SupervisorTest(test.TestCase):
       self.assertEquals(287, sess.run(sv.global_step))
       sv.stop()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testRestoreFromMetaGraph(self):
     logdir = self._test_dir("restore_from_meta_graph")
     with ops.Graph().as_default():
@@ -763,7 +763,7 @@ class SupervisorTest(test.TestCase):
   # This test is based on the fact that the standard services start
   # right away and get to run once before sv.stop() returns.
   # We still sleep a bit to make the test robust.
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testStandardServicesWithoutGlobalStep(self):
     logdir = self._test_dir("standard_services_without_global_step")
     # Create a checkpoint.
@@ -814,7 +814,7 @@ class SupervisorTest(test.TestCase):
 
   # Same as testStandardServicesNoGlobalStep but with a global step.
   # We should get a summary about the step time.
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testStandardServicesWithGlobalStep(self):
     logdir = self._test_dir("standard_services_with_global_step")
     # Create a checkpoint.
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index fbde8fe3c2a5ee720df4eef9659a1b9ebae9922c..cd4590db7f6550f8790ad683c9aaecf145ad12da 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -44,6 +45,9 @@ from tensorflow.python.util.tf_export import tf_export
 class SyncReplicasOptimizer(optimizer.Optimizer):
   """Class to synchronize, aggregate gradients and pass them to the optimizer.
 
+  This class is deprecated. For synchrononous training, please use [Distribution
+  Strategies](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/distribute).
+
   In a typical asynchronous training environment, it's common to have some
   stale gradients. For example, with a N-replica asynchronous training,
   gradients will be applied to the variables N times independently. Depending
@@ -142,9 +146,9 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
 
   @deprecation.deprecated(
       None,
-      "The `SyncReplicaOptimizer` is deprecated. For synchrononous training, "
-      "please use [Distribution Strategies](https://github.com/tensorflow/"
-      "tensorflow/tree/master/tensorflow/contrib/distribute).",
+      "The `SyncReplicaOptimizer` class is deprecated. For synchrononous "
+      "training, please use [Distribution Strategies](https://github.com/"
+      "tensorflow/tensorflow/tree/master/tensorflow/contrib/distribute).",
       warn_once=True)
   def __init__(self,
                opt,
@@ -256,7 +260,9 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
     # local_anchor op will be placed on this worker task by default.
     local_anchor = control_flow_ops.no_op()
     # Colocating local_step variable prevents it being placed on the PS.
-    with ops.colocate_with(local_anchor):
+    distribution_strategy = (
+        distribution_strategy_context.get_distribution_strategy())
+    with distribution_strategy.colocate_vars_with(local_anchor):
       self._local_step = variable_scope.variable(
           initial_value=0,
           trainable=False,
diff --git a/tensorflow/python/training/sync_replicas_optimizer_test.py b/tensorflow/python/training/sync_replicas_optimizer_test.py
index 1ef8756ef671b652e2fb1b7616d813db7089fec2..428583d048ab30c8ccad0a5e32b47455c5c9bc3c 100644
--- a/tensorflow/python/training/sync_replicas_optimizer_test.py
+++ b/tensorflow/python/training/sync_replicas_optimizer_test.py
@@ -22,6 +22,7 @@ import time
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.test_util import create_local_cluster
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -88,6 +89,7 @@ class SyncReplicasOptimizerTest(test.TestCase):
   def _run(self, train_op, sess):
     sess.run(train_op)
 
+  @test_util.run_v1_only("b/120545219")
   def test2Workers(self):
     num_workers = 2
     replicas_to_aggregate = 2
@@ -178,6 +180,7 @@ class SyncReplicasOptimizerTest(test.TestCase):
                         sessions[1].run(var_1_g_1))
 
   # 3 workers and one of them is backup.
+  @test_util.run_v1_only("b/120545219")
   def test3Workers1Backup(self):
     num_workers = 3
     replicas_to_aggregate = 2
@@ -266,6 +269,7 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
                                  "apply_gradient should be called"):
       hook.begin()
 
+  @test_util.run_v1_only("b/120545219")
   def testCanCreatedBeforeMinimizeCalled(self):
     """This behavior is required to be integrated with Estimators."""
     opt = training.SyncReplicasOptimizer(
@@ -278,6 +282,7 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
     opt.minimize(v, global_step=global_step)
     hook.begin()
 
+  @test_util.run_v1_only("b/120545219")
   def testFetchVariableList(self):
     opt = training.SyncReplicasOptimizer(
         opt=adam.AdamOptimizer(0.01),
diff --git a/tensorflow/python/training/tensorboard_logging_test.py b/tensorflow/python/training/tensorboard_logging_test.py
index 5af6a0aa7b430cd6dc3d2e9f54392cf9ffafa63a..5088ab07e5e387c880aadc8de7385b53df911a29 100644
--- a/tensorflow/python/training/tensorboard_logging_test.py
+++ b/tensorflow/python/training/tensorboard_logging_test.py
@@ -25,6 +25,7 @@ import tempfile
 import time
 
 from tensorflow.core.util import event_pb2
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary_iterator
@@ -32,6 +33,7 @@ from tensorflow.python.summary.writer import writer
 from tensorflow.python.training import tensorboard_logging
 
 
+@test_util.run_v1_only("b/120545219")
 class EventLoggingTest(test.TestCase):
 
   def setUp(self):
@@ -85,6 +87,7 @@ class EventLoggingTest(test.TestCase):
                                   (event_pb2.LogMessage.ERROR, "format")])
     self.assertEqual(2, self.logged_message_count)
 
+  @test_util.run_v1_only("b/120545219")
   def testVerbosity(self):
     tensorboard_logging.set_summary_writer(self._sw)
     tensorboard_logging.set_verbosity(tensorboard_logging.ERROR)
@@ -112,6 +115,7 @@ class EventLoggingTest(test.TestCase):
     tensorboard_logging.warn("this should work")
     self.assertEqual(1, self.logged_message_count)
 
+  @test_util.run_v1_only("b/120545219")
   def testSummaryWriterFailsAfterClear(self):
     tensorboard_logging._clear_summary_writer()
     with self.assertRaises(RuntimeError):
diff --git a/tensorflow/python/training/training_ops_test.py b/tensorflow/python/training/training_ops_test.py
index 51f49ca0818c08373267c490c266839b6dfeb194..8ba6abdcf956bdebc00145a53ca34322847c180f 100644
--- a/tensorflow/python/training/training_ops_test.py
+++ b/tensorflow/python/training/training_ops_test.py
@@ -53,14 +53,14 @@ class TrainingOpsTest(TensorFlowTestCase):
     self.setUp()
     with self.session(use_gpu=use_gpu):
       var = variables.VariableV1(x)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_sgd = training_ops.apply_gradient_descent(var, alpha, delta)
       out = self.evaluate(apply_sgd)
       self.assertShapeEqual(out, apply_sgd)
       self.assertAllCloseAccordingToType(x - alpha * delta, out)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testApplyGradientDescent(self):
     for (dtype, use_gpu) in itertools.product(
         [np.float16, np.float32, np.float64], [False, True]):
@@ -74,7 +74,7 @@ class TrainingOpsTest(TensorFlowTestCase):
     with self.session(use_gpu=use_gpu):
       var = variables.VariableV1(x)
       accum = variables.VariableV1(y)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_adagrad = training_ops.apply_adagrad(var, accum, lr, grad)
@@ -99,7 +99,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       var = variables.VariableV1(x)
       accum = variables.VariableV1(y)
       linear = variables.VariableV1(z)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_ftrl = training_ops.apply_ftrl(var, accum, linear, grad, lr, l1, l2,
@@ -129,7 +129,7 @@ class TrainingOpsTest(TensorFlowTestCase):
         self.assertAllClose(linear_update, self.evaluate(linear))
         self.assertAllClose(expected_out, out)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testApplyAdagrad(self):
     for (dtype, use_gpu) in itertools.product(
         [np.float16, np.float32, np.float64], [False, True]):
@@ -139,7 +139,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       grad = np.arange(100).astype(dtype)
       self._testTypesForAdagrad(x, y, lr, grad, use_gpu)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testApplyFtrl(self):
     for dtype in [np.float16, np.float32, np.float64]:
       x = np.arange(100).astype(dtype)
@@ -156,7 +156,7 @@ class TrainingOpsTest(TensorFlowTestCase):
     with self.session(use_gpu=False):
       var = variables.VariableV1(x)
       accum = variables.VariableV1(y)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
       sparse_apply_adagrad = training_ops.sparse_apply_adagrad(
@@ -187,7 +187,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       var = variables.VariableV1(x)
       accum = variables.VariableV1(y)
       linear = variables.VariableV1(z)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
       sparse_apply_ftrl = training_ops.sparse_apply_ftrl(
@@ -211,7 +211,7 @@ class TrainingOpsTest(TensorFlowTestCase):
         self.assertAllCloseAccordingToType(y[index] + grad[i] * grad[i],
                                            self.evaluate(accum)[index])
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSparseApplyAdagrad(self):
     for (dtype, index_type) in itertools.product(
         [np.float16, np.float32, np.float64], [np.int32, np.int64]):
@@ -225,7 +225,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseAdagrad(x, y, lr, grad, indices)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSparseApplyAdagradDim1(self):
     for (dtype, index_type) in itertools.product(
         [np.float16, np.float32, np.float64], [np.int32, np.int64]):
@@ -239,7 +239,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseAdagrad(x, y, lr, grad, indices)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testSparseApplyFtrlDim1(self):
     for (dtype, index_type) in itertools.product(
         [np.float16, np.float32, np.float64], [np.int32, np.int64]):
@@ -255,7 +255,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseFtrl(x, y, z, lr, grad, indices)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testApplyAdam(self):
     for dtype, use_gpu in itertools.product(
         [np.float16, np.float32, np.float64], [False, True]):
@@ -285,7 +285,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       beta2_power_t = variables.VariableV1(beta2_power)
       lr_t = constant_op.constant(lr, self._toType(var.dtype), [])
       epsilon_t = constant_op.constant(epsilon, self._toType(var.dtype), [])
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(var, self.evaluate(var_t))
       new_var, _, _ = self._adamUpdateNumpy(var, grad, t, m, v, lr, beta1,
diff --git a/tensorflow/python/training/training_util_test.py b/tensorflow/python/training/training_util_test.py
index 3317008fce0b0cf882f00598d6f6a66042785602..3f9858a33bafc6ae0750695ec55e97ad5800119b 100644
--- a/tensorflow/python/training/training_util_test.py
+++ b/tensorflow/python/training/training_util_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.training import monitored_session
 from tensorflow.python.training import training_util
 
 
+@test_util.run_v1_only('b/120545219')
 class GlobalStepTest(test.TestCase):
 
   def _assert_global_step(self, global_step, expected_dtype=dtypes.int64):
@@ -47,7 +48,6 @@ class GlobalStepTest(test.TestCase):
     self.assertRaisesRegexp(TypeError, 'does not have integer type',
                             training_util.get_global_step, g)
 
-  @test_util.run_deprecated_v1
   def test_invalid_shape(self):
     with ops.Graph().as_default() as g:
       self.assertIsNone(training_util.get_global_step())
@@ -72,7 +72,6 @@ class GlobalStepTest(test.TestCase):
                               training_util.create_global_step, g)
       self._assert_global_step(training_util.create_global_step(ops.Graph()))
 
-  @test_util.run_deprecated_v1
   def test_get_global_step(self):
     with ops.Graph().as_default() as g:
       self.assertIsNone(training_util.get_global_step())
@@ -93,6 +92,7 @@ class GlobalStepTest(test.TestCase):
       self._assert_global_step(training_util.get_or_create_global_step(g))
 
 
+@test_util.run_v1_only('b/120545219')
 class GlobalStepReadTest(test.TestCase):
 
   def test_global_step_read_is_none_if_there_is_no_global_step(self):
diff --git a/tensorflow/python/training/warm_starting_util.py b/tensorflow/python/training/warm_starting_util.py
index 19dc04e8fb2a391784738c2f3d45678f69295ab3..1382b8ce72e93b19a16e60ac597a2413941b638e 100644
--- a/tensorflow/python/training/warm_starting_util.py
+++ b/tensorflow/python/training/warm_starting_util.py
@@ -28,7 +28,7 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_ops
 from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import saver
+from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -139,7 +139,7 @@ def _infer_var_name(var):
   Returns:
     Name of the `var`
   """
-  name_to_var_dict = saver.BaseSaverBuilder.OpListToDict(var)
+  name_to_var_dict = saveable_object_util.op_list_to_dict(var)
   if len(name_to_var_dict) > 1:
     raise TypeError("`var` = %s passed as arg violates the constraints.  "
                     "name_to_var_dict = %s" % (var, name_to_var_dict))
@@ -360,7 +360,7 @@ def _get_grouped_variables(vars_to_warm_start):
   return grouped_variables
 
 
-@tf_export("train.warm_start")
+@tf_export(v1=["train.warm_start"])
 def warm_start(ckpt_to_initialize_from,
                vars_to_warm_start=".*",
                var_name_to_vocab_info=None,
diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..e94e3345348b119bc64dd487c3c2a14603a2ce09
--- /dev/null
+++ b/tensorflow/python/util/dispatch.py
@@ -0,0 +1,191 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Type-based dispatch for TensorFlow ops.
+
+"Operation dispatchers" can be used to override the behavior for TensorFlow ops
+when they are called with otherwise unsupported argument types.  In particular,
+when an operation is called with arguments that would cause it to raise a
+TypeError, it falls back on its registered operation dispatchers.  If any
+registered dispatchers can handle the arguments, then its result is returned.
+Otherwise, the original TypeError is raised.
+
+By default, dispatch support is added to the generated op wrappers for any
+visible ops by default.  Ops that are implemented in Python can opt in to
+dispatch support using the `add_dispatch_support` decorator.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+# Private function attribute used to store a list of dispatchers.
+DISPATCH_ATTR = "_tf_dispatchers"
+
+
+class OpDispatcher(object):
+  """Abstract base class for TensorFlow operator dispatchers.
+
+  Each operation dispatcher acts as an override handler for a single
+  TensorFlow operation, and its results are used when the handler indicates
+  that it can handle the operation's arguments (by returning any value other
+  than `OpDispatcher.NOT_SUPPORTED`).
+  """
+
+  # Sentinel value that can be returned to indicate that an operation
+  # dispatcher does not support a given set of arguments.
+  NOT_SUPPORTED = object()
+
+  def handle(self, args, kwargs):  # pylint: disable=unused-argument
+    """Handle this dispatcher's operation with the specified arguments.
+
+    If this operation dispatcher can handle the given arguments, then
+    return an appropriate value (or raise an appropriate exception).
+
+    Args:
+      args: The arguments to the operation.
+      kwargs: They keyword arguments to the operation.
+
+    Returns:
+      The result of the operation, or `OpDispatcher.NOT_SUPPORTED` if this
+      dispatcher can not handle the given arguments.
+    """
+    return self.NOT_SUPPORTED
+
+  def register(self, op):
+    """Register this dispatcher as a handler for `op`.
+
+    Args:
+      op: Python function: the TensorFlow operation that should be handled. Must
+        have a dispatch list (which is added automatically for generated ops,
+        and can be added to Python ops using the `add_dispatch_support`
+        decorator).
+    """
+    if not hasattr(op, DISPATCH_ATTR):
+      raise AssertionError("Dispatching not enabled for %s" % op)
+    getattr(op, DISPATCH_ATTR).append(self)
+
+
+def dispatch(op, *args, **kwargs):
+  """Returns the result from the first successful dispatcher for a given op.
+
+  Calls the `handle` method of each `OpDispatcher` that has been registered
+  to handle `op`, and returns the value from the first successful handler.
+
+  Args:
+    op: Python function: the operation to dispatch for.
+    *args: The arguments to the operation.
+    **kwargs: They keyword arguments to the operation.
+
+  Returns:
+    The result of the operation, or `NOT_SUPPORTED` if no registered
+    dispatcher can handle the given arguments.
+  """
+  for dispatcher in getattr(op, DISPATCH_ATTR):
+    result = dispatcher.handle(args, kwargs)
+    if result is not OpDispatcher.NOT_SUPPORTED:
+      return result
+  return OpDispatcher.NOT_SUPPORTED
+
+
+class _TypeBasedDispatcher(OpDispatcher):
+  """Dispatcher that handles op if any arguments have a specified type.
+
+  Checks the types of the arguments and keyword arguments (including elements
+  of lists or tuples), and if any argument values have the indicated type(s),
+  then delegates to an override function.
+  """
+
+  def __init__(self, override_func, types):
+    self._types = types
+    self._override_func = override_func
+
+  def _handles(self, args, kwargs):
+    for arg in itertools.chain(args, kwargs.values()):
+      if (isinstance(arg, self._types) or
+          (isinstance(arg, (list, tuple)) and
+           any(isinstance(elt, self._types) for elt in arg))):
+        return True
+    return False
+
+  def handle(self, args, kwargs):
+    if self._handles(args, kwargs):
+      return self._override_func(*args, **kwargs)
+    else:
+      return self.NOT_SUPPORTED
+
+
+# pylint: disable=g-doc-return-or-yield
+def dispatch_for_types(op, *types):
+  """Decorator to declare that a Python function overrides an op for a type.
+
+  The decorated function is used to override `op` if any of the arguments or
+  keyword arguments (including elements of lists or tuples) have one of the
+  specified types.
+
+  Example:
+
+  ```python
+  @dispatch_for_types(math_ops.add, RaggedTensor, RaggedTensorValue)
+  def ragged_add(x, y, name=None): ...
+  ```
+
+  Args:
+    op: Python function: the operation that should be overridden.
+    *types: The argument types for which this function should be used.
+  """
+
+  def decorator(func):
+    if tf_inspect.getargspec(func) != tf_inspect.getargspec(op):
+      raise AssertionError("The decorated function's signature must exactly "
+                           "match the signature of the overridden op.")
+    _TypeBasedDispatcher(func, types).register(op)
+    return func
+
+  return decorator
+
+
+# pylint: enable=g-doc-return-or-yield
+
+
+def add_dispatch_list(target):
+  """Decorator that adds a dispatch_list attribute to an op."""
+  if hasattr(target, DISPATCH_ATTR):
+    raise AssertionError("%s already has a dispatch list" % target)
+  setattr(target, DISPATCH_ATTR, [])
+  return target
+
+
+def add_dispatch_support(target):
+  """Decorator that adds a dispatch handling wrapper to an op."""
+  def wrapper(*args, **kwargs):
+    """Call target, and fall back on dispatchers if there is a TypeError."""
+    try:
+      return target(*args, **kwargs)
+    except (TypeError, ValueError):
+      # Note: convert_to_eager_tensor currently raises a ValueError, not a
+      # TypeError, when given unexpected types.  So we need to catch both.
+      result = dispatch(wrapper, *args, **kwargs)
+      if result is not OpDispatcher.NOT_SUPPORTED:
+        return result
+      else:
+        raise
+
+  add_dispatch_list(wrapper)
+  return tf_decorator.make_decorator(target, wrapper)
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7c5c8eca8dbb8c810351291d9445404897a9d5f
--- /dev/null
+++ b/tensorflow/python/util/dispatch_test.py
@@ -0,0 +1,120 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for operator dispatch."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import googletest
+from tensorflow.python.util import dispatch
+from tensorflow.python.util.tf_export import tf_export
+
+
+class CustomTensor(object):
+  """A fake composite tensor class, for testing type-based dispatching."""
+
+  def __init__(self, tensor, score):
+    self.tensor = ops.convert_to_tensor(tensor)
+    self.score = score
+
+
+@tf_export("test_op")
+@dispatch.add_dispatch_support
+def test_op(x, y, z):
+  """A fake op for testing dispatch of Python ops."""
+  return x + (2 * y) + (3 * z)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class DispatchTest(test_util.TensorFlowTestCase):
+
+  def testAddDispatchForTypes_With_CppOp(self):
+    original_handlers = gen_math_ops.add._tf_dispatchers[:]
+
+    # Override the behavior of gen_math_ops.add.
+    @dispatch.dispatch_for_types(gen_math_ops.add, CustomTensor)
+    def custom_add(x, y, name=None):  # pylint: disable=unused-variable
+      return CustomTensor(gen_math_ops.add(x.tensor, y.tensor, name),
+                          (x.score+y.score) / 2.0)
+    self.assertEqual(len(math_ops.add._tf_dispatchers),
+                     len(original_handlers) + 1)
+
+    # Test that we see the overridden behavior when using CustomTensors.
+    x = CustomTensor([1, 2, 3], 2.0)
+    y = CustomTensor([7, 8, 2], 0.0)
+    x_plus_y = gen_math_ops.add(x, y)
+    self.assertAllEqual(self.evaluate(x_plus_y.tensor), [8, 10, 5])
+    self.assertNear(x_plus_y.score, 1.0, 0.001)
+
+    # Test that we still get the right behavior when using normal Tensors.
+    a = [1, 2, 3]
+    b = [4, 5, 6]
+    a_plus_b = gen_math_ops.add(a, b)
+    self.assertAllEqual(a_plus_b, [5, 7, 9])
+
+    # Test that we still get a TypeError or ValueError if we pass some
+    # type that's not supported by any dispatcher.
+    with self.assertRaises((TypeError, ValueError)):
+      gen_math_ops.add(a, None)
+
+    # Clean up
+    gen_math_ops.add._tf_dispatchers = original_handlers
+
+  def testAddDispatchForTypes_With_PythonOp(self):
+    original_handlers = test_op._tf_dispatchers[:]
+
+    @dispatch.dispatch_for_types(test_op, CustomTensor)
+    def override_for_test_op(x, y, z):  # pylint: disable=unused-variable
+      return CustomTensor(test_op(x.tensor, y.tensor, z.tensor),
+                          (x.score + y.score + z.score) / 3.0)
+
+    x = CustomTensor([1, 2, 3], 0.2)
+    y = CustomTensor([7, 8, 2], 0.4)
+    z = CustomTensor([0, 1, 2], 0.6)
+
+    result = test_op(x, y, z)
+    self.assertAllEqual(self.evaluate(result.tensor), [15, 21, 13])
+    self.assertNear(result.score, 0.4, 0.001)
+
+    # Clean up
+    test_op._tf_dispatchers = original_handlers
+
+  def testDispatchForTypes_SignatureMismatch(self):
+    with self.assertRaisesRegexp(AssertionError, "The decorated function's "
+                                 "signature must exactly match.*"):
+      @dispatch.dispatch_for_types(test_op, CustomTensor)
+      def override_for_test_op(a, b, c):  # pylint: disable=unused-variable
+        return CustomTensor(test_op(a.tensor, b.tensor, c.tensor),
+                            (a.score + b.score + c.score) / 3.0)
+
+  def testDispatchForTypes_OpDoesNotSupportDispatch(self):
+    def some_op(x, y):
+      return x + y
+
+    with self.assertRaisesRegexp(AssertionError, "Dispatching not enabled for"):
+      @dispatch.dispatch_for_types(some_op, CustomTensor)
+      def override_for_some_op(x, y):  # pylint: disable=unused-variable
+        return x if x.score > 0 else y
+
+
+if __name__ == "__main__":
+  googletest.main()
+
+
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index be8b0f1949ff7655d14c81ce29d643a919176fe6..70e5ebb3b68b0973cf46d147bf2a11837a82b1b9 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -413,6 +413,51 @@ def map_structure_with_paths(func, *structure, **kwargs):
       the type of sequence in any of their substructures.
     ValueError: If no structures are provided.
   """
+  return _map_structure_with_tuple_or_string_paths(
+      use_string_paths=True, func=func, structure=structure, kwargs=kwargs)
+
+
+def map_structure_with_tuple_paths(func, *structure, **kwargs):
+  """Applies `func` to each entry in `structure` and returns a new structure.
+
+  Applies `func(tuple_path, x[0], x[1], ..., **kwargs)` where `x[i]` is an entry
+  in `structure[i]` and `tuple_path` is a tuple of indices and/or dictionary
+  keys (as returned by `nest.yield_flat_paths`), which uniquely specifies the
+  common path to x[i] in the structures. All structures in `structure` must have
+  the same arity, and the return value will contain the results in the same
+  structure. Special kwarg `check_types` determines whether the types of
+  iterables within the structure must be the same-- see **kwargs definition
+  below.
+
+  Args:
+    func: A callable with the signature `func(tuple_path, *values, **kwargs)`
+      that is evaluated on the leaves of the structure.
+    *structure: A variable number of compatible structures to process.
+    **kwargs: Optional kwargs to be passed through to func. Special kwarg
+      `check_types` is not passed to func, but instead determines whether the
+      types of iterables within the structures have to be same (e.g.
+      `map_structure(func, [1], (1,))` raises a `TypeError` exception). To allow
+      this set this argument to `False`.
+
+  Returns:
+    A structure of the same form as the input structures whose leaves are the
+    result of evaluating func on corresponding leaves of the input structures.
+
+  Raises:
+    TypeError: If `func` is not callable or if the structures do not match
+      each other by depth tree.
+    TypeError: If `check_types` is not `False` and the two structures differ in
+      the type of sequence in any of their substructures.
+    ValueError: If no structures are provided.
+  """
+  return _map_structure_with_tuple_or_string_paths(
+      use_string_paths=False, func=func, structure=structure, kwargs=kwargs)
+
+
+def _map_structure_with_tuple_or_string_paths(
+    use_string_paths, func, structure, kwargs):
+  """Implements `map_structure` with either tuple or string paths."""
+
   if not callable(func):
     raise TypeError("func must be callable, got: %s" % func)
   if not structure:
@@ -422,9 +467,14 @@ def map_structure_with_paths(func, *structure, **kwargs):
   for other in structure[1:]:
     assert_same_structure(structure[0], other, check_types=check_types)
 
+  if use_string_paths:
+    flatten_func = flatten_with_joined_string_paths
+  else:
+    flatten_func = flatten_with_tuple_paths
+
   # First set paths_and_values to:
   # [[(p11, v11), ... (p1n, v1n)], ... [(pm1, vm1), ... (pmn, vmn)]]
-  paths_and_values = [flatten_with_joined_string_paths(s) for s in structure]
+  paths_and_values = [flatten_func(s) for s in structure]
 
   # Now zip(*paths_and_values) would be:
   # [((p11, v11), ... (pm1, vm1)), ... ((p1n, v1n), ... (pmn, vmn))]
@@ -820,5 +870,24 @@ def flatten_with_joined_string_paths(structure, separator="/"):
   return list(zip(flat_string_paths, flatten(structure)))
 
 
+def flatten_with_tuple_paths(structure):
+  """Returns a list of `(tuple_path, leaf_element)` tuples.
+
+  The order of pairs produced matches that of `nest.flatten`. This allows you
+  to flatten a nested structure while keeping information about where in the
+  structure each data element was located. See `nest.yield_flat_paths`
+  for more information about tuple paths.
+
+  Args:
+    structure: the nested structure to flatten.
+
+  Returns:
+    A list of `(tuple_path, leaf_element)` tuples. Each `tuple_path` is a tuple
+    of indices and/or dictionary keys that uniquely specify the path to
+    `leaf_element` within `structure`.
+  """
+  return list(zip(yield_flat_paths(structure), flatten(structure)))
+
+
 _pywrap_tensorflow.RegisterType("Mapping", _collections.Mapping)
 _pywrap_tensorflow.RegisterType("Sequence", _collections.Sequence)
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index d0d0c5f7935ba0a4d2b867b3c6fb6bd52c7cd54a..83fa5dd66084e7d6710505bc638cdc7ae4f9bbe3 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -209,12 +209,12 @@ class NestTest(parameterized.TestCase, test.TestCase):
   def testFlatten_numpyIsNotFlattened(self):
     structure = np.array([1, 2, 3])
     flattened = nest.flatten(structure)
-    self.assertEqual(len(flattened), 1)
+    self.assertLen(flattened, 1)
 
   def testFlatten_stringIsNotFlattened(self):
     structure = "lots of letters"
     flattened = nest.flatten(structure)
-    self.assertEqual(len(flattened), 1)
+    self.assertLen(flattened, 1)
     unflattened = nest.pack_sequence_as("goodbye", flattened)
     self.assertEqual(structure, unflattened)
 
@@ -791,37 +791,46 @@ class NestTest(parameterized.TestCase, test.TestCase):
       expected = inputs_expected["expected"]
       self.assertEqual(list(nest.yield_flat_paths(inputs)), expected)
 
-  def testFlattenWithStringPaths(self):
-    for inputs_expected in (
-        {"inputs": [], "expected": []},
-        {"inputs": [23, "42"], "expected": [("0", 23), ("1", "42")]},
-        {"inputs": [[[[108]]]], "expected": [("0/0/0/0", 108)]}):
-      inputs = inputs_expected["inputs"]
-      expected = inputs_expected["expected"]
-      self.assertEqual(
-          nest.flatten_with_joined_string_paths(inputs, separator="/"),
-          expected)
-
-  # Need a separate test for namedtuple as we can't declare tuple definitions
-  # in the @parameterized arguments.
-  def testFlattenNamedTuple(self):
-    # pylint: disable=invalid-name
-    Foo = collections.namedtuple("Foo", ["a", "b"])
-    Bar = collections.namedtuple("Bar", ["c", "d"])
-    # pylint: enable=invalid-name
-    test_cases = [
-        (Foo(a=3, b=Bar(c=23, d=42)),
-         [("a", 3), ("b/c", 23), ("b/d", 42)]),
-        (Foo(a=Bar(c=23, d=42), b=Bar(c=0, d="something")),
-         [("a/c", 23), ("a/d", 42), ("b/c", 0), ("b/d", "something")]),
-        (Bar(c=42, d=43),
-         [("c", 42), ("d", 43)]),
-        (Bar(c=[42], d=43),
-         [("c/0", 42), ("d", 43)]),
-    ]
-    for inputs, expected in test_cases:
-      self.assertEqual(
-          list(nest.flatten_with_joined_string_paths(inputs)), expected)
+  # We cannot define namedtuples within @parameterized argument lists.
+  # pylint: disable=invalid-name
+  Foo = collections.namedtuple("Foo", ["a", "b"])
+  Bar = collections.namedtuple("Bar", ["c", "d"])
+  # pylint: enable=invalid-name
+
+  @parameterized.parameters([
+      dict(inputs=[], expected=[]),
+      dict(inputs=[23, "42"], expected=[("0", 23), ("1", "42")]),
+      dict(inputs=[[[[108]]]], expected=[("0/0/0/0", 108)]),
+      dict(inputs=Foo(a=3, b=Bar(c=23, d=42)),
+           expected=[("a", 3), ("b/c", 23), ("b/d", 42)]),
+      dict(inputs=Foo(a=Bar(c=23, d=42), b=Bar(c=0, d="thing")),
+           expected=[("a/c", 23), ("a/d", 42), ("b/c", 0), ("b/d", "thing")]),
+      dict(inputs=Bar(c=42, d=43),
+           expected=[("c", 42), ("d", 43)]),
+      dict(inputs=Bar(c=[42], d=43),
+           expected=[("c/0", 42), ("d", 43)]),
+  ])
+  def testFlattenWithStringPaths(self, inputs, expected):
+    self.assertEqual(
+        nest.flatten_with_joined_string_paths(inputs, separator="/"),
+        expected)
+
+  @parameterized.parameters([
+      dict(inputs=[], expected=[]),
+      dict(inputs=[23, "42"], expected=[((0,), 23), ((1,), "42")]),
+      dict(inputs=[[[[108]]]], expected=[((0, 0, 0, 0), 108)]),
+      dict(inputs=Foo(a=3, b=Bar(c=23, d=42)),
+           expected=[(("a",), 3), (("b", "c"), 23), (("b", "d"), 42)]),
+      dict(inputs=Foo(a=Bar(c=23, d=42), b=Bar(c=0, d="thing")),
+           expected=[(("a", "c"), 23), (("a", "d"), 42), (("b", "c"), 0),
+                     (("b", "d"), "thing")]),
+      dict(inputs=Bar(c=42, d=43),
+           expected=[(("c",), 42), (("d",), 43)]),
+      dict(inputs=Bar(c=[42], d=43),
+           expected=[(("c", 0), 42), (("d",), 43)]),
+  ])
+  def testFlattenWithTuplePaths(self, inputs, expected):
+    self.assertEqual(nest.flatten_with_tuple_paths(inputs), expected)
 
   @parameterized.named_parameters(
       ("tuples", (1, 2), (3, 4), True, (("0", 4), ("1", 6))),
@@ -852,6 +861,42 @@ class NestTest(parameterized.TestCase, test.TestCase):
     with self.assertRaises(error_type):
       nest.map_structure_with_paths(lambda path, *s: 0, s1, s2)
 
+  @parameterized.named_parameters([
+      dict(testcase_name="Tuples", s1=(1, 2), s2=(3, 4),
+           check_types=True, expected=(((0,), 4), ((1,), 6))),
+      dict(testcase_name="Dicts", s1={"a": 1, "b": 2}, s2={"b": 4, "a": 3},
+           check_types=True, expected={"a": (("a",), 4), "b": (("b",), 6)}),
+      dict(testcase_name="Mixed", s1=(1, 2), s2=[3, 4],
+           check_types=False, expected=(((0,), 4), ((1,), 6))),
+      dict(testcase_name="Nested",
+           s1={"a": [2, 3], "b": [1, 2, 3]},
+           s2={"b": [5, 6, 7], "a": [8, 9]},
+           check_types=True,
+           expected={"a": [(("a", 0), 10), (("a", 1), 12)],
+                     "b": [(("b", 0), 6), (("b", 1), 8), (("b", 2), 10)]}),
+  ])
+  def testMapWithTuplePathsCompatibleStructures(
+      self, s1, s2, check_types, expected):
+    def path_and_sum(path, *values):
+      return path, sum(values)
+    result = nest.map_structure_with_tuple_paths(
+        path_and_sum, s1, s2, check_types=check_types)
+    self.assertEqual(expected, result)
+
+  @parameterized.named_parameters([
+      dict(testcase_name="Tuples", s1=(1, 2), s2=(3, 4, 5),
+           error_type=ValueError),
+      dict(testcase_name="Dicts", s1={"a": 1}, s2={"b": 2},
+           error_type=ValueError),
+      dict(testcase_name="Mixed", s1=(1, 2), s2=[3, 4], error_type=TypeError),
+      dict(testcase_name="Nested",
+           s1={"a": [2, 3], "b": [1, 3]}, s2={"b": [5, 6, 7], "a": [8, 9]},
+           error_type=ValueError)
+  ])
+  def testMapWithTuplePathsIncompatibleStructures(self, s1, s2, error_type):
+    with self.assertRaises(error_type):
+      nest.map_structure_with_tuple_paths(lambda path, *s: 0, s1, s2)
+
 
 class NestBenchmark(test.Benchmark):
 
diff --git a/tensorflow/python/util/tf_decorator.py b/tensorflow/python/util/tf_decorator.py
index 0cfc836246d2d885c28d168fe90b08a325cf6ded..f018e1a1bd35f0111cacc20e678c0466bfd5f2e3 100644
--- a/tensorflow/python/util/tf_decorator.py
+++ b/tensorflow/python/util/tf_decorator.py
@@ -98,6 +98,9 @@ def make_decorator(target,
   if hasattr(target, '__doc__'):
     decorator_func.__doc__ = decorator.__doc__
   decorator_func.__wrapped__ = target
+  # Keeping a second handle to `target` allows callers to detect whether the
+  # decorator was modified using `rewrap`.
+  decorator_func.__original_wrapped__ = target
   return decorator_func
 
 
@@ -173,6 +176,8 @@ def unwrap(maybe_tf_decorator):
       decorators.append(getattr(cur, '_tf_decorator'))
     else:
       break
+    if not hasattr(decorators[-1], 'decorated_target'):
+      break
     cur = decorators[-1].decorated_target
   return decorators, cur
 
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index ec70cae7d2fc00f793e8ffa0aec331e32e11115f..74afc3746fb112784c672e4aa9fde5d34c1e354f 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -147,6 +147,94 @@ def get_canonical_name(api_names, deprecated_api_names):
   return None
 
 
+def get_v1_names(symbol):
+  """Get a list of TF 1.* names for this symbol.
+
+  Args:
+    symbol: symbol to get API names for.
+
+  Returns:
+    List of all API names for this symbol including TensorFlow and
+    Estimator names.
+  """
+  names_v1 = []
+  tensorflow_api_attr_v1 = API_ATTRS_V1[TENSORFLOW_API_NAME].names
+  estimator_api_attr_v1 = API_ATTRS_V1[ESTIMATOR_API_NAME].names
+
+  if not hasattr(symbol, tensorflow_api_attr_v1):
+    return names_v1
+  if tensorflow_api_attr_v1 in symbol.__dict__:
+    names_v1.extend(getattr(symbol, tensorflow_api_attr_v1))
+  if estimator_api_attr_v1 in symbol.__dict__:
+    names_v1.extend(getattr(symbol, estimator_api_attr_v1))
+  return names_v1
+
+
+def get_v2_names(symbol):
+  """Get a list of TF 2.0 names for this symbol.
+
+  Args:
+    symbol: symbol to get API names for.
+
+  Returns:
+    List of all API names for this symbol including TensorFlow and
+    Estimator names.
+  """
+  names_v2 = []
+  tensorflow_api_attr = API_ATTRS[TENSORFLOW_API_NAME].names
+  estimator_api_attr = API_ATTRS[ESTIMATOR_API_NAME].names
+
+  if not hasattr(symbol, tensorflow_api_attr):
+    return names_v2
+  if tensorflow_api_attr in symbol.__dict__:
+    names_v2.extend(getattr(symbol, tensorflow_api_attr))
+  if estimator_api_attr in symbol.__dict__:
+    names_v2.extend(getattr(symbol, estimator_api_attr))
+  return names_v2
+
+
+def get_v1_constants(module):
+  """Get a list of TF 1.* constants in this module.
+
+  Args:
+    module: TensorFlow module.
+
+  Returns:
+    List of all API constants under the given module including TensorFlow and
+    Estimator constants.
+  """
+  constants_v1 = []
+  tensorflow_constants_attr_v1 = API_ATTRS_V1[TENSORFLOW_API_NAME].constants
+  estimator_constants_attr_v1 = API_ATTRS_V1[ESTIMATOR_API_NAME].constants
+
+  if hasattr(module, tensorflow_constants_attr_v1):
+    constants_v1.extend(getattr(module, tensorflow_constants_attr_v1))
+  if hasattr(module, estimator_constants_attr_v1):
+    constants_v1.extend(getattr(module, estimator_constants_attr_v1))
+  return constants_v1
+
+
+def get_v2_constants(module):
+  """Get a list of TF 2.0 constants in this module.
+
+  Args:
+    module: TensorFlow module.
+
+  Returns:
+    List of all API constants under the given module including TensorFlow and
+    Estimator constants.
+  """
+  constants_v2 = []
+  tensorflow_constants_attr = API_ATTRS[TENSORFLOW_API_NAME].constants
+  estimator_constants_attr = API_ATTRS[ESTIMATOR_API_NAME].constants
+
+  if hasattr(module, tensorflow_constants_attr):
+    constants_v2.extend(getattr(module, tensorflow_constants_attr))
+  if hasattr(module, estimator_constants_attr):
+    constants_v2.extend(getattr(module, estimator_constants_attr))
+  return constants_v2
+
+
 class api_export(object):  # pylint: disable=invalid-name
   """Provides ways to export symbols to the TensorFlow API."""
 
diff --git a/tensorflow/python/util/tf_export_test.py b/tensorflow/python/util/tf_export_test.py
index a0fac8bf362627e6802821e3b33c0f107c5c97ce..20625792e9bf88ebca34ba00a885742c6d6f745f 100644
--- a/tensorflow/python/util/tf_export_test.py
+++ b/tensorflow/python/util/tf_export_test.py
@@ -62,6 +62,10 @@ class ValidateExportTest(test.TestCase):
         del symbol._tf_api_names
       if hasattr(symbol, '_tf_api_names_v1'):
         del symbol._tf_api_names_v1
+      if hasattr(symbol, '_estimator_api_names'):
+        del symbol._estimator_api_names
+      if hasattr(symbol, '_estimator_api_names_v1'):
+        del symbol._estimator_api_names_v1
 
   def _CreateMockModule(self, name):
     mock_module = self.MockModule(name)
@@ -74,6 +78,10 @@ class ValidateExportTest(test.TestCase):
     decorated_function = export_decorator(_test_function)
     self.assertEquals(decorated_function, _test_function)
     self.assertEquals(('nameA', 'nameB'), decorated_function._tf_api_names)
+    self.assertEquals(['nameA', 'nameB'],
+                      tf_export.get_v1_names(decorated_function))
+    self.assertEquals(['nameA', 'nameB'],
+                      tf_export.get_v2_names(decorated_function))
 
   def testExportMultipleFunctions(self):
     export_decorator1 = tf_export.tf_export('nameA', 'nameB')
@@ -95,6 +103,22 @@ class ValidateExportTest(test.TestCase):
     export_decorator_b(TestClassB)
     self.assertEquals(('TestClassA1',), TestClassA._tf_api_names)
     self.assertEquals(('TestClassB1',), TestClassB._tf_api_names)
+    self.assertEquals(['TestClassA1'], tf_export.get_v1_names(TestClassA))
+    self.assertEquals(['TestClassB1'], tf_export.get_v1_names(TestClassB))
+
+  def testExportClassInEstimator(self):
+    export_decorator_a = tf_export.tf_export('TestClassA1')
+    export_decorator_a(TestClassA)
+    self.assertEquals(('TestClassA1',), TestClassA._tf_api_names)
+
+    export_decorator_b = tf_export.estimator_export(
+        'estimator.TestClassB1')
+    export_decorator_b(TestClassB)
+    self.assertTrue('_tf_api_names' not in TestClassB.__dict__)
+    self.assertEquals(('TestClassA1',), TestClassA._tf_api_names)
+    self.assertEquals(['TestClassA1'], tf_export.get_v1_names(TestClassA))
+    self.assertEquals(['estimator.TestClassB1'],
+                      tf_export.get_v1_names(TestClassB))
 
   def testExportSingleConstant(self):
     module1 = self._CreateMockModule('module1')
@@ -103,6 +127,10 @@ class ValidateExportTest(test.TestCase):
     export_decorator.export_constant('module1', 'test_constant')
     self.assertEquals([(('NAME_A', 'NAME_B'), 'test_constant')],
                       module1._tf_api_constants)
+    self.assertEquals([(('NAME_A', 'NAME_B'), 'test_constant')],
+                      tf_export.get_v1_constants(module1))
+    self.assertEquals([(('NAME_A', 'NAME_B'), 'test_constant')],
+                      tf_export.get_v2_constants(module1))
 
   def testExportMultipleConstants(self):
     module1 = self._CreateMockModule('module1')
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index ca6710bcf2178db0fcf63c9bdfdf27531651f7ed..63de4a7a96c162f38aa3cba1512cc639df09adcf 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -23,6 +23,7 @@ import traceback
 
 import six  # pylint: disable=unused-import
 
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_decorator
 # pylint: enable=g-bad-import-order,g-import-not-at-top
@@ -32,7 +33,8 @@ class _TFShouldUseHelper(object):
   """Object stored in TFShouldUse-wrapped objects.
 
   When it is deleted it will emit a warning or error if its `sate` method
-  has not been called by time of deletion.
+  has not been called by time of deletion, and Tensorflow is not executing
+  eagerly outside of functions.
   """
 
   def __init__(self, type_, repr_, stack_frame, fatal_error_if_unsated):
@@ -50,6 +52,8 @@ class _TFShouldUseHelper(object):
     self._logging_module = None
 
   def __del__(self):
+    if ops.executing_eagerly_outside_functions():
+      return
     if self._sated:
       return
     if self._fatal_error_if_unsated:
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 11eb9ce94768f47e5afe48355fadab30744224b1..e69eec73a0ef8b37f042d9a0f5bf63569b6f5b39 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -172,7 +172,7 @@ class CachedTypeCheck {
     auto* type = Py_TYPE(o);
 
     {
-      mutex_lock l(type_to_sequence_map_mu_);
+      tf_shared_lock l(type_to_sequence_map_mu_);
       auto it = type_to_sequence_map_.find(type);
       if (it != type_to_sequence_map_.end()) {
         return it->second;
@@ -195,7 +195,12 @@ class CachedTypeCheck {
       mutex_lock l(type_to_sequence_map_mu_);
       if (type_to_sequence_map_.size() < kMaxItemsInCache) {
         Py_INCREF(type);
-        type_to_sequence_map_.insert({type, check_result});
+        auto insert_result = type_to_sequence_map_.insert({type, check_result});
+        if (!insert_result.second) {
+          // The type was added to the cache by a concurrent thread after we
+          // looked it up above.
+          Py_DECREF(type);
+        }
       }
     }
 
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 2526e1adaa107565042d0dff9e12183bd022f9f1..00c23b8d1788d56cee0e549ccd835fa174037760 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -23,6 +23,14 @@ tf_proto_library(
     protodeps = tf_additional_all_protos(),
 )
 
+tf_proto_library(
+    name = "logging_proto",
+    srcs = ["logging.proto"],
+    cc_api_version = 2,
+    default_header = True,
+    protodeps = tf_additional_all_protos(),
+)
+
 cc_library(
     name = "stream_executor_impl",
     srcs = glob(
@@ -108,11 +116,8 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
     ] + if_cuda_is_configured([
         "//tensorflow/core:cuda",
-        "@local_config_cuda//cuda:cublas",
         "@local_config_cuda//cuda:cuda_driver",
         "@local_config_cuda//cuda:cudnn",
-        "@local_config_cuda//cuda:cufft",
-        "@local_config_cuda//cuda:curand",
     ]),
     alwayslink = 1,
 )
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 7fabb35e28ce5a4107882a8739c1c0d641e05828..957f6c98da564500f81d7185ce6a151003549ee5 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -58,6 +58,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
 #include "tensorflow/stream_executor/device_memory.h"
+
+#ifndef PLATFORM_GOOGLE
+#include "tensorflow/stream_executor/dso_loader.h"
+#endif
+
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
@@ -76,21 +81,8 @@ PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin);
 
 namespace wrap {
 
-#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                         \
-  struct WrapperShim__##__name {                                    \
-    static const char *kName;                                       \
-    template <typename... Args>                                     \
-    cublasStatus_t operator()(CUDAExecutor *parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};              \
-      return ::__name(args...);                                     \
-    }                                                               \
-  } __name;                                                         \
-  const char *WrapperShim__##__name::kName = #__name;
-
-#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
-  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
-
-#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+// clang-format off
+#define CUBLAS_ROUTINE_EACH(__macro)      \
   __macro(cublasSnrm2)                    \
   __macro(cublasDnrm2)                    \
   __macro(cublasScnrm2)                   \
@@ -262,6 +254,58 @@ namespace wrap {
   __macro(cublasCdgmm)                    \
   __macro(cublasZdgmm)
 
+// clang-format off
+
+#ifdef PLATFORM_GOOGLE
+#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                         \
+  struct WrapperShim__##__name {                                    \
+    static const char *kName;                                       \
+    template <typename... Args>                                     \
+    cublasStatus_t operator()(CUDAExecutor *parent, Args... args) { \
+      cuda::ScopedActivateExecutorContext sac{parent};              \
+      return ::__name(args...);                                     \
+    }                                                               \
+  } __name;                                                         \
+  const char *WrapperShim__##__name::kName = #__name;
+
+#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
+
+#else
+
+#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                               \
+  struct DynLoadShim__##__name {                                          \
+    static const char* kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void* GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetCublasDsoHandle();           \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void* f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in cublas DSO; dlerror: " << s.error_message();  \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    cublasStatus_t operator()(CUDAExecutor* parent, Args... args) {       \
+      cuda::ScopedActivateExecutorContext sac{parent};                    \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char* DynLoadShim__##__name::kName = #__name;
+
+#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
+
+#endif
+
 STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasCreate)
 STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasDestroy)
 STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasSetStream)
@@ -271,7 +315,7 @@ STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmBatched)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasDgemmBatched)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasCgemmBatched)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasZgemmBatched)
-CUBLAS_BLAS_ROUTINE_EACH(STREAM_EXECUTOR_CUBLAS_V2_WRAP)
+CUBLAS_ROUTINE_EACH(STREAM_EXECUTOR_CUBLAS_V2_WRAP)
 
 #if CUDA_VERSION >= 7050
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmEx)
@@ -424,7 +468,8 @@ class ScopedCublasMathMode {
   // Note that when false is returned, an appropriate error has already been
   // logged.
   bool Init(cublasMath_t new_mode) {
-    cublasStatus_t ret = wrap::cublasGetMathMode(parent_, handle_, &old_mode_);
+    cublasStatus_t ret =
+        wrap::cublasGetMathMode(parent_, handle_, &old_mode_);
     if (ret != CUBLAS_STATUS_SUCCESS) {
       LOG(ERROR) << "failed to get old cublas math mode: " << ToString(ret);
       return ok_ = false;
@@ -442,7 +487,8 @@ class ScopedCublasMathMode {
   // successful in the first place.
   ~ScopedCublasMathMode() {
     if (ok_) {
-      cublasStatus_t ret = wrap::cublasSetMathMode(parent_, handle_, old_mode_);
+      cublasStatus_t ret =
+          wrap::cublasSetMathMode(parent_, handle_, old_mode_);
       if (ret != CUBLAS_STATUS_SUCCESS) {
         LOG(ERROR) << "failed to set former cublas math mode: "
                    << ToString(ret);
@@ -675,16 +721,16 @@ bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(
-      wrap::cublasScasum, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasScasum, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(
-      wrap::cublasDzasum, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasDzasum, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
@@ -835,16 +881,16 @@ bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(
-      wrap::cublasScnrm2, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasScnrm2, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(
-      wrap::cublasDznrm2, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasDznrm2, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
@@ -1060,48 +1106,48 @@ bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<float>> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIcamax, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIcamax, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<double>> &x,
                            int incx, DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIzamax, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIzamax, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<float> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIsamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIsamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<double> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIdamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIdamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<float>> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIcamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIcamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<double>> &x,
                            int incx, DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIzamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIzamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index cbf388a0f892b0feee0e4f45f67fcb0be5c32537..acac7d6368885537b1f5727779388d550680e90d 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -23,6 +23,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
+
+#ifndef PLATFORM_GOOGLE
+#include "tensorflow/stream_executor/dso_loader.h"
+#endif
+
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
@@ -38,6 +43,7 @@ PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);
 
 namespace wrap {
 
+#ifdef PLATFORM_GOOGLE
 // This macro wraps a global identifier, given by __name, in a callable
 // structure that loads the DLL symbol out of the DSO handle in a thread-safe
 // manner on first use. This dynamic loading technique is used to avoid DSO
@@ -52,22 +58,69 @@ namespace wrap {
     }                                                            \
   } __name;
 
-#define CUFFT_ROUTINE_EACH(__macro)                                            \
-  __macro(cufftDestroy) __macro(cufftSetStream) __macro(cufftPlan1d)           \
-      __macro(cufftPlan2d) __macro(cufftPlan3d) __macro(cufftPlanMany)         \
-          __macro(cufftExecD2Z) __macro(cufftExecZ2D) __macro(cufftExecC2C)    \
-              __macro(cufftExecC2R) __macro(cufftExecZ2Z)                      \
-                  __macro(cufftExecR2C) __macro(cufftCreate)                   \
-                      __macro(cufftSetAutoAllocation)                          \
-                          __macro(cufftSetWorkArea) __macro(cufftGetSize1d)    \
-                              __macro(cufftMakePlan1d) __macro(cufftGetSize2d) \
-                                  __macro(cufftMakePlan2d)                     \
-                                      __macro(cufftGetSize3d)                  \
-                                          __macro(cufftMakePlan3d)             \
-                                              __macro(cufftGetSizeMany)        \
-                                                  __macro(cufftMakePlanMany)
+#else
+
+#define STREAM_EXECUTOR_CUFFT_WRAP(__name)                                \
+  struct DynLoadShim__##__name {                                          \
+    static const char *kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void *GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetCufftDsoHandle();            \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void *f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in cufft DSO; dlerror: " << s.error_message();   \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    cufftResult operator()(CUDAExecutor *parent, Args... args) {          \
+      cuda::ScopedActivateExecutorContext sac{parent};                    \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+#endif
+
+// clang-format off
+
+#define CUFFT_ROUTINE_EACH(__macro)                                     \
+  __macro(cufftDestroy)                                                 \
+  __macro(cufftSetStream)                                               \
+  __macro(cufftPlan1d)                                                  \
+  __macro(cufftPlan2d)                                                  \
+  __macro(cufftPlan3d)                                                  \
+  __macro(cufftPlanMany)                                                \
+  __macro(cufftExecD2Z)                                                 \
+  __macro(cufftExecZ2D)                                                 \
+  __macro(cufftExecC2C)                                                 \
+  __macro(cufftExecC2R)                                                 \
+  __macro(cufftExecZ2Z)                                                 \
+  __macro(cufftExecR2C)                                                 \
+  __macro(cufftCreate)                                                  \
+  __macro(cufftSetAutoAllocation)                                       \
+  __macro(cufftSetWorkArea)                                             \
+  __macro(cufftGetSize1d)                                               \
+  __macro(cufftMakePlan1d)                                              \
+  __macro(cufftGetSize2d)                                               \
+  __macro(cufftMakePlan2d)                                              \
+  __macro(cufftGetSize3d)                                               \
+  __macro(cufftMakePlan3d)                                              \
+  __macro(cufftGetSizeMany)                                             \
+  __macro(cufftMakePlanMany)
+
+// clang-format on
 
 CUFFT_ROUTINE_EACH(STREAM_EXECUTOR_CUFFT_WRAP)
+#undef CUFFT_ROUTINE_EACH
 
 }  // namespace wrap
 
diff --git a/tensorflow/stream_executor/cuda/cuda_helpers.h b/tensorflow/stream_executor/cuda/cuda_helpers.h
index d55706c66a9b47abfe125eaaa09e4b0cc543622a..dc0dc694cdc6001341514c02cef38178b25338aa 100644
--- a/tensorflow/stream_executor/cuda/cuda_helpers.h
+++ b/tensorflow/stream_executor/cuda/cuda_helpers.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include <complex>
 
 #include "cuda/include/cuComplex.h"
-#include "cuda/include/cuda.h"
 
 namespace stream_executor {
 
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index 88c4f15792737aac8dfafefba4c7fce74c434320..7f920719321637360fdf5c098e83dfaa49164e6c 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -21,6 +21,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
+
+#ifndef PLATFORM_GOOGLE
+#include "tensorflow/stream_executor/dso_loader.h"
+#endif
+
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
@@ -61,6 +66,7 @@ PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin);
 
 namespace wrap {
 
+#ifdef PLATFORM_GOOGLE
 #define STREAM_EXECUTOR_CURAND_WRAP(__name)                         \
   struct WrapperShim__##__name {                                    \
     template <typename... Args>                                     \
@@ -70,6 +76,36 @@ namespace wrap {
     }                                                               \
   } __name;
 
+#else
+#define STREAM_EXECUTOR_CURAND_WRAP(__name)                               \
+  struct DynLoadShim__##__name {                                          \
+    static const char *kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void *GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetCurandDsoHandle();           \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void *f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in curand DSO; dlerror: " << s.error_message();  \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    curandStatus_t operator()(CUDAExecutor *parent, Args... args) {       \
+      cuda::ScopedActivateExecutorContext sac{parent};                    \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char *DynLoadShim__##__name::kName = #__name;
+#endif
+
 STREAM_EXECUTOR_CURAND_WRAP(curandCreateGenerator);
 STREAM_EXECUTOR_CURAND_WRAP(curandDestroyGenerator);
 STREAM_EXECUTOR_CURAND_WRAP(curandSetStream);
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index c044a356efb38c333c3153f024092a22fbdf56db..43738d2d1d9a976d72d952969b18b91c51ecad48 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -906,9 +906,10 @@ class VersionInfo {
  public:
   VersionInfo(int major = 0, int minor = 0, int patch = 0)
       : major_(major), minor_(minor), patch_(patch) {}
-  int major_version() { return major_; }
-  int minor_version() { return minor_; }
-  int patch() { return patch_; }
+  int major_version() const { return major_; }
+  int minor_version() const { return minor_; }
+  int patch() const { return patch_; }
+
  private:
   int major_;
   int minor_;
diff --git a/tensorflow/stream_executor/logging.proto b/tensorflow/stream_executor/logging.proto
new file mode 100644
index 0000000000000000000000000000000000000000..2c75500cda452f787cb174238058f026a31e4242
--- /dev/null
+++ b/tensorflow/stream_executor/logging.proto
@@ -0,0 +1,19 @@
+syntax = "proto3";
+
+package stream_executor;
+
+message CudnnVersion {
+  int32 major = 1;
+  int32 minor = 2;
+  int32 patch = 3;
+};
+
+message ComputeCapability {
+  int32 major = 1;
+  int32 minor = 2;
+}
+
+message CudaInfo {
+  CudnnVersion cudnn_version = 1;
+  ComputeCapability compute_capability = 2;
+}
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index ed1de5a31cae98bf5855fde0676162f0264d998e..d93e0df5e44eb32145a7f966cc631ceefab7117c 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1666,8 +1666,7 @@ def tf_py_wrap_cc(
     )
     extra_linkopts = select({
         "@local_config_cuda//cuda:darwin": [
-            "-Wl,-exported_symbols_list",
-            "$(location %s.lds)" % vscriptname,
+            "-Wl,-exported_symbols_list,$(location %s.lds)" % vscriptname,
         ],
         clean_dep("//tensorflow:windows"): [],
         "//conditions:default": [
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-node-def.-experimental-debug-info.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-node-def.-experimental-debug-info.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73483e2b6e2239dc35b25e2057b75a56ef010c3d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-node-def.-experimental-debug-info.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.NodeDef.ExperimentalDebugInfo"
+tf_proto {
+  descriptor {
+    name: "ExperimentalDebugInfo"
+    field {
+      name: "original_node_names"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_STRING
+    }
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-node-def.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-node-def.pbtxt
index 646fa8abb9b22dbd908ff821cbe66a33ad02ba64..18548632c9cb1cc227aec6f893bfc487ef2cd864 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-node-def.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-node-def.pbtxt
@@ -33,6 +33,13 @@ tf_proto {
       type: TYPE_MESSAGE
       type_name: ".tensorflow.NodeDef.AttrEntry"
     }
+    field {
+      name: "experimental_debug_info"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.NodeDef.ExperimentalDebugInfo"
+    }
     nested_type {
       name: "AttrEntry"
       field {
@@ -52,5 +59,14 @@ tf_proto {
         map_entry: true
       }
     }
+    nested_type {
+      name: "ExperimentalDebugInfo"
+      field {
+        name: "original_node_names"
+        number: 1
+        label: LABEL_REPEATED
+        type: TYPE_STRING
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0ed95653552f904acea1cc82bca00773ecb792c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
@@ -0,0 +1,125 @@
+path: "tensorflow.RaggedTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "flat_values"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "nested_row_splits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ragged_rank"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "row_splits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'values\', \'row_splits\', \'cached_row_lengths\', \'cached_value_rowids\', \'cached_nrows\', \'internal\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "bounding_shape"
+    argspec: "args=[\'self\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_nested_row_lengths"
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_nested_row_splits"
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_nested_value_rowids"
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_value_rowids\', \'nested_nrows\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_row_lengths"
+    argspec: "args=[\'cls\', \'values\', \'row_lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_row_limits"
+    argspec: "args=[\'cls\', \'values\', \'row_limits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_row_splits"
+    argspec: "args=[\'cls\', \'values\', \'row_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_row_starts"
+    argspec: "args=[\'cls\', \'values\', \'row_starts\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_sparse"
+    argspec: "args=[\'cls\', \'st_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\', \'lengths\', \'padding\', \'ragged_rank\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\'], "
+  }
+  member_method {
+    name: "from_value_rowids"
+    argspec: "args=[\'cls\', \'values\', \'value_rowids\', \'nrows\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "nested_row_lengths"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "nrows"
+    argspec: "args=[\'self\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "row_lengths"
+    argspec: "args=[\'self\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "row_limits"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "row_starts"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_list"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "to_sparse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_tensor"
+    argspec: "args=[\'self\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "value_rowids"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "with_flat_values"
+    argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_values"
+    argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
index af7fc9d4efebc62c282bb82f8a71cd0f5cdfb827..62d8ea9208f7f5f031b80be168cedfd538f18a22 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "assign_sub"
     argspec: "args=[\'self\', \'delta\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
   }
+  member_method {
+    name: "batch_scatter_update"
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "count_up_to"
     argspec: "args=[\'self\', \'limit\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a71da113b4ffcaa9ff71e18df4a9263b141b42e6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
@@ -0,0 +1,28 @@
+path: "tensorflow.autograph.experimental.Feature"
+tf_class {
+  is_instance: "<enum \'Feature\'>"
+  member {
+    name: "ALL"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "AUTO_CONTROL_DEPS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "DECORATORS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "ERROR_REWRITING"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "LISTS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "NAME_SCOPES"
+    mtype: "<enum \'Feature\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-verbosity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-verbosity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4d5b77c0738feb1fa6ea69672ee3fafa51de5be
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-verbosity.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.autograph.experimental.Verbosity"
+tf_class {
+  is_instance: "<enum \'Verbosity\'>"
+  member {
+    name: "BRIEF"
+    mtype: "<enum \'Verbosity\'>"
+  }
+  member {
+    name: "VERBOSE"
+    mtype: "<enum \'Verbosity\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5747dac7ab201443d1f237415cd280aee672a8ff
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.autograph.experimental"
+tf_module {
+  member {
+    name: "Feature"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "Verbosity"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..12e23bc0c8fd0831471abcf56bcd8f07d3e6fe57
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.autograph"
+tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "to_code"
+    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'indentation\', \'experimental_optional_features\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'  \', \'Feature.ALL\', \'None\'], "
+  }
+  member_method {
+    name: "to_graph"
+    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'experimental_optional_features\', \'experimental_strip_decorators\', \'experimental_verbose\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'Feature.ALL\', \'None\', \'Verbosity.BRIEF\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index f59082baeb21f092783290657d083ca7ef0bbc7b..f7d388d33d050eac2c9f14682bc7068c745a46bc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -5,15 +5,15 @@ tf_class {
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member {
     name: "output_shapes"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member {
     name: "output_types"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-dataset-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-dataset-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dcb304f763ea44d0d7314248170e615115b0794c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-dataset-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.DatasetStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'element_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-nested-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-nested-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b4b066e563cc6196650b1ba561da7c16a80a8656
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-nested-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.NestedStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.NestedStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'nested_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
index 9ca75828e55cdaeac5a493f49fe4bd963265e9d4..3b7ad64f51f88ae9c860e061db5c1ad6b5f2bcf8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.experimental.ops.optimization_options.OptimizationOptions\'>"
   is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "apply_default_optimizations"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "filter_fusion"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf41c1d1d696d94ef9da5fc64272349d1533816e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.OptionalStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.optional_ops.OptionalStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f97376b328cf34eb04918bec7bacf08d254d8db5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.SparseTensorStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.SparseTensorStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'dense_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a99db4542e0deb506d00c00f889299dd22d67e1e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-structure.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.data.experimental.Structure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-tensor-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-tensor-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5c8864a9dd98058c659e72ba8059182a666ea39
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-tensor-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.TensorStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.TensorStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index b8ba3e341f1faeb353d9e0bfce8b43976d9529e4..a262c0f799634470090eeba90f480f94ac671f87 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -12,6 +12,18 @@ tf_module {
     name: "CsvDataset"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DatasetStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "INFINITE_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NestedStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "OptimizationOptions"
     mtype: "<type \'type\'>"
@@ -20,6 +32,10 @@ tf_module {
     name: "Optional"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "OptionalStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RandomDataset"
     mtype: "<type \'type\'>"
@@ -28,6 +44,10 @@ tf_module {
     name: "Reducer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseTensorStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SqlDataset"
     mtype: "<type \'type\'>"
@@ -40,14 +60,26 @@ tf_module {
     name: "StatsOptions"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Structure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFRecordWriter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "TensorStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ThreadingOptions"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "UNKNOWN_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member_method {
     name: "Counter"
     argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
@@ -56,6 +88,10 @@ tf_module {
     name: "bucket_by_sequence_length"
     argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\', \'False\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "choose_from_datasets"
     argspec: "args=[\'datasets\', \'choice_dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
index 509bbae8332fe767a34c14a33d5af1855b3ffdac..aa474680592a1a3996ca3db970b814ba167cd801 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
@@ -28,4 +28,12 @@ tf_module {
     name: "experimental"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "make_initializable_iterator"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_one_shot_iterator"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..81224f00a4afdceba88b62192ad157573a7665ff
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -0,0 +1,142 @@
+path: "tensorflow.distribute.MirroredStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.mirrored_strategy.MirroredStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'devices\', \'cross_device_ops\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "distribute_dataset"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_steps_on_dataset"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-server.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-server.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-server.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distribute.-server.pbtxt
index 9b8f185f5b699e860c6fbb50b8d2912984908982..6c39bf4fc4099a753ceee4de0df990a887d2ab4e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-server.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-server.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.Server"
+path: "tensorflow.distribute.Server"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.server_lib.Server\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
index 9eb73d2c0d9069ec4b818abe1825503f0ea36fc9..63b6584caf02adce52b90dd74ff63f88003de7c8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -74,6 +74,10 @@ tf_class {
     name: "experimental_initialize"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "finalize"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
index 4d833b54ba0950b6b2cf40c958829dc2eeb24795..31dc6e071613bfe3d2ea24c65835f09bab90c400 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "InputReplicationMode"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member {
+    name: "MirroredStrategy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ReduceOp"
     mtype: "<class \'enum.EnumMeta\'>"
@@ -16,6 +20,10 @@ tf_module {
     name: "ReplicaContext"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Server"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Strategy"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
index 848fc303aa5748348b2aee69ec1e869807327d3d..01b870a81639807489ec2a09dcc185137aae1665 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "half"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "int16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "int32"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-hook.pbtxt
index c3037baa8c951ecd9b60267ee7cc8674ead88dbe..f9e1504b494e3863f770df23f9f9a92e004b8713 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.CheckpointSaverHook"
+path: "tensorflow.estimator.CheckpointSaverHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-listener.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-listener.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..111b7583f2cd005912c7f06d977565cd17f265b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-listener.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.estimator.CheckpointSaverListener"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverListener\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "after_save"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_save"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-feed-fn-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-feed-fn-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-feed-fn-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-feed-fn-hook.pbtxt
index 7bec4d032cedc0711ca07049d5d04490e8bc3f30..f24de493f24a363190cd1d323adaa75b32b0d8e3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-feed-fn-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-feed-fn-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.FeedFnHook"
+path: "tensorflow.estimator.FeedFnHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FeedFnHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-final-ops-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-final-ops-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-final-ops-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-final-ops-hook.pbtxt
index 31cf9aaeb2c640f8db205c0753f20acc75338fe0..6651170ba33f491d5a5342bcd6e6814e1b973832 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-final-ops-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-final-ops-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.FinalOpsHook"
+path: "tensorflow.estimator.FinalOpsHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FinalOpsHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-global-step-waiter-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-global-step-waiter-hook.pbtxt
similarity index 95%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-global-step-waiter-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-global-step-waiter-hook.pbtxt
index 147448618e2df9f71ac794e369b108629e10ce0a..37db48bc64e2f0e955105e8094d51c851c25558b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-global-step-waiter-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-global-step-waiter-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.GlobalStepWaiterHook"
+path: "tensorflow.estimator.GlobalStepWaiterHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.GlobalStepWaiterHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-logging-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-logging-tensor-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-logging-tensor-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-logging-tensor-hook.pbtxt
index 9801c05df181ee65cc8ce0ad2e886566c0145fd5..425f0167a161104891c3bb76816fe8c5094de28a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-logging-tensor-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-logging-tensor-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.LoggingTensorHook"
+path: "tensorflow.estimator.LoggingTensorHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.LoggingTensorHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-loss-during-training-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-loss-during-training-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6e17e4352b0f909b31327a57bbdca3bc0e02a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-loss-during-training-error.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.estimator.NanLossDuringTrainingError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError\'>"
+  is_instance: "<type \'exceptions.RuntimeError\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-tensor-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-nan-tensor-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-tensor-hook.pbtxt
index 7d1c89f9b37b5e63ecf2cf766986cb8faa5872c4..82293c2c0c4e7204d9aba83f43ed2fac6bc46b19 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-tensor-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-tensor-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.NanTensorHook"
+path: "tensorflow.estimator.NanTensorHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanTensorHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-profiler-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-profiler-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..65b5fb16b0874e7c6469ef11420db146be1f0b5f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-profiler-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.ProfilerHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.ProfilerHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'save_steps\', \'save_secs\', \'output_dir\', \'show_dataflow\', \'show_memory\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'True\', \'False\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-second-or-step-timer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-second-or-step-timer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..64051d2bd6b69614cd210d902552ddeb8b6c8e5e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-second-or-step-timer.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.estimator.SecondOrStepTimer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SecondOrStepTimer\'>"
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks._HookTimer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'every_secs\', \'every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "last_triggered_step"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "should_trigger_for_step"
+    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_last_triggered_step"
+    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-value.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-args.pbtxt
similarity index 52%
rename from tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-value.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-args.pbtxt
index d33fd4d5d7b6b3e2eb7454b5326d993c139f0490..b375c7429469d2a8b89d1bcd048599d6478624ae 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-value.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-args.pbtxt
@@ -1,17 +1,18 @@
-path: "tensorflow.SparseTensorValue"
+path: "tensorflow.estimator.SessionRunArgs"
 tf_class {
-  is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensorValue\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
   is_instance: "<type \'tuple\'>"
   member {
-    name: "dense_shape"
+    name: "feed_dict"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "indices"
+    name: "fetches"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "values"
+    name: "options"
     mtype: "<type \'property\'>"
   }
   member_method {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cb4ac9f50ec9aa9d6531a16ebb48a9223cbc5188
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-context.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.estimator.SessionRunContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "original_args"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "session"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stop_requested"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'original_args\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "request_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-hook.pbtxt
similarity index 95%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-hook.pbtxt
index db1aa24acf0e295b4b787eef68250401dd6a6e27..54e9ad9ed44b64e2c1c49b5ade4c7d3bb35563de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.SessionRunHook"
+path: "tensorflow.estimator.SessionRunHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-values.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-values.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..678814169635bfa9997db26df23acc79c2d84881
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-values.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.estimator.SessionRunValues"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "options"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "results"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "run_metadata"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-step-counter-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-step-counter-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-step-counter-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-step-counter-hook.pbtxt
index 13261f6dde1cf8e6fd228950600303370947b7ea..4368e04df3f86834b540bb5306bf66dd82ac440c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-step-counter-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-step-counter-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.StepCounterHook"
+path: "tensorflow.estimator.StepCounterHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StepCounterHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-stop-at-step-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-stop-at-step-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-stop-at-step-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-stop-at-step-hook.pbtxt
index e388599b0bf63379fa95a3276e3f4859eab86d6d..938b189a8c30237bb15bf73083a348e6366fbfc4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-stop-at-step-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-stop-at-step-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.StopAtStepHook"
+path: "tensorflow.estimator.StopAtStepHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StopAtStepHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-summary-saver-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-summary-saver-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-summary-saver-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-summary-saver-hook.pbtxt
index 697c3667b09f42f208dec38938f5a1ce0cc09029..104157315f5982efb4f6b9f39e0ece905a225e10 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-summary-saver-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-summary-saver-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.SummarySaverHook"
+path: "tensorflow.estimator.SummarySaverHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SummarySaverHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
index aba120218cc599039a501d6b2a6e754ae3ea5b5e..5a2a01cd5325ba7e02d9b549293dd09a4a57e167 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.experimental.InMemoryEvaluatorHook"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.hooks.InMemoryEvaluatorHook\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.hooks.hooks.InMemoryEvaluatorHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
   is_instance: "<type \'object\'>"
   member_method {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
index 2a9a0346d72d29524d697e53ae2872608c837caf..f0fd7ce782db71ff5e790fe50e93556bf5d19e1e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "make_early_stopping_hook"
     argspec: "args=[\'estimator\', \'should_stop_fn\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'60\', \'None\'], "
   }
+  member_method {
+    name: "make_stop_at_checkpoint_step_hook"
+    argspec: "args=[\'estimator\', \'last_step\', \'wait_after_file_check_secs\'], varargs=None, keywords=None, defaults=[\'30\'], "
+  }
   member_method {
     name: "stop_if_higher_hook"
     argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
index c5b0085b8d3ec58b4215d4a756957e1509501841..6f57505afe84f3982a8beb402783f35b3e699241 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
@@ -24,6 +24,14 @@ tf_module {
     name: "BoostedTreesRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CheckpointSaverHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CheckpointSaverListener"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNClassifier"
     mtype: "<type \'type\'>"
@@ -64,10 +72,22 @@ tf_module {
     name: "Exporter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "FeedFnHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "FinalExporter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "FinalOpsHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GlobalStepWaiterHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LatestExporter"
     mtype: "<type \'type\'>"
@@ -84,14 +104,62 @@ tf_module {
     name: "LinearRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LoggingTensorHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ModeKeys"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "NanLossDuringTrainingError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NanTensorHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ProfilerHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RunConfig"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SecondOrStepTimer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunArgs"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunValues"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StepCounterHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StopAtStepHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SummarySaverHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrainSpec"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
index 93d9b0fd75b53e6b15e34506e698855903b5be5a..cfa3372b12bfe32eed4311c89b6448c0359c0913 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
@@ -46,6 +46,6 @@ tf_module {
   }
   member_method {
     name: "walk"
-    argspec: "args=[\'top\', \'topdown\', \'onerror\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'top\', \'topdown\', \'onerror\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index 9dc8daea5c4e8e6293b2427add50ad4ebfbc264e..eced2e1cb0706153a9bfc2749847395d194fcb56 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -163,11 +167,11 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
@@ -231,16 +235,20 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -263,7 +271,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -275,6 +283,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index a357a825153528bdff75e9f73ec8d99545d71120..2acb90173f3242e8a92913728eec84ef5d455d1f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -168,11 +172,11 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
@@ -240,7 +244,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_classes"
@@ -248,7 +252,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
@@ -258,6 +262,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -280,7 +288,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -292,6 +300,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
index 7d298e95135ebf41230d72ff488fef30be682edb..9dbdaf0f5f3db292feb98fe06092b6f7a6b8f034 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
index 133205ab88b47afad32fc70ceca93513768a3b19..a5804d3bbcff401920ddd2b59bd5f094b3e1c628 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
@@ -17,12 +17,44 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
index d766c09ac5efaa9d0e4ffba4e495385130c7e770..bbc02c4d71f835497be74e771c5ae57682f5a5b3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
@@ -16,11 +16,43 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+  }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_train_batch_begin"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
index 605f74e5602a63f5a18c31cb26113d300ec76e7a..6182baf0a31e7027b685561fed5eeedc54a766a3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -21,12 +21,44 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
index cd893e67269164781d6a6b6294a199014d40fed8..9b1b068e225a5dae69672ecba70bdea48c6e6ae6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
@@ -17,12 +17,44 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
index 50f2054cabb1b8f6c46a9537ea923a18f87e5c80..92440188c81ee192df332cd89256233591b2d281 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
@@ -17,11 +17,43 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+  }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_train_batch_begin"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
index 9ed9db0a89b49b88098e15baca414ff78b6f10e6..a04ffb92eb9e32b2473355f140d68537b80074df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 3d8d1363bb4e4de818788efbf3c997594350006a..c10c236ad1990160be53ba5df7afeb64619bf260 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -17,12 +17,44 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
index 5012f1517d57dd646d82ab669cb279b6363dd6ec..624f856d2752e1f375154664a892d6c1d600ecbd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
index 73652c2b61259f768eca76b995ae4592df868392..0db6b8d371b61db6fa565a93416dfc14eeae1d47 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
@@ -21,12 +21,44 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
index 24db71de1182d58b78fec0419aa9cb48a2e315d2..dac2049fe19426738368009822ce2dac8bc64467 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
@@ -17,12 +17,44 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
index c5503c69a5f3cb6765c984778c0e3626369ee815..2e0f77eda85780cec26b103ba11276ccdfd90189 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
index de6e8ef072558e6d926ea125aa5056e3c229d37f..2834b74e8afbd5ee01eb77b8b14e75fc4e50f230 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
@@ -17,11 +17,43 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+  }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_train_batch_begin"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 1d814b2c8b553f1b2a07f9d9b97dc70ec0674969..b2ab5006dc4ac3571b4f9d01607adb6aa2c0be26 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
index 164edbd66ab2487a980155eabcf18ed8446e2c14..5cd6851278dce8ef45c90112176be94b9c45dc91 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
@@ -4,4 +4,12 @@ tf_module {
     name: "PeepholeLSTMCell"
     mtype: "<type \'type\'>"
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'model\', \'saved_model_path\', \'custom_objects\', \'as_text\', \'input_signature\', \'serving_only\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "load_from_saved_model"
+    argspec: "args=[\'saved_model_path\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index b84629540e700f242f885064c92309c294693a11..da212382c1a6a3c5d37afbd1ac895249b566a913 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index 5918a13ad8629582829049485e896688ecad9579..c910db027e69f3ca21495c968ebeae691711c316 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index 599da06427dfe4f28e757a7aac8d8a14856a4556..8b7b33e98ce2673ffb5dcf951a8cd6a684d847af 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index f9ff1538c8134d96051ad81d35c73e59c6a8cc57..5e3e41ba205c70413b7d015141b92c206ea26f32 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 723fc9cdb0d0ad93470e22fd8c147d3ecc92af91..e160b1015380fcf9f3a7a8f4a41df6877cbf9246 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 957ce2f0ce86f8df3eb8b57606229fb661eb52f7..b6b71358c869ff6210e9a704f79cbd63970b5dcd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index a52c0af68175420dc2a1993d1f025d36705538e1..5c5ab1580eb3d6ce02498b1bc42aefc39784abf4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index a004db62ddcaaae02a411d8db51f4026ece1384d..489de2e4d31d8c631ea11f8a50c91498a70fa308 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 44f83d1387cb2ec681f50f7b1f0297f3f74594ed..30fec249b838350ac4ef542dd0f1969b0ddd7588 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 8378faf7188ec594865d4b68c8ea8cae284183ca..0e983c9234597a17c6c9342eaa3b3a26158736fa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 9d5655c9644e3a2394a346bed78fc478cf60ba8d..ec50db71279b5e688ef36558941071fbba3c02f4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index b3d3c84f92e6491601f670739b2b45f79313e8f5..cbbb000e25669a6a77c90c371d999983274e48bf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index d37a6b47105225d7b83b6a264b944ceeb583a6c4..23153d42847ad6015ccc347b70d35b7f3b83dc03 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -17,6 +17,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index 1ad7a91be0ba48d0dbab19da8c7cd9ca89095918..766c3f267f97f19d87cc39a24ae90dca796b4988 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index cb9abc25396bb63a3c40de5cc52f9df7ed20071e..898098227190498a5a752a493e3d9bccb431bf15 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -42,6 +42,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "filters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index 47dba1d81f8f97a60fe72ec521f82a78ee5f3505..a74b8d29502f0493a99b16d8fdeccf77e205be0e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index fd649418961301f150aac3dabc1bdf0ade4a9c28..b093f8ead94199ba2a4861d0453ff5248b2d7fb0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 1b1425d53197db8b59abf51fe93c0b0c45299956..0ce9f6fd591f127eb2874397abce21e8451ba3b4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 1741063fe8b09acf3865e0a135e96bb715dcdcfa..c1f5bfae0d35683e4e718a73add8f57be9473c72 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index 50feb4f458ad1a9cb2b2bfe5d67997b7551eed74..4aa872c4f04c2f0a3cddc83bc7c64700cb97ca3c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index faaa535df9fe03ad07862f0793f8ebea67b405ca..6e01f7c70c9987cfa651078175927edfaf1fd6ad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 4079329d1ee2a61270fee38426bb8a0859c38ce3..c002042d7703bccb0af37cebe453803c9e9009e3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index 32e56696e1617f7810792e3416a2ebb2037d23c2..f5e5446d2b995c9ba2707cc16376e8c639576c76 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 381abe73401fa3a588873d643324fc020c159e30..d5f36f4bc3d1b517a7f2dfaf3fed490df66a5fba 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index b3e4bf9689dc7e9db63de7f43e9dfa9ac4d42b02..346fec6056380842f4d5f40833cea82a540c088d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index 7aeff8003c322e8a8168dd70481a8b30b08762a8..0f8fe9f05e0f577dca9e1f3225f3e14074fefa12 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index a1728d9d4f9a1e677646db04c4d0df9572e21208..68fb7382a71cc0e3215daa43e2f1ea0f6de26e16 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index 8d8fd142cc64ee113c4b6a7e4e2462ecc69b6028..deda82f9b3f020589c9673e9070ec40713846b7f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index 7758209adf8fe7a1306fa5ef125935dafd925c3e..2eba3fb95450568a8e1611dda2564b764565cb3c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -18,6 +18,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 7c463ff1257599366be049edce6cc06140906286..6ed13d37f2b511e09d3dbd4ade0ca29088e565ae 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -18,6 +18,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index 4960d0264e96e872ea5c49a8841cef20bd5eb37c..919aed5723c0464b8540ba1cfb971bb23bfef73a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 8fad7535f882718462a11e27e75732e3097cb87d..f590ce1ef71200854f62baf3c8746deefbaf8e46 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index 5b425f2d4d7a8a897280490e26922766d8bf7065..db4261fadc76e2d953d477c472adcb422d48105e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index f6c4d0a438ed027635b40ec992eb1bbcb5c9a3a1..7369552b3b8733b6c586888c643c9596bebcdded 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index 82b761fc1761bb3e7638f7a80bc80c6433162d04..f643ef9de28eed6756073d84553a4986fb0d338f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index c9ff323877e06b6dff274644744d425e3a9b7932..ce053ae8c44353815d9f6872d1f8ab72ec93c4f0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index 9b4165d4cbf88fefd2bb684dae70ea8afc01357b..db9504307798cf5e51a28469a3df669dd77dc0b8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index f225f7c4309615919fb05df05f2ae664bde80097..a6edba6b7efc631cc1057a8ddb7d4af19142ac6d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index 855d001700179fb634d1dff78585d340420abe7f..f8c0dbb27364db34f8357460376cde555a5a0063 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -33,6 +33,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 2c404c99cd2175cdc8b60b229e4410bf280ebcb7..ac4bbe7d19625bdf1b11f8c3dfc9bdf1ad5eaaf4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 6f109d59d0f6fcd2b4650719e3b4f653baec7d23..947e3170aea6cfb26e6604f1ef950293fa4cf4ad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 69f8a9031d32eb73bb44291cdf330d738d745cf9..17e202c5812f633e430a821dac5f424ae587ad47 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 4299f765e525b136e289bba169becec06e19ffb1..9772c5df9b525576a2b9702f238fc7d309b7561e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 9153a1a2406b6fc4ab60c80fee2f8d6d69b00b72..cd65075591d151c0e6538588af932a6cdab5c90c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 625e81fd2322ceba153fa65c138948ce43843089..0423de7a248c17b1232ea5b9689578f2d824cbdc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 2fc769742c70c5665c9cb77ad246fcdb49366d5a..4471cba245469c636419209084d624d2138fd4d4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index e307a65c7c565660e1f2b6b6b74dc5970425eaa4..c0e7fae4564f2f253df4377076b0ec64cf2b5cab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 4394ad0364e89fd3531d6625e52540991cadf973..6975a6e88d8822f5a817d4a178ab15104799b91a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 050ed39fe98dc7cfdf6febe45e235d3ae7cbf486..56bd70db7e18f61f8af8cd9f9d4439b544d5b380 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 436191821ef4689351b6124cf2a20afad917e4ab..656319920ec34891e22b7145da1f80f787681572 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 4ba540aa6adc72b572aa9340f89967d69ab78a3c..f815e669115eb21ca2f23909d6a36ede278ccbd2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index a2e9322cb3fd4e56af708d5c4e17b660f7bc2247..f61f0e521b7962bbef1c916a5aa79c43e8ce4019 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 5d16a57fc1aeff9939220de8043fcae39e3d953e..c58c8ce63f50b6d5f2dc3428fd50726ddee720c6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index 9dd29c1251ef2eacaf535a3f10f3d42dc36624a2..0efe9a4297960644c20d16f097e816046bb2672c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 0045d5775e2c19df21428bd4420b6e5612c8002b..5caa02e71a10d92a3c0d68f20628b5391f80e260 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 529c750f98715ec30313ed34c9023a845061a3df..f21c7e5b217cb5e3e9a8c30c31b6a0615d7d73b8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -33,6 +33,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index d4d1bc6b6bbf0ce39742b740aff6dc0c1cd464a1..381d5660b9846b9f2b90f630d724fb0561d6ca94 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index e1f5491180903f7d6931cc09755cabb715bbf233..36b0a86628b92c84c227eb59d55c9e9a12be053c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -11,6 +11,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -81,7 +85,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'dynamic\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 9b69d9a9447f42907236b5cc8c7672012f96c38a..b41662e63a8f9273062256ef7ee100d70900e22a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index fd52259432577ac94dc702d4411ad5c0eed1ff10..e4abfca91363887b9574b76894da24c9700102cf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 5fc8af0d03564c649dff6e9df70d10731319de40..cfcb92e293c59493c7e57ebdb30ac2f2ab35715b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index 7f8932270e63bc02852c5b64e53694e7e26be08b..e0721353d14d87d2d1e9e204eb9d5b4fe5902b3f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 4723b99cb0792e1ce0bdc45e46908da8c2b5359c..0618fbeead02e74e645a2b6be1310f8fd0c00470 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 173c5d4a8b149c4e23683cf375e8d793db7faa5a..4af52ffec80937d32e8cc0e0b128a8db606fd94d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 14e1899e145224e411d65cbf481060a3b2cec0f1..db9311ee58d441908fb5c4ce3d952bafdab9dfcd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index a708e652bf0e82dea0f58034a81a040a39550dc9..bfb15cb44789d9d8d134a5090bf27abf2f81eda2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index e6706b5cf9f32bda78adc4e2db5916a5750cc82e..1db962dbb8c0ac2b0562ecce10354a76d3e74be4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index a73c082d1bba0453b742f76bacf0ad6116ba79a7..f80d5267e79c4b74831b2b926beb84d479008e10 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index f3f195554bbf4a43efaf2af0fd278a23bf270994..cd772d4ac75e3ea4820a788543e15e3af3566b21 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index f345d1d67b2ce0200c64b1aeea5f39821d070bac..2bb6b3073ac79cf475c942b68ac351a18073c689 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index 31cb8bc177c7a9e365101e75108a29900fbda124..e1a1f0735524af6d3597dfff9ca64b3e7dbd5e2c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 44cccc92bd2f1ff0335c22f2967865dc88a96ff7..66c4446572c2ac5930a8a0bc0d5de96e584aa94e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index b55e191ff1ad6997550966bbb6154a81a489575d..0839554f434f64cf957b17c8f5863655fb427ee4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index e9575436e5b14ac8c52a0b59c86937886eab5f40..b10695f6f7965ab7d5dbca7128530348c8758179 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index 98223b207f2ecfd5b7af8a53390166e53a7d4f73..b96500f710398514b37b5b6f32fe31c61aa99e44 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index 2df918b16b2552323d75083bfa80e328c0639cfe..a27d93ec62002a9ade1a012c1bc9f8bc4f05e80f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index ce5f9e21290eeddc0052257191ac4a6d068c1366..6dda24d3d27dcdbc88189e377ce20ec64d908dc9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index a0bb917775fd9edb5d909bf850310e0596a88209..8a4ae8aaa7b91587c7f4e0a71eae6e0ac8598482 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index d7942f201bdbfa8d1577813be461a5905b5c6c90..a083c1da2e3a0a450bb1f39dd12f3270bc49e1f6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index f7ac9042d46f46ab35d18c62e5d8841679a18ca9..5d5b361f8272d273941e8beb1978d0ec8b406027 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index e5a92688220f6e227b317d71a70fde01df4c432b..392c338d73d75e2af9b06be86d449d0ac3415c50 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 0fe2c974a762784a82a6b97e116357be2a61d84f..1143604903523b286f24cc6ca20b97b68e473593 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 2ee5873f0f11688019dec3a6cd69db06d99b9caa..5a15f1a55fa0ce6db3357ab9a3e69d13846caaf9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -33,6 +33,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index 5b8f64aa35725d0ea44fc5c5b81952fd839503e7..c470d9c8e8d9281087e347881592c488f46212f7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 240cb6e562f77467d94ef95db2374150e318bc04..d17d6495c09b0e43041e85b8eb99d9d47212606d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 6226c469f8a534f96f6ea991fa5e7d2cf0019e3f..2d538b4734892b85034974887b7fa7dd024551b8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 34dabce6d8dd0b1b6fe50a008a981e1f06a77edf..b70923601aeb843fe663734d45493fa97757915f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 0ddf628ace582db259ebe0b211aba6e6362b5d5b..f453ddd50efb193accb2d9105fcaf8a130ca3b3f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index 12eb35ad154a514afd9c900cb2dbece8af28c49f..5759169e07d26600a12b086edb8f945735782fed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index c41020c2b45cc88c9b63f3b7a45c35066794dfe2..bfde1c35f65a603bd38e1cbab6c2d5eb49ac40f3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index 479f89cf6ae93e8d6ae02e304a51a145164df7de..e7f59a9cc5143b337e539d28cd6d1ffd691b5e97 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 233363ce02614f184b43a059889c7475b6a8c50b..0354149d4fea06d489be61391c46e84d8b6c369a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index cb6228ac446bd236df88f94eb6e9e717ea38463d..fff0e26bc16b863a1d86d3f735da009cedcaffd9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 03bad3ccb613a225ad56e128ea680fc9312151e1..c49fa5663d91c4601062d7b207ce2257cec6dd2c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index 158996792a47fab0e7aa26d21d4bb7f281ca76d2..c961699053a4fdd71f8a2782ae463970f243c88e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 63a56cd3eebe271f66258c9a0acb974764555b34..1911e128eb2d7b0ffe6c4ff7eeb0b4927a731bca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 965a4cca04651e123c5bd93484200a58b39918ba..88be9143472ad00b12688059600890f67c6f4e92 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 1a624308878a68f1b48cb0f8b5e08dafbbfa0333..2bbb71ece2583d283cedec37b10eda7b693baa0e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2f7da93f6f412ca559aec2f6acde2b80a5c93c86
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b3a7cd80973259bd5cdfe382c656a9478f8933d8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
index c6638bd5e06b154656dc4ea4cb429e7860e7a29d..9e26ddbdca0c45df195dd566952379887dcfcff3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.keras.losses"
 tf_module {
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MeanAbsoluteError"
     mtype: "<type \'type\'>"
@@ -38,11 +46,11 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_hinge"
@@ -122,7 +130,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "squared_hinge"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
index 2db07df5235e150f691a12d6b332c6d0d241ac19..bad488f59b99ccbe7c6424244c86288afba51f46 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index 904ad3a21a05895b23e30dab82a89a31c74dcfca..a1e7601a5141152c6709c46bb50b331fda69afca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 17b74924fab4f596a010d6b9731b474433a8153e..5f2c2f980777a34ed5128d8090ea7e945d9004e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
index 49f577e1367aece126449923f77f4f6c89493e99..c153e9cf4d7932b1e4bf65bd02b8de2706d4b8be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
index e8baf858669a446a11b44e044f36bfde61e440bb..aae2bd99886fbe93086186864eb6040437b872d5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
index 40fe64bbd2cec45b9a8c4e9b041d3fa858af1327..904a2fa9caee882775701c53a97c9aac0fd8120e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
index ae6a85026da80cd071984aede8d0ec4e9cd571c5..e81ecfe3f627f9d43ad1c673d41b70e81c783f13 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
index 31068a51d510a7b95f62f61f03d37176c0fca55d..f8470b94d7f52216d1c1e4342acabb404bbd8f74 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
similarity index 92%
rename from tensorflow/tools/api/golden/v1/tensorflow.metrics.-false-negatives.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 33226a2df62bd69017c3f54020629d5429e39c06..b70ef32bcaf3cb243d5d22d93cdbd8188f56d4df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -1,7 +1,7 @@
-path: "tensorflow.metrics.FalseNegatives"
+path: "tensorflow.keras.metrics.SensitivityAtSpecificity"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.FalseNegatives\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivityAtSpecificity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -84,7 +88,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'specificity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 0c17452292a031d42f3da0d5844e99d1272dad25..2e693269bf749260e143cf19c6e1f51a5242412f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
similarity index 92%
rename from tensorflow/tools/api/golden/v1/tensorflow.metrics.-binary-accuracy.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index b9bc6a716a1d114330fce2521e238897bdae56d0..e62a2df0564a0eb4dba528dab575b7c08e41b913 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -1,8 +1,7 @@
-path: "tensorflow.metrics.BinaryAccuracy"
+path: "tensorflow.keras.metrics.SpecificityAtSensitivity"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SpecificityAtSensitivity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
@@ -15,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -85,7 +88,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\', \'threshold\'], varargs=None, keywords=None, defaults=[\'binary_accuracy\', \'None\', \'0.5\'], "
+    argspec: "args=[\'self\', \'sensitivity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
index 1b5eb8d0de53960c3a98409119709c1307aa6379..1a524d73c0d387fe603846b5f180916829d65435 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
index 5b9c470e32d7e038f9ba11e4f96ab6eaa6b60a87..b9b4f565c5eff9ece856255ffbe15af3fb97c2df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
index cc90d0e7092e37993bff8c4883aa677b45e05499..905021dd790205e64a6f9839218200db98941927 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
@@ -32,10 +32,18 @@ tf_module {
     name: "Recall"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SensitivityAtSpecificity"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SparseCategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SpecificityAtSensitivity"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrueNegatives"
     mtype: "<type \'type\'>"
@@ -70,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_accuracy"
@@ -78,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "cosine"
@@ -154,7 +162,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sparse_top_k_categorical_accuracy"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 41d8b2fc950d02ace5f0efc7f790aa0a9c022f5f..5885cd21c1976bd7b95f7ca5bbea59eeb40b2ce8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -163,11 +167,11 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
@@ -231,16 +235,20 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -263,7 +271,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -275,6 +283,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 2cf107a5cd89805d590e0fc5a372ed3d18c914c0..935fa32f8c7f2d3b9c6b220a6b77a957d2c73f30 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -168,11 +172,11 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
@@ -240,7 +244,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_classes"
@@ -248,7 +252,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
@@ -258,6 +262,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -280,7 +288,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -292,6 +300,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
index 059c91f724aae187055f8323c7748dc99f153302..85764cc8dcb46f5aa8f0d0050dae07cdbaae35f3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
index d06c8e81ee5d2a8b487d7c3c3714a1f4ed2c8e80..259da2ad3e8938bc65f3cb740f8599a29a7a9a17 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
index 6be8e7c210f3f0a28ed8ad8a6672bc4323eb7f9d..ffda9334cf31ebd5329eab57fc0b0111b4bb6ab3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
index 16d9ecce10cfb3c28cd1cf47fd65c987680bda41..56a3fc3de751b6b52cbb165f3f07cc935c33c054 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
index 21c695935ce7751df67e09091c961e9e0cfbbf7c..d72f24b3d574c2a5a59df57e00241804c6bd6cfc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
index f24d0307207588610c1f764bf43912b64c3ea2c6..72a7339368a64474bcb3ae70ac655c89f446abf4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -16,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
index 0a510ece355435d8e75e39d5f7cdc6cebefe32cf..38a63df42d296d9c87c70be0f87a6894b7d1dcab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
index d0ee44bed3c739da27cc83f0e643e1ea9dd98078..29620561f7f3d244ac900c92565812cb20834853 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -16,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
index 546de3cdab3aa0519450f74c6c6d0fe74ddc000c..f1a2bcbb7268e32c213124887d5f635c91493339 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
index 3ad311581eba815c2d1b0155a1380db80dd61c5d..d1e2d5757068f15e893c42631fcef12558d1f16f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
index 9b83271350cf90a2d430303dfecfd28facad272b..92e40f6d96063d06d8b2e4dc63d69481171bfaa7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
index 87a7fb3d843e3e8e3e2fe5a56ec0b181355a6d7b..087601a3c13a921fbcaee22cf92ec17dcb841d93 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
index 32b17e90ade7aa0054a390256e3abadfc7011cbe..b052c6bb0a7c72c05102006fe3f413b53c0651ac 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
index 643c469717c258207046ddd93a318f47753de46b..9444a1bc765814e49834a2c76cb5f8938861728b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
index 434e25adc12c2f2f704b07087b8552781ac2d024..83dcb5e4e7d379c129483d507f07c4875d467092 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
index 089fc6f9243c85937500b6275da034eb0748ecd4..eb26e2220bb2b96403fb50304e07e5ddc3a8579e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
index bc3d58b9ca9789b43bc91f9283a81811f2b6a4e9..38d75e8bd54995c85ce0f403a0c5cc2fc167eebb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -16,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
index fe7d71af3a4a46bed4ea9e62cbd7ad17987517c7..90fc61cdfaebe4d03cc5422337899fbe853fecc7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -16,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
index 1a4098d121b71d25fc0aaa9c7e6e4f096b01e033..9f7b422fabcd55aed98bc93f01143d35698c0399 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
@@ -132,6 +132,10 @@ tf_module {
     name: "lstsq"
     argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'True\', \'None\'], "
   }
+  member_method {
+    name: "lu"
+    argspec: "args=[\'input\', \'output_idx_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
   member_method {
     name: "matmul"
     argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68c651a3c9969f2f16fca39f4466cebbb44eea28
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.lite.OpsSet"
+tf_class {
+  is_instance: "<enum \'OpsSet\'>"
+  member {
+    name: "SELECT_TF_OPS"
+    mtype: "<enum \'OpsSet\'>"
+  }
+  member {
+    name: "TFLITE_BUILTINS"
+    mtype: "<enum \'OpsSet\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
index f5013c250be8477bb630d3d57ae88a501bb60b9b..154dd00821794ef4a5118e98d67e32beca38bebf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "OpHint"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "OpsSet"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
   member {
     name: "TFLiteConverter"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-false-positives.pbtxt
deleted file mode 100644
index 9953162ea3ec8ee7259bc8304052ab0754cfa630..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-false-positives.pbtxt
+++ /dev/null
@@ -1,193 +0,0 @@
-path: "tensorflow.metrics.FalsePositives"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.FalsePositives\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-mean.pbtxt
deleted file mode 100644
index 7fe6d6fda9685e3f9f0ce29b81f260f3e41a7ef3..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-mean.pbtxt
+++ /dev/null
@@ -1,192 +0,0 @@
-path: "tensorflow.metrics.Mean"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-precision.pbtxt
deleted file mode 100644
index 8c3271a109cb408492369f59c889fffa522e6d44..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-precision.pbtxt
+++ /dev/null
@@ -1,192 +0,0 @@
-path: "tensorflow.metrics.Precision"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-recall.pbtxt
deleted file mode 100644
index 840a68bbc784b8570eea7a40d0e6174de60a7e9d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-recall.pbtxt
+++ /dev/null
@@ -1,192 +0,0 @@
-path: "tensorflow.metrics.Recall"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
deleted file mode 100644
index 7bce43fbdeb13591ab5a25b50a0d880702173d98..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
+++ /dev/null
@@ -1,194 +0,0 @@
-path: "tensorflow.metrics.SparseCategoricalAccuracy"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_accuracy\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-true-negatives.pbtxt
deleted file mode 100644
index 83cd5b736bc9d0b55720e9bdac7047f940b259f1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-true-negatives.pbtxt
+++ /dev/null
@@ -1,193 +0,0 @@
-path: "tensorflow.metrics.TrueNegatives"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.TrueNegatives\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.-true-positives.pbtxt
deleted file mode 100644
index 5b2502eafee7126993d1f40dca74e5cb16856b71..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-true-positives.pbtxt
+++ /dev/null
@@ -1,193 +0,0 @@
-path: "tensorflow.metrics.TruePositives"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.TruePositives\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.pbtxt
index f5c267a1664a1c17cc5ffffa4992039050addc69..e9b996c9f53e9062dcdd39ef22f99eef5175eb35 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.metrics.pbtxt
@@ -1,49 +1,5 @@
 path: "tensorflow.metrics"
 tf_module {
-  member {
-    name: "Accuracy"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "BinaryAccuracy"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "CategoricalAccuracy"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "FalseNegatives"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "FalsePositives"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Mean"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Precision"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Recall"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SparseCategoricalAccuracy"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "TrueNegatives"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "TruePositives"
-    mtype: "<type \'type\'>"
-  }
   member_method {
     name: "accuracy"
     argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
index 48501e1b581336558c7e62d4b27d21d8c701878e..40e20f8c919e64362e5697bd00ded70d0c2292a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
@@ -154,7 +154,7 @@ tf_module {
   }
   member_method {
     name: "dropout"
-    argspec: "args=[\'x\', \'keep_prob\', \'noise_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'keep_prob\', \'noise_shape\', \'seed\', \'name\', \'rate\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "dynamic_rnn"
@@ -326,7 +326,7 @@ tf_module {
   }
   member_method {
     name: "softmax_cross_entropy_with_logits_v2"
-    argspec: "args=[\'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+    argspec: "args=[\'labels\', \'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "softplus"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index f7f9978c063ceae89c7228b476f54694e25bc249..adffc552275554f888c398ac8beea730b851e293 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index f9e898484b9813373a49e6f117578f822cdeb156..95746cc49c3c4e762e8559cf704572ef122a96ef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 9e52a4252619ffc19b287fc1818fa6f772847335..3547b66d19ac6b64449860160774647df855a6de 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 9836433d08cba809107f9bb5dbccf2e971865b8a..7582fd52b63afdb8c6f2a5e7f0e6b26071232832 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index 5fd9b329bdeb40b5a57fe68564977f61b5349ae5..7ec61661fde68ff102aeed8992891854a4028bb2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 76c8cff22b1e65e65d0ac3d6705541dc3f16f80c..9617d07568ee70a7e6158fdbd33c956f8ae5e604 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index f53567af52f7ed6baa78bcc75bfc0e38de02e548..b31886f73665d6e895ebbf25a33d61b4b95eba74 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index d3b68e4f2976912ed65ba7916284c951fda03b05..c36ecaa4b2b2ce14292cd2c46a986bb1387294bd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 1f7840ab919baeeb0077904592ba8dcc1d4c91fb..42128ebd17234fcee3b016bbd7f1964824d1a0b6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 490b4a847f1ee1d133723cd866ae3cafdd5cd777..4ed4deea138f549636d432691550d6e7e44eca92 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -172,6 +172,10 @@ tf_module {
     name: "QueueBase"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RaggedTensor"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RandomShuffleQueue"
     mtype: "<type \'type\'>"
@@ -288,6 +292,10 @@ tf_module {
     name: "app"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "autograph"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "bfloat16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -504,6 +512,10 @@ tf_module {
     name: "quantization"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "queue"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "quint16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -512,6 +524,10 @@ tf_module {
     name: "quint8"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "ragged"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "random"
     mtype: "<type \'module\'>"
@@ -1052,10 +1068,18 @@ tf_module {
     name: "dimension_value"
     argspec: "args=[\'dimension\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "disable_eager_execution"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "disable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "disable_v2_behavior"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "disable_v2_tensorshape"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1096,6 +1120,10 @@ tf_module {
     name: "enable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "enable_v2_behavior"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "enable_v2_tensorshape"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.queue.-f-i-f-o-queue.pbtxt
similarity index 98%
rename from tensorflow/tools/api/golden/v2/tensorflow.-f-i-f-o-queue.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.queue.-f-i-f-o-queue.pbtxt
index a095616c00cfe8fb64413e2078ae1589a423d2f4..724ab5fe8283de44b20b059042f8d6744b11da19 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-f-i-f-o-queue.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.queue.-f-i-f-o-queue.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.FIFOQueue"
+path: "tensorflow.queue.FIFOQueue"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.FIFOQueue\'>"
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-padding-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.queue.-padding-f-i-f-o-queue.pbtxt
similarity index 98%
rename from tensorflow/tools/api/golden/v2/tensorflow.io.-padding-f-i-f-o-queue.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.queue.-padding-f-i-f-o-queue.pbtxt
index 85306fdcac519820fce8d254d9aaaf504b830b7a..9ef0a4d9eb6bbfb69fddf3fe696e3f60ac3ef67b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-padding-f-i-f-o-queue.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.queue.-padding-f-i-f-o-queue.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.io.PaddingFIFOQueue"
+path: "tensorflow.queue.PaddingFIFOQueue"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PaddingFIFOQueue\'>"
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-priority-queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.queue.-priority-queue.pbtxt
similarity index 98%
rename from tensorflow/tools/api/golden/v2/tensorflow.io.-priority-queue.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.queue.-priority-queue.pbtxt
index 02d8037b34a57b5d1c1309b7cbcfd290a6091e04..bb66beb13af18501912fda85b9c3dc67cdf21683 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-priority-queue.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.queue.-priority-queue.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.io.PriorityQueue"
+path: "tensorflow.queue.PriorityQueue"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PriorityQueue\'>"
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-queue-base.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.queue.-queue-base.pbtxt
similarity index 98%
rename from tensorflow/tools/api/golden/v2/tensorflow.io.-queue-base.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.queue.-queue-base.pbtxt
index a30481a0ea8f1cb71f5695be2099f5e5ae3f644c..8faaad22af6e0f920e26a44e1ebf294fc4b109c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-queue-base.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.queue.-queue-base.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.io.QueueBase"
+path: "tensorflow.queue.QueueBase"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-random-shuffle-queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.queue.-random-shuffle-queue.pbtxt
similarity index 97%
rename from tensorflow/tools/api/golden/v2/tensorflow.io.-random-shuffle-queue.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.queue.-random-shuffle-queue.pbtxt
index 82cbf9884f77ed70d9f3191875daeb8b6f9f72ec..31cd503b13040b119d4028f813c94689f8e2ebb3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-random-shuffle-queue.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.queue.-random-shuffle-queue.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.io.RandomShuffleQueue"
+path: "tensorflow.queue.RandomShuffleQueue"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.RandomShuffleQueue\'>"
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c16e95e2116b703434ed91106eb29e4beb5668f2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.queue.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.queue"
+tf_module {
+  member {
+    name: "FIFOQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PaddingFIFOQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PriorityQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "QueueBase"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomShuffleQueue"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.ragged.-ragged-tensor-value.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.ragged.-ragged-tensor-value.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..96c895e0a49364b37d1578ff1a1e9214a10189df
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.ragged.-ragged-tensor-value.pbtxt
@@ -0,0 +1,41 @@
+path: "tensorflow.ragged.RaggedTensorValue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.ragged.ragged_tensor_value.RaggedTensorValue\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "flat_values"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "nested_row_splits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ragged_rank"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "row_splits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'values\', \'row_splits\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "to_list"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..22ca7e931f3589f11b7fc5c655d633c86716b4d8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.ragged"
+tf_module {
+  member {
+    name: "RaggedTensorValue"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "constant"
+    argspec: "args=[\'pylist\', \'dtype\', \'ragged_rank\', \'inner_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "constant_value"
+    argspec: "args=[\'pylist\', \'dtype\', \'ragged_rank\', \'inner_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "map_flat_values"
+    argspec: "args=[\'op\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "range"
+    argspec: "args=[\'starts\', \'limits\', \'deltas\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "row_splits_to_segment_ids"
+    argspec: "args=[\'splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "segment_ids_to_row_splits"
+    argspec: "args=[\'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
index d788f6dfca277ee9f76db66ef7bf214289fa1527..1eefb1c70ce4d825402155a5e068c736defff02f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
@@ -1,5 +1,17 @@
 path: "tensorflow.random"
 tf_module {
+  member_method {
+    name: "all_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fixed_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'vocab_file\', \'distortion\', \'num_reserved_ids\', \'num_shards\', \'shard\', \'unigrams\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'1.0\', \'0\', \'1\', \'0\', \'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "gamma"
     argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
@@ -8,6 +20,10 @@ tf_module {
     name: "get_seed"
     argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "learned_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "log_uniform_candidate_sampler"
     argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
index a1cd581a86bc2132bfa04ac3f3433e84b6365b19..ad26ded10b4dc652574ce4b544cbadd98e57a013 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
@@ -52,6 +52,14 @@ tf_module {
     name: "to_number"
     argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "unicode_decode"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "unicode_decode_with_offsets"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+  }
   member_method {
     name: "unicode_encode"
     argspec: "args=[\'input\', \'output_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
@@ -60,6 +68,14 @@ tf_module {
     name: "unicode_script"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "unicode_split"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
+  }
+  member_method {
+    name: "unicode_split_with_offsets"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
+  }
   member_method {
     name: "unicode_transcode"
     argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.-list-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.-list-value.pbtxt
deleted file mode 100644
index f1dffd595285098afaeb0ff04e5db35d594f7fac..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.-list-value.pbtxt
+++ /dev/null
@@ -1,70 +0,0 @@
-path: "tensorflow.AttrValue.ListValue"
-tf_proto {
-  descriptor {
-    name: "ListValue"
-    field {
-      name: "s"
-      number: 2
-      label: LABEL_REPEATED
-      type: TYPE_BYTES
-    }
-    field {
-      name: "i"
-      number: 3
-      label: LABEL_REPEATED
-      type: TYPE_INT64
-      options {
-        packed: true
-      }
-    }
-    field {
-      name: "f"
-      number: 4
-      label: LABEL_REPEATED
-      type: TYPE_FLOAT
-      options {
-        packed: true
-      }
-    }
-    field {
-      name: "b"
-      number: 5
-      label: LABEL_REPEATED
-      type: TYPE_BOOL
-      options {
-        packed: true
-      }
-    }
-    field {
-      name: "type"
-      number: 6
-      label: LABEL_REPEATED
-      type: TYPE_ENUM
-      type_name: ".tensorflow.DataType"
-      options {
-        packed: true
-      }
-    }
-    field {
-      name: "shape"
-      number: 7
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-    }
-    field {
-      name: "tensor"
-      number: 8
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorProto"
-    }
-    field {
-      name: "func"
-      number: 9
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.NameAttrList"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.pbtxt
deleted file mode 100644
index 6ccd64f428c3b87c807d0af82f67a884187f738c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.pbtxt
+++ /dev/null
@@ -1,151 +0,0 @@
-path: "tensorflow.AttrValue"
-tf_proto {
-  descriptor {
-    name: "AttrValue"
-    field {
-      name: "s"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    field {
-      name: "i"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-      oneof_index: 0
-    }
-    field {
-      name: "f"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-      oneof_index: 0
-    }
-    field {
-      name: "b"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-      oneof_index: 0
-    }
-    field {
-      name: "type"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.DataType"
-      oneof_index: 0
-    }
-    field {
-      name: "shape"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-      oneof_index: 0
-    }
-    field {
-      name: "tensor"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorProto"
-      oneof_index: 0
-    }
-    field {
-      name: "list"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.AttrValue.ListValue"
-      oneof_index: 0
-    }
-    field {
-      name: "func"
-      number: 10
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.NameAttrList"
-      oneof_index: 0
-    }
-    field {
-      name: "placeholder"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-      oneof_index: 0
-    }
-    nested_type {
-      name: "ListValue"
-      field {
-        name: "s"
-        number: 2
-        label: LABEL_REPEATED
-        type: TYPE_BYTES
-      }
-      field {
-        name: "i"
-        number: 3
-        label: LABEL_REPEATED
-        type: TYPE_INT64
-        options {
-          packed: true
-        }
-      }
-      field {
-        name: "f"
-        number: 4
-        label: LABEL_REPEATED
-        type: TYPE_FLOAT
-        options {
-          packed: true
-        }
-      }
-      field {
-        name: "b"
-        number: 5
-        label: LABEL_REPEATED
-        type: TYPE_BOOL
-        options {
-          packed: true
-        }
-      }
-      field {
-        name: "type"
-        number: 6
-        label: LABEL_REPEATED
-        type: TYPE_ENUM
-        type_name: ".tensorflow.DataType"
-        options {
-          packed: true
-        }
-      }
-      field {
-        name: "shape"
-        number: 7
-        label: LABEL_REPEATED
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorShapeProto"
-      }
-      field {
-        name: "tensor"
-        number: 8
-        label: LABEL_REPEATED
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorProto"
-      }
-      field {
-        name: "func"
-        number: 9
-        label: LABEL_REPEATED
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.NameAttrList"
-      }
-    }
-    oneof_decl {
-      name: "value"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-device-count-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-device-count-entry.pbtxt
deleted file mode 100644
index d9b142682899bf5d9fd5d942437359adf8962466..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-device-count-entry.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.ConfigProto.DeviceCountEntry"
-tf_proto {
-  descriptor {
-    name: "DeviceCountEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
deleted file mode 100644
index caa72fe5a61aa9a13bc51ae5ab70048d309f6b62..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
+++ /dev/null
@@ -1,34 +0,0 @@
-path: "tensorflow.ConfigProto.Experimental"
-tf_proto {
-  descriptor {
-    name: "Experimental"
-    field {
-      name: "collective_group_leader"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "executor_type"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "recv_buf_max_chunk"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "use_numa_affinity"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    reserved_range {
-      start: 2
-      end: 3
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
deleted file mode 100644
index b505d813509c2049fa6e3f60df553492d6f66613..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
+++ /dev/null
@@ -1,158 +0,0 @@
-path: "tensorflow.ConfigProto"
-tf_proto {
-  descriptor {
-    name: "ConfigProto"
-    field {
-      name: "device_count"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ConfigProto.DeviceCountEntry"
-    }
-    field {
-      name: "intra_op_parallelism_threads"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "inter_op_parallelism_threads"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "use_per_session_threads"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "session_inter_op_thread_pool"
-      number: 12
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ThreadPoolOptionProto"
-    }
-    field {
-      name: "placement_period"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "device_filters"
-      number: 4
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-    field {
-      name: "gpu_options"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GPUOptions"
-    }
-    field {
-      name: "allow_soft_placement"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "log_device_placement"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "graph_options"
-      number: 10
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GraphOptions"
-    }
-    field {
-      name: "operation_timeout_in_ms"
-      number: 11
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "rpc_options"
-      number: 13
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.RPCOptions"
-    }
-    field {
-      name: "cluster_def"
-      number: 14
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ClusterDef"
-    }
-    field {
-      name: "isolate_session_state"
-      number: 15
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "experimental"
-      number: 16
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ConfigProto.Experimental"
-    }
-    nested_type {
-      name: "DeviceCountEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      options {
-        map_entry: true
-      }
-    }
-    nested_type {
-      name: "Experimental"
-      field {
-        name: "collective_group_leader"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "executor_type"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "recv_buf_max_chunk"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "use_numa_affinity"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_BOOL
-      }
-      reserved_range {
-        start: 2
-        end: 3
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-g-p-u-options.pbtxt
deleted file mode 100644
index a2cc07483a4e10918891f555ca9459fb7503bb32..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-g-p-u-options.pbtxt
+++ /dev/null
@@ -1,98 +0,0 @@
-path: "tensorflow.GPUOptions"
-tf_proto {
-  descriptor {
-    name: "GPUOptions"
-    field {
-      name: "per_process_gpu_memory_fraction"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "allow_growth"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "allocator_type"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "deferred_deletion_bytes"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "visible_device_list"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "polling_active_delay_usecs"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "polling_inactive_delay_msecs"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "force_gpu_compatible"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "experimental"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GPUOptions.Experimental"
-    }
-    nested_type {
-      name: "Experimental"
-      field {
-        name: "virtual_devices"
-        number: 1
-        label: LABEL_REPEATED
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.GPUOptions.Experimental.VirtualDevices"
-      }
-      field {
-        name: "use_unified_memory"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_BOOL
-      }
-      field {
-        name: "num_dev_to_dev_copy_streams"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "collective_ring_order"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      nested_type {
-        name: "VirtualDevices"
-        field {
-          name: "memory_limit_mb"
-          number: 1
-          label: LABEL_REPEATED
-          type: TYPE_FLOAT
-        }
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-graph-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph-def.pbtxt
deleted file mode 100644
index 19eccff03d24719d95ea84ccdad4014aa777ccd5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-graph-def.pbtxt
+++ /dev/null
@@ -1,36 +0,0 @@
-path: "tensorflow.GraphDef"
-tf_proto {
-  descriptor {
-    name: "GraphDef"
-    field {
-      name: "node"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.NodeDef"
-    }
-    field {
-      name: "versions"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.VersionDef"
-    }
-    field {
-      name: "version"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-      options {
-        deprecated: true
-      }
-    }
-    field {
-      name: "library"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.FunctionDefLibrary"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-graph-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph-options.pbtxt
deleted file mode 100644
index a9f99bc171cc3661031981f467f583b122e43476..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-graph-options.pbtxt
+++ /dev/null
@@ -1,67 +0,0 @@
-path: "tensorflow.GraphOptions"
-tf_proto {
-  descriptor {
-    name: "GraphOptions"
-    field {
-      name: "enable_recv_scheduling"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "optimizer_options"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.OptimizerOptions"
-    }
-    field {
-      name: "build_cost_model"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "build_cost_model_after"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "infer_shapes"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "place_pruned_graph"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "enable_bfloat16_sendrecv"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "timeline_step"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "rewrite_options"
-      number: 10
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.RewriterConfig"
-    }
-    reserved_range {
-      start: 1
-      end: 2
-    }
-    reserved_name: "skip_common_subexpression_elimination"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-histogram-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-histogram-proto.pbtxt
deleted file mode 100644
index d4402f330b8a28eaa61eb2b74c9ca412dce06b62..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-histogram-proto.pbtxt
+++ /dev/null
@@ -1,54 +0,0 @@
-path: "tensorflow.HistogramProto"
-tf_proto {
-  descriptor {
-    name: "HistogramProto"
-    field {
-      name: "min"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "max"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "num"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "sum"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "sum_squares"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "bucket_limit"
-      number: 6
-      label: LABEL_REPEATED
-      type: TYPE_DOUBLE
-      options {
-        packed: true
-      }
-    }
-    field {
-      name: "bucket"
-      number: 7
-      label: LABEL_REPEATED
-      type: TYPE_DOUBLE
-      options {
-        packed: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-log-message.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-log-message.pbtxt
deleted file mode 100644
index 5023aa96bf3b4f3f550421db5f41872d9f62b70d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-log-message.pbtxt
+++ /dev/null
@@ -1,46 +0,0 @@
-path: "tensorflow.LogMessage"
-tf_proto {
-  descriptor {
-    name: "LogMessage"
-    field {
-      name: "level"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.LogMessage.Level"
-    }
-    field {
-      name: "message"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    enum_type {
-      name: "Level"
-      value {
-        name: "UNKNOWN"
-        number: 0
-      }
-      value {
-        name: "DEBUGGING"
-        number: 10
-      }
-      value {
-        name: "INFO"
-        number: 20
-      }
-      value {
-        name: "WARN"
-        number: 30
-      }
-      value {
-        name: "ERROR"
-        number: 40
-      }
-      value {
-        name: "FATAL"
-        number: 50
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
deleted file mode 100644
index 0ba09bec4b3fa6e9eaf59978beaa958ebc038b4c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.MetaGraphDef.CollectionDefEntry"
-tf_proto {
-  descriptor {
-    name: "CollectionDefEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.CollectionDef"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-meta-info-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
deleted file mode 100644
index 41c62a407b8577288016f2376c35ba6ec1c3c1ca..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
+++ /dev/null
@@ -1,50 +0,0 @@
-path: "tensorflow.MetaGraphDef.MetaInfoDef"
-tf_proto {
-  descriptor {
-    name: "MetaInfoDef"
-    field {
-      name: "meta_graph_version"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "stripped_op_list"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.OpList"
-    }
-    field {
-      name: "any_info"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".google.protobuf.Any"
-    }
-    field {
-      name: "tags"
-      number: 4
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-    field {
-      name: "tensorflow_version"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "tensorflow_git_version"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "stripped_default_attrs"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
deleted file mode 100644
index 73dc414a779ded3d1f896e743b7f1f1a443352f0..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.MetaGraphDef.SignatureDefEntry"
-tf_proto {
-  descriptor {
-    name: "SignatureDefEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SignatureDef"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.pbtxt
deleted file mode 100644
index d71c2358c93e9597726665fdf8f92e648b2ea772..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.pbtxt
+++ /dev/null
@@ -1,133 +0,0 @@
-path: "tensorflow.MetaGraphDef"
-tf_proto {
-  descriptor {
-    name: "MetaGraphDef"
-    field {
-      name: "meta_info_def"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.MetaGraphDef.MetaInfoDef"
-    }
-    field {
-      name: "graph_def"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GraphDef"
-    }
-    field {
-      name: "saver_def"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SaverDef"
-    }
-    field {
-      name: "collection_def"
-      number: 4
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.MetaGraphDef.CollectionDefEntry"
-    }
-    field {
-      name: "signature_def"
-      number: 5
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.MetaGraphDef.SignatureDefEntry"
-    }
-    field {
-      name: "asset_file_def"
-      number: 6
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.AssetFileDef"
-    }
-    nested_type {
-      name: "MetaInfoDef"
-      field {
-        name: "meta_graph_version"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "stripped_op_list"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.OpList"
-      }
-      field {
-        name: "any_info"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".google.protobuf.Any"
-      }
-      field {
-        name: "tags"
-        number: 4
-        label: LABEL_REPEATED
-        type: TYPE_STRING
-      }
-      field {
-        name: "tensorflow_version"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "tensorflow_git_version"
-        number: 6
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "stripped_default_attrs"
-        number: 7
-        label: LABEL_OPTIONAL
-        type: TYPE_BOOL
-      }
-    }
-    nested_type {
-      name: "CollectionDefEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.CollectionDef"
-      }
-      options {
-        map_entry: true
-      }
-    }
-    nested_type {
-      name: "SignatureDefEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.SignatureDef"
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.-attr-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.-attr-entry.pbtxt
deleted file mode 100644
index b119b208772199e5c3596be142f3e0f62d3ed50e..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.-attr-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.NameAttrList.AttrEntry"
-tf_proto {
-  descriptor {
-    name: "AttrEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.AttrValue"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.pbtxt
deleted file mode 100644
index fcdb411ffce9b68ac28696f86ca11a47f9e64e8f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.NameAttrList"
-tf_proto {
-  descriptor {
-    name: "NameAttrList"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "attr"
-      number: 2
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.NameAttrList.AttrEntry"
-    }
-    nested_type {
-      name: "AttrEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.AttrValue"
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-node-def.-attr-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-node-def.-attr-entry.pbtxt
deleted file mode 100644
index 622e4c3d0f60ce4842a6fd4cc421551aa795fcbf..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-node-def.-attr-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.NodeDef.AttrEntry"
-tf_proto {
-  descriptor {
-    name: "AttrEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.AttrValue"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-node-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-node-def.pbtxt
deleted file mode 100644
index 646fa8abb9b22dbd908ff821cbe66a33ad02ba64..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-node-def.pbtxt
+++ /dev/null
@@ -1,56 +0,0 @@
-path: "tensorflow.NodeDef"
-tf_proto {
-  descriptor {
-    name: "NodeDef"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "op"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "input"
-      number: 3
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-    field {
-      name: "device"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "attr"
-      number: 5
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.NodeDef.AttrEntry"
-    }
-    nested_type {
-      name: "AttrEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.AttrValue"
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-optimizer-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-optimizer-options.pbtxt
deleted file mode 100644
index 3ccf9d459b133b48e5456f02e4780ade8d3042c8..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-optimizer-options.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-path: "tensorflow.OptimizerOptions"
-tf_proto {
-  descriptor {
-    name: "OptimizerOptions"
-    field {
-      name: "do_common_subexpression_elimination"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "do_constant_folding"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "max_folded_constant_in_bytes"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "do_function_inlining"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "opt_level"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.OptimizerOptions.Level"
-    }
-    field {
-      name: "global_jit_level"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.OptimizerOptions.GlobalJitLevel"
-    }
-    enum_type {
-      name: "Level"
-      value {
-        name: "L1"
-        number: 0
-      }
-      value {
-        name: "L0"
-        number: -1
-      }
-    }
-    enum_type {
-      name: "GlobalJitLevel"
-      value {
-        name: "DEFAULT"
-        number: 0
-      }
-      value {
-        name: "OFF"
-        number: -1
-      }
-      value {
-        name: "ON_1"
-        number: 1
-      }
-      value {
-        name: "ON_2"
-        number: 2
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0ed95653552f904acea1cc82bca00773ecb792c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
@@ -0,0 +1,125 @@
+path: "tensorflow.RaggedTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "flat_values"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "nested_row_splits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ragged_rank"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "row_splits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'values\', \'row_splits\', \'cached_row_lengths\', \'cached_value_rowids\', \'cached_nrows\', \'internal\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "bounding_shape"
+    argspec: "args=[\'self\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_nested_row_lengths"
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_nested_row_splits"
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_nested_value_rowids"
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_value_rowids\', \'nested_nrows\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_row_lengths"
+    argspec: "args=[\'cls\', \'values\', \'row_lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_row_limits"
+    argspec: "args=[\'cls\', \'values\', \'row_limits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_row_splits"
+    argspec: "args=[\'cls\', \'values\', \'row_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_row_starts"
+    argspec: "args=[\'cls\', \'values\', \'row_starts\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_sparse"
+    argspec: "args=[\'cls\', \'st_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\', \'lengths\', \'padding\', \'ragged_rank\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\'], "
+  }
+  member_method {
+    name: "from_value_rowids"
+    argspec: "args=[\'cls\', \'values\', \'value_rowids\', \'nrows\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "nested_row_lengths"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "nrows"
+    argspec: "args=[\'self\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "row_lengths"
+    argspec: "args=[\'self\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "row_limits"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "row_starts"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_list"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "to_sparse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_tensor"
+    argspec: "args=[\'self\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "value_rowids"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "with_flat_values"
+    argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_values"
+    argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-run-metadata.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-metadata.pbtxt
deleted file mode 100644
index 1287940326c0196e76fff2cf6363622226092504..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-run-metadata.pbtxt
+++ /dev/null
@@ -1,27 +0,0 @@
-path: "tensorflow.RunMetadata"
-tf_proto {
-  descriptor {
-    name: "RunMetadata"
-    field {
-      name: "step_stats"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.StepStats"
-    }
-    field {
-      name: "cost_graph"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.CostGraphDef"
-    }
-    field {
-      name: "partition_graphs"
-      number: 3
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GraphDef"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
deleted file mode 100644
index 47b5b56faf63edba9ce4f08bf744f3acf4f67f5f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.RunOptions.Experimental"
-tf_proto {
-  descriptor {
-    name: "Experimental"
-    field {
-      name: "collective_graph_key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "use_run_handler_pool"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
deleted file mode 100644
index c0c2e7b9f8d71be9b96e7195b561d0a934d24057..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
+++ /dev/null
@@ -1,89 +0,0 @@
-path: "tensorflow.RunOptions"
-tf_proto {
-  descriptor {
-    name: "RunOptions"
-    field {
-      name: "trace_level"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.RunOptions.TraceLevel"
-    }
-    field {
-      name: "timeout_in_ms"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "inter_op_thread_pool"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "output_partition_graphs"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "debug_options"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.DebugOptions"
-    }
-    field {
-      name: "report_tensor_allocations_upon_oom"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "experimental"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.RunOptions.Experimental"
-    }
-    nested_type {
-      name: "Experimental"
-      field {
-        name: "collective_graph_key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "use_run_handler_pool"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_BOOL
-      }
-    }
-    enum_type {
-      name: "TraceLevel"
-      value {
-        name: "NO_TRACE"
-        number: 0
-      }
-      value {
-        name: "SOFTWARE_TRACE"
-        number: 1
-      }
-      value {
-        name: "HARDWARE_TRACE"
-        number: 2
-      }
-      value {
-        name: "FULL_TRACE"
-        number: 3
-      }
-    }
-    reserved_range {
-      start: 4
-      end: 5
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-session-log.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-session-log.pbtxt
deleted file mode 100644
index 259f2418740cbfe47cdb4bd871d4f5c6306d25f5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-session-log.pbtxt
+++ /dev/null
@@ -1,44 +0,0 @@
-path: "tensorflow.SessionLog"
-tf_proto {
-  descriptor {
-    name: "SessionLog"
-    field {
-      name: "status"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.SessionLog.SessionStatus"
-    }
-    field {
-      name: "checkpoint_path"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "msg"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    enum_type {
-      name: "SessionStatus"
-      value {
-        name: "STATUS_UNSPECIFIED"
-        number: 0
-      }
-      value {
-        name: "START"
-        number: 1
-      }
-      value {
-        name: "STOP"
-        number: 2
-      }
-      value {
-        name: "CHECKPOINT"
-        number: 3
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
index e85949f23c968046f8a9cfa50ffd206a18e767e7..6136c8fbe79ef8d3851c39b8f11ac3c33f6050f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
@@ -63,6 +63,10 @@ tf_class {
     name: "assign_sub"
     argspec: "args=[\'self\', \'delta\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
   }
+  member_method {
+    name: "batch_scatter_update"
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "count_up_to"
     argspec: "args=[\'self\', \'limit\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a71da113b4ffcaa9ff71e18df4a9263b141b42e6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
@@ -0,0 +1,28 @@
+path: "tensorflow.autograph.experimental.Feature"
+tf_class {
+  is_instance: "<enum \'Feature\'>"
+  member {
+    name: "ALL"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "AUTO_CONTROL_DEPS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "DECORATORS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "ERROR_REWRITING"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "LISTS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "NAME_SCOPES"
+    mtype: "<enum \'Feature\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-verbosity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-verbosity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4d5b77c0738feb1fa6ea69672ee3fafa51de5be
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-verbosity.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.autograph.experimental.Verbosity"
+tf_class {
+  is_instance: "<enum \'Verbosity\'>"
+  member {
+    name: "BRIEF"
+    mtype: "<enum \'Verbosity\'>"
+  }
+  member {
+    name: "VERBOSE"
+    mtype: "<enum \'Verbosity\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5747dac7ab201443d1f237415cd280aee672a8ff
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.autograph.experimental"
+tf_module {
+  member {
+    name: "Feature"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "Verbosity"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..12e23bc0c8fd0831471abcf56bcd8f07d3e6fe57
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.autograph"
+tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "to_code"
+    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'indentation\', \'experimental_optional_features\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'  \', \'Feature.ALL\', \'None\'], "
+  }
+  member_method {
+    name: "to_graph"
+    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'experimental_optional_features\', \'experimental_strip_decorators\', \'experimental_verbose\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'Feature.ALL\', \'None\', \'Verbosity.BRIEF\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index 39a6e1ee7175b6ed10d7de00edda34d3829ea816..d877339409d781f95f7ff75a553d21d82c27fc40 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -4,15 +4,15 @@ tf_class {
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member {
     name: "output_shapes"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member {
     name: "output_types"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
@@ -61,14 +61,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index ef367238d0498dec5bf3960ebea300b02e43aab3..f1573512438b3f40db7653bf94fd4ad282a40acd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -64,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index a8fc6fbec1d27034a9a8ebcd026244f7daa76eb6..690da98b1ac2097c4241ba3218caa3b476dbf397 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -63,14 +63,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index 697f37134440d1b264c2f94463c77008ada3a280..fe0bc1a4db5d4a5e78ec7479e414545b522ec2df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -64,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 17ac098910d25d5501aff304289529be499d6bdb..261129b132189ef504678058f11651dd22bdce8c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -64,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-dataset-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-dataset-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dcb304f763ea44d0d7314248170e615115b0794c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-dataset-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.DatasetStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'element_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-nested-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-nested-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b4b066e563cc6196650b1ba561da7c16a80a8656
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-nested-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.NestedStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.NestedStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'nested_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
index 9ca75828e55cdaeac5a493f49fe4bd963265e9d4..3b7ad64f51f88ae9c860e061db5c1ad6b5f2bcf8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.experimental.ops.optimization_options.OptimizationOptions\'>"
   is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "apply_default_optimizations"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "filter_fusion"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optional-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optional-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf41c1d1d696d94ef9da5fc64272349d1533816e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optional-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.OptionalStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.optional_ops.OptionalStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index f005d36e1a072fa390d9c3ba09be8a8045ab813a..0b34bbc94269280d6cca77bca789fb74f76629be 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -64,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f97376b328cf34eb04918bec7bacf08d254d8db5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.SparseTensorStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.SparseTensorStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'dense_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index b0c0b73ad685dec8fa56580980507d609040a512..0e61890eee42a8b5b0df7bda0f99d189c4911eb9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -64,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a99db4542e0deb506d00c00f889299dd22d67e1e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-structure.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.data.experimental.Structure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-tensor-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-tensor-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5c8864a9dd98058c659e72ba8059182a666ea39
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-tensor-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.TensorStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.TensorStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index b8ba3e341f1faeb353d9e0bfce8b43976d9529e4..a262c0f799634470090eeba90f480f94ac671f87 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -12,6 +12,18 @@ tf_module {
     name: "CsvDataset"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DatasetStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "INFINITE_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NestedStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "OptimizationOptions"
     mtype: "<type \'type\'>"
@@ -20,6 +32,10 @@ tf_module {
     name: "Optional"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "OptionalStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RandomDataset"
     mtype: "<type \'type\'>"
@@ -28,6 +44,10 @@ tf_module {
     name: "Reducer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseTensorStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SqlDataset"
     mtype: "<type \'type\'>"
@@ -40,14 +60,26 @@ tf_module {
     name: "StatsOptions"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Structure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFRecordWriter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "TensorStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ThreadingOptions"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "UNKNOWN_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member_method {
     name: "Counter"
     argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
@@ -56,6 +88,10 @@ tf_module {
     name: "bucket_by_sequence_length"
     argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\', \'False\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "choose_from_datasets"
     argspec: "args=[\'datasets\', \'choice_dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..81224f00a4afdceba88b62192ad157573a7665ff
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -0,0 +1,142 @@
+path: "tensorflow.distribute.MirroredStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.mirrored_strategy.MirroredStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'devices\', \'cross_device_ops\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "distribute_dataset"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_steps_on_dataset"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-server.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-server.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c39bf4fc4099a753ceee4de0df990a887d2ab4e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-server.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.distribute.Server"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.server_lib.Server\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "server_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "target"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'server_or_cluster_def\', \'job_name\', \'task_index\', \'protocol\', \'config\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "create_local_server"
+    argspec: "args=[\'config\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "join"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "start"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
index 9eb73d2c0d9069ec4b818abe1825503f0ea36fc9..63b6584caf02adce52b90dd74ff63f88003de7c8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -74,6 +74,10 @@ tf_class {
     name: "experimental_initialize"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "finalize"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
index 4d833b54ba0950b6b2cf40c958829dc2eeb24795..31dc6e071613bfe3d2ea24c65835f09bab90c400 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "InputReplicationMode"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member {
+    name: "MirroredStrategy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ReduceOp"
     mtype: "<class \'enum.EnumMeta\'>"
@@ -16,6 +20,10 @@ tf_module {
     name: "ReplicaContext"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Server"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Strategy"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
index 848fc303aa5748348b2aee69ec1e869807327d3d..01b870a81639807489ec2a09dcc185137aae1665 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "half"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "int16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "int32"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9e1504b494e3863f770df23f9f9a92e004b8713
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.CheckpointSaverHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-listener.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-listener.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..111b7583f2cd005912c7f06d977565cd17f265b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-listener.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.estimator.CheckpointSaverListener"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverListener\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "after_save"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_save"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-feed-fn-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-feed-fn-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f24de493f24a363190cd1d323adaa75b32b0d8e3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-feed-fn-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.FeedFnHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FeedFnHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feed_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-final-ops-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-final-ops-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6651170ba33f491d5a5342bcd6e6814e1b973832
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-final-ops-hook.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.estimator.FinalOpsHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FinalOpsHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "final_ops_values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'final_ops\', \'final_ops_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-global-step-waiter-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-global-step-waiter-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37db48bc64e2f0e955105e8094d51c851c25558b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-global-step-waiter-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.GlobalStepWaiterHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.GlobalStepWaiterHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'wait_until_step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-logging-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-logging-tensor-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..425f0167a161104891c3bb76816fe8c5094de28a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-logging-tensor-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.LoggingTensorHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.LoggingTensorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'tensors\', \'every_n_iter\', \'every_n_secs\', \'at_end\', \'formatter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-loss-during-training-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-loss-during-training-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6e17e4352b0f909b31327a57bbdca3bc0e02a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-loss-during-training-error.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.estimator.NanLossDuringTrainingError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError\'>"
+  is_instance: "<type \'exceptions.RuntimeError\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-tensor-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..82293c2c0c4e7204d9aba83f43ed2fac6bc46b19
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-tensor-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.NanTensorHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanTensorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'loss_tensor\', \'fail_on_nan_loss\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-profiler-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-profiler-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..65b5fb16b0874e7c6469ef11420db146be1f0b5f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-profiler-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.ProfilerHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.ProfilerHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'save_steps\', \'save_secs\', \'output_dir\', \'show_dataflow\', \'show_memory\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'True\', \'False\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-second-or-step-timer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-second-or-step-timer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..64051d2bd6b69614cd210d902552ddeb8b6c8e5e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-second-or-step-timer.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.estimator.SecondOrStepTimer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SecondOrStepTimer\'>"
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks._HookTimer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'every_secs\', \'every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "last_triggered_step"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "should_trigger_for_step"
+    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_last_triggered_step"
+    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-args.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-args.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b375c7429469d2a8b89d1bcd048599d6478624ae
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-args.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.estimator.SessionRunArgs"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "feed_dict"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "fetches"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "options"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cb4ac9f50ec9aa9d6531a16ebb48a9223cbc5188
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-context.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.estimator.SessionRunContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "original_args"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "session"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stop_requested"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'original_args\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "request_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..54e9ad9ed44b64e2c1c49b5ade4c7d3bb35563de
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-hook.pbtxt
@@ -0,0 +1,28 @@
+path: "tensorflow.estimator.SessionRunHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-values.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-values.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..678814169635bfa9997db26df23acc79c2d84881
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-values.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.estimator.SessionRunValues"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "options"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "results"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "run_metadata"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-step-counter-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-step-counter-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4368e04df3f86834b540bb5306bf66dd82ac440c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-step-counter-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.StepCounterHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StepCounterHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'every_n_steps\', \'every_n_secs\', \'output_dir\', \'summary_writer\'], varargs=None, keywords=None, defaults=[\'100\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-stop-at-step-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-stop-at-step-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..938b189a8c30237bb15bf73083a348e6366fbfc4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-stop-at-step-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.StopAtStepHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StopAtStepHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_steps\', \'last_step\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-summary-saver-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-summary-saver-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..104157315f5982efb4f6b9f39e0ece905a225e10
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-summary-saver-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.SummarySaverHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SummarySaverHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'save_steps\', \'save_secs\', \'output_dir\', \'summary_writer\', \'scaffold\', \'summary_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
index aba120218cc599039a501d6b2a6e754ae3ea5b5e..5a2a01cd5325ba7e02d9b549293dd09a4a57e167 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.experimental.InMemoryEvaluatorHook"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.hooks.InMemoryEvaluatorHook\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.hooks.hooks.InMemoryEvaluatorHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
   is_instance: "<type \'object\'>"
   member_method {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
index 2a9a0346d72d29524d697e53ae2872608c837caf..f0fd7ce782db71ff5e790fe50e93556bf5d19e1e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "make_early_stopping_hook"
     argspec: "args=[\'estimator\', \'should_stop_fn\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'60\', \'None\'], "
   }
+  member_method {
+    name: "make_stop_at_checkpoint_step_hook"
+    argspec: "args=[\'estimator\', \'last_step\', \'wait_after_file_check_secs\'], varargs=None, keywords=None, defaults=[\'30\'], "
+  }
   member_method {
     name: "stop_if_higher_hook"
     argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
index c5b0085b8d3ec58b4215d4a756957e1509501841..6f57505afe84f3982a8beb402783f35b3e699241 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
@@ -24,6 +24,14 @@ tf_module {
     name: "BoostedTreesRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CheckpointSaverHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CheckpointSaverListener"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNClassifier"
     mtype: "<type \'type\'>"
@@ -64,10 +72,22 @@ tf_module {
     name: "Exporter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "FeedFnHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "FinalExporter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "FinalOpsHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GlobalStepWaiterHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LatestExporter"
     mtype: "<type \'type\'>"
@@ -84,14 +104,62 @@ tf_module {
     name: "LinearRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LoggingTensorHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ModeKeys"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "NanLossDuringTrainingError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NanTensorHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ProfilerHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RunConfig"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SecondOrStepTimer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunArgs"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunValues"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StepCounterHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StopAtStepHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SummarySaverHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrainSpec"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
index 93d9b0fd75b53e6b15e34506e698855903b5be5a..cfa3372b12bfe32eed4311c89b6448c0359c0913 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
@@ -46,6 +46,6 @@ tf_module {
   }
   member_method {
     name: "walk"
-    argspec: "args=[\'top\', \'topdown\', \'onerror\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'top\', \'topdown\', \'onerror\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
index 8906329742c61ed08a25bcc252ec0d1dfa9e374e..2d9c759e3cf92a2368fd904fa57eec2413dbba8f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
@@ -8,22 +8,6 @@ tf_module {
     name: "FixedLenSequenceFeature"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "PaddingFIFOQueue"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "PriorityQueue"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "QueueBase"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "RandomShuffleQueue"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "SparseFeature"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 9dc8daea5c4e8e6293b2427add50ad4ebfbc264e..eced2e1cb0706153a9bfc2749847395d194fcb56 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -163,11 +167,11 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
@@ -231,16 +235,20 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -263,7 +271,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -275,6 +283,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index a357a825153528bdff75e9f73ec8d99545d71120..2acb90173f3242e8a92913728eec84ef5d455d1f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -168,11 +172,11 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
@@ -240,7 +244,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_classes"
@@ -248,7 +252,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
@@ -258,6 +262,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -280,7 +288,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -292,6 +300,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
index 7d298e95135ebf41230d72ff488fef30be682edb..9dbdaf0f5f3db292feb98fe06092b6f7a6b8f034 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
index 133205ab88b47afad32fc70ceca93513768a3b19..a5804d3bbcff401920ddd2b59bd5f094b3e1c628 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
@@ -17,12 +17,44 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
index d766c09ac5efaa9d0e4ffba4e495385130c7e770..bbc02c4d71f835497be74e771c5ae57682f5a5b3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
@@ -16,11 +16,43 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+  }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_train_batch_begin"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
index 605f74e5602a63f5a18c31cb26113d300ec76e7a..6182baf0a31e7027b685561fed5eeedc54a766a3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -21,12 +21,44 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
index cd893e67269164781d6a6b6294a199014d40fed8..9b1b068e225a5dae69672ecba70bdea48c6e6ae6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
@@ -17,12 +17,44 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
index 50f2054cabb1b8f6c46a9537ea923a18f87e5c80..92440188c81ee192df332cd89256233591b2d281 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
@@ -17,11 +17,43 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+  }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_train_batch_begin"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
index 9ed9db0a89b49b88098e15baca414ff78b6f10e6..a04ffb92eb9e32b2473355f140d68537b80074df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 3d8d1363bb4e4de818788efbf3c997594350006a..c10c236ad1990160be53ba5df7afeb64619bf260 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -17,12 +17,44 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
index 5012f1517d57dd646d82ab669cb279b6363dd6ec..624f856d2752e1f375154664a892d6c1d600ecbd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
index 73652c2b61259f768eca76b995ae4592df868392..0db6b8d371b61db6fa565a93416dfc14eeae1d47 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
@@ -21,12 +21,44 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
index 24db71de1182d58b78fec0419aa9cb48a2e315d2..dac2049fe19426738368009822ce2dac8bc64467 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
@@ -17,12 +17,44 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
index c5503c69a5f3cb6765c984778c0e3626369ee815..2e0f77eda85780cec26b103ba11276ccdfd90189 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
index de6e8ef072558e6d926ea125aa5056e3c229d37f..2834b74e8afbd5ee01eb77b8b14e75fc4e50f230 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
@@ -17,11 +17,43 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
   }
   member_method {
     name: "on_epoch_end"
-    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+  }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_train_batch_begin"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 1d814b2c8b553f1b2a07f9d9b97dc70ec0674969..b2ab5006dc4ac3571b4f9d01607adb6aa2c0be26 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
index 164edbd66ab2487a980155eabcf18ed8446e2c14..5cd6851278dce8ef45c90112176be94b9c45dc91 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
@@ -4,4 +4,12 @@ tf_module {
     name: "PeepholeLSTMCell"
     mtype: "<type \'type\'>"
   }
+  member_method {
+    name: "export"
+    argspec: "args=[\'model\', \'saved_model_path\', \'custom_objects\', \'as_text\', \'input_signature\', \'serving_only\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "load_from_saved_model"
+    argspec: "args=[\'saved_model_path\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index b84629540e700f242f885064c92309c294693a11..da212382c1a6a3c5d37afbd1ac895249b566a913 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index 5918a13ad8629582829049485e896688ecad9579..c910db027e69f3ca21495c968ebeae691711c316 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index 599da06427dfe4f28e757a7aac8d8a14856a4556..8b7b33e98ce2673ffb5dcf951a8cd6a684d847af 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index f9ff1538c8134d96051ad81d35c73e59c6a8cc57..5e3e41ba205c70413b7d015141b92c206ea26f32 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 723fc9cdb0d0ad93470e22fd8c147d3ecc92af91..e160b1015380fcf9f3a7a8f4a41df6877cbf9246 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 957ce2f0ce86f8df3eb8b57606229fb661eb52f7..b6b71358c869ff6210e9a704f79cbd63970b5dcd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index a52c0af68175420dc2a1993d1f025d36705538e1..5c5ab1580eb3d6ce02498b1bc42aefc39784abf4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index a004db62ddcaaae02a411d8db51f4026ece1384d..489de2e4d31d8c631ea11f8a50c91498a70fa308 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 44f83d1387cb2ec681f50f7b1f0297f3f74594ed..30fec249b838350ac4ef542dd0f1969b0ddd7588 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 8378faf7188ec594865d4b68c8ea8cae284183ca..0e983c9234597a17c6c9342eaa3b3a26158736fa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 9d5655c9644e3a2394a346bed78fc478cf60ba8d..ec50db71279b5e688ef36558941071fbba3c02f4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index 5da79268129fc5c08cbd37686333847cbb32730d..36ea9d58519d1638ca25f31bc1ce3dcbcd51aeb3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index d37a6b47105225d7b83b6a264b944ceeb583a6c4..23153d42847ad6015ccc347b70d35b7f3b83dc03 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -17,6 +17,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index 1ad7a91be0ba48d0dbab19da8c7cd9ca89095918..766c3f267f97f19d87cc39a24ae90dca796b4988 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index cb9abc25396bb63a3c40de5cc52f9df7ed20071e..898098227190498a5a752a493e3d9bccb431bf15 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -42,6 +42,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "filters"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index 47dba1d81f8f97a60fe72ec521f82a78ee5f3505..a74b8d29502f0493a99b16d8fdeccf77e205be0e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index fd649418961301f150aac3dabc1bdf0ade4a9c28..b093f8ead94199ba2a4861d0453ff5248b2d7fb0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 1b1425d53197db8b59abf51fe93c0b0c45299956..0ce9f6fd591f127eb2874397abce21e8451ba3b4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 1741063fe8b09acf3865e0a135e96bb715dcdcfa..c1f5bfae0d35683e4e718a73add8f57be9473c72 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index 50feb4f458ad1a9cb2b2bfe5d67997b7551eed74..4aa872c4f04c2f0a3cddc83bc7c64700cb97ca3c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index faaa535df9fe03ad07862f0793f8ebea67b405ca..6e01f7c70c9987cfa651078175927edfaf1fd6ad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 4079329d1ee2a61270fee38426bb8a0859c38ce3..c002042d7703bccb0af37cebe453803c9e9009e3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index 32e56696e1617f7810792e3416a2ebb2037d23c2..f5e5446d2b995c9ba2707cc16376e8c639576c76 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 381abe73401fa3a588873d643324fc020c159e30..d5f36f4bc3d1b517a7f2dfaf3fed490df66a5fba 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index b3e4bf9689dc7e9db63de7f43e9dfa9ac4d42b02..346fec6056380842f4d5f40833cea82a540c088d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index 7aeff8003c322e8a8168dd70481a8b30b08762a8..0f8fe9f05e0f577dca9e1f3225f3e14074fefa12 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index a1728d9d4f9a1e677646db04c4d0df9572e21208..68fb7382a71cc0e3215daa43e2f1ea0f6de26e16 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index 8d8fd142cc64ee113c4b6a7e4e2462ecc69b6028..deda82f9b3f020589c9673e9070ec40713846b7f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
deleted file mode 100644
index 7758209adf8fe7a1306fa5ef125935dafd925c3e..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ /dev/null
@@ -1,197 +0,0 @@
-path: "tensorflow.keras.layers.CuDNNGRU"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent.CuDNNGRU\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "cell"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "states"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\'], varargs=None, keywords=kwargs, defaults=[\'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
deleted file mode 100644
index 7c463ff1257599366be049edce6cc06140906286..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ /dev/null
@@ -1,197 +0,0 @@
-path: "tensorflow.keras.layers.CuDNNLSTM"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent.CuDNNLSTM\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "cell"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "states"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\'], varargs=None, keywords=kwargs, defaults=[\'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index 0781a93bd56c5ebc77e1fb650497621e49d7ee1f..ff00ca1bb24eab0e35f04c232b3eb5252d645edb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index 4960d0264e96e872ea5c49a8841cef20bd5eb37c..919aed5723c0464b8540ba1cfb971bb23bfef73a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 8fad7535f882718462a11e27e75732e3097cb87d..f590ce1ef71200854f62baf3c8746deefbaf8e46 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index 5b425f2d4d7a8a897280490e26922766d8bf7065..db4261fadc76e2d953d477c472adcb422d48105e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index f6c4d0a438ed027635b40ec992eb1bbcb5c9a3a1..7369552b3b8733b6c586888c643c9596bebcdded 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index 82b761fc1761bb3e7638f7a80bc80c6433162d04..f643ef9de28eed6756073d84553a4986fb0d338f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index c9ff323877e06b6dff274644744d425e3a9b7932..ce053ae8c44353815d9f6872d1f8ab72ec93c4f0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index 9b4165d4cbf88fefd2bb684dae70ea8afc01357b..db9504307798cf5e51a28469a3df669dd77dc0b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index f225f7c4309615919fb05df05f2ae664bde80097..a6edba6b7efc631cc1057a8ddb7d4af19142ac6d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index 855d001700179fb634d1dff78585d340420abe7f..df2ea3fbe9a20987892a971499a671f7268c23e5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -1,5 +1,6 @@
 path: "tensorflow.keras.layers.GRU"
 tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.UnifiedGRU\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRU\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
@@ -33,6 +34,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
@@ -155,7 +160,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'time_major\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'True\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 2c404c99cd2175cdc8b60b229e4410bf280ebcb7..ac4bbe7d19625bdf1b11f8c3dfc9bdf1ad5eaaf4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 6f109d59d0f6fcd2b4650719e3b4f653baec7d23..947e3170aea6cfb26e6604f1ef950293fa4cf4ad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 69f8a9031d32eb73bb44291cdf330d738d745cf9..17e202c5812f633e430a821dac5f424ae587ad47 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 4299f765e525b136e289bba169becec06e19ffb1..9772c5df9b525576a2b9702f238fc7d309b7561e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 9153a1a2406b6fc4ab60c80fee2f8d6d69b00b72..cd65075591d151c0e6538588af932a6cdab5c90c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 625e81fd2322ceba153fa65c138948ce43843089..0423de7a248c17b1232ea5b9689578f2d824cbdc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 2fc769742c70c5665c9cb77ad246fcdb49366d5a..4471cba245469c636419209084d624d2138fd4d4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index e307a65c7c565660e1f2b6b6b74dc5970425eaa4..c0e7fae4564f2f253df4377076b0ec64cf2b5cab 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 4394ad0364e89fd3531d6625e52540991cadf973..6975a6e88d8822f5a817d4a178ab15104799b91a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 050ed39fe98dc7cfdf6febe45e235d3ae7cbf486..56bd70db7e18f61f8af8cd9f9d4439b544d5b380 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 436191821ef4689351b6124cf2a20afad917e4ab..656319920ec34891e22b7145da1f80f787681572 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 4ba540aa6adc72b572aa9340f89967d69ab78a3c..f815e669115eb21ca2f23909d6a36ede278ccbd2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index a2e9322cb3fd4e56af708d5c4e17b660f7bc2247..f61f0e521b7962bbef1c916a5aa79c43e8ce4019 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 5d16a57fc1aeff9939220de8043fcae39e3d953e..c58c8ce63f50b6d5f2dc3428fd50726ddee720c6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index 9dd29c1251ef2eacaf535a3f10f3d42dc36624a2..0efe9a4297960644c20d16f097e816046bb2672c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 0045d5775e2c19df21428bd4420b6e5612c8002b..5caa02e71a10d92a3c0d68f20628b5391f80e260 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 529c750f98715ec30313ed34c9023a845061a3df..33082a6f06c17232a136e612adc00f284f2787ec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -1,5 +1,6 @@
 path: "tensorflow.keras.layers.LSTM"
 tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.UnifiedLSTM\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTM\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
@@ -33,6 +34,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
@@ -155,7 +160,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'time_major\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index d4d1bc6b6bbf0ce39742b740aff6dc0c1cd464a1..381d5660b9846b9f2b90f630d724fb0561d6ca94 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index e1f5491180903f7d6931cc09755cabb715bbf233..36b0a86628b92c84c227eb59d55c9e9a12be053c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -11,6 +11,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -81,7 +85,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'dynamic\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 9b69d9a9447f42907236b5cc8c7672012f96c38a..b41662e63a8f9273062256ef7ee100d70900e22a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
index a94bd5930f4d9af9aaf9ec9b5cda9d678e698a19..5766528b31adfb27ddc5540232425610e737577d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
@@ -18,6 +18,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -168,11 +172,11 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
@@ -236,16 +240,20 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -268,7 +276,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -280,6 +288,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index fd52259432577ac94dc702d4411ad5c0eed1ff10..e4abfca91363887b9574b76894da24c9700102cf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 5fc8af0d03564c649dff6e9df70d10731319de40..cfcb92e293c59493c7e57ebdb30ac2f2ab35715b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index 7f8932270e63bc02852c5b64e53694e7e26be08b..e0721353d14d87d2d1e9e204eb9d5b4fe5902b3f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 4723b99cb0792e1ce0bdc45e46908da8c2b5359c..0618fbeead02e74e645a2b6be1310f8fd0c00470 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 173c5d4a8b149c4e23683cf375e8d793db7faa5a..4af52ffec80937d32e8cc0e0b128a8db606fd94d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 14e1899e145224e411d65cbf481060a3b2cec0f1..db9311ee58d441908fb5c4ce3d952bafdab9dfcd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index a708e652bf0e82dea0f58034a81a040a39550dc9..bfb15cb44789d9d8d134a5090bf27abf2f81eda2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index e6706b5cf9f32bda78adc4e2db5916a5750cc82e..1db962dbb8c0ac2b0562ecce10354a76d3e74be4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index a73c082d1bba0453b742f76bacf0ad6116ba79a7..f80d5267e79c4b74831b2b926beb84d479008e10 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index f3f195554bbf4a43efaf2af0fd278a23bf270994..cd772d4ac75e3ea4820a788543e15e3af3566b21 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index f345d1d67b2ce0200c64b1aeea5f39821d070bac..2bb6b3073ac79cf475c942b68ac351a18073c689 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index 31cb8bc177c7a9e365101e75108a29900fbda124..e1a1f0735524af6d3597dfff9ca64b3e7dbd5e2c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 44cccc92bd2f1ff0335c22f2967865dc88a96ff7..66c4446572c2ac5930a8a0bc0d5de96e584aa94e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index b55e191ff1ad6997550966bbb6154a81a489575d..0839554f434f64cf957b17c8f5863655fb427ee4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index e9575436e5b14ac8c52a0b59c86937886eab5f40..b10695f6f7965ab7d5dbca7128530348c8758179 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index 98223b207f2ecfd5b7af8a53390166e53a7d4f73..b96500f710398514b37b5b6f32fe31c61aa99e44 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index 2df918b16b2552323d75083bfa80e328c0639cfe..a27d93ec62002a9ade1a012c1bc9f8bc4f05e80f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index ce5f9e21290eeddc0052257191ac4a6d068c1366..6dda24d3d27dcdbc88189e377ce20ec64d908dc9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index a0bb917775fd9edb5d909bf850310e0596a88209..8a4ae8aaa7b91587c7f4e0a71eae6e0ac8598482 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index d7942f201bdbfa8d1577813be461a5905b5c6c90..a083c1da2e3a0a450bb1f39dd12f3270bc49e1f6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index f7ac9042d46f46ab35d18c62e5d8841679a18ca9..5d5b361f8272d273941e8beb1978d0ec8b406027 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index e5a92688220f6e227b317d71a70fde01df4c432b..392c338d73d75e2af9b06be86d449d0ac3415c50 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 0fe2c974a762784a82a6b97e116357be2a61d84f..1143604903523b286f24cc6ca20b97b68e473593 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 2ee5873f0f11688019dec3a6cd69db06d99b9caa..5a15f1a55fa0ce6db3357ab9a3e69d13846caaf9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -33,6 +33,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index 5b8f64aa35725d0ea44fc5c5b81952fd839503e7..c470d9c8e8d9281087e347881592c488f46212f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 240cb6e562f77467d94ef95db2374150e318bc04..d17d6495c09b0e43041e85b8eb99d9d47212606d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 6226c469f8a534f96f6ea991fa5e7d2cf0019e3f..2d538b4734892b85034974887b7fa7dd024551b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 34dabce6d8dd0b1b6fe50a008a981e1f06a77edf..b70923601aeb843fe663734d45493fa97757915f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 0ddf628ace582db259ebe0b211aba6e6362b5d5b..f453ddd50efb193accb2d9105fcaf8a130ca3b3f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index 12eb35ad154a514afd9c900cb2dbece8af28c49f..5759169e07d26600a12b086edb8f945735782fed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index c41020c2b45cc88c9b63f3b7a45c35066794dfe2..bfde1c35f65a603bd38e1cbab6c2d5eb49ac40f3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index 479f89cf6ae93e8d6ae02e304a51a145164df7de..e7f59a9cc5143b337e539d28cd6d1ffd691b5e97 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 233363ce02614f184b43a059889c7475b6a8c50b..0354149d4fea06d489be61391c46e84d8b6c369a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index cb6228ac446bd236df88f94eb6e9e717ea38463d..fff0e26bc16b863a1d86d3f735da009cedcaffd9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 03bad3ccb613a225ad56e128ea680fc9312151e1..c49fa5663d91c4601062d7b207ce2257cec6dd2c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index 158996792a47fab0e7aa26d21d4bb7f281ca76d2..c961699053a4fdd71f8a2782ae463970f243c88e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 63a56cd3eebe271f66258c9a0acb974764555b34..1911e128eb2d7b0ffe6c4ff7eeb0b4927a731bca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 965a4cca04651e123c5bd93484200a58b39918ba..88be9143472ad00b12688059600890f67c6f4e92 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 1a624308878a68f1b48cb0f8b5e08dafbbfa0333..2bbb71ece2583d283cedec37b10eda7b693baa0e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
index 3b4724ef104878df0caada75b0ba68740dc93f8a..e84c9a2a8f178f0acf8305a77f6ea06c406b9888 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
@@ -112,14 +112,6 @@ tf_module {
     name: "Cropping3D"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "CuDNNGRU"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "CuDNNLSTM"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Dense"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2f7da93f6f412ca559aec2f6acde2b80a5c93c86
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b3a7cd80973259bd5cdfe382c656a9478f8933d8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
index 5204a29a82d8b630a246677b32f4dae0180f612a..c198096d252cd9a3706bcbf6f1e4a1199ec7a1f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.keras.losses"
 tf_module {
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MeanAbsoluteError"
     mtype: "<type \'type\'>"
@@ -42,11 +50,11 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_hinge"
@@ -126,7 +134,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "squared_hinge"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
index 2db07df5235e150f691a12d6b332c6d0d241ac19..bad488f59b99ccbe7c6424244c86288afba51f46 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index 904ad3a21a05895b23e30dab82a89a31c74dcfca..a1e7601a5141152c6709c46bb50b331fda69afca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 17b74924fab4f596a010d6b9731b474433a8153e..5f2c2f980777a34ed5128d8090ea7e945d9004e7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
index 49f577e1367aece126449923f77f4f6c89493e99..c153e9cf4d7932b1e4bf65bd02b8de2706d4b8be 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
index e8baf858669a446a11b44e044f36bfde61e440bb..aae2bd99886fbe93086186864eb6040437b872d5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
index 40fe64bbd2cec45b9a8c4e9b041d3fa858af1327..904a2fa9caee882775701c53a97c9aac0fd8120e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
index ae6a85026da80cd071984aede8d0ec4e9cd571c5..e81ecfe3f627f9d43ad1c673d41b70e81c783f13 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
index 31068a51d510a7b95f62f61f03d37176c0fca55d..f8470b94d7f52216d1c1e4342acabb404bbd8f74 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
similarity index 92%
rename from tensorflow/tools/api/golden/v1/tensorflow.metrics.-categorical-accuracy.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 0ef75d8756f8b8f50c281f12e664f9989df951d6..b70ef32bcaf3cb243d5d22d93cdbd8188f56d4df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -1,8 +1,7 @@
-path: "tensorflow.metrics.CategoricalAccuracy"
+path: "tensorflow.keras.metrics.SensitivityAtSpecificity"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivityAtSpecificity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
@@ -15,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -85,7 +88,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_accuracy\', \'None\'], "
+    argspec: "args=[\'self\', \'specificity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 0c17452292a031d42f3da0d5844e99d1272dad25..2e693269bf749260e143cf19c6e1f51a5242412f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
similarity index 92%
rename from tensorflow/tools/api/golden/v1/tensorflow.metrics.-accuracy.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index f8e12f8817356477fe09b9efb4e1aef8b0469ec6..e62a2df0564a0eb4dba528dab575b7c08e41b913 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -1,8 +1,7 @@
-path: "tensorflow.metrics.Accuracy"
+path: "tensorflow.keras.metrics.SpecificityAtSensitivity"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SpecificityAtSensitivity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
@@ -15,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -85,7 +88,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'accuracy\', \'None\'], "
+    argspec: "args=[\'self\', \'sensitivity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
index 1b5eb8d0de53960c3a98409119709c1307aa6379..1a524d73c0d387fe603846b5f180916829d65435 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
index 5b9c470e32d7e038f9ba11e4f96ab6eaa6b60a87..b9b4f565c5eff9ece856255ffbe15af3fb97c2df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
index cc90d0e7092e37993bff8c4883aa677b45e05499..905021dd790205e64a6f9839218200db98941927 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
@@ -32,10 +32,18 @@ tf_module {
     name: "Recall"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SensitivityAtSpecificity"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SparseCategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SpecificityAtSensitivity"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrueNegatives"
     mtype: "<type \'type\'>"
@@ -70,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_accuracy"
@@ -78,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "cosine"
@@ -154,7 +162,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sparse_top_k_categorical_accuracy"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 41d8b2fc950d02ace5f0efc7f790aa0a9c022f5f..5885cd21c1976bd7b95f7ca5bbea59eeb40b2ce8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -163,11 +167,11 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
@@ -231,16 +235,20 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -263,7 +271,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -275,6 +283,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 2cf107a5cd89805d590e0fc5a372ed3d18c914c0..935fa32f8c7f2d3b9c6b220a6b77a957d2c73f30 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -168,11 +172,11 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
@@ -240,7 +244,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_classes"
@@ -248,7 +252,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
@@ -258,6 +262,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -280,7 +288,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -292,6 +300,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
index b9ce154bddef609e0aaf6627d6f59de551e51e3b..54262697932738810406380504fba217e736b1b7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Adadelta"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adadelta\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adadelta.Adadelta\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'0.95\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.95\', \'1e-07\', \'Adadelta\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
index d0dc9e37a386a26143365eb443d5ba5fce8a87d9..c39fe6ba4f7355e24bdaa5d7592f1ee7bd6de67f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Adagrad"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adagrad\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adagrad.Adagrad\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.1\', \'1e-07\', \'Adagrad\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
index 06815fa99a4a474ec131c29d0cbc78bb2b9cb72d..05d46d380bf93631ea598efb0fce256f2e33a848 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Adam"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adam\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adam.Adam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\', \'amsgrad\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'None\', \'0.0\', \'False\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'Adam\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
index 47b55fdb44e79e976b6de13d760a7cf175323c6c..78829def67d11e422aa33e06434e78d3048382d9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -1,15 +1,37 @@
 path: "tensorflow.keras.optimizers.Adamax"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adamax\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adamax.Adamax\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adam.Adam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Adamax\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +41,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +57,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
index 53d64dae932e250b9d81b2767a833de3bac8c403..58b7f274916f378a0893b2addc99c3f4b68d108f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -1,14 +1,35 @@
 path: "tensorflow.keras.optimizers.Optimizer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -18,6 +39,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -26,8 +55,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index a1e9b8cceb95e8f25ac5f414fadacf237be33cd9..8de796edde56b4639f1b59963383ed9f35a39f58 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.RMSprop"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.RMSprop\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'momentum\', \'epsilon\', \'centered\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.0\', \'1e-07\', \'False\', \'RMSprop\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
index a67fefb1bafebd62db9f6108f0fe1847b5d2e0cb..393eeb3d6cab4ea4c9acf3f909edc0a929d51414 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.SGD"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.SGD\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'momentum\', \'decay\', \'nesterov\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'0.0\', \'False\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.0\', \'False\', \'SGD\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index a3599bfa801160da15aa2c51b52525c912b3ba3b..3e1e2e3d54de3e2442299a783f933a60dfd2db6d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -132,6 +132,10 @@ tf_module {
     name: "lstsq"
     argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'True\', \'None\'], "
   }
+  member_method {
+    name: "lu"
+    argspec: "args=[\'input\', \'output_idx_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
   member_method {
     name: "matmul"
     argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68c651a3c9969f2f16fca39f4466cebbb44eea28
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.lite.OpsSet"
+tf_class {
+  is_instance: "<enum \'OpsSet\'>"
+  member {
+    name: "SELECT_TF_OPS"
+    mtype: "<enum \'OpsSet\'>"
+  }
+  member {
+    name: "TFLITE_BUILTINS"
+    mtype: "<enum \'OpsSet\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
index f5013c250be8477bb630d3d57ae88a501bb60b9b..154dd00821794ef4a5118e98d67e32beca38bebf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "OpHint"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "OpsSet"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
   member {
     name: "TFLiteConverter"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index 979d77ea6b3a2021c55b05b77a7ec9d27e43f297..4ac0484050054abee9496bcf09d90ff58bbfb9d7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -170,7 +170,7 @@ tf_module {
   }
   member_method {
     name: "in_top_k"
-    argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'targets\', \'predictions\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "invert_permutation"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
deleted file mode 100644
index f8e12f8817356477fe09b9efb4e1aef8b0469ec6..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
+++ /dev/null
@@ -1,194 +0,0 @@
-path: "tensorflow.metrics.Accuracy"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'accuracy\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
deleted file mode 100644
index b9bc6a716a1d114330fce2521e238897bdae56d0..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
+++ /dev/null
@@ -1,194 +0,0 @@
-path: "tensorflow.metrics.BinaryAccuracy"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\', \'threshold\'], varargs=None, keywords=None, defaults=[\'binary_accuracy\', \'None\', \'0.5\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
deleted file mode 100644
index 0ef75d8756f8b8f50c281f12e664f9989df951d6..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
+++ /dev/null
@@ -1,194 +0,0 @@
-path: "tensorflow.metrics.CategoricalAccuracy"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_accuracy\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
deleted file mode 100644
index 33226a2df62bd69017c3f54020629d5429e39c06..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
+++ /dev/null
@@ -1,193 +0,0 @@
-path: "tensorflow.metrics.FalseNegatives"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.FalseNegatives\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
deleted file mode 100644
index 9953162ea3ec8ee7259bc8304052ab0754cfa630..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
+++ /dev/null
@@ -1,193 +0,0 @@
-path: "tensorflow.metrics.FalsePositives"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.FalsePositives\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
deleted file mode 100644
index 7fe6d6fda9685e3f9f0ce29b81f260f3e41a7ef3..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
+++ /dev/null
@@ -1,192 +0,0 @@
-path: "tensorflow.metrics.Mean"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
deleted file mode 100644
index 8c3271a109cb408492369f59c889fffa522e6d44..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
+++ /dev/null
@@ -1,192 +0,0 @@
-path: "tensorflow.metrics.Precision"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
deleted file mode 100644
index 840a68bbc784b8570eea7a40d0e6174de60a7e9d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
+++ /dev/null
@@ -1,192 +0,0 @@
-path: "tensorflow.metrics.Recall"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
deleted file mode 100644
index 7bce43fbdeb13591ab5a25b50a0d880702173d98..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
+++ /dev/null
@@ -1,194 +0,0 @@
-path: "tensorflow.metrics.SparseCategoricalAccuracy"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_accuracy\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
deleted file mode 100644
index 83cd5b736bc9d0b55720e9bdac7047f940b259f1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
+++ /dev/null
@@ -1,193 +0,0 @@
-path: "tensorflow.metrics.TrueNegatives"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.TrueNegatives\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
deleted file mode 100644
index 5b2502eafee7126993d1f40dca74e5cb16856b71..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
+++ /dev/null
@@ -1,193 +0,0 @@
-path: "tensorflow.metrics.TruePositives"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.TruePositives\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
-  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "result"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_state"
-    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
deleted file mode 100644
index 773efd03fc8d1c422fc2e4b2400c4c536289d767..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
+++ /dev/null
@@ -1,47 +0,0 @@
-path: "tensorflow.metrics"
-tf_module {
-  member {
-    name: "Accuracy"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "BinaryAccuracy"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "CategoricalAccuracy"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "FalseNegatives"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "FalsePositives"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Mean"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Precision"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Recall"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SparseCategoricalAccuracy"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "TrueNegatives"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "TruePositives"
-    mtype: "<type \'type\'>"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
index 47794487600ef066b1309879614573e57e4c1c6e..c75c75f2ef7ca50cce15fe1dffb4d0de3f6815de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
@@ -106,7 +106,7 @@ tf_module {
   }
   member_method {
     name: "depth_to_space"
-    argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
+    argspec: "args=[\'input\', \'block_size\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
   member_method {
     name: "depthwise_conv2d"
@@ -134,11 +134,11 @@ tf_module {
   }
   member_method {
     name: "embedding_lookup"
-    argspec: "args=[\'params\', \'ids\', \'partition_strategy\', \'name\', \'validate_indices\', \'max_norm\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'params\', \'ids\', \'partition_strategy\', \'max_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'None\'], "
   }
   member_method {
     name: "embedding_lookup_sparse"
-    argspec: "args=[\'params\', \'sp_ids\', \'sp_weights\', \'partition_strategy\', \'name\', \'combiner\', \'max_norm\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'params\', \'sp_ids\', \'sp_weights\', \'partition_strategy\', \'combiner\', \'max_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "erosion2d"
@@ -158,7 +158,7 @@ tf_module {
   }
   member_method {
     name: "in_top_k"
-    argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'targets\', \'predictions\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "l2_loss"
@@ -210,7 +210,7 @@ tf_module {
   }
   member_method {
     name: "nce_loss"
-    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'False\', \'mod\', \'nce_loss\'], "
+    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'False\', \'nce_loss\'], "
   }
   member_method {
     name: "normalize_moments"
@@ -228,17 +228,13 @@ tf_module {
     name: "relu6"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "relu_layer"
-    argspec: "args=[\'x\', \'weights\', \'biases\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "safe_embedding_lookup_sparse"
-    argspec: "args=[\'embedding_weights\', \'sparse_ids\', \'sparse_weights\', \'combiner\', \'default_id\', \'name\', \'partition_strategy\', \'max_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'mean\', \'None\', \'None\', \'div\', \'None\'], "
+    argspec: "args=[\'embedding_weights\', \'sparse_ids\', \'sparse_weights\', \'combiner\', \'default_id\', \'max_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'mean\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sampled_softmax_loss"
-    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\', \'seed\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'True\', \'mod\', \'sampled_softmax_loss\', \'None\'], "
+    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'True\', \'None\', \'sampled_softmax_loss\'], "
   }
   member_method {
     name: "selu"
@@ -262,7 +258,7 @@ tf_module {
   }
   member_method {
     name: "softmax_cross_entropy_with_logits"
-    argspec: "args=[\'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+    argspec: "args=[\'labels\', \'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "softplus"
@@ -274,11 +270,11 @@ tf_module {
   }
   member_method {
     name: "space_to_batch"
-    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "space_to_depth"
-    argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
+    argspec: "args=[\'input\', \'block_size\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
   member_method {
     name: "sparse_softmax_cross_entropy_with_logits"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 9e52a4252619ffc19b287fc1818fa6f772847335..3547b66d19ac6b64449860160774647df855a6de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 9836433d08cba809107f9bb5dbccf2e971865b8a..7582fd52b63afdb8c6f2a5e7f0e6b26071232832 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index d3b68e4f2976912ed65ba7916284c951fda03b05..c36ecaa4b2b2ce14292cd2c46a986bb1387294bd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 1f7840ab919baeeb0077904592ba8dcc1d4c91fb..42128ebd17234fcee3b016bbd7f1964824d1a0b6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index f5e1545657e2095de6eeccbe58eed93d8f7ec75a..574b6778fad167e5f05900ca4b934b9b06d68bd5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -4,14 +4,6 @@ tf_module {
     name: "AggregationMethod"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "AttrValue"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "ConfigProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "DType"
     mtype: "<type \'type\'>"
@@ -20,14 +12,6 @@ tf_module {
     name: "Event"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "FIFOQueue"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "GPUOptions"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "GradientTape"
     mtype: "<type \'type\'>"
@@ -36,70 +20,26 @@ tf_module {
     name: "Graph"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "GraphDef"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "GraphOptions"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "HistogramProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "IndexedSlices"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "LogMessage"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "MetaGraphDef"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "NameAttrList"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "NodeDef"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "Operation"
     mtype: "<type \'type\'>"
   }
   member {
-    name: "OptimizerOptions"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+    name: "RaggedTensor"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "RegisterGradient"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "RunMetadata"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "RunOptions"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "SessionLog"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "SparseTensor"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "SparseTensorValue"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Summary"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -140,6 +80,10 @@ tf_module {
     name: "VariableSynchronization"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member {
+    name: "autograph"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "bfloat16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -276,10 +220,6 @@ tf_module {
     name: "math"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "metrics"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "name_scope"
     mtype: "<type \'type\'>"
@@ -312,6 +252,10 @@ tf_module {
     name: "quantization"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "queue"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "quint16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -320,6 +264,10 @@ tf_module {
     name: "quint8"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "ragged"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "random"
     mtype: "<type \'module\'>"
@@ -492,10 +440,6 @@ tf_module {
     name: "batch_gather"
     argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "batch_scatter_update"
-    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-  }
   member_method {
     name: "batch_to_space"
     argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -608,10 +552,6 @@ tf_module {
     name: "einsum"
     argspec: "args=[\'equation\'], varargs=inputs, keywords=kwargs, defaults=None"
   }
-  member_method {
-    name: "enable_eager_execution"
-    argspec: "args=[\'config\', \'device_policy\', \'execution_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "ensure_shape"
     argspec: "args=[\'x\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1000,6 +940,10 @@ tf_module {
     name: "sort"
     argspec: "args=[\'values\', \'axis\', \'direction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'None\'], "
   }
+  member_method {
+    name: "space_to_batch"
+    argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "space_to_batch_nd"
     argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1036,10 +980,6 @@ tf_module {
     name: "string_split"
     argspec: "args=[\'source\', \'delimiter\', \'skip_empty\'], varargs=None, keywords=None, defaults=[\' \', \'True\'], "
   }
-  member_method {
-    name: "substr"
-    argspec: "args=[\'input\', \'pos\', \'len\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
-  }
   member_method {
     name: "subtract"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.queue.-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.queue.-f-i-f-o-queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..724ab5fe8283de44b20b059042f8d6744b11da19
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.queue.-f-i-f-o-queue.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.queue.FIFOQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.FIFOQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'dtypes\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'fifo_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.queue.-padding-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.queue.-padding-f-i-f-o-queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ef0a4d9eb6bbfb69fddf3fe696e3f60ac3ef67b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.queue.-padding-f-i-f-o-queue.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.queue.PaddingFIFOQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PaddingFIFOQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'dtypes\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'padding_fifo_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.queue.-priority-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.queue.-priority-queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb66beb13af18501912fda85b9c3dc67cdf21683
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.queue.-priority-queue.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.queue.PriorityQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PriorityQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'types\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'priority_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.queue.-queue-base.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.queue.-queue-base.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8faaad22af6e0f920e26a44e1ebf294fc4b109c4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.queue.-queue-base.pbtxt
@@ -0,0 +1,65 @@
+path: "tensorflow.queue.QueueBase"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtypes\', \'shapes\', \'names\', \'queue_ref\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.queue.-random-shuffle-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.queue.-random-shuffle-queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31cd503b13040b119d4028f813c94689f8e2ebb3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.queue.-random-shuffle-queue.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.queue.RandomShuffleQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.RandomShuffleQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'min_after_dequeue\', \'dtypes\', \'shapes\', \'names\', \'seed\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'random_shuffle_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c16e95e2116b703434ed91106eb29e4beb5668f2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.queue.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.queue"
+tf_module {
+  member {
+    name: "FIFOQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PaddingFIFOQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PriorityQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "QueueBase"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomShuffleQueue"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5fde488ffdd4dc30695407b5eba097585c885f65
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.ragged"
+tf_module {
+  member_method {
+    name: "constant"
+    argspec: "args=[\'pylist\', \'dtype\', \'ragged_rank\', \'inner_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "map_flat_values"
+    argspec: "args=[\'op\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "range"
+    argspec: "args=[\'starts\', \'limits\', \'deltas\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "row_splits_to_segment_ids"
+    argspec: "args=[\'splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "segment_ids_to_row_splits"
+    argspec: "args=[\'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
index de5cb6b7172af32e3e246798c8d748c272dae097..d49c23e59cf036f05758f5c50208febf4b7381d5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
@@ -16,6 +16,10 @@ tf_module {
     name: "gamma"
     argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
   }
+  member_method {
+    name: "learned_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "log_uniform_candidate_sampler"
     argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index 9808200d72c66d940f694aecb4c7a958658a745e..b8bd2c0b72c1a78fb2abbfb319073fec267f56fb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "concat"
-    argspec: "args=[\'axis\', \'sp_inputs\', \'expand_nonconcat_dim\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'axis\', \'sp_inputs\', \'expand_nonconcat_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "cross"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
index f6e32ed08c8339413374c11c6fc75aec92bffec2..962cf9a7239343e3b570d3a6d20edeeeb871b120 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
@@ -52,6 +52,14 @@ tf_module {
     name: "to_number"
     argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "unicode_decode"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "unicode_decode_with_offsets"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+  }
   member_method {
     name: "unicode_encode"
     argspec: "args=[\'input\', \'output_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
@@ -60,6 +68,14 @@ tf_module {
     name: "unicode_script"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "unicode_split"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
+  }
+  member_method {
+    name: "unicode_split_with_offsets"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
+  }
   member_method {
     name: "unicode_transcode"
     argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-session-log.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-session-log.pbtxt
deleted file mode 100644
index 73de73869c8d1a6808b16fe8853fd21cc8891879..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-session-log.pbtxt
+++ /dev/null
@@ -1,44 +0,0 @@
-path: "tensorflow.summary.SessionLog"
-tf_proto {
-  descriptor {
-    name: "SessionLog"
-    field {
-      name: "status"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.SessionLog.SessionStatus"
-    }
-    field {
-      name: "checkpoint_path"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "msg"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    enum_type {
-      name: "SessionStatus"
-      value {
-        name: "STATUS_UNSPECIFIED"
-        number: 0
-      }
-      value {
-        name: "START"
-        number: 1
-      }
-      value {
-        name: "STOP"
-        number: 2
-      }
-      value {
-        name: "CHECKPOINT"
-        number: 3
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
index 42a74a65fbb02a85c73cd740cdfc5155bdefb0d7..61670bd15122f65ef05d20ee5d023a3c326f7757 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
@@ -12,10 +12,6 @@ tf_module {
     name: "FileWriterCache"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "SessionLog"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "Summary"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -44,4 +40,12 @@ tf_module {
     name: "import_event"
     argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "summary_scope"
+    argspec: "args=[\'name\', \'default_name\', \'values\'], varargs=None, keywords=None, defaults=[\'summary\', \'None\'], "
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'tag\', \'tensor\', \'step\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.-stub-out-for-testing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.-stub-out-for-testing.pbtxt
deleted file mode 100644
index e02a0c6097c5ea4dae905b25cd0e381f5e257105..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.-stub-out-for-testing.pbtxt
+++ /dev/null
@@ -1,28 +0,0 @@
-path: "tensorflow.test.StubOutForTesting"
-tf_class {
-  is_instance: "<class \'tensorflow.python.platform.googletest.StubOutForTesting\'>"
-  member_method {
-    name: "CleanUp"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Set"
-    argspec: "args=[\'self\', \'parent\', \'child_name\', \'new_child\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "SmartSet"
-    argspec: "args=[\'self\', \'obj\', \'attr_name\', \'new_attr\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "SmartUnsetAll"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "UnsetAll"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
index 72ce7330445a9e9b94402cf06438c4284676d9dd..980e96ac254aebf229ae52d98f607ed87d334e7a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
@@ -4,10 +4,6 @@ tf_module {
     name: "Benchmark"
     mtype: "<class \'tensorflow.python.platform.benchmark._BenchmarkRegistrar\'>"
   }
-  member {
-    name: "StubOutForTesting"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "TestCase"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
index 2cc3b7ee1d0aec5eae228b179db85bb943ad1ec3..c72564e5987de36a95f7f44bae2b8122dcf256c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
@@ -12,10 +12,6 @@ tf_module {
     name: "CheckpointManager"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "CheckpointSaverHook"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "ClusterDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -52,22 +48,10 @@ tf_module {
     name: "Features"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "FeedFnHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "FinalOpsHook"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "FloatList"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "GlobalStepWaiterHook"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Int64List"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -76,14 +60,6 @@ tf_module {
     name: "JobDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "LoggingTensorHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "NanTensorHook"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "ProximalGradientDescentOptimizer"
     mtype: "<type \'type\'>"
@@ -92,30 +68,10 @@ tf_module {
     name: "SequenceExample"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "Server"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "ServerDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "SessionRunHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "StepCounterHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "StopAtStepHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SummarySaverHook"
-    mtype: "<type \'type\'>"
-  }
   member_method {
     name: "cosine_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
@@ -188,8 +144,4 @@ tf_module {
     name: "summary_iterator"
     argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "warm_start"
-    argspec: "args=[\'ckpt_to_initialize_from\', \'vars_to_warm_start\', \'var_name_to_vocab_info\', \'var_name_to_prev_var_name\'], varargs=None, keywords=None, defaults=[\'.*\', \'None\', \'None\'], "
-  }
 }
diff --git a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
index 70df38ba8b8c46a51640b14591b6437dea639450..5102066730533c717a029c6fd52ef0e2d10a520d 100644
--- a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
+++ b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
@@ -37,6 +37,9 @@ _CORNER_CASES = {
     'train.NanLossDuringTrainingError': {
         'message': {}
     },
+    'estimator.NanLossDuringTrainingError': {
+        'message': {}
+    },
 }
 
 # Python 2 vs. 3 differences
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index e7f23a11740100ea9f4386bd39e3e17c9b86ffdf..723fceef413d86675e885debd37e73e5facd7f7c 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -277,6 +277,9 @@ class ApiCompatibilityTest(test.TestCase):
 
     public_api_visitor = public_api.PublicAPIVisitor(visitor)
     public_api_visitor.private_map['tf'] = ['contrib']
+    if api_version == 2:
+      public_api_visitor.private_map['tf'].append('enable_v2_behavior')
+
     public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
     if FLAGS.only_test_core_api:
       public_api_visitor.do_not_descend_map['tf'].extend(_NON_CORE_PACKAGES)
@@ -311,7 +314,7 @@ class ApiCompatibilityTest(test.TestCase):
         update_goldens=FLAGS.update_goldens,
         api_version=api_version)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def testAPIBackwardsCompatibility(self):
     api_version = 1
     golden_file_pattern = os.path.join(
@@ -330,7 +333,7 @@ class ApiCompatibilityTest(test.TestCase):
         'tensorflow.python.util.lazy_loader.LazyLoader'
         in str(type(tf.contrib)))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/120545219')
   def testAPIBackwardsCompatibilityV1(self):
     api_version = 1
     golden_file_pattern = os.path.join(
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
index 85b9d943131749b446db8e4cba50c7557abd8933..4fe86066c91b2baa665070a6fd9d34ebc74bdab7 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
@@ -20,6 +20,7 @@ ENV CUDA_VERSION 10.0.130
 ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1
 ENV CUDNN_VERSION 7.3.1.20
 ENV NCCL_VERSION 2.3.5
+ENV TENSORRT_VERSION 5.0.2
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0,driver>=410"
 ENV NVIDIA_VISIBLE_DEVICES all
@@ -31,7 +32,7 @@ ENV PATH /usr/local/cuda/bin:${PATH}
 # -Wl,-rpath-link=/usr/local/cuda/lib64/stubs to all binaries transitively
 # depending on libcuda. Optimally, builds targeting cuda would do that
 # internally.
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
+ENV LD_LIBRARY_PATH /usr/local/cuda/lib64/stubs
 
 LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
 
@@ -48,7 +49,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
         libcudnn7-dev=$CUDNN_VERSION-1+cuda10.0 \
         libnccl2=$NCCL_VERSION-2+cuda10.0 \
-        libnccl-dev=$NCCL_VERSION-2+cuda10.0 && \
+        libnccl-dev=$NCCL_VERSION-2+cuda10.0 \
+        nvinfer-runtime-trt-repo-ubuntu1604-$TENSORRT_VERSION-ga-cuda10.0 && \
+    apt-get update && apt-get install -y --no-install-recommends \
+        libnvinfer5=$TENSORRT_VERSION-1+cuda10.0 \
+        libnvinfer-dev=$TENSORRT_VERSION-1+cuda10.0 && \
     ln -s cuda-10.0 /usr/local/cuda && \
     apt-mark hold libcudnn7 && \
     apt-mark hold libnccl2 && \
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
index eb6ca7c8f0fe27bd8bb9e5b11cf14e98ad67e530..60a23e1edbced8dbef738e290353cdfb60ea86a6 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
@@ -26,6 +26,7 @@ ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 ENV NVIDIA_REQUIRE_CUDA "cuda>=9.0"
 ENV NCCL_VERSION 2.2.13
+ENV TENSORRT_VERSION 5.0.2
 ENV CUDNN_VERSION 7.1.4.18
 
 # TODO(b/110903506): /usr/loca/cuda/lib64/stubs should not be needed in
@@ -53,7 +54,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cublas-dev-9-0=9.0.176.4-1 \
         libnccl-dev=$NCCL_VERSION-1+cuda9.0 \
         libcudnn7-dev=$CUDNN_VERSION-1+cuda9.0 \
-        libcudnn7=$CUDNN_VERSION-1+cuda9.0 && \
+        libcudnn7=$CUDNN_VERSION-1+cuda9.0 \
+        nvinfer-runtime-trt-repo-ubuntu1604-$TENSORRT_VERSION-ga-cuda9.0 && \
+    apt-get update && apt-get install -y --no-install-recommends \
+        libnvinfer5=$TENSORRT_VERSION-1+cuda9.0 \
+        libnvinfer-dev=$TENSORRT_VERSION-1+cuda9.0 && \
     ln -s cuda-9.0 /usr/local/cuda && \
     apt-mark hold libnccl2 && \
     apt-mark hold libcudnn7 libcudnn7-dev && \
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index e284401b8aa469ebcbed856cd09dd597be242d7a..f45ac3eab37bdb2a51c44f68d51fbdb42b5f82d1 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="0.15.0"
+BAZEL_VERSION="0.20.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
index 87be81577d0efb395a12afc85109f10ad4178c27..9501a6d94b026774753bbd162fddec3c20753740 100755
--- a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -18,7 +18,7 @@
 # It will compile bazel from source and install it in /usr/local/bin
 
 # Select bazel version.
-BAZEL_VERSION="0.15.0"
+BAZEL_VERSION="0.20.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index c18f0d6e69d98ac50f0aa850f1c78ceaab4c36e2..9c6825f27164bdc694fbd694a90792f6a58f852c 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -30,6 +30,9 @@ export TMPDIR=${TMPDIR:-"C:/tmp"}
 export TMPDIR=$(cygpath -m "$TMPDIR")
 mkdir -p "$TMPDIR"
 
+# Add timestamps before each command.
+export PS4='+ $(date) + '
+
 # Set bash path
 export BAZEL_SH=${BAZEL_SH:-"C:/tools/msys64/usr/bin/bash"}
 
@@ -52,9 +55,9 @@ export PATH="/c/Program Files/Git/cmd:$PATH"
 export PATH="/c/${PYTHON_BASE_PATH}/Scripts:$PATH"
 
 # Setting default values to CUDA related environment variables
-export TF_CUDA_VERSION=${TF_CUDA_VERSION:-9.0}
+export TF_CUDA_VERSION=${TF_CUDA_VERSION:-10.0}
 export TF_CUDNN_VERSION=${TF_CUDNN_VERSION:-7}
-export TF_CUDA_COMPUTE_CAPABILITIES=${TF_CUDA_COMPUTE_CAPABILITIES:-3.7}
+export TF_CUDA_COMPUTE_CAPABILITIES=${TF_CUDA_COMPUTE_CAPABILITIES:-6.0}
 export CUDA_TOOLKIT_PATH=${CUDA_TOOLKIT_PATH:-"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${TF_CUDA_VERSION}"}
 export CUDNN_INSTALL_PATH=${CUDNN_INSTALL_PATH:-"C:/tools/cuda"}
 
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 5990caa50c7dc18aaf8d560319d80c753276110c..4c4e8ba1ca168f3925d7f5f7ad5282500214af4f 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -58,7 +58,8 @@ PY_TEST_DIR="py_test_dir"
 SKIP_TEST=0
 RELEASE_BUILD=0
 TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
-EXTRA_BUILD_FLAGS=${EXTRA_BUILD_FLAGS:-}
+PROJECT_NAME=""
+EXTRA_BUILD_FLAGS=""
 
 # --skip_test            Skip running tests
 # --enable_remote_cache  Add options to enable remote cache for build and test
@@ -66,16 +67,32 @@ EXTRA_BUILD_FLAGS=${EXTRA_BUILD_FLAGS:-}
 #                        ensure performance
 # --test_core_only       Use tensorflow/python/... as test target
 # --test_contrib_only    Use tensorflow/contrib/... as test target
-for ARG in "$@"; do
-  case "$ARG" in
+#for ARG in "$@"; do
+while [[ $# -gt 0 ]]; do
+  case "$1" in
     --tf_nightly) TF_NIGHTLY=1 ;;
     --skip_test) SKIP_TEST=1 ;;
     --enable_remote_cache) set_remote_cache_options ;;
     --release_build) RELEASE_BUILD=1 ;;
     --test_core_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..." ;;
     --test_contrib_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/contrib/..." ;;
+    --extra_build_flags)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      EXTRA_BUILD_FLAGS="$1"
+      ;;
+    --project_name)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+      ;;
     *)
   esac
+  shift
 done
 
 if [[ "$RELEASE_BUILD" == 1 ]]; then
@@ -89,10 +106,10 @@ fi
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   python tensorflow/tools/ci_build/update_version.py --nightly
-  if [ -z ${EXTRA_PIP_FLAGS} ]; then
+  if [ -z ${PROJECT_NAME} ]; then
     EXTRA_PIP_FLAGS="--nightly_flag"
   else
-    EXTRA_PIP_FLAGS="${EXTRA_PIP_FLAGS} --nightly_flag"
+    EXTRA_PIP_FLAGS="--project_name ${PROJECT_NAME} --nightly_flag"
   fi
 fi
 
@@ -106,7 +123,8 @@ fi
 run_configure_for_cpu_build
 
 bazel build --announce_rc --config=opt ${EXTRA_BUILD_FLAGS} \
-  tensorflow/tools/pip_package:build_pip_package || exit $?
+  tensorflow/tools/pip_package:build_pip_package \
+  --incompatible_remove_native_http_archive=false || exit $?
 
 if [[ "$SKIP_TEST" == 1 ]]; then
   exit 0
@@ -115,7 +133,7 @@ fi
 # Create a python test directory to avoid package name conflict
 create_python_test_dir "${PY_TEST_DIR}"
 
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" "${EXTRA_PIP_FLAGS}"
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" ${EXTRA_PIP_FLAGS}
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   exit 0
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index 1bca2a6f88a0cb738ad76bf53abf3cb306229350..070235fcb27aa1d51c7feaaebec4f72088966d2e 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -58,7 +58,8 @@ PY_TEST_DIR="py_test_dir"
 SKIP_TEST=0
 RELEASE_BUILD=0
 TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
-EXTRA_BUILD_FLAGS=${EXTRA_BUILD_FLAGS:-}
+PROJECT_NAME=""
+EXTRA_BUILD_FLAGS=""
 
 # --skip_test            Skip running tests
 # --enable_remote_cache  Add options to enable remote cache for build and test
@@ -66,7 +67,7 @@ EXTRA_BUILD_FLAGS=${EXTRA_BUILD_FLAGS:-}
 #                        ensure performance
 # --test_core_only       Use tensorflow/python/... as test target
 # --test_contrib_only    Use tensorflow/contrib/... as test target
-for ARG in "$@"; do
+while [[ $# -gt 0 ]]; do
   case "$ARG" in
     --tf_nightly) TF_NIGHTLY=1 ;;
     --skip_test) SKIP_TEST=1 ;;
@@ -74,8 +75,23 @@ for ARG in "$@"; do
     --release_build) RELEASE_BUILD=1 ;;
     --test_core_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..." ;;
     --test_contrib_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/contrib/..." ;;
+    --extra_build_flags)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      EXTRA_BUILD_FLAGS="$1"
+      ;;
+    --project_name)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+      ;;
     *)
   esac
+  shift
 done
 
 if [[ "$RELEASE_BUILD" == 1 ]]; then
@@ -89,10 +105,10 @@ fi
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   python tensorflow/tools/ci_build/update_version.py --nightly
-  if [ -z ${EXTRA_PIP_FLAGS} ]; then
+  if [ -z ${PROJECT_NAME} ]; then
     EXTRA_PIP_FLAGS="--nightly_flag"
   else
-    EXTRA_PIP_FLAGS="${EXTRA_PIP_FLAGS} --nightly_flag"
+    EXTRA_PIP_FLAGS="--project_name=${PROJECT_NAME} --nightly_flag"
   fi
 fi
 
@@ -120,7 +136,7 @@ fi
 create_python_test_dir "${PY_TEST_DIR}"
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" \
-  --gpu "${EXTRA_PIP_FLAGS}"
+  --gpu ${EXTRA_PIP_FLAGS}
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   exit 0
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index 152a79f54297cef8c0bb3103ea28326baa1c1117..a9902d77f5ec103fe2000a4a470d425e3998f45e 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -51,14 +51,21 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
+py_library(
+    name = "reorders_v2",
+    srcs = ["reorders_v2.py"],
+    srcs_version = "PY2AND3",
+)
+
 py_library(
     name = "tf_upgrade_v2_lib",
-    srcs = [
-        "renames_v2.py",
-        "tf_upgrade_v2.py",
-    ],
+    srcs = ["tf_upgrade_v2.py"],
     srcs_version = "PY2AND3",
-    deps = [":ast_edits"],
+    deps = [
+        ":ast_edits",
+        ":renames_v2",
+        ":reorders_v2",
+    ],
 )
 
 py_binary(
@@ -126,22 +133,22 @@ py_test(
 genrule(
     name = "generate_upgraded_file_v2",
     testonly = 1,
-    srcs = ["testdata/test_file_v1_10.py"],
+    srcs = ["testdata/test_file_v1_12.py"],
     outs = [
         "test_file_v2_0.py",
         "report_v2.txt",
     ],
     cmd = ("$(location :tf_upgrade_v2)" +
-           " --infile $(location testdata/test_file_v1_10.py)" +
+           " --infile $(location testdata/test_file_v1_12.py)" +
            " --outfile $(location test_file_v2_0.py)" +
            " --reportfile $(location report_v2.txt)"),
     tools = [":tf_upgrade_v2"],
 )
 
 py_test(
-    name = "test_file_v1_10",
+    name = "test_file_v1_12",
     size = "small",
-    srcs = ["testdata/test_file_v1_10.py"],
+    srcs = ["testdata/test_file_v1_12.py"],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
@@ -164,6 +171,6 @@ exports_files(
         "tf_upgrade.py",
         "renames_v2.py",
         "testdata/test_file_v0_11.py",
-        "testdata/test_file_v1_10.py",
+        "testdata/test_file_v1_12.py",
     ],
 )
diff --git a/tensorflow/tools/compatibility/README.md b/tensorflow/tools/compatibility/README.md
index aabc7b253d68eb43d3e6c1d5cecd55697a0cab59..5e2de35338a9460205272112195ff5560d6e503c 100644
--- a/tensorflow/tools/compatibility/README.md
+++ b/tensorflow/tools/compatibility/README.md
@@ -1,60 +1,79 @@
 # TensorFlow Python API Upgrade Utility
 
 This tool allows you to upgrade your existing TensorFlow Python scripts.
-This script can be run on a single Python file:
+Specifically: \
+`tf_upgrade_v2.py`: upgrades code from TensorFlow 1.12 to TensorFlow 2.0 preview. \
+`tf_upgrade.py`: upgrades code to TensorFlow 1.0 from TensorFlow 0.11.
+
+## Running the script from pip package
+
+First, install TensorFlow pip package*. See
+https://www.tensorflow.org/install/pip.
+
+Upgrade script can be run on a single Python file:
 
 ```
-tf_upgrade.py --infile foo.py --outfile foo-upgraded.py
+tf_upgrade_v2 --infile foo.py --outfile foo-upgraded.py
 ```
 
 It will print a list of errors it finds that it can't fix. You can also run
 it on a directory tree:
 
 ```
+# upgrade the .py files and copy all the other files to the outtree
+tf_upgrade_v2 --intree coolcode --outtree coolcode-upgraded
+
 # just upgrade the .py files
-tf_upgrade.py --intree coolcode --outtree coolcode-upgraded
-# after upgrade the .py files, then copy all the other files to the outtree
-tf_upgrade.py --intree coolcode --outtree coolcode-upgraded --copyotherfiles True
+tf_upgrade_v2 --intree coolcode --outtree coolcode-upgraded --copyotherfiles False
 ```
 
-In either case, it will also dump out a report e.g. which will detail changes
+*Note: `tf_upgrade_v2` is installed automatically as a script by the pip install 
+after TensorFlow 1.12.
+
+## Report
+
+The script will also dump out a report e.g. which will detail changes
 e.g.:
 
 ```
-third_party/tensorflow/tools/compatibility/test_file_v0.11.py Line 125
+'tensorflow/tools/compatibility/testdata/test_file_v1_12.py' Line 65
+--------------------------------------------------------------------------------
+
+Added keyword 'input' to reordered function 'tf.argmax'
+Renamed keyword argument from 'dimension' to 'axis'
 
-Renamed keyword argument from `dim` to `axis`
-Renamed keyword argument from `squeeze_dims` to `axis`
+    Old:         tf.argmax([[1, 3, 2]], dimension=0))
+                                        ~~~~~~~~~~
+    New:         tf.argmax(input=[[1, 3, 2]], axis=0))
 
-    Old:                   [[1, 2, 3]], dim=1), squeeze_dims=[1]).eval(),
-                                        ~~~~    ~~~~~~~~~~~~~
-    New:                   [[1, 2, 3]], axis=1), axis=[1]).eval(),
-                                        ~~~~~    ~~~~~
 ```
 
 ## Caveats
 
 - Don't update parts of your code manually before running this script. In
-particular, functions that have had reordered arguments like `tf.concat`
-or `tf.split` will cause the script to incorrectly add keyword arguments that
-mismap arguments.
+particular, functions that have had reordered arguments like `tf.argmax`
+or `tf.batch_to_space` will cause the script to incorrectly add keyword
+arguments that mismap arguments.
 
 - This script wouldn't actually reorder arguments. Instead, the script will add
 keyword arguments to functions that had their arguments reordered.
 
 - This script is not able to upgrade all functions. One notable example is
-`tf.reverse()` which has been changed to take a list of indices rather than
-a tensor of bools. If the script detects this, it will report this to stdout
+`tf.nn.conv2d` that no longer takes `use_cudnn_on_gpu` argument.
+If the script detects this, it will report this to stdout
 (and in the report), and you can fix it manually. For example if you have
-`tf.reverse(a, [False, True, True])` you will need to manually change it to
-`tf.reverse(a, [1, 2])`.
+`tf.nn.conv2d(inputs, filters, strides, padding, use_cudnn_on_gpu=True)`
+you will need to manually change it to
+`tf.nn.conv2d(input, filters, strides, padding)`.
 
 - There are some syntaxes that are not handleable with this script as this
-script was designed to use only standard python packages. If the script fails
-with "A necessary keyword argument failed to be inserted." or
+script was designed to use only standard python packages.
+There is an alternative available for TensorFlow 0.* to 1.0 upgrade script.
+If the script fails with "A necessary keyword argument failed to be inserted." or
 "Failed to find keyword lexicographically. Fix manually.", you can try
 [@machrisaa's fork of this script](https://github.com/machrisaa/tf0to1).
 [@machrisaa](https://github.com/machrisaa) has used the
 [RedBaron Python refactoring engine](https://redbaron.readthedocs.io/en/latest/)
 which is able to localize syntactic elements more reliably than the built-in
-`ast` module this script is based upon.
+`ast` module this script is based upon. Note that the alternative script is not
+available for TensorFlow 2.0 upgrade.
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index 5a27eb241ca80f1527323ce5b5b72792640fab4a..ba72d1d202d9d366b726f9e5d8f6dd0b9a093e94 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -26,37 +26,53 @@ from __future__ import print_function
 
 renames = {
     'tf.AUTO_REUSE': 'tf.compat.v1.AUTO_REUSE',
+    'tf.AttrValue': 'tf.compat.v1.AttrValue',
     'tf.COMPILER_VERSION': 'tf.version.COMPILER_VERSION',
     'tf.CXX11_ABI_FLAG': 'tf.sysconfig.CXX11_ABI_FLAG',
     'tf.ConditionalAccumulator': 'tf.compat.v1.ConditionalAccumulator',
     'tf.ConditionalAccumulatorBase': 'tf.compat.v1.ConditionalAccumulatorBase',
+    'tf.ConfigProto': 'tf.compat.v1.ConfigProto',
     'tf.DeviceSpec': 'tf.compat.v1.DeviceSpec',
     'tf.Dimension': 'tf.compat.v1.Dimension',
+    'tf.FIFOQueue': 'tf.queue.FIFOQueue',
     'tf.FixedLenFeature': 'tf.io.FixedLenFeature',
     'tf.FixedLenSequenceFeature': 'tf.io.FixedLenSequenceFeature',
     'tf.FixedLengthRecordReader': 'tf.compat.v1.FixedLengthRecordReader',
     'tf.GIT_VERSION': 'tf.version.GIT_VERSION',
+    'tf.GPUOptions': 'tf.compat.v1.GPUOptions',
     'tf.GRAPH_DEF_VERSION': 'tf.version.GRAPH_DEF_VERSION',
     'tf.GRAPH_DEF_VERSION_MIN_CONSUMER': 'tf.version.GRAPH_DEF_VERSION_MIN_CONSUMER',
     'tf.GRAPH_DEF_VERSION_MIN_PRODUCER': 'tf.version.GRAPH_DEF_VERSION_MIN_PRODUCER',
+    'tf.GraphDef': 'tf.compat.v1.GraphDef',
     'tf.GraphKeys': 'tf.compat.v1.GraphKeys',
+    'tf.GraphOptions': 'tf.compat.v1.GraphOptions',
+    'tf.HistogramProto': 'tf.compat.v1.HistogramProto',
     'tf.IdentityReader': 'tf.compat.v1.IdentityReader',
     'tf.InteractiveSession': 'tf.compat.v1.InteractiveSession',
     'tf.LMDBReader': 'tf.compat.v1.LMDBReader',
+    'tf.LogMessage': 'tf.compat.v1.LogMessage',
     'tf.MONOLITHIC_BUILD': 'tf.sysconfig.MONOLITHIC_BUILD',
+    'tf.MetaGraphDef': 'tf.compat.v1.MetaGraphDef',
+    'tf.NameAttrList': 'tf.compat.v1.NameAttrList',
     'tf.NoGradient': 'tf.no_gradient',
+    'tf.NodeDef': 'tf.compat.v1.NodeDef',
     'tf.NotDifferentiable': 'tf.no_gradient',
     'tf.OpError': 'tf.errors.OpError',
-    'tf.PaddingFIFOQueue': 'tf.io.PaddingFIFOQueue',
+    'tf.OptimizerOptions': 'tf.compat.v1.OptimizerOptions',
+    'tf.PaddingFIFOQueue': 'tf.queue.PaddingFIFOQueue',
     'tf.Print': 'tf.compat.v1.Print',
-    'tf.PriorityQueue': 'tf.io.PriorityQueue',
+    'tf.PriorityQueue': 'tf.queue.PriorityQueue',
     'tf.QUANTIZED_DTYPES': 'tf.dtypes.QUANTIZED_DTYPES',
-    'tf.QueueBase': 'tf.io.QueueBase',
-    'tf.RandomShuffleQueue': 'tf.io.RandomShuffleQueue',
+    'tf.QueueBase': 'tf.queue.QueueBase',
+    'tf.RandomShuffleQueue': 'tf.queue.RandomShuffleQueue',
     'tf.ReaderBase': 'tf.compat.v1.ReaderBase',
+    'tf.RunMetadata': 'tf.compat.v1.RunMetadata',
+    'tf.RunOptions': 'tf.compat.v1.RunOptions',
     'tf.Session': 'tf.compat.v1.Session',
+    'tf.SessionLog': 'tf.compat.v1.SessionLog',
     'tf.SparseConditionalAccumulator': 'tf.sparse.SparseConditionalAccumulator',
     'tf.SparseFeature': 'tf.io.SparseFeature',
+    'tf.SparseTensorValue': 'tf.compat.v1.SparseTensorValue',
     'tf.TFRecordReader': 'tf.compat.v1.TFRecordReader',
     'tf.TensorInfo': 'tf.compat.v1.TensorInfo',
     'tf.TextLineReader': 'tf.compat.v1.TextLineReader',
@@ -71,8 +87,6 @@ renames = {
     'tf.all_variables': 'tf.compat.v1.all_variables',
     'tf.angle': 'tf.math.angle',
     'tf.app.run': 'tf.compat.v1.app.run',
-    'tf.arg_max': 'tf.compat.v1.arg_max',
-    'tf.arg_min': 'tf.compat.v1.arg_min',
     'tf.assert_greater_equal': 'tf.compat.v1.assert_greater_equal',
     'tf.assert_integer': 'tf.compat.v1.assert_integer',
     'tf.assert_less_equal': 'tf.compat.v1.assert_less_equal',
@@ -92,6 +106,7 @@ renames = {
     'tf.assign': 'tf.compat.v1.assign',
     'tf.assign_add': 'tf.compat.v1.assign_add',
     'tf.assign_sub': 'tf.compat.v1.assign_sub',
+    'tf.batch_scatter_update': 'tf.compat.v1.batch_scatter_update',
     'tf.betainc': 'tf.math.betainc',
     'tf.ceil': 'tf.math.ceil',
     'tf.check_numerics': 'tf.debugging.check_numerics',
@@ -108,6 +123,8 @@ renames = {
     'tf.create_partitioned_variables': 'tf.compat.v1.create_partitioned_variables',
     'tf.cross': 'tf.linalg.cross',
     'tf.cumprod': 'tf.math.cumprod',
+    'tf.data.make_initializable_iterator': 'tf.compat.v1.data.make_initializable_iterator',
+    'tf.data.make_one_shot_iterator': 'tf.compat.v1.data.make_one_shot_iterator',
     'tf.debugging.is_finite': 'tf.math.is_finite',
     'tf.debugging.is_inf': 'tf.math.is_inf',
     'tf.debugging.is_nan': 'tf.math.is_nan',
@@ -118,7 +135,7 @@ renames = {
     'tf.decode_json_example': 'tf.io.decode_json_example',
     'tf.decode_raw': 'tf.io.decode_raw',
     'tf.delete_session_tensor': 'tf.compat.v1.delete_session_tensor',
-    'tf.depth_to_space': 'tf.nn.depth_to_space',
+    'tf.depth_to_space': 'tf.compat.v1.depth_to_space',
     'tf.dequantize': 'tf.quantization.dequantize',
     'tf.deserialize_many_sparse': 'tf.io.deserialize_many_sparse',
     'tf.diag': 'tf.linalg.tensor_diag',
@@ -126,7 +143,9 @@ renames = {
     'tf.digamma': 'tf.math.digamma',
     'tf.dimension_at_index': 'tf.compat.v1.dimension_at_index',
     'tf.dimension_value': 'tf.compat.v1.dimension_value',
+    'tf.disable_eager_execution': 'tf.compat.v1.disable_eager_execution',
     'tf.disable_resource_variables': 'tf.compat.v1.disable_resource_variables',
+    'tf.disable_v2_behavior': 'tf.compat.v1.disable_v2_behavior',
     'tf.disable_v2_tensorshape': 'tf.compat.v1.disable_v2_tensorshape',
     'tf.distributions.Bernoulli': 'tf.compat.v1.distributions.Bernoulli',
     'tf.distributions.Beta': 'tf.compat.v1.distributions.Beta',
@@ -147,13 +166,14 @@ renames = {
     'tf.distributions.Uniform': 'tf.compat.v1.distributions.Uniform',
     'tf.distributions.kl_divergence': 'tf.compat.v1.distributions.kl_divergence',
     'tf.div': 'tf.compat.v1.div',
+    'tf.enable_eager_execution': 'tf.compat.v1.enable_eager_execution',
     'tf.enable_resource_variables': 'tf.compat.v1.enable_resource_variables',
+    'tf.enable_v2_behavior': 'tf.compat.v1.enable_v2_behavior',
     'tf.enable_v2_tensorshape': 'tf.compat.v1.enable_v2_tensorshape',
     'tf.encode_base64': 'tf.io.encode_base64',
     'tf.erf': 'tf.math.erf',
     'tf.erfc': 'tf.math.erfc',
     'tf.expm1': 'tf.math.expm1',
-    'tf.extract_image_patches': 'tf.compat.v1.extract_image_patches',
     'tf.fake_quant_with_min_max_args': 'tf.quantization.fake_quant_with_min_max_args',
     'tf.fake_quant_with_min_max_args_gradient': 'tf.quantization.fake_quant_with_min_max_args_gradient',
     'tf.fake_quant_with_min_max_vars': 'tf.quantization.fake_quant_with_min_max_vars',
@@ -210,6 +230,10 @@ renames = {
     'tf.initializers.tables_initializer': 'tf.compat.v1.initializers.tables_initializer',
     'tf.initializers.variables': 'tf.compat.v1.initializers.variables',
     'tf.invert_permutation': 'tf.math.invert_permutation',
+    'tf.io.PaddingFIFOQueue': 'tf.queue.PaddingFIFOQueue',
+    'tf.io.PriorityQueue': 'tf.queue.PriorityQueue',
+    'tf.io.QueueBase': 'tf.queue.QueueBase',
+    'tf.io.RandomShuffleQueue': 'tf.queue.RandomShuffleQueue',
     'tf.io.tf_record_iterator': 'tf.compat.v1.io.tf_record_iterator',
     'tf.is_finite': 'tf.math.is_finite',
     'tf.is_inf': 'tf.math.is_inf',
@@ -219,6 +243,8 @@ renames = {
     'tf.is_strictly_increasing': 'tf.math.is_strictly_increasing',
     'tf.is_variable_initialized': 'tf.compat.v1.is_variable_initialized',
     'tf.keras.backend.get_session': 'tf.compat.v1.keras.backend.get_session',
+    'tf.keras.layers.CuDNNGRU': 'tf.compat.v1.keras.layers.CuDNNGRU',
+    'tf.keras.layers.CuDNNLSTM': 'tf.compat.v1.keras.layers.CuDNNLSTM',
     'tf.layers.AveragePooling1D': 'tf.compat.v1.layers.AveragePooling1D',
     'tf.layers.AveragePooling2D': 'tf.compat.v1.layers.AveragePooling2D',
     'tf.layers.AveragePooling3D': 'tf.compat.v1.layers.AveragePooling3D',
@@ -367,20 +393,18 @@ renames = {
     'tf.nn.quantized_max_pool': 'tf.compat.v1.nn.quantized_max_pool',
     'tf.nn.quantized_relu_x': 'tf.compat.v1.nn.quantized_relu_x',
     'tf.nn.raw_rnn': 'tf.compat.v1.nn.raw_rnn',
+    'tf.nn.relu_layer': 'tf.compat.v1.nn.relu_layer',
     'tf.nn.rnn_cell.BasicLSTMCell': 'tf.compat.v1.nn.rnn_cell.BasicLSTMCell',
     'tf.nn.rnn_cell.BasicRNNCell': 'tf.compat.v1.nn.rnn_cell.BasicRNNCell',
     'tf.nn.rnn_cell.GRUCell': 'tf.compat.v1.nn.rnn_cell.GRUCell',
     'tf.nn.rnn_cell.LSTMCell': 'tf.compat.v1.nn.rnn_cell.LSTMCell',
     'tf.nn.rnn_cell.MultiRNNCell': 'tf.compat.v1.nn.rnn_cell.MultiRNNCell',
-    'tf.nn.softmax_cross_entropy_with_logits_v2': 'tf.nn.softmax_cross_entropy_with_logits',
     'tf.nn.static_bidirectional_rnn': 'tf.compat.v1.nn.static_bidirectional_rnn',
     'tf.nn.static_rnn': 'tf.compat.v1.nn.static_rnn',
     'tf.nn.uniform_candidate_sampler': 'tf.random.uniform_candidate_sampler',
     'tf.nn.xw_plus_b': 'tf.compat.v1.nn.xw_plus_b',
     'tf.op_scope': 'tf.compat.v1.op_scope',
     'tf.orthogonal_initializer': 'tf.keras.initializers.Orthogonal',
-    'tf.parse_example': 'tf.compat.v1.parse_example',
-    'tf.parse_single_example': 'tf.compat.v1.parse_single_example',
     'tf.parse_single_sequence_example': 'tf.io.parse_single_sequence_example',
     'tf.parse_tensor': 'tf.io.parse_tensor',
     'tf.placeholder': 'tf.compat.v1.placeholder',
@@ -403,9 +427,11 @@ renames = {
     'tf.qr': 'tf.linalg.qr',
     'tf.quantize': 'tf.quantization.quantize',
     'tf.quantized_concat': 'tf.quantization.quantized_concat',
+    'tf.ragged.constant_value': 'tf.compat.v1.ragged.constant_value',
+    'tf.ragged.convert_to_tensor_or_ragged_tensor': 'tf.compat.v1.ragged.convert_to_tensor_or_ragged_tensor',
+    'tf.ragged.RaggedTensorValue': 'tf.compat.v1.ragged.RaggedTensorValue',
     'tf.random.get_seed': 'tf.compat.v1.random.get_seed',
     'tf.random.set_random_seed': 'tf.compat.v1.random.set_random_seed',
-    'tf.random.stateless_multinomial': 'tf.compat.v1.random.stateless_multinomial',
     'tf.random_crop': 'tf.image.random_crop',
     'tf.random_gamma': 'tf.random.gamma',
     'tf.random_normal': 'tf.random.normal',
@@ -415,7 +441,6 @@ renames = {
     'tf.read_file': 'tf.io.read_file',
     'tf.real': 'tf.math.real',
     'tf.reciprocal': 'tf.math.reciprocal',
-    'tf.reduce_join': 'tf.compat.v1.reduce_join',
     'tf.regex_replace': 'tf.strings.regex_replace',
     'tf.report_uninitialized_variables': 'tf.compat.v1.report_uninitialized_variables',
     'tf.reset_default_graph': 'tf.compat.v1.reset_default_graph',
@@ -496,24 +521,19 @@ renames = {
     'tf.sets.set_intersection': 'tf.sets.intersection',
     'tf.sets.set_size': 'tf.sets.size',
     'tf.sets.set_union': 'tf.sets.union',
-    'tf.space_to_batch': 'tf.nn.space_to_batch',
-    'tf.space_to_depth': 'tf.nn.space_to_depth',
+    'tf.space_to_depth': 'tf.compat.v1.space_to_depth',
     'tf.sparse.matmul': 'tf.sparse.sparse_dense_matmul',
     'tf.sparse.merge': 'tf.compat.v1.sparse.merge',
     'tf.sparse.placeholder': 'tf.compat.v1.sparse.placeholder',
     'tf.sparse.reduce_max_sparse': 'tf.compat.v1.sparse.reduce_max_sparse',
     'tf.sparse.reduce_sum_sparse': 'tf.compat.v1.sparse.reduce_sum_sparse',
-    'tf.sparse_add': 'tf.compat.v1.sparse_add',
     'tf.sparse_fill_empty_rows': 'tf.sparse.fill_empty_rows',
     'tf.sparse_mask': 'tf.sparse.mask',
-    'tf.sparse_matmul': 'tf.compat.v1.sparse_matmul',
     'tf.sparse_maximum': 'tf.sparse.maximum',
     'tf.sparse_merge': 'tf.compat.v1.sparse_merge',
     'tf.sparse_minimum': 'tf.sparse.minimum',
     'tf.sparse_placeholder': 'tf.compat.v1.sparse_placeholder',
-    'tf.sparse_reduce_max': 'tf.compat.v1.sparse_reduce_max',
     'tf.sparse_reduce_max_sparse': 'tf.compat.v1.sparse_reduce_max_sparse',
-    'tf.sparse_reduce_sum': 'tf.compat.v1.sparse_reduce_sum',
     'tf.sparse_reduce_sum_sparse': 'tf.compat.v1.sparse_reduce_sum_sparse',
     'tf.sparse_reorder': 'tf.sparse.reorder',
     'tf.sparse_reset_shape': 'tf.sparse.reset_shape',
@@ -548,6 +568,7 @@ renames = {
     'tf.string_strip': 'tf.strings.strip',
     'tf.string_to_hash_bucket_fast': 'tf.strings.to_hash_bucket_fast',
     'tf.string_to_hash_bucket_strong': 'tf.strings.to_hash_bucket_strong',
+    'tf.summary.SessionLog': 'tf.compat.v1.summary.SessionLog',
     'tf.summary.audio': 'tf.compat.v1.summary.audio',
     'tf.summary.get_summary_description': 'tf.compat.v1.summary.get_summary_description',
     'tf.summary.histogram': 'tf.compat.v1.summary.histogram',
@@ -559,6 +580,7 @@ renames = {
     'tf.summary.text': 'tf.compat.v1.summary.text',
     'tf.svd': 'tf.linalg.svd',
     'tf.tables_initializer': 'tf.compat.v1.tables_initializer',
+    'tf.test.StubOutForTesting': 'tf.compat.v1.test.StubOutForTesting',
     'tf.test.compute_gradient': 'tf.compat.v1.test.compute_gradient',
     'tf.test.compute_gradient_error': 'tf.compat.v1.test.compute_gradient_error',
     'tf.test.get_temp_dir': 'tf.compat.v1.test.get_temp_dir',
@@ -576,31 +598,42 @@ renames = {
     'tf.train.AdagradDAOptimizer': 'tf.compat.v1.train.AdagradDAOptimizer',
     'tf.train.AdagradOptimizer': 'tf.compat.v1.train.AdagradOptimizer',
     'tf.train.AdamOptimizer': 'tf.compat.v1.train.AdamOptimizer',
-    'tf.train.CheckpointSaverListener': 'tf.compat.v1.train.CheckpointSaverListener',
+    'tf.train.CheckpointSaverHook': 'tf.estimator.CheckpointSaverHook',
+    'tf.train.CheckpointSaverListener': 'tf.estimator.CheckpointSaverListener',
     'tf.train.ChiefSessionCreator': 'tf.compat.v1.train.ChiefSessionCreator',
+    'tf.train.FeedFnHook': 'tf.estimator.FeedFnHook',
+    'tf.train.FinalOpsHook': 'tf.estimator.FinalOpsHook',
     'tf.train.FtrlOptimizer': 'tf.compat.v1.train.FtrlOptimizer',
+    'tf.train.GlobalStepWaiterHook': 'tf.estimator.GlobalStepWaiterHook',
     'tf.train.GradientDescentOptimizer': 'tf.compat.v1.train.GradientDescentOptimizer',
+    'tf.train.LoggingTensorHook': 'tf.estimator.LoggingTensorHook',
     'tf.train.LooperThread': 'tf.compat.v1.train.LooperThread',
     'tf.train.MomentumOptimizer': 'tf.compat.v1.train.MomentumOptimizer',
     'tf.train.MonitoredSession': 'tf.compat.v1.train.MonitoredSession',
     'tf.train.MonitoredTrainingSession': 'tf.compat.v1.train.MonitoredTrainingSession',
-    'tf.train.NanLossDuringTrainingError': 'tf.compat.v1.train.NanLossDuringTrainingError',
+    'tf.train.NanLossDuringTrainingError': 'tf.estimator.NanLossDuringTrainingError',
+    'tf.train.NanTensorHook': 'tf.estimator.NanTensorHook',
     'tf.train.NewCheckpointReader': 'tf.compat.v1.train.NewCheckpointReader',
     'tf.train.Optimizer': 'tf.compat.v1.train.Optimizer',
-    'tf.train.ProfilerHook': 'tf.compat.v1.train.ProfilerHook',
+    'tf.train.ProfilerHook': 'tf.estimator.ProfilerHook',
     'tf.train.ProximalAdagradOptimizer': 'tf.compat.v1.train.ProximalAdagradOptimizer',
     'tf.train.QueueRunner': 'tf.compat.v1.train.QueueRunner',
     'tf.train.RMSPropOptimizer': 'tf.compat.v1.train.RMSPropOptimizer',
     'tf.train.Saver': 'tf.compat.v1.train.Saver',
     'tf.train.SaverDef': 'tf.compat.v1.train.SaverDef',
     'tf.train.Scaffold': 'tf.compat.v1.train.Scaffold',
-    'tf.train.SecondOrStepTimer': 'tf.compat.v1.train.SecondOrStepTimer',
+    'tf.train.SecondOrStepTimer': 'tf.estimator.SecondOrStepTimer',
+    'tf.train.Server': 'tf.distribute.Server',
     'tf.train.SessionCreator': 'tf.compat.v1.train.SessionCreator',
     'tf.train.SessionManager': 'tf.compat.v1.train.SessionManager',
-    'tf.train.SessionRunArgs': 'tf.compat.v1.train.SessionRunArgs',
-    'tf.train.SessionRunContext': 'tf.compat.v1.train.SessionRunContext',
-    'tf.train.SessionRunValues': 'tf.compat.v1.train.SessionRunValues',
+    'tf.train.SessionRunArgs': 'tf.estimator.SessionRunArgs',
+    'tf.train.SessionRunContext': 'tf.estimator.SessionRunContext',
+    'tf.train.SessionRunHook': 'tf.estimator.SessionRunHook',
+    'tf.train.SessionRunValues': 'tf.estimator.SessionRunValues',
     'tf.train.SingularMonitoredSession': 'tf.compat.v1.train.SingularMonitoredSession',
+    'tf.train.StepCounterHook': 'tf.estimator.StepCounterHook',
+    'tf.train.StopAtStepHook': 'tf.estimator.StopAtStepHook',
+    'tf.train.SummarySaverHook': 'tf.estimator.SummarySaverHook',
     'tf.train.Supervisor': 'tf.compat.v1.train.Supervisor',
     'tf.train.SyncReplicasOptimizer': 'tf.compat.v1.train.SyncReplicasOptimizer',
     'tf.train.VocabInfo': 'tf.estimator.VocabInfo',
@@ -641,6 +674,7 @@ renames = {
     'tf.train.start_queue_runners': 'tf.compat.v1.train.start_queue_runners',
     'tf.train.string_input_producer': 'tf.compat.v1.train.string_input_producer',
     'tf.train.update_checkpoint_state': 'tf.compat.v1.train.update_checkpoint_state',
+    'tf.train.warm_start': 'tf.compat.v1.train.warm_start',
     'tf.train.write_graph': 'tf.io.write_graph',
     'tf.trainable_variables': 'tf.compat.v1.trainable_variables',
     'tf.truncated_normal': 'tf.random.truncated_normal',
diff --git a/tensorflow/tools/compatibility/reorders_v2.py b/tensorflow/tools/compatibility/reorders_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c9fb92db0efdec6996dc75ce45aba150776f813
--- /dev/null
+++ b/tensorflow/tools/compatibility/reorders_v2.py
@@ -0,0 +1,116 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+"""List of renames to apply when converting from TF 1.0 to TF 2.0.
+
+THIS FILE IS AUTOGENERATED: To update, please run:
+  bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+This file should be updated whenever a function is added to
+self.reordered_function_names in tf_upgrade_v2.py.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+reorders = {
+    'tf.argmax': ['input', 'axis', 'name', 'dimension', 'output_type'],
+    'tf.argmin': ['input', 'axis', 'name', 'dimension', 'output_type'],
+    'tf.batch_to_space': ['input', 'crops', 'block_size', 'name'],
+    'tf.boolean_mask': ['tensor', 'mask', 'name', 'axis'],
+    'tf.confusion_matrix': ['labels', 'predictions', 'num_classes', 'dtype', 'name', 'weights'],
+    'tf.convert_to_tensor': ['value', 'dtype', 'name', 'preferred_dtype'],
+    'tf.decode_csv': ['records', 'record_defaults', 'field_delim', 'use_quote_delim', 'name', 'na_value', 'select_cols'],
+    'tf.depth_to_space': ['input', 'block_size', 'name', 'data_format'],
+    'tf.feature_column.categorical_column_with_vocabulary_file': ['key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'default_value', 'dtype'],
+    'tf.io.decode_csv': ['records', 'record_defaults', 'field_delim', 'use_quote_delim', 'name', 'na_value', 'select_cols'],
+    'tf.io.parse_example': ['serialized', 'features', 'name', 'example_names'],
+    'tf.io.parse_single_example': ['serialized', 'features', 'name', 'example_names'],
+    'tf.io.serialize_many_sparse': ['sp_input', 'name', 'out_type'],
+    'tf.io.serialize_sparse': ['sp_input', 'name', 'out_type'],
+    'tf.linalg.norm': ['tensor', 'ord', 'axis', 'keepdims', 'name', 'keep_dims'],
+    'tf.math.argmax': ['input', 'axis', 'name', 'dimension', 'output_type'],
+    'tf.math.argmin': ['input', 'axis', 'name', 'dimension', 'output_type'],
+    'tf.math.confusion_matrix': ['labels', 'predictions', 'num_classes', 'dtype', 'name', 'weights'],
+    'tf.math.in_top_k': ['predictions', 'targets', 'k', 'name'],
+    'tf.math.reduce_all': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_any': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_logsumexp': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_max': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_mean': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_min': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_prod': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_sum': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.multinomial': ['logits', 'num_samples', 'seed', 'name', 'output_dtype'],
+    'tf.nn.convolution': ['input', 'filter', 'padding', 'strides', 'dilation_rate', 'name', 'data_format'],
+    'tf.nn.crelu': ['features', 'name', 'axis'],
+    'tf.nn.depth_to_space': ['input', 'block_size', 'name', 'data_format'],
+    'tf.nn.depthwise_conv2d': ['input', 'filter', 'strides', 'padding', 'rate', 'name', 'data_format'],
+    'tf.nn.embedding_lookup': ['params', 'ids', 'partition_strategy', 'name', 'validate_indices', 'max_norm'],
+    'tf.nn.embedding_lookup_sparse': ['params', 'sp_ids', 'sp_weights', 'partition_strategy', 'name', 'combiner', 'max_norm'],
+    'tf.nn.in_top_k': ['predictions', 'targets', 'k', 'name'],
+    'tf.nn.moments': ['x', 'axes', 'shift', 'name', 'keep_dims'],
+    'tf.nn.pool': ['input', 'window_shape', 'pooling_type', 'padding', 'dilation_rate', 'strides', 'name', 'data_format'],
+    'tf.nn.separable_conv2d': ['input', 'depthwise_filter', 'pointwise_filter', 'strides', 'padding', 'rate', 'name', 'data_format'],
+    'tf.nn.space_to_batch': ['input', 'paddings', 'block_size', 'name'],
+    'tf.nn.space_to_depth': ['input', 'block_size', 'name', 'data_format'],
+    'tf.nn.weighted_moments': ['x', 'axes', 'frequency_weights', 'name', 'keep_dims'],
+    'tf.norm': ['tensor', 'ord', 'axis', 'keepdims', 'name', 'keep_dims'],
+    'tf.pad': ['tensor', 'paddings', 'mode', 'name', 'constant_values'],
+    'tf.parse_example': ['serialized', 'features', 'name', 'example_names'],
+    'tf.parse_single_example': ['serialized', 'features', 'name', 'example_names'],
+    'tf.quantize_v2': ['input', 'min_range', 'max_range', 'T', 'mode', 'name', 'round_mode'],
+    'tf.random.multinomial': ['logits', 'num_samples', 'seed', 'name', 'output_dtype'],
+    'tf.random.poisson': ['lam', 'shape', 'dtype', 'seed', 'name'],
+    'tf.random_poisson': ['lam', 'shape', 'dtype', 'seed', 'name'],
+    'tf.reduce_all': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_any': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices'],
+    'tf.reduce_logsumexp': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_max': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_mean': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_min': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_prod': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_sum': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reverse_sequence': ['input', 'seq_lengths', 'seq_axis', 'batch_axis', 'name', 'seq_dim', 'batch_dim'],
+    'tf.serialize_many_sparse': ['sp_input', 'name', 'out_type'],
+    'tf.serialize_sparse': ['sp_input', 'name', 'out_type'],
+    'tf.shape': ['input', 'name', 'out_type'],
+    'tf.size': ['input', 'name', 'out_type'],
+    'tf.space_to_batch': ['input', 'paddings', 'block_size', 'name'],
+    'tf.space_to_depth': ['input', 'block_size', 'name', 'data_format'],
+    'tf.sparse.add': ['a', 'b', 'threshold', 'thresh'],
+    'tf.sparse.concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim'],
+    'tf.sparse.reduce_max': ['sp_input', 'axis', 'keepdims', 'reduction_axes', 'keep_dims'],
+    'tf.sparse.segment_mean': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse.segment_sqrt_n': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse.segment_sum': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse.split': ['keyword_required', 'sp_input', 'num_split', 'axis', 'name', 'split_dim'],
+    'tf.sparse_add': ['a', 'b', 'threshold', 'thresh'],
+    'tf.sparse_concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim'],
+    'tf.sparse_matmul': ['a', 'b', 'transpose_a', 'transpose_b', 'a_is_sparse', 'b_is_sparse', 'name'],
+    'tf.sparse_reduce_max': ['sp_input', 'axis', 'keepdims', 'reduction_axes', 'keep_dims'],
+    'tf.sparse_segment_mean': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse_segment_sqrt_n': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse_segment_sum': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse_split': ['keyword_required', 'sp_input', 'num_split', 'axis', 'name', 'split_dim'],
+    'tf.strings.length': ['input', 'name', 'unit'],
+    'tf.strings.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices'],
+    'tf.strings.substr': ['input', 'pos', 'len', 'name', 'unit'],
+    'tf.substr': ['input', 'pos', 'len', 'name', 'unit'],
+    'tf.transpose': ['a', 'perm', 'name', 'conjugate'],
+    'tf.tuple': ['tensors', 'name', 'control_inputs'],
+    'tf.while_loop': ['cond', 'body', 'loop_vars', 'shape_invariants', 'parallel_iterations', 'back_prop', 'swap_memory', 'name', 'maximum_iterations', 'return_same_structure']
+}
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v0_11.py b/tensorflow/tools/compatibility/testdata/test_file_v0_11.py
index 68ba7a2630cec9cf23e9fbe3d1e9822c31ae3c0c..917236da4b4b75a1a1ca65e11d49d722cc178571 100644
--- a/tensorflow/tools/compatibility/testdata/test_file_v0_11.py
+++ b/tensorflow/tools/compatibility/testdata/test_file_v0_11.py
@@ -34,6 +34,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   a unit test if the converter is successful.
   """
 
+  @test_util.run_v1_only("b/120545219")
   def testArgRenames(self):
     with self.cached_session():
 
@@ -97,6 +98,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           tf.expand_dims([[1, 2], [3, 4]], axis=1).eval(),
           [[[1, 2]], [[3, 4]]])
 
+  @test_util.run_v1_only("b/120545219")
   def testArgMinMax(self):
     with self.cached_session():
       self.assertAllEqual(
@@ -112,6 +114,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           tf.argmax([[1, 2, 3], [4, 1, 0]], dimension=0).eval(),
           [1, 0, 0])
 
+  @test_util.run_v1_only("b/120545219")
   def testExpandAndSqueeze(self):
     with self.cached_session():
 
@@ -139,6 +142,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
                   [[1, 2, 3]], dim=1), squeeze_dims=[1]).eval(),
           a)
 
+  @test_util.run_v1_only("b/120545219")
   def testArithmeticRenames(self):
     with self.cached_session() as s:
       stuff = tf.split(1, 2, [[1, 2, 3, 4], [4, 5, 6, 7]])
@@ -163,6 +167,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
       #     # TODO(aselle): (tf.batch_*)
       # ]
 
+  @test_util.run_v1_only("b/120545219")
   def testBatchAndSvd(self):
     with self.cached_session():
       mat = [[1., 2.], [2., 3.]]
@@ -174,6 +179,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           tf.svd(mat, False, True).eval(),
           tf.svd(mat, compute_uv=False, full_matrices=True).eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testCrossEntropy(self):
     # TODO(aselle): Test sparse_softmax_...
     with self.cached_session():
@@ -190,6 +196,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           tf.nn.sigmoid_cross_entropy_with_logits(
               labels=labels, logits=logits).eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testVariables(self):
     with self.cached_session() as s:
 
@@ -200,6 +207,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
       _ = [v.name for v in tf.all_variables()]
       _ = [v.name for v in tf.local_variables()]
 
+  @test_util.run_v1_only("b/120545219")
   def testSummaries(self):
     with self.cached_session() as s:
       var = tf.Variable([1, 2, 3], dtype=tf.float32)
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v1_10.py b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
similarity index 93%
rename from tensorflow/tools/compatibility/testdata/test_file_v1_10.py
rename to tensorflow/tools/compatibility/testdata/test_file_v1_12.py
index fd688781b0dafe5d5162c63115d9fa0e5680ab3b..5ce4dd49adc940dbc56e19915a188cdb6b8de1d1 100644
--- a/tensorflow/tools/compatibility/testdata/test_file_v1_10.py
+++ b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
@@ -28,11 +28,13 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   def setUp(self):
     tf.enable_eager_execution()
 
+  @test_util.run_v1_only("b/120545219")
   def testRenames(self):
     with self.cached_session():
       self.assertAllClose(1.04719755, tf.acos(0.5))
       self.assertAllClose(0.5, tf.rsqrt(4.0))
 
+  @test_util.run_v1_only("b/120545219")
   def testSerializeSparseTensor(self):
     sp_input = tf.SparseTensor(
         indices=tf.constant([[1]], dtype=tf.int64),
@@ -44,6 +46,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
       self.assertEqual((3,), serialized_sp.shape)
       self.assertTrue(serialized_sp[0].numpy())  # check non-empty
 
+  @test_util.run_v1_only("b/120545219")
   def testSerializeManySparse(self):
     sp_input = tf.SparseTensor(
         indices=tf.constant([[0, 1]], dtype=tf.int64),
@@ -55,6 +58,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           sp_input, 'serialize_name', tf.string)
       self.assertEqual((1, 3), serialized_sp.shape)
 
+  @test_util.run_v1_only("b/120545219")
   def testArgMaxMin(self):
     self.assertAllClose(
         [1],
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index 83ef73b8faa2df6d9890c2a1fe042b916dbb4553..06a7bb781d71f003eea98812916d91d21f133ec2 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import renames_v2
+from tensorflow.tools.compatibility import reorders_v2
 
 
 class TFAPIChangeSpec(ast_edits.APIChangeSpec):
@@ -35,6 +36,18 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.argmax": {
             "dimension": "axis",
         },
+        "tf.arg_min": {
+            "dimension": "axis",
+        },
+        "tf.arg_max": {
+            "dimension": "axis",
+        },
+        "tf.math.argmin": {
+            "dimension": "axis",
+        },
+        "tf.math.argmax": {
+            "dimension": "axis",
+        },
         "tf.image.crop_and_resize": {
             "box_ind": "box_indices",
         },
@@ -47,18 +60,36 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.expand_dims": {
             "dim": "axis",
         },
-        "tf.batch_to_space_nd": {
+        "tf.batch_to_space": {
+            "block_size": "block_shape",
+        },
+        "tf.space_to_batch": {
+            "block_size": "block_shape",
+        },
+        "tf.nn.space_to_batch": {
             "block_size": "block_shape",
         },
         "tf.constant": {
-            "verify_shapes": "verify_shapes_is_now_always_true",
+            "verify_shape": "verify_shape_is_now_always_true",
         },
         "tf.convert_to_tensor": {
             "preferred_dtype": "dtype_hint"
         },
+        "tf.nn.softmax_cross_entropy_with_logits_v2": {
+            "dim": "axis"
+        },
         "tf.linalg.l2_normalize": {
             "dim": "axis",
         },
+        "tf.linalg.norm": {
+            "keep_dims": "keepdims",
+        },
+        "tf.norm": {
+            "keep_dims": "keepdims",
+        },
+        "tf.load_file_system_library": {
+            "library_filename": "library_location",
+        },
         "tf.math.count_nonzero": {
             "input_tensor": "input",
             "keep_dims": "keepdims",
@@ -92,6 +123,9 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.nn.separable_conv2d": {
             "rate": "dilations"
         },
+        "tf.nn.depthwise_conv2d": {
+            "rate": "dilations"
+        },
         "tf.nn.softmax": {
             "dim": "axis"
         },
@@ -110,14 +144,35 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         },
         "tf.sparse.concat": {
             "concat_dim": "axis",
+            "expand_nonconcat_dim": "expand_nonconcat_dims",
         },
         "tf.sparse_concat": {
             "concat_dim": "axis",
+            "expand_nonconcat_dim": "expand_nonconcat_dims",
         },
         "tf.sparse.split": {
             "split_dim": "axis",
         },
-        "tf.max_pool_with_argmax": {
+        "tf.sparse_split": {
+            "split_dim": "axis",
+        },
+        "tf.sparse.reduce_max": {
+            "reduction_axes": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.sparse_reduce_max": {
+            "reduction_axes": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.sparse.reduce_sum": {
+            "reduction_axes": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.sparse_reduce_sum": {
+            "reduction_axes": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.nn.max_pool_with_argmax": {
             "Targmax": "output_dtype",
         },
         "tf.multinomial": {
@@ -126,14 +181,15 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.random.multinomial": {
             "output_dtype": "dtype",
         },
+        "tf.reverse_sequence": {
+            "seq_dim": "seq_axis",
+            "batch_dim": "batch_axis",
+        },
         "tf.nn.batch_norm_with_global_normalization": {
             "t": "input",
             "m": "mean",
             "v": "variance",
         },
-        "tf.manip.batch_to_space_nd": {
-            "block_size": "block_shape",
-        },
         "tf.nn.dilation2d": {
             "filter": "filters",
             "rates": "dilations",
@@ -147,6 +203,10 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.ones_like": {
             "tensor": "input",
         },
+        "tf.nn.conv2d_transpose": {
+            "value": "input",
+            "filter": "filters",
+        },
         "tf.nn.conv3d_transpose": {
             "value": "input",
             "filter": "filters",
@@ -187,8 +247,8 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "newpath": "dst",
         },
         "tf.gfile.Rename": {
-            "oldpath": "src",
-            "newpath": "dst",
+            "oldname": "src",
+            "newname": "dst",
         },
         "tf.gfile.Walk": {
             "in_order": "topdown",
@@ -283,6 +343,9 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.squeeze": {
             "squeeze_dims": "axis",
         },
+        "tf.nn.weighted_moments": {
+            "keep_dims": "keepdims"
+        },
     }
 
     # pylint: disable=line-too-long
@@ -293,6 +356,12 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     self.manual_symbol_renames = {
         "tf.batch_to_space_nd":
             "tf.batch_to_space",
+        "tf.space_to_batch_nd":
+            "tf.space_to_batch",
+        "tf.nn.space_to_batch":
+            "tf.space_to_batch",
+        "tf.extract_image_patches":
+            "tf.image.extract_image_patches",
         "tf.gfile.Copy":
             "tf.io.gfile.copy",
         "tf.gfile.DeleteRecursively":
@@ -401,6 +470,10 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.data.experimental.unbatch",
         "tf.contrib.data.unique":
             "tf.data.experimental.unique",
+        "tf.contrib.rnn.RNNCell":
+            "tf.nn.rnn_cell.RNNCell",
+        "tf.contrib.rnn.LSTMStateTuple":
+            "tf.nn.rnn_cell.LSTMStateTuple",
         "tf.contrib.framework.sort":
             "tf.sort",
         "tf.contrib.framework.argsort":
@@ -409,10 +482,22 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.batch_to_space",
         "tf.quantize_v2":
             "tf.quantization.quantize",
+        "tf.sparse_add":
+            "tf.sparse.add",
         "tf.sparse_concat":
             "tf.sparse.concat",
         "tf.sparse_split":
             "tf.sparse.split",
+        "tf.sparse_matmul":
+            "tf.linalg.matmul",
+        "tf.sparse_reduce_sum":
+            "tf.sparse.reduce_sum",
+        "tf.sparse_reduce_max":
+            "tf.sparse.reduce_max",
+        "tf.random.stateless_multinomial":
+            "tf.random.stateless_categorical",
+        "tf.substr":
+            "tf.strings.substr",
         "tf.string_to_hash_bucket":
             "tf.strings.to_hash_bucket",
         "tf.string_to_number":
@@ -421,6 +506,8 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.random.categorical",
         "tf.random.multinomial":
             "tf.random.categorical",
+        "tf.reduce_join":
+            "tf.strings.reduce_join",
         "tf.load_file_system_library":
             "tf.load_library",
         "tf.pywrap_tensorflow":
@@ -435,14 +522,38 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.io.decode_csv",
         "tf.data.Iterator":
             "tf.compat.v1.data.Iterator",
+        "tf.parse_example":
+            "tf.io.parse_example",
+        "tf.parse_single_example":
+            "tf.io.parse_single_example",
         "tf.nn.fused_batch_norm":
             "tf.compat.v1.nn.fused_batch_norm",
+        "tf.nn.softmax_cross_entropy_with_logits_v2":
+            "tf.nn.softmax_cross_entropy_with_logits",
         "tf.losses.Reduction.MEAN":
             "tf.compat.v1.losses.Reduction.MEAN",
         "tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS":
             "tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS",
         "tf.losses.Reduction.SUM_OVER_NONZERO_WEIGHTS":
             "tf.compat.v1.losses.Reduction.SUM_OVER_NONZERO_WEIGHTS",
+        "tf.lite.constants.FLOAT":
+            "tf.float32",
+        "tf.lite.constants.INT32":
+            "tf.int32",
+        "tf.lite.constants.INT64":
+            "tf.int64",
+        "tf.lite.constants.STRING":
+            "tf.string",
+        "tf.lite.constants.QUANTIZED_UINT8":
+            "tf.uint8",
+        "tf.arg_max":
+            "tf.argmax",
+        "tf.arg_min":
+            "tf.argmin",
+        # tf.nn.ctc_loss is still available in 2.0 but behavior
+        # changed significantly.
+        "tf.nn.ctc_loss":
+            "tf.compat.v1.nn.ctc_loss",
     }
     # pylint: enable=line-too-long
 
@@ -453,171 +564,94 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     # Variables that should be changed to functions.
     self.change_to_function = {}
 
+    # pylint: disable=line-too-long
+    # This list should just contain names of functions that had
+    # their arguments reordered. After adding a function name to the list
+    # run the following to update reorders_v2.py:
+    # bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+    # bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+    # pylint: enable=line-too-long
+    self.reordered_function_names = {
+        "tf.io.serialize_sparse",
+        "tf.io.serialize_many_sparse",
+        "tf.argmax",
+        "tf.argmin",
+        "tf.batch_to_space",
+        "tf.nn.space_to_batch",
+        "tf.boolean_mask",
+        "tf.convert_to_tensor",
+        "tf.nn.moments",
+        "tf.nn.convolution",
+        "tf.nn.crelu",
+        "tf.nn.weighted_moments",
+        "tf.nn.pool",
+        "tf.nn.separable_conv2d",
+        "tf.nn.depthwise_conv2d",
+        "tf.multinomial",
+        "tf.random.multinomial",
+        "tf.pad",
+        "tf.quantize_v2",
+        "tf.feature_column.categorical_column_with_vocabulary_file",
+        "tf.shape",
+        "tf.size",
+        "tf.random.poisson",
+        "tf.sparse.add",
+        "tf.sparse_add",
+        "tf.sparse.concat",
+        "tf.sparse_concat",
+        "tf.sparse.segment_mean",
+        "tf.sparse.segment_sqrt_n",
+        "tf.sparse.segment_sum",
+        "tf.sparse_matmul",
+        "tf.sparse.reduce_max",
+        "tf.sparse_reduce_max",
+        "tf.io.decode_csv",
+        "tf.strings.length",
+        "tf.strings.reduce_join",
+        "tf.strings.substr",
+        "tf.substr",
+        "tf.transpose",
+        "tf.tuple",
+        "tf.parse_example",
+        "tf.parse_single_example",
+        "tf.io.parse_example",
+        "tf.io.parse_single_example",
+        "tf.while_loop",
+        "tf.reduce_all",
+        "tf.math.reduce_all",
+        "tf.reduce_any",
+        "tf.math.reduce_any",
+        "tf.reduce_min",
+        "tf.math.reduce_min",
+        "tf.reduce_max",
+        "tf.math.reduce_max",
+        "tf.reduce_sum",
+        "tf.math.reduce_sum",
+        "tf.reduce_mean",
+        "tf.math.reduce_mean",
+        "tf.reduce_prod",
+        "tf.math.reduce_prod",
+        "tf.reduce_logsumexp",
+        "tf.math.reduce_logsumexp",
+        "tf.reduce_join",
+        "tf.confusion_matrix",
+        "tf.math.confusion_matrix",
+        "tf.math.in_top_k",
+        "tf.nn.depth_to_space",
+        "tf.nn.embedding_lookup",
+        "tf.nn.embedding_lookup_sparse",
+        "tf.nn.in_top_k",
+        "tf.nn.space_to_depth",
+        "tf.linalg.norm",
+        "tf.norm",
+        "tf.reverse_sequence",
+        "tf.sparse_split",
+    }
+
     # Functions that were reordered should be changed to the new keyword args
     # for safety, if positional arguments are used. If you have reversed the
     # positional arguments yourself, this could do the wrong thing.
-    # IMPORTANT: order here should correspond to OLD argument order.
-    # We just prepend "arg_name=" to all arguments in function calls.
-    self.function_reorders = {
-        "tf.io.serialize_sparse": ["sp_input", "name", "out_type"],
-        "tf.io.serialize_many_sparse": ["sp_input", "name", "out_type"],
-        "tf.argmax": ["input", "axis", "name", "axis", "output_type"],
-        "tf.argmin": ["input", "axis", "name", "axis", "output_type"],
-        "tf.batch_to_space": ["input", "crops", "block_size", "name"],
-        "tf.boolean_mask": ["tensor", "mask", "name", "axis"],
-        "tf.convert_to_tensor": ["value", "dtype", "name", "preferred_dtype"],
-        "tf.nn.moments": ["x", "axes", "shift", "keepdims", "name"],
-        "tf.nn.convolution": [
-            "input", "filter", "padding", "strides", "dilation_rate", "name",
-            "data_format"
-        ],
-        "tf.nn.crelu": ["features", "name", "axis"],
-        "tf.nn.pool": [
-            "input", "window_shape", "pooling_type", "padding", "dilation_rate",
-            "strides", "name", "data_format"
-        ],
-        "tf.nn.depthwise_conv2d": [
-            "input", "filter", "strides", "padding", "rate", "name",
-            "data_format"
-        ],
-        "tf.manip.batch_to_space_nd": ["input", "crops", "block_size", "name"],
-        "tf.multinomial": [
-            "logits", "num_samples", "seed", "name", "output_dtype"
-        ],
-        "tf.random.multinomial": [
-            "logits", "num_samples", "seed", "name", "output_dtype"
-        ],
-        "tf.pad": ["tensor", "paddings", "mode", "name", "constant_values"],
-        "tf.quantize_v2": [
-            "input", "min_range", "max_range", "T", "mode", "name", "round_mode"
-        ],
-        "tf.feature_column.categorical_column_with_vocabulary_file": [
-            "key", "vocabulary_file", "vocabulary_size", "num_oov_buckets",
-            "default_value", "dtype"
-        ],
-        "tf.shape": ["input", "name", "out_type"],
-        "tf.size": ["input", "name", "out_type"],
-        "tf.random.poisson": ["lam", "shape", "dtype", "seed", "name"],
-        "tf.sparse.add": ["a", "b", "thresh"],
-        "tf.sparse_add": ["a", "b", "thresh"],
-        "tf.sparse.concat": [
-            "axis", "sp_inputs", "name", "expand_nonconcat_dim", "concat_dim"
-        ],
-        "tf.sparse_concat": [
-            "axis", "sp_inputs", "name", "expand_nonconcat_dim", "concat_dim"
-        ],
-        "tf.sparse.segment_mean": [
-            "data", "indices", "segment_ids", "name", "num_segments"
-        ],
-        "tf.sparse.segment_sqrt_n": [
-            "data", "indices", "segment_ids", "name", "num_segments"
-        ],
-        "tf.sparse.segment_sum": [
-            "data", "indices", "segment_ids", "name", "num_segments"
-        ],
-        "tf.io.decode_csv": [
-            "records",
-            "record_defaults",
-            "field_delim",
-            "use_quote_delim",
-            "name",
-            "na_value",
-            "select_cols",
-        ],
-        "tf.strings.substr": ["input", "pos", "len", "name", "unit"],
-        "tf.strings.reduce_join": [
-            "input", "axis", "keep_dims", "separator", "name",
-            "reduction_indices"
-        ],
-        "tf.strings.length": ["input", "name", "unit"],
-        "tf.transpose": ["a", "perm", "name", "conjugate"],
-        "tf.tuple": ["tensors", "name", "control_inputs"],
-        "tf.io.parse_example": [
-            "serialized", "features", "name", "example_names"
-        ],
-        "tf.io.parse_single_example": [
-            "serialized", "features", "name", "example_names"
-        ],
-        "tf.while_loop": [
-            "cond", "body", "loop_vars", "shape_invariants",
-            "parallel_iterations", "back_prop", "swap_memory", "name",
-            "maximum_iterations", "return_same_structure"
-        ],
-        "tf.reduce_all": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.math.reduce_all": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.reduce_any": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.math.reduce_any": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.reduce_min": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.math.reduce_min": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.reduce_max": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.math.reduce_max": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.reduce_sum": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.math.reduce_sum": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.reduce_mean": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.math.reduce_mean": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.reduce_prod": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.math.reduce_prod": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.reduce_logsumexp": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.math.reduce_logsumexp": [
-            "input_tensor", "axis", "keepdims", "name", "reduction_indices",
-            "keep_dims"
-        ],
-        "tf.reduce_join": [
-            "input", "axis", "keep_dims", "separator", "name",
-            "reduction_indices"
-        ],
-        "tf.confusion_matrix": [
-            "labels", "predictions", "num_classes", "dtype", "name", "weights"
-        ],
-        "tf.math.confusion_matrix": [
-            "labels", "predictions", "num_classes", "dtype", "name", "weights"
-        ]
-    }
+    self.function_reorders = reorders_v2.reorders
 
     # Specially handled functions.
     self.function_handle = {
@@ -665,30 +699,60 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         " they may already have been correct)."
     )
 
+    deprecate_partition_strategy_comment = (
+        "WARNING: `partition_strategy` has been removed from `%s` "
+        " The 'div' strategy is used by default.")
+
     # Function warnings. <function name> placeholder inside warnings will be
     # replaced by function name.
     self.function_warnings = {
-        "tf.assert_greater": assert_return_type_comment,
-        "tf.assert_equal": assert_return_type_comment,
-        "tf.assert_less": assert_return_type_comment,
-        "tf.assert_rank": assert_rank_comment,
-        "tf.debugging.assert_equal": assert_return_type_comment,
-        "tf.debugging.assert_greater": assert_return_type_comment,
-        "tf.debugging.assert_greater_equal": assert_return_type_comment,
-        "tf.debugging.assert_integer": assert_return_type_comment,
-        "tf.debugging.assert_less": assert_return_type_comment,
-        "tf.debugging.assert_less_equal": assert_return_type_comment,
-        "tf.debugging.assert_near": assert_return_type_comment,
-        "tf.debugging.assert_negative": assert_return_type_comment,
-        "tf.debugging.assert_non_negative": assert_return_type_comment,
-        "tf.debugging.assert_non_positive": assert_return_type_comment,
-        "tf.debugging.assert_none_equal": assert_return_type_comment,
-        "tf.debugging.assert_positive": assert_return_type_comment,
-        "tf.debugging.assert_rank": assert_rank_comment,
-        "tf.debugging.assert_rank_at_least": assert_rank_comment,
-        "tf.debugging.assert_rank_in": assert_rank_comment,
-        "tf.flags": "tf.flags has been removed, please use the argparse or absl"
-                    " module if you need command line parsing.",
+        "tf.assert_greater":
+            assert_return_type_comment,
+        "tf.assert_equal":
+            assert_return_type_comment,
+        "tf.assert_less":
+            assert_return_type_comment,
+        "tf.assert_rank":
+            assert_rank_comment,
+        "tf.cond": "tf.cond no longer takes 'strict'. "
+                   "Now 'strict' defaults to True."
+                   "fn1/fn2 arguments are replaced by true_fn/false_fn.",
+        "tf.debugging.assert_equal":
+            assert_return_type_comment,
+        "tf.debugging.assert_greater":
+            assert_return_type_comment,
+        "tf.debugging.assert_greater_equal":
+            assert_return_type_comment,
+        "tf.debugging.assert_integer":
+            assert_return_type_comment,
+        "tf.debugging.assert_less":
+            assert_return_type_comment,
+        "tf.debugging.assert_less_equal":
+            assert_return_type_comment,
+        "tf.debugging.assert_near":
+            assert_return_type_comment,
+        "tf.debugging.assert_negative":
+            assert_return_type_comment,
+        "tf.debugging.assert_non_negative":
+            assert_return_type_comment,
+        "tf.debugging.assert_non_positive":
+            assert_return_type_comment,
+        "tf.debugging.assert_none_equal":
+            assert_return_type_comment,
+        "tf.debugging.assert_positive":
+            assert_return_type_comment,
+        "tf.debugging.assert_rank":
+            assert_rank_comment,
+        "tf.debugging.assert_rank_at_least":
+            assert_rank_comment,
+        "tf.debugging.assert_rank_in":
+            assert_rank_comment,
+        "tf.device": "tf.device no longer takes function as an argument. "
+                     "'devide_name_or_function' argument has been renamed to "
+                     "'device_name'.",
+        "tf.flags":
+            "tf.flags has been removed, please use the argparse or absl"
+            " module if you need command line parsing.",
         "tf.train.exponential_decay":
             decay_function_comment,
         "tf.train.piecewise_constant_decay":
@@ -723,21 +787,63 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             default_loss_reduction_changed,
         "tf.estimator.BaselineRegressor":
             default_loss_reduction_changed,
+        "tf.hessians": "tf.hessians no longer takes "
+                       "'colocate_gradients_with_ops' argument. Also, "
+                       "arguments have been reordered so that 'name' is the "
+                       "last argument.",
         "tf.nn.conv1d":
-        "WARNING: use_cudnn_on_gpu argument has been removed and \"value\" was "
-        "renamed to \"input\"",
+            "WARNING: use_cudnn_on_gpu argument has been removed and \"value\""
+            " was renamed to \"input\"",
         "tf.nn.conv2d":
-        "WARNING: use_cudnn_on_gpu argument has been removed and \"filter\" "
-        "was renamed to \"filters\"",
+            "WARNING: use_cudnn_on_gpu argument has been removed and "
+            "\"filter\" was renamed to \"filters\"",
         "tf.nn.conv2d_backprop_filter":
-        "WARNING: use_cudnn_on_gpu argument has been removed",
+            "WARNING: use_cudnn_on_gpu argument has been removed",
         "tf.nn.conv2d_backprop_input":
-        "WARNING: use_cudnn_on_gpu argument has been removed and \"filter\" "
-        "was renamed to \"filters\"",
+            "WARNING: use_cudnn_on_gpu argument has been removed and "
+            "\"filter\" was renamed to \"filters\"",
         "tf.nn.erosion2d":
-        "WARNING: <function name> now requires a data_format argument",
-        "tf.zeros_like": tf_01s_like_no_optimize_comment,
-        "tf.ones_like": tf_01s_like_no_optimize_comment,
+            "WARNING: <function name> now requires a data_format argument",
+        "tf.nn.nce_loss":
+            deprecate_partition_strategy_comment % "tf.nn.nce_loss",
+        "tf.nn.safe_embedding_lookup_sparse":
+            deprecate_partition_strategy_comment %
+            "tf.nn.safe_embedding_lookup_sparse",
+        "tf.nn.sampled_softmax_loss":
+            deprecate_partition_strategy_comment % "tf.nn.sampled_softmax_loss",
+        "tf.zeros_like":
+            tf_01s_like_no_optimize_comment,
+        "tf.ones_like":
+            tf_01s_like_no_optimize_comment,
+        "tf.nn.embedding_lookup":
+            "WARNING: validate_indices argument has been removed.",
+        "tf.while_loop":
+            "tf.while_loop no longer takes 'return_same_structure' argument. "
+            "'return_same_structure' now defaults to True. Also, 'name'"
+            "argument is now the last argument.",
+        "tf.image.sample_distorted_bounding_box":
+            "tf.image.sample_distorted_bounding_box no longer takes 'seed2' "
+            "argument.",
+        "tf.nn.ctc_beam_search_decoder":
+            "tf.nn.ctc_beam_search_decoder no longer takes 'merge_repeated' "
+            "argument. 'merge_repeated' now defaults to False.",
+        "tf.nn.fractional_avg_pool":
+            "tf.nn.fractional_avg_pool no longer takes 'seed2' and "
+            "'deterministic' arguments. Now it takes a single 'seed' arg. If "
+            "'seed' is zero, the execution is random and deterministic "
+            "otherwise",
+        "tf.nn.fractional_max_pool":
+            "tf.nn.fractional_max_pool no longer takes 'seed2' and "
+            "'deterministic' arguments. Now it takes a single 'seed' arg. If "
+            "'seed' is zero, the execution is random and deterministic "
+            "otherwise",
+        "tf.nn.softmax_cross_entropy_with_logits":
+            "tf.nn.softmax_cross_entropy_with_logits behavior has changed. "
+            "'labels' needs to be wrapped with tf.stop_gradient to keep the "
+            "old behavior. Also, 'dim' argument has been renamed to 'axis'.",
+        "tf.test.assert_equal_graph_def":
+            "tf.assert_equal_graph_def no longer takes 'checkpoint_v2' "
+            "argument. 'checkpoint_v2' now defaults to True.",
     }
 
     self.symbol_renames = {
@@ -754,11 +860,32 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "only effects core estimator. If you are using "
         "tf.contrib.learn.Estimator, please switch to using core estimator.")
 
+    make_initializable_iterator_deprecation = (
+        "(Manual edit required) The "
+        "`tf.data.Dataset.make_initializable_iterator()` method has been "
+        "removed. If you are using the Estimator API, you can return a dataset "
+        "directly from your input functions without creating an iterator. "
+        "As a last resort, please replace calls to that method on `dataset` "
+        "with a call to "
+        "`tf.compat.v1.data.make_initializable_iterator(dataset)`.")
+
+    make_one_shot_iterator_deprecation = (
+        "(Manual edit required) The "
+        "`tf.data.Dataset.make_one_shot_iterator()` method has been "
+        "removed. If you are using eager execution, you can iterate over "
+        "`dataset` using a Python `for` loop. If you are using the Estimator "
+        "API, you can return a dataset directly from your input functions "
+        "without creating an iterator. As a last resort, please replace calls "
+        "to that method on `dataset` with a call to "
+        "`tf.compat.v1.data.make_one_shot_iterator(dataset)`.")
+
     # Specify warnings for functions that aren't restricted to the tf.x.y.z
     # format. This should only be used for methods with unique names, e.g.
     # export_savedmodel, which is only defined in Estimator objects.
     self.unrestricted_function_warnings = {
         "export_savedmodel": export_saved_model_renamed,
+        "make_initializable_iterator": make_initializable_iterator_deprecation,
+        "make_one_shot_iterator": make_one_shot_iterator_deprecation,
     }
 
   @staticmethod
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index dbddf170d6149829149d92bbfe2d0a0ac50fc7f2..d5428e7536989df407e40d23be3efbb037d94528 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import inspect
 import os
 import tempfile
 
@@ -25,41 +26,53 @@ import six
 import tensorflow as tf
 # OSS TF V2 import placeholder.
 
-
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_export
+from tensorflow.python.util import tf_inspect
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import tf_upgrade_v2
 
 
-_TENSORFLOW_API_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].names)
-_TENSORFLOW_API_ATTR = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
-_ESTIMATOR_API_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].names)
-_ESTIMATOR_API_ATTR = tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].names
+def get_symbol_for_name(root, name):
+  name_parts = name.split(".")
+  symbol = root
+  # Iterate starting with second item since 1st item is "tf.".
+  for part in name_parts[1:]:
+    symbol = getattr(symbol, part)
+  return symbol
+
+
+def get_args(symbol):
+  if hasattr(inspect, "signature"):
+    signature = inspect.signature(symbol)
+    # Ignore *args and **kwargs for now.
+    return [param.name for param in signature.parameters.values()
+            if param.kind == param.POSITIONAL_OR_KEYWORD]
+  return tf_inspect.getargspec(symbol)[0]
 
 
-def get_v1_names(symbol):
-  names_v1 = []
-  if hasattr(symbol, _TENSORFLOW_API_ATTR_V1):
-    names_v1.extend(getattr(symbol, _TENSORFLOW_API_ATTR_V1))
-  if hasattr(symbol, _ESTIMATOR_API_ATTR_V1):
-    names_v1.extend(getattr(symbol, _ESTIMATOR_API_ATTR_V1))
-  return names_v1
+def get_func_and_args_from_str(call_str):
+  """Parse call string to get function and argument names.
 
+  Args:
+    call_str: Call string must be in the form:
+              `tf.foo(arg1=val1, arg2=val2, ...)`.
 
-def get_v2_names(symbol):
-  names_v2 = set()
-  if hasattr(symbol, _TENSORFLOW_API_ATTR):
-    names_v2.update(getattr(symbol, _TENSORFLOW_API_ATTR))
-  if hasattr(symbol, _ESTIMATOR_API_ATTR):
-    names_v2.update(getattr(symbol, _ESTIMATOR_API_ATTR))
-  return list(names_v2)
+  Returns:
+    (function_name, list of arg names) tuple.
+  """
+  open_paren_index = call_str.find("(")
+  close_paren_index = call_str.rfind(")")
+
+  function_name = call_str[:call_str.find("(")]
+  args = call_str[open_paren_index+1:close_paren_index].split(",")
+  args = [arg.split("=")[0].strip() for arg in args]
+  args = [arg for arg in args if arg]  # filter out empty strings
+  return function_name, args
 
 
 class TestUpgrade(test_util.TensorFlowTestCase):
@@ -70,6 +83,22 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   work when run with current TensorFlow.
   """
 
+  @classmethod
+  def setUpClass(cls):
+    cls.v2_symbols = {}
+    if not hasattr(tf.compat, "v2"):
+      return
+
+    def symbol_collector(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        api_names_v2 = tf_export.get_v2_names(attr)
+        for name in api_names_v2:
+          cls.v2_symbols["tf." + name] = attr
+
+    visitor = public_api.PublicAPIVisitor(symbol_collector)
+    traverse.traverse(tf.compat.v2, visitor)
+
   def _upgrade(self, old_file_text):
     in_file = six.StringIO(old_file_text)
     out_file = six.StringIO()
@@ -104,29 +133,19 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     if not hasattr(tf.compat, "v2"):
       return
 
-    v2_symbols = set([])
-
-    def symbol_collector(unused_path, unused_parent, children):
-      for child in children:
-        _, attr = tf_decorator.unwrap(child[1])
-        api_names_v2 = get_v2_names(attr)
-        for name in api_names_v2:
-          v2_symbols.add("tf." + name)
-
-    visitor = public_api.PublicAPIVisitor(symbol_collector)
-    traverse.traverse(tf.compat.v2, visitor)
-
     # Converts all symbols in the v1 namespace to the v2 namespace, raising
     # an error if the target of the conversion is not in the v2 namespace.
+    # Please regenerate the renames file or edit any manual renames if this
+    # test fails.
     def conversion_visitor(unused_path, unused_parent, children):
       for child in children:
         _, attr = tf_decorator.unwrap(child[1])
-        api_names = get_v1_names(attr)
+        api_names = tf_export.get_v1_names(attr)
         for name in api_names:
           _, _, _, text = self._upgrade("tf." + name)
           if (text and
               not text.startswith("tf.compat.v1") and
-              text not in v2_symbols):
+              text not in self.v2_symbols):
             self.assertFalse(
                 True, "Symbol %s generated from %s not in v2 API" % (
                     text, name))
@@ -136,6 +155,179 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     visitor.private_map["tf.compat"] = ["v1", "v2"]
     traverse.traverse(tf.compat.v1, visitor)
 
+  def testAllAPIV1(self):
+    collect = True
+    v1_symbols = set([])
+
+    # Symbols which may be generated by the conversion script which do not exist
+    # in TF 1.x. This should be a very short list of symbols which are
+    # experimental in 1.x but stable for 2.x.
+    whitelisted_v2_only_symbols = set(["tf.saved_model.save"])
+
+    # Converts all symbols in the v1 namespace to the v2 namespace, raising
+    # an error if the target of the conversion is not in the v1 namespace.
+    def conversion_visitor(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        api_names = tf_export.get_v1_names(attr)
+        for name in api_names:
+          if collect:
+            v1_symbols.add("tf." + name)
+          else:
+            _, _, _, text = self._upgrade("tf." + name)
+            if (text and
+                not text.startswith("tf.compat.v1") and
+                not text.startswith("tf.estimator") and
+                text not in v1_symbols and
+                text not in whitelisted_v2_only_symbols):
+              self.assertFalse(
+                  True, "Symbol %s generated from %s not in v1 API" % (
+                      text, name))
+
+    visitor = public_api.PublicAPIVisitor(conversion_visitor)
+    visitor.do_not_descend_map["tf"].append("contrib")
+    visitor.private_map["tf.compat"] = ["v1", "v2"]
+    traverse.traverse(tf.compat.v1, visitor)
+    collect = False
+    traverse.traverse(tf.compat.v1, visitor)
+
+  def testV1KeywordArgNames(self):
+    all_keyword_renames = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_keyword_renames)
+
+    # Visitor that verifies V1 argument names.
+    def arg_test_visitor(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        names_v1 = tf_export.get_v1_names(attr)
+
+        for name in names_v1:
+          name = "tf.%s" % name
+          if name not in all_keyword_renames:
+            continue
+          arg_names_v1 = tf_inspect.getargspec(attr)[0]
+          keyword_renames = all_keyword_renames[name]
+          self.assertEqual(type(keyword_renames), dict)
+
+          # Assert that v1 function has valid v1 argument names.
+          for from_name, _ in keyword_renames.items():
+            self.assertIn(
+                from_name, arg_names_v1,
+                "%s not found in %s arguments: %s" %
+                (from_name, name, str(arg_names_v1)))
+
+    visitor = public_api.PublicAPIVisitor(arg_test_visitor)
+    visitor.do_not_descend_map["tf"].append("contrib")
+    visitor.private_map["tf.compat"] = ["v1", "v2"]
+    traverse.traverse(tf.compat.v1, visitor)
+
+  def testV2KeywordArgNames(self):
+    # This test converts a call of the form:
+    # tf.foo(arg1=0, arg2=1, ...)
+    # to 2.0. Then, checks that converted function has valid argument names.
+    if not hasattr(tf.compat, "v2"):
+      return
+    v2_arg_exceptions = {
+        "verify_shape_is_now_always_true",
+        # These arguments should not be used, they just specify
+        # that a function takes named arguments.
+        "keyword_required",
+        "_sentinel",
+    }
+    v1_name_exceptions = {
+        "tf.print",  # requires print_function import
+    }
+    function_warnings = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_warnings)
+    function_handles = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_handle)
+    keyword_renames = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_keyword_renames)
+
+    # Visitor that converts to V2 and checks V2 argument names.
+    def conversion_visitor(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        if not tf_inspect.isfunction(attr):
+          continue
+        names_v1 = tf_export.get_v1_names(attr)
+        arg_names_v1 = get_args(attr)
+
+        for name in names_v1:
+          tf_name = "tf.%s" % name
+          if tf_name in function_warnings or tf_name in function_handles:
+            continue  # These require manual change
+          if tf_name in v1_name_exceptions:
+            continue
+          # Assert that arg names after converting to v2 are present in
+          # v2 function.
+          # 1. First, create an input of the form:
+          #    tf.foo(arg1=val1, arg2=val2, ...)
+          args = ",".join(
+              ["%s=%d" % (from_name, from_index)
+               for from_index, from_name in enumerate(arg_names_v1)])
+          text_input = "%s(%s)" % (tf_name, args)
+          # 2. Convert the input to V2.
+          _, _, _, text = self._upgrade(text_input)
+          new_function_name, new_args = get_func_and_args_from_str(text)
+          if new_function_name == "tf.compat.v1.%s" % name:
+            if tf_name in keyword_renames:
+              # If we rename arguments, new function must be available in 2.0.
+              # We should not be using compat.v1 in this case.
+              self.assertFalse(
+                  "Function '%s' is not in 2.0 when converting\n%s\nto\n%s" %
+                  (new_function_name, text_input, text))
+            continue
+          # 3. Verify V2 function and arguments.
+          args_v2 = get_args(self.v2_symbols[new_function_name])
+          args_v2.extend(v2_arg_exceptions)
+          for new_arg in new_args:
+            self.assertIn(
+                new_arg, args_v2,
+                "Invalid argument '%s' in 2.0 when converting\n%s\nto\n%s.\n"
+                "Supported arguments: %s" % (
+                    new_arg, text_input, text, str(args_v2)))
+
+    visitor = public_api.PublicAPIVisitor(conversion_visitor)
+    visitor.do_not_descend_map["tf"].append("contrib")
+    visitor.private_map["tf.compat"] = ["v1", "v2"]
+    traverse.traverse(tf.compat.v1, visitor)
+
+  def testReorderFileNeedsUpdate(self):
+    reordered_function_names = (
+        tf_upgrade_v2.TFAPIChangeSpec().reordered_function_names)
+    function_reorders = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_reorders)
+
+    added_names_message = """Some function names in
+self.reordered_function_names are not in reorders_v2.py.
+Please run the following commands to update reorders_v2.py:
+bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+"""
+    removed_names_message = """%s in self.reorders_v2 does not match
+any name in self.reordered_function_names.
+Please run the following commands to update reorders_v2.py:
+bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+"""
+    self.assertTrue(
+        reordered_function_names.issubset(function_reorders),
+        added_names_message)
+    # function_reorders should contain reordered_function_names
+    # and their TensorFlow V1 aliases.
+    for name in function_reorders:
+      # get other names for this function
+      attr = get_symbol_for_name(tf.compat.v1, name)
+      _, attr = tf_decorator.unwrap(attr)
+      v1_names = tf_export.get_v1_names(attr)
+      self.assertTrue(v1_names)
+      v1_names = ["tf.%s" % n for n in v1_names]
+      # check if any other name is in
+      self.assertTrue(
+          any(n in reordered_function_names for n in v1_names),
+          removed_names_message % name)
+
   def testRenameConstant(self):
     text = "tf.MONOLITHIC_BUILD\n"
     _, unused_report, unused_errors, new_text = self._upgrade(text)
@@ -257,6 +449,13 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     )
     self.assertEqual(new_text, expected_text)
 
+  def test_substr(self):
+    text = "tf.substr(input, pos, len, name, unit)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual("tf.strings.substr(input=input, pos=pos, len=len, "
+                     "name=name, unit=unit)\n", new_text)
+    self.assertEqual(errors, [])
+
   def testColocateGradientsWithOps(self):
     text = "tf.gradients(a, foo=False)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
@@ -309,6 +508,11 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
+    text = "tf.arg_min(input, 0)"
+    expected_text = "tf.argmin(input, 0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
   def testArgmax(self):
     text = "tf.argmax(input, name=n, dimension=1, output_type=type)"
     expected_text = "tf.argmax(input=input, name=n, axis=1, output_type=type)"
@@ -320,6 +524,207 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
+    text = "tf.arg_max(input, 0)"
+    expected_text = "tf.argmax(input, 0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testBatchToSpace(self):
+    text = "tf.batch_to_space_nd(input, block_shape, crops, name)"
+    expected_text = "tf.batch_to_space(input, block_shape, crops, name)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.batch_to_space(input, crops, block_size, name)"
+    expected_text = (
+        "tf.batch_to_space(input=input, crops=crops, block_shape=block_size, "
+        "name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.manip.batch_to_space_nd(input, block_shape, crops, name)"
+    expected_text = "tf.batch_to_space(input, block_shape, crops, name)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testExtractImagePatches(self):
+    text = (
+        "tf.extract_image_patches(images, ksizes=ksizes, strides=strides,"
+        "rates=rates, padding=padding, name=name)")
+    expected_text = (
+        "tf.image.extract_image_patches(images, sizes=ksizes, strides=strides,"
+        "rates=rates, padding=padding, name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testStatelessMultinomial(self):
+    text = (
+        "tf.random.stateless_multinomial(logits, num_samples, seed, "
+        "output_dtype=dtype, name=name)")
+    expected_text = (
+        "tf.random.stateless_categorical(logits, num_samples, seed, "
+        "dtype=dtype, name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSoftMaxCrossEntropyWithLogitsV2(self):
+    text = "tf.nn.softmax_cross_entropy_with_logits_v2(labels, logits, dim=2)"
+    expected_text = (
+        "tf.nn.softmax_cross_entropy_with_logits(labels, logits, axis=2)")
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    self.assertFalse(errors)
+
+  def testSoftMaxCrossEntropyWithLogits(self):
+    text = "tf.nn.softmax_cross_entropy_with_logits(labels, logits, dim=2)"
+    expected_text = (
+        "tf.nn.softmax_cross_entropy_with_logits(labels, logits, dim=2)")
+    _, report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+    self.assertIn(
+        "tf.nn.softmax_cross_entropy_with_logits requires manual check.",
+        errors[0])
+    self.assertIn(
+        "tf.nn.softmax_cross_entropy_with_logits behavior has changed. ",
+        report)
+
+  def testSparseMatmul(self):
+    text = ("tf.sparse_matmul(a, b, c, d, e, f, g)\n")
+    expected_text = ("tf.linalg.matmul(a=a, b=b, transpose_a=c, transpose_b=d, "
+                     "a_is_sparse=e, b_is_sparse=f, name=g)\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testWeightedMoments(self):
+    text = "tf.nn.weighted_moments(x, axes, freq, name, kd)"
+    expected_text = (
+        "tf.nn.weighted_moments(x=x, axes=axes, frequency_weights=freq, "
+        "name=name, keepdims=kd)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSparseAdd(self):
+    text = "tf.sparse.add(a, b, t)"
+    expected_text = "tf.sparse.add(a=a, b=b, threshold=t)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSparseConcat(self):
+    text = "tf.sparse.concat(ax, inp, name, exp, concat)"
+    expected_text = (
+        "tf.sparse.concat(axis=ax, sp_inputs=inp, name=name, "
+        "expand_nonconcat_dims=exp, axis=concat)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSeparableConv2D(self):
+    text = "tf.nn.separable_conv2d(inp, d, pt, strides, pad, rate, name, fmt)"
+    expected_text = (
+        "tf.nn.separable_conv2d(input=inp, depthwise_filter=d, "
+        "pointwise_filter=pt, strides=strides, padding=pad, "
+        "dilations=rate, name=name, data_format=fmt)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSpacetoBatch(self):
+    text = "tf.space_to_batch_nd(input, shape, paddings, name)"
+    expected_text = "tf.space_to_batch(input, shape, paddings, name)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.nn.space_to_batch(input, paddings, block_size, name)"
+    expected_text = (
+        "tf.space_to_batch(input=input, paddings=paddings, "
+        "block_shape=block_size, name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testInTopK(self):
+    text = "tf.math.in_top_k(a, b, c, n)"
+    expected_text = (
+        "tf.math.in_top_k(predictions=a, targets=b, k=c, name=n)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testDepthToSpace(self):
+    text = "tf.nn.depth_to_space(input, block_size, name, data_format)"
+    expected_text = (
+        "tf.nn.depth_to_space(input=input, block_size=block_size, "
+        "name=name, data_format=data_format)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testEmbeddingLookup(self):
+    text = ("tf.nn.embedding_lookup(params, ids, partition_strategy, name, "
+            "validate_indices, max_norm)")
+    expected_text = ("tf.nn.embedding_lookup(params=params, ids=ids, "
+                     "partition_strategy=partition_strategy, name=name, "
+                     "validate_indices=validate_indices, max_norm=max_norm)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testEmbeddingLookupSparse(self):
+    text = ("tf.nn.embedding_lookup_sparse(params, sp_ids, sp_weights, "
+            "partition_strategy, name, combiner, max_norm)")
+    expected_text = ("tf.nn.embedding_lookup_sparse(params=params, "
+                     "sp_ids=sp_ids, sp_weights=sp_weights, "
+                     "partition_strategy=partition_strategy, name=name, "
+                     "combiner=combiner, max_norm=max_norm)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testNnInTopK(self):
+    text = "tf.nn.in_top_k(predictions, targets, k, name)"
+    expected_text = ("tf.nn.in_top_k(predictions=predictions, "
+                     "targets=targets, k=k, name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSpaceToDepth(self):
+    text = "tf.nn.space_to_depth(input, block_size, name, data_format)"
+    expected_text = ("tf.nn.space_to_depth(input=input, block_size=block_size, "
+                     "name=name, data_format=data_format)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testPrint(self):
+    # tf.print() cannot be parsed unless we import print_function
+    text = """from __future__ import print_function
+tf.print()
+tf.print('abc')
+"""
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, text)  # Text should stay the same
+
+  def testSparseSplit(self):
+    text = (
+        "tf.sparse_split(sp_input=sp_input, num_split=num_split, axis=axis, "
+        "name=name)")
+    expected_text = (
+        "tf.sparse.split(sp_input=sp_input, num_split=num_split, axis=axis, "
+        "name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = (
+        "tf.sparse_split(sp_input=sp_input, num_split=num_split, "
+        "name=name, split_dim=axis)")
+    expected_text = (
+        "tf.sparse.split(sp_input=sp_input, num_split=num_split, "
+        "name=name, axis=axis)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = (
+        "tf.sparse.split(sp_input=sp_input, num_split=num_split, "
+        "name=name, split_dim=axis)")
+    expected_text = (
+        "tf.sparse.split(sp_input=sp_input, num_split=num_split, "
+        "name=name, axis=axis)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
 
 class TestUpgradeFiles(test_util.TensorFlowTestCase):
 
@@ -338,3 +743,4 @@ class TestUpgradeFiles(test_util.TensorFlowTestCase):
 
 if __name__ == "__main__":
   test_lib.main()
+
diff --git a/tensorflow/tools/compatibility/update/BUILD b/tensorflow/tools/compatibility/update/BUILD
index b9725a74ee583f5b9cbb9e40aba3086c43661528..75bb0cfd2b7569c899fb72aa5ac9f4e608c3decc 100644
--- a/tensorflow/tools/compatibility/update/BUILD
+++ b/tensorflow/tools/compatibility/update/BUILD
@@ -15,3 +15,17 @@ py_binary(
         "//tensorflow/tools/compatibility:tf_upgrade_v2_lib",
     ],
 )
+
+py_binary(
+    name = "generate_v2_reorders_map",
+    srcs = ["generate_v2_reorders_map.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:no_contrib",
+        "//tensorflow/tools/common:public_api",
+        "//tensorflow/tools/common:traverse",
+        "//tensorflow/tools/compatibility:tf_upgrade_v2_lib",
+    ],
+)
diff --git a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
index 19ad6c3a2a5c723cbbff2c76c8bfe6517ca4a4f0..a2c5e7cf82dd8dfb5cb150a7e4e4a58a7a2e1631 100644
--- a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
+++ b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
@@ -64,58 +64,6 @@ from __future__ import print_function
 
 """
 
-_TENSORFLOW_API_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].names)
-_TENSORFLOW_API_ATTR = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
-_TENSORFLOW_CONSTANTS_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].constants)
-_TENSORFLOW_CONSTANTS_ATTR = (
-    tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].constants)
-
-_ESTIMATOR_API_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].names)
-_ESTIMATOR_API_ATTR = tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].names
-_ESTIMATOR_CONSTANTS_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].constants)
-_ESTIMATOR_CONSTANTS_ATTR = (
-    tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].constants)
-
-
-def get_v1_names(symbol):
-  names_v1 = []
-  if hasattr(symbol, _TENSORFLOW_API_ATTR_V1):
-    names_v1.extend(getattr(symbol, _TENSORFLOW_API_ATTR_V1))
-  if hasattr(symbol, _ESTIMATOR_API_ATTR_V1):
-    names_v1.extend(getattr(symbol, _ESTIMATOR_API_ATTR_V1))
-  return names_v1
-
-
-def get_v2_names(symbol):
-  names_v2 = []
-  if hasattr(symbol, _TENSORFLOW_API_ATTR):
-    names_v2.extend(getattr(symbol, _TENSORFLOW_API_ATTR))
-  if hasattr(symbol, _ESTIMATOR_API_ATTR):
-    names_v2.extend(getattr(symbol, _ESTIMATOR_API_ATTR))
-  return list(names_v2)
-
-
-def get_v1_constants(module):
-  constants_v1 = []
-  if hasattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1):
-    constants_v1.extend(getattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1))
-  if hasattr(module, _ESTIMATOR_CONSTANTS_ATTR_V1):
-    constants_v1.extend(getattr(module, _ESTIMATOR_CONSTANTS_ATTR_V1))
-  return constants_v1
-
-
-def get_v2_constants(module):
-  constants_v2 = []
-  if hasattr(module, _TENSORFLOW_CONSTANTS_ATTR):
-    constants_v2.extend(getattr(module, _TENSORFLOW_CONSTANTS_ATTR))
-  if hasattr(module, _ESTIMATOR_CONSTANTS_ATTR):
-    constants_v2.extend(getattr(module, _ESTIMATOR_CONSTANTS_ATTR))
-  return constants_v2
-
 
 def get_canonical_name(v2_names, v1_name):
   if v2_names:
@@ -131,7 +79,7 @@ def get_all_v2_names():
     """Visitor that collects TF 2.0 names."""
     for child in children:
       _, attr = tf_decorator.unwrap(child[1])
-      api_names_v2 = get_v2_names(attr)
+      api_names_v2 = tf_export.get_v2_names(attr)
       for name in api_names_v2:
         v2_names.add(name)
 
@@ -149,8 +97,8 @@ def collect_constant_renames():
   """
   renames = set()
   for module in sys.modules.values():
-    constants_v1_list = get_v1_constants(module)
-    constants_v2_list = get_v2_constants(module)
+    constants_v1_list = tf_export.get_v1_constants(module)
+    constants_v2_list = tf_export.get_v2_constants(module)
 
     # _tf_api_constants attribute contains a list of tuples:
     # (api_names_list, constant_name)
@@ -186,8 +134,8 @@ def collect_function_renames():
     """Visitor that collects rename strings to add to rename_line_set."""
     for child in children:
       _, attr = tf_decorator.unwrap(child[1])
-      api_names_v1 = get_v1_names(attr)
-      api_names_v2 = get_v2_names(attr)
+      api_names_v1 = tf_export.get_v1_names(attr)
+      api_names_v2 = tf_export.get_v2_names(attr)
       deprecated_api_names = set(api_names_v1) - set(api_names_v2)
       for name in deprecated_api_names:
         renames.add((name, get_canonical_name(api_names_v2, name)))
diff --git a/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py b/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eb942d39617c7fe17bc62ff19c98047900d33cf
--- /dev/null
+++ b/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py
@@ -0,0 +1,132 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+"""Script for updating tensorflow/tools/compatibility/reorders_v2.py.
+
+To update reorders_v2.py, run:
+  bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+"""
+# pylint: enable=line-too-long
+import tensorflow as tf
+
+# This import is needed so that TensorFlow python modules are in sys.modules.
+from tensorflow import python as tf_python  # pylint: disable=unused-import
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.platform import app
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_export
+from tensorflow.python.util import tf_inspect
+from tensorflow.tools.common import public_api
+from tensorflow.tools.common import traverse
+from tensorflow.tools.compatibility import tf_upgrade_v2
+
+
+_OUTPUT_FILE_PATH = 'third_party/tensorflow/tools/compatibility/reorders_v2.py'
+_FILE_HEADER = """# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+\"\"\"List of renames to apply when converting from TF 1.0 to TF 2.0.
+
+THIS FILE IS AUTOGENERATED: To update, please run:
+  bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+This file should be updated whenever a function is added to
+self.reordered_function_names in tf_upgrade_v2.py.
+\"\"\"
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+"""
+
+
+def collect_function_arg_names(function_names):
+  """Determines argument names for reordered function signatures.
+
+  Args:
+    function_names: Functions to collect arguments for.
+
+  Returns:
+    Dictionary mapping function name to its arguments.
+  """
+  # Map from reordered function name to its arguments.
+  function_to_args = {}
+
+  def visit(unused_path, unused_parent, children):
+    """Visitor that collects arguments for reordered functions."""
+    for child in children:
+      _, attr = tf_decorator.unwrap(child[1])
+      api_names_v1 = tf_export.get_v1_names(attr)
+      api_names_v1 = ['tf.%s' % name for name in api_names_v1]
+      matches_function_names = any(
+          name in function_names for name in api_names_v1)
+      if matches_function_names:
+        arg_list = tf_inspect.getargspec(attr)[0]
+        for name in api_names_v1:
+          function_to_args[name] = arg_list
+
+  visitor = public_api.PublicAPIVisitor(visit)
+  visitor.do_not_descend_map['tf'].append('contrib')
+  visitor.do_not_descend_map['tf.compat'] = ['v1', 'v2']
+  traverse.traverse(tf, visitor)
+
+  return function_to_args
+
+
+def get_reorder_line(name, arg_list):
+  return '    \'%s\': %s' % (name, str(arg_list))
+
+
+def update_reorders_v2(output_file_path):
+  """Writes a Python dictionary mapping function name to argument order.
+
+  Args:
+    output_file_path: File path to write output to. Any existing contents
+      would be replaced.
+  """
+  reordered_function_names = (
+      tf_upgrade_v2.TFAPIChangeSpec().reordered_function_names)
+
+  all_reorders = collect_function_arg_names(reordered_function_names)
+
+  # List of reorder lines to write to output file in the form:
+  #   'tf.function_name': ['arg1', 'arg2', ...]
+  rename_lines = [
+      get_reorder_line(name, arg_names)
+      for name, arg_names in all_reorders.items()]
+  renames_file_text = '%sreorders = {\n%s\n}\n' % (
+      _FILE_HEADER, ',\n'.join(sorted(rename_lines)))
+  file_io.write_string_to_file(output_file_path, renames_file_text)
+
+
+def main(unused_argv):
+  update_reorders_v2(_OUTPUT_FILE_PATH)
+
+
+if __name__ == '__main__':
+  app.run(main=main)
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index c256dd364ef5a29ba7f8a2afa6e772ee9c566cb8..9ea29c0e201e9cb1630e7bb682d1d7694665decd 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -65,7 +65,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.15.0
+ENV BAZEL_VERSION 0.20.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 7f9b55b45595bc74b51e14883d1fd1dc19b9099c..1ad359ddccc71201553803140fa4efca06fbb5e1 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -87,7 +87,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.15.0
+ENV BAZEL_VERSION 0.20.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl b/tensorflow/tools/docker/Dockerfile.devel-mkl
index 2341c0e8ccfc5f88356ed38f33cca356c207214f..4eefd31d0097913e9ff5cb9d0415c0427dcf1de7 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl
@@ -88,7 +88,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.15.0
+ENV BAZEL_VERSION 0.20.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
index 5e24617b2190f1d564d63f4c9be6321aa03cd8fb..3810daefa570210cfba3f044ccb95816d4393e09 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
@@ -79,7 +79,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.15.0
+ENV BAZEL_VERSION 0.20.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/.gitignore b/tensorflow/tools/dockerfiles/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d7efa472a92b23dfde1277acfe4b543f14842678
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/.gitignore
@@ -0,0 +1 @@
+dockerfiles/*.temp.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/README.md b/tensorflow/tools/dockerfiles/README.md
index 7c8ca1d1c7a21989c616c7ed93ed737f7664b585..b42dd9fc0cda842def86af5be953002e11a1959a 100644
--- a/tensorflow/tools/dockerfiles/README.md
+++ b/tensorflow/tools/dockerfiles/README.md
@@ -1,9 +1,13 @@
 # TensorFlow Dockerfiles
 
-This directory houses TensorFlow's Dockerfiles. **DO NOT EDIT THE DOCKERFILES
-MANUALLY!** They are maintained by `assembler.py`, which builds Dockerfiles from
-the files in `partials/` and the rules in `spec.yml`. See [the Contributing
-section](#contributing) for more information.
+This directory houses TensorFlow's Dockerfiles and the infrastructure used to
+create and deploy them to
+[Docker Hub](https://hub.docker.com/r/tensorflow/tensorflow).
+
+**DO NOT EDIT THE DOCKERFILES/ DIRECTORY MANUALLY!** The files within are
+maintained by `assembler.py`, which builds Dockerfiles from the files in
+`partials/` and the rules in `spec.yml`. See
+[the Contributing section](#contributing) for more information.
 
 These Dockerfiles are planned to replace the Dockerfiles used to generate
 [TensorFlow's official Docker images](https://hub.docker.com/r/tensorflow/tensorflow).
@@ -20,10 +24,10 @@ $ docker build -f ./dockerfiles/cpu.Dockerfile -t tf .
 Each Dockerfile has its own set of available `--build-arg`s which are documented
 in the Dockerfile itself.
 
-## Running
+## Running Locally Built Images
 
 After building the image with the tag `tf` (for example), use `docker run` to
-run the images. Examples are below.
+run the images.
 
 Note for new Docker users: the `-v` and `-u` flags share directories between
 the Docker container and your machine, and very important. Without
@@ -42,8 +46,10 @@ $ docker run -u $(id -u):$(id -g) -v $(pwd):/my-devel -it tf
 # GPU-based images (set up nvidia-docker2 first)
 $ docker run --runtime=nvidia -u $(id -u):$(id -g) -v $(pwd):/my-devel -it tf
 
-# Images with Jupyter run on port 8888, and needs a volume for notebooks
-$ docker run --user $(id -u):$(id -g) -p 8888:8888 -v $(pwd):/notebooks -it tf
+# Images with Jupyter run on port 8888 and need a volume for your notebooks
+# You can change $(PWD) to the full path to a directory if your notebooks
+# live outside the current directory.
+$ docker run --user $(id -u):$(id -g) -p 8888:8888 -v $(PWD):/tf/notebooks -it tf
 ```
 
 These images do not come with the TensorFlow source code -- but the development
@@ -60,11 +66,32 @@ You can use the `Dockerfile` in this directory to build an editing environment
 that has all of the Python dependencies you'll need:
 
 ```bash
-$ docker build -t tf-assembler -f assembler.Dockerfile .
+# Build the tools-helper image so you can run the assembler
+$ docker build -t tf-tools -f tools.Dockerfile .
 
 # Set --user to set correct permissions on generated files
-$ docker run --user $(id -u):$(id -g) -it -v $(pwd):/tf tf-assembler bash 
+$ docker run --user $(id -u):$(id -g) -it -v $(pwd):/tf tf-tools bash
+
+# Next you can make a handy alias depending on what you're doing. When building
+# Docker images, you need to run as root with docker.sock mounted so that the
+# container can run Docker commands. When assembling Dockerfiles, though, you'll
+# want to run as your user so that new files have the right permissions.
+
+# If you're BUILDING OR DEPLOYING DOCKER IMAGES, run as root with docker.sock:
+$ alias asm_images="docker run --rm -v $(pwd):/tf -v /var/run/docker.sock:/var/run/docker.sock tf-tools python3 assembler.py "
+
+# If you're REBUILDING OR ADDING DOCKERFILES, remove docker.sock and add -u:
+$ alias asm_dockerfiles="docker run --rm -u $(id -u):$(id -g) -v $(pwd):/tf tf-tools python3 assembler.py "
+
+# Check flags
+$ asm_dockerfiles --help
+
+# Assemble all of the Dockerfiles
+$ asm_dockerfiles --release dockerfiles --construct_dockerfiles
+
+# Build all of the "nightly" images on your local machine:
+$ asm_images --release nightly --build_images
 
-# In the container...
-/tf $ python3 ./assembler.py -o dockerfiles -s spec.yml
+# Build version release for version 99.0, except "gpu" tags:
+$ asm_images --release versioned --arg _TAG_PREFIX=99.0 --build_images --exclude_tags_matching '*.gpu.*'
 ```
diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py
index 9cdd9bb0cb0841e95d8d334293026207f093ab90..67a0320241d273bbb7a2439b2e09723905db0765 100644
--- a/tensorflow/tools/dockerfiles/assembler.py
+++ b/tensorflow/tools/dockerfiles/assembler.py
@@ -11,63 +11,144 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Assemble common TF Dockerfiles from many parts.
+# ============================================================================
+"""Multipurpose TensorFlow Docker Helper.
 
-This script constructs TF's Dockerfiles by aggregating partial
-Dockerfiles. See README.md for usage examples.
+- Assembles Dockerfiles
+- Builds images (and optionally runs image tests)
+- Pushes images to Docker Hub (provided with credentials)
+
+Read README.md (in this directory) for instructions!
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
 import errno
+import itertools
+import multiprocessing
 import os
-import os.path
 import re
 import shutil
-import textwrap
+import sys
 
 from absl import app
 from absl import flags
 import cerberus
+import docker
 import yaml
 
 FLAGS = flags.FLAGS
 
+flags.DEFINE_string('hub_username', None,
+                    'Dockerhub username, only used with --upload_to_hub')
+
+flags.DEFINE_string(
+    'hub_password', None,
+    ('Dockerhub password, only used with --upload_to_hub. Use from an env param'
+     'so your password isn\'t in your history.'))
+
+flags.DEFINE_integer('hub_timeout', 3600,
+                     'Abort Hub upload if it takes longer than this.')
+
+flags.DEFINE_string(
+    'repository', 'tensorflow',
+    'Tag local images as {repository}:tag (in addition to the '
+    'hub_repository, if uploading to hub)')
+
+flags.DEFINE_string(
+    'hub_repository', None,
+    'Push tags to this Docker Hub repository, e.g. tensorflow/tensorflow')
+
+flags.DEFINE_boolean(
+    'upload_to_hub',
+    False,
+    ('Push built images to Docker Hub (you must also provide --hub_username, '
+     '--hub_password, and --hub_repository)'),
+    short_name='u',
+)
+
+flags.DEFINE_boolean(
+    'construct_dockerfiles', False, 'Do not build images', short_name='d')
+
+flags.DEFINE_boolean(
+    'keep_temp_dockerfiles',
+    False,
+    'Retain .temp.Dockerfiles created while building images.',
+    short_name='k')
+
 flags.DEFINE_boolean(
-    'dry_run', False, 'Do not actually generate Dockerfiles', short_name='n')
+    'build_images', False, 'Do not build images', short_name='b')
 
 flags.DEFINE_string(
-    'spec_file',
-    './spec.yml',
-    'Path to a YAML specification file',
-    short_name='s')
+    'run_tests_path', None,
+    ('Execute test scripts on generated Dockerfiles before pushing them. '
+     'Flag value must be a full path to the "tests" directory, which is usually'
+     ' $(realpath ./tests). A failed tests counts the same as a failed build.'))
+
+flags.DEFINE_boolean(
+    'stop_on_failure', False,
+    ('Stop processing tags if any one build fails. If False or not specified, '
+     'failures are reported but do not affect the other images.'))
+
+flags.DEFINE_boolean(
+    'dry_run',
+    False,
+    'Do not build or deploy anything at all.',
+    short_name='n',
+)
 
 flags.DEFINE_string(
-    'output_dir',
-    './dockerfiles', ('Path to an output directory for Dockerfiles. '
-                      'Will be created if it doesn\'t exist.'),
+    'exclude_tags_matching',
+    None,
+    ('Regular expression that skips processing on any tag it matches. Must '
+     'match entire string, e.g. ".*gpu.*" ignores all GPU tags.'),
+    short_name='x')
+
+flags.DEFINE_string(
+    'only_tags_matching',
+    None,
+    ('Regular expression that skips processing on any tag it does not match. '
+     'Must match entire string, e.g. ".*gpu.*" includes only GPU tags.'),
+    short_name='i')
+
+flags.DEFINE_string(
+    'dockerfile_dir',
+    './dockerfiles', 'Path to an output directory for Dockerfiles.'
+    ' Will be created if it doesn\'t exist.'
+    ' Existing files in this directory will be deleted when new Dockerfiles'
+    ' are made.',
     short_name='o')
 
 flags.DEFINE_string(
     'partial_dir',
     './partials',
-    'Path to a directory containing foo.partial.Dockerfile partial files.',
+    'Path to a directory containing foo.partial.Dockerfile partial files.'
+    ' can have subdirectories, e.g. "bar/baz.partial.Dockerfile".',
     short_name='p')
 
-flags.DEFINE_boolean(
-    'quiet_dry_run',
-    True,
-    'Do not print contents of dry run Dockerfiles.',
-    short_name='q')
+flags.DEFINE_multi_string(
+    'release', [],
+    'Set of releases to build and tag. Defaults to every release type.',
+    short_name='r')
 
-flags.DEFINE_boolean(
-    'validate', True, 'Validate generated Dockerfiles', short_name='c')
+flags.DEFINE_multi_string(
+    'arg', [],
+    ('Extra build arguments. These are used for expanding tag names if needed '
+     '(e.g. --arg _TAG_PREFIX=foo) and for using as build arguments (unused '
+     'args will print a warning).'),
+    short_name='a')
 
-# Schema to verify the contents of spec.yml with Cerberus.
+flags.DEFINE_string(
+    'spec_file',
+    './spec.yml',
+    'Path to the YAML specification file',
+    short_name='s')
+
+# Schema to verify the contents of tag-spec.yml with Cerberus.
 # Must be converted to a dict from yaml to work.
 # Note: can add python references with e.g.
 # !!python/name:builtins.str
@@ -76,79 +157,78 @@ SCHEMA_TEXT = """
 header:
   type: string
 
-partials:
+slice_sets:
   type: dict
   keyschema:
     type: string
   valueschema:
-    type: dict
-    schema:
-      desc:
-        type: string
-      args:
+     type: list
+     schema:
         type: dict
-        keyschema:
-          type: string
-        valueschema:
-          anyof:
-            - type: [ boolean, number, string ]
-            - type: dict
-              schema:
-                 default:
-                    type: [ boolean, number, string ]
-                 desc:
-                    type: string
-                 options:
-                    type: list
-                    schema:
-                       type: string
-
-images:
+        schema:
+           add_to_name:
+             type: string
+           dockerfile_exclusive_name:
+             type: string
+           dockerfile_subdirectory:
+             type: string
+           partials:
+             type: list
+             schema:
+               type: string
+               ispartial: true
+           test_runtime:
+             type: string
+             required: false
+           tests:
+             type: list
+             default: []
+             schema:
+               type: string
+           args:
+             type: list
+             default: []
+             schema:
+               type: string
+               isfullarg: true
+
+releases:
+  type: dict
   keyschema:
     type: string
   valueschema:
     type: dict
     schema:
-      desc:
-        type: string
-      arg-defaults:
-        type: list
-        schema:
-          anyof:
-            - type: dict
-              keyschema:
-                type: string
-                arg_in_use: true
-              valueschema:
-                type: string
-            - type: string
-              isimage: true
-      create-dockerfile:
+      is_dockerfiles:
         type: boolean
-      partials:
+        required: false
+        default: false
+      upload_images:
+        type: boolean
+        required: false
+        default: true
+      tag_specs:
         type: list
+        required: true
         schema:
-          anyof:
-            - type: dict
-              keyschema:
-                type: string
-                regex: image
-              valueschema:
-                type: string
-                isimage: true
-            - type: string
-              ispartial: true
+          type: string
 """
 
 
-class TfDockerValidator(cerberus.Validator):
-  """Custom Cerberus validator for TF dockerfile spec.
+class TfDockerTagValidator(cerberus.Validator):
+  """Custom Cerberus validator for TF tag spec.
 
   Note: Each _validate_foo function's docstring must end with a segment
   describing its own validation schema, e.g. "The rule's arguments are...". If
   you add a new validator, you can copy/paste that section.
   """
 
+  def __init__(self, *args, **kwargs):
+    # See http://docs.python-cerberus.org/en/stable/customize.html
+    if 'partials' in kwargs:
+      self.partials = kwargs['partials']
+    super(cerberus.Validator, self).__init__(*args, **kwargs)
+
   def _validate_ispartial(self, ispartial, field, value):
     """Validate that a partial references an existing partial spec.
 
@@ -156,398 +236,431 @@ class TfDockerValidator(cerberus.Validator):
       ispartial: Value of the rule, a bool
       field: The field being validated
       value: The field's value
-
     The rule's arguments are validated against this schema:
     {'type': 'boolean'}
     """
-    if ispartial and value not in self.root_document.get('partials', dict()):
-      self._error(field, '{} is not an existing partial.'.format(value))
+    if ispartial and value not in self.partials:
+      self._error(field,
+                  '{} is not present in the partials directory.'.format(value))
 
-  def _validate_isimage(self, isimage, field, value):
-    """Validate that an image references an existing partial spec.
+  def _validate_isfullarg(self, isfullarg, field, value):
+    """Validate that a string is either a FULL=arg or NOT.
 
     Args:
-      isimage: Value of the rule, a bool
+      isfullarg: Value of the rule, a bool
       field: The field being validated
       value: The field's value
-
     The rule's arguments are validated against this schema:
     {'type': 'boolean'}
     """
-    if isimage and value not in self.root_document.get('images', dict()):
-      self._error(field, '{} is not an existing image.'.format(value))
-
-  def _validate_arg_in_use(self, arg_in_use, field, value):
-    """Validate that an arg references an existing partial spec's args.
-
-    Args:
-      arg_in_use: Value of the rule, a bool
-      field: The field being validated
-      value: The field's value
-
-    The rule's arguments are validated against this schema:
-    {'type': 'boolean'}
-    """
-    if arg_in_use:
-      for partial in self.root_document.get('partials', dict()).values():
-        if value in partial.get('args', tuple()):
-          return
-
-      self._error(field, '{} is not an arg used in any partial.'.format(value))
-
-
-def build_partial_description(partial_spec):
-  """Create the documentation lines for a specific partial.
-
-  Generates something like this:
-
-    # This is the partial's description, from spec.yml.
-    # --build-arg ARG_NAME=argdefault
-    #    this is one of the args.
-    # --build-arg ANOTHER_ARG=(some|choices)
-    #    another arg.
+    if isfullarg and '=' not in value:
+      self._error(field, '{} should be of the form ARG=VALUE.'.format(value))
+    if not isfullarg and '=' in value:
+      self._error(field, '{} should be of the form ARG (no =).'.format(value))
 
-  Args:
-    partial_spec: A dict representing one of the partials from spec.yml. Doesn't
-      include the name of the partial; is a dict like { desc: ..., args: ... }.
-
-  Returns:
-    A commented string describing this partial.
-  """
 
-  # Start from linewrapped desc field
-  lines = []
-  wrapper = textwrap.TextWrapper(
-      initial_indent='# ', subsequent_indent='# ', width=80)
-  description = wrapper.fill(partial_spec.get('desc', '( no comments )'))
-  lines.extend(['#', description])
-
-  # Document each arg
-  for arg, arg_data in partial_spec.get('args', dict()).items():
-    # Wrap arg description with comment lines
-    desc = arg_data.get('desc', '( no description )')
-    desc = textwrap.fill(
-        desc,
-        initial_indent='#    ',
-        subsequent_indent='#    ',
-        width=80,
-        drop_whitespace=False)
-
-    # Document (each|option|like|this)
-    if 'options' in arg_data:
-      arg_options = ' ({})'.format('|'.join(arg_data['options']))
-    else:
-      arg_options = ''
+def eprint(*args, **kwargs):
+  print(*args, file=sys.stderr, flush=True, **kwargs)
 
-    # Add usage sample
-    arg_use = '# --build-arg {}={}{}'.format(arg,
-                                             arg_data.get('default', '(unset)'),
-                                             arg_options)
-    lines.extend([arg_use, desc])
 
-  return '\n'.join(lines)
+def aggregate_all_slice_combinations(spec, slice_set_names):
+  """Figure out all of the possible slice groupings for a tag spec."""
+  slice_sets = copy.deepcopy(spec['slice_sets'])
 
+  for name in slice_set_names:
+    for slice_set in slice_sets[name]:
+      slice_set['set_name'] = name
 
-def construct_contents(partial_specs, image_spec):
-  """Assemble the dockerfile contents for an image spec.
+  slices_grouped_but_not_keyed = [slice_sets[name] for name in slice_set_names]
+  all_slice_combos = list(itertools.product(*slices_grouped_but_not_keyed))
+  return all_slice_combos
 
-  It assembles a concrete list of partial references into a single, large
-  string.
-  Also expands argument defaults, so that the resulting Dockerfile doesn't have
-  to be configured with --build-arg=... every time. That is, any ARG directive
-  will be updated with a new default value.
 
-  Args:
-    partial_specs: The dict from spec.yml["partials"].
-    image_spec: One of the dict values from spec.yml["images"].
+def build_name_from_slices(format_string, slices, args, is_dockerfile=False):
+  """Build the tag name (cpu-devel...) from a list of slices."""
+  name_formatter = copy.deepcopy(args)
+  name_formatter.update({s['set_name']: s['add_to_name'] for s in slices})
+  name_formatter.update({
+      s['set_name']: s['dockerfile_exclusive_name']
+      for s in slices
+      if is_dockerfile and 'dockerfile_exclusive_name' in s
+  })
+  name = format_string.format(**name_formatter)
+  return name
 
-  Returns:
-    A string containing a valid Dockerfile based on the partials listed in
-    image_spec.
-  """
-  processed_partial_strings = []
-  for partial_name in image_spec['partials']:
-    # Apply image arg-defaults to existing arg defaults
-    partial_spec = copy.deepcopy(partial_specs[partial_name])
-    args = partial_spec.get('args', dict())
-    for k_v in image_spec.get('arg-defaults', []):
-      arg, value = list(k_v.items())[0]
-      if arg in args:
-        args[arg]['default'] = value
-
-    # Read partial file contents
-    filename = partial_spec.get('file', partial_name)
-    partial_path = os.path.join(FLAGS.partial_dir,
-                                '{}.partial.Dockerfile'.format(filename))
-    with open(partial_path, 'r') as f_partial:
-      partial_contents = f_partial.read()
-
-    # Replace ARG FOO=BAR with ARG FOO=[new-default]
-    for arg, arg_data in args.items():
-      if 'default' in arg_data and arg_data['default']:
-        default = '={}'.format(arg_data['default'])
-      else:
-        default = ''
-      partial_contents = re.sub(r'ARG {}.*'.format(arg), 'ARG {}{}'.format(
-          arg, default), partial_contents)
-
-    # Store updated partial contents
-    processed_partial_strings.append(partial_contents)
-
-  # Join everything together
-  return '\n'.join(processed_partial_strings)
 
-
-def mkdir_p(path):
-  """Create a directory and its parents, even if it already exists."""
-  try:
-    os.makedirs(path)
-  except OSError as e:
-    if e.errno != errno.EEXIST:
-      raise
+def update_args_dict(args_dict, updater):
+  """Update a dict of arg values with more values from a list or dict."""
+  if isinstance(updater, list):
+    for arg in updater:
+      key, sep, value = arg.partition('=')
+      if sep == '=':
+        args_dict[key] = value
+  if isinstance(updater, dict):
+    for key, value in updater.items():
+      args_dict[key] = value
+  return args_dict
 
 
-def construct_documentation(header, partial_specs, image_spec):
-  """Assemble all of the documentation for a single dockerfile.
+def get_slice_sets_and_required_args(slice_sets, tag_spec):
+  """Extract used-slice-sets and required CLI arguments from a spec string.
 
-  Builds explanations of included partials and available build args.
+  For example, {FOO}{bar}{bat} finds FOO, bar, and bat. Assuming bar and bat
+  are both named slice sets, FOO must be specified on the command line.
 
   Args:
-    header: The string from spec.yml["header"]; will be commented and wrapped.
-    partial_specs: The dict from spec.yml["partials"].
-    image_spec: The spec for the dockerfile being built.
+     slice_sets: Dict of named slice sets
+     tag_spec: The tag spec string, e.g. {_FOO}{blep}
 
   Returns:
-    A string containing a commented header that documents the contents of the
-    dockerfile.
-
+     (used_slice_sets, required_args), a tuple of lists
   """
-  # Comment and wrap header and image description
-  commented_header = '\n'.join(
-      [('# ' + l).rstrip() for l in header.splitlines()])
-  commented_desc = '\n'.join(
-      ['# ' + l for l in image_spec.get('desc', '').splitlines()])
-  partial_descriptions = []
-
-  # Build documentation for each partial in the image
-  for partial in image_spec['partials']:
-    # Copy partial data for default args unique to this image
-    partial_spec = copy.deepcopy(partial_specs[partial])
-    args = partial_spec.get('args', dict())
-
-    # Overwrite any existing arg defaults
-    for k_v in image_spec.get('arg-defaults', []):
-      arg, value = list(k_v.items())[0]
-      if arg in args:
-        args[arg]['default'] = value
-
-    # Build the description from new args
-    partial_description = build_partial_description(partial_spec)
-    partial_descriptions.append(partial_description)
-
-  contents = [commented_header, '#', commented_desc] + partial_descriptions
-  return '\n'.join(contents) + '\n'
-
-
-def normalize_partial_args(partial_specs):
-  """Normalize the shorthand form of a partial's args specification.
-
-  Turns this:
-
-    partial:
-      args:
-        SOME_ARG: arg_value
-
-  Into this:
-
-    partial:
-       args:
-         SOME_ARG:
-            default: arg_value
-
-  Args:
-    partial_specs: The dict from spec.yml["partials"]. This dict is modified in
-      place.
-
-  Returns:
-    The modified contents of partial_specs.
-
-  """
-  for _, partial in partial_specs.items():
-    args = partial.get('args', dict())
-    for arg, value in args.items():
-      if not isinstance(value, dict):
-        new_value = {'default': value}
-        args[arg] = new_value
-
-  return partial_specs
-
-
-def flatten_args_references(image_specs):
-  """Resolve all default-args in each image spec to a concrete dict.
-
-  Turns this:
-
-    example-image:
-      arg-defaults:
-        - MY_ARG: ARG_VALUE
-
-    another-example:
-      arg-defaults:
-        - ANOTHER_ARG: ANOTHER_VALUE
-        - example_image
-
-  Into this:
+  required_args = []
+  used_slice_sets = []
+
+  extract_bracketed_words = re.compile(r'\{([^}]+)\}')
+  possible_args_or_slice_set_names = extract_bracketed_words.findall(tag_spec)
+  for name in possible_args_or_slice_set_names:
+    if name in slice_sets:
+      used_slice_sets.append(name)
+    else:
+      required_args.append(name)
 
-    example-image:
-      arg-defaults:
-        - MY_ARG: ARG_VALUE
+  return (used_slice_sets, required_args)
 
-    another-example:
-      arg-defaults:
-        - ANOTHER_ARG: ANOTHER_VALUE
-        - MY_ARG: ARG_VALUE
 
-  Args:
-    image_specs: A dict of image_spec dicts; should be the contents of the
-      "images" key in the global spec.yaml. This dict is modified in place and
-      then returned.
-
-  Returns:
-    The modified contents of image_specs.
-  """
-  for _, image_spec in image_specs.items():
-    too_deep = 0
-    while str in map(type, image_spec.get('arg-defaults', [])) and too_deep < 5:
-      new_args = []
-      for arg in image_spec['arg-defaults']:
-        if isinstance(arg, str):
-          new_args.extend(image_specs[arg]['arg-defaults'])
-        else:
-          new_args.append(arg)
+def gather_tag_args(slices, cli_input_args, required_args):
+  """Build a dictionary of all the CLI and slice-specified args for a tag."""
+  args = dict()
 
-      image_spec['arg-defaults'] = new_args
-      too_deep += 1
+  for s in slices:
+    args = update_args_dict(args, s['args'])
 
-  return image_specs
+  args = update_args_dict(args, cli_input_args)
+  for arg in required_args:
+    if arg not in args:
+      eprint(('> Error: {} is not a valid slice_set, and also isn\'t an arg '
+              'provided on the command line. If it is an arg, please specify '
+              'it with --arg. If not, check the slice_sets list.'.format(arg)))
+      exit(1)
 
+  return args
 
-def flatten_partial_references(image_specs):
-  """Resolve all partial references in each image spec to a concrete list.
 
-  Turns this:
+def gather_slice_list_items(slices, key):
+  """For a list of slices, get the flattened list of all of a certain key."""
+  return list(itertools.chain(*[s[key] for s in slices if key in s]))
 
-    example-image:
-      partials:
-        - foo
 
-    another-example:
-      partials:
-        - bar
-        - image: example-image
-        - bat
+def find_first_slice_value(slices, key):
+  """For a list of slices, get the first value for a certain key."""
+  for s in slices:
+    if key in s and s[key] is not None:
+      return s[key]
+  return None
 
-  Into this:
 
-    example-image:
-      partials:
-        - foo
+def assemble_tags(spec, cli_args, enabled_releases, all_partials):
+  """Gather all the tags based on our spec.
 
-    another-example:
-      partials:
-        - bar
-        - foo
-        - bat
   Args:
-    image_specs: A dict of image_spec dicts; should be the contents of the
-      "images" key in the global spec.yaml. This dict is modified in place and
-      then returned.
+    spec: Nested dict containing full Tag spec
+    cli_args: List of ARG=foo arguments to pass along to Docker build
+    enabled_releases: List of releases to parse. Empty list = all
+    all_partials: Dict of every partial, for reference
 
   Returns:
-    The modified contents of image_specs.
+    Dict of tags and how to build them
   """
-  for _, image_spec in image_specs.items():
-    too_deep = 0
-    while dict in map(type, image_spec['partials']) and too_deep < 5:
-      new_partials = []
-      for partial in image_spec['partials']:
-        if isinstance(partial, str):
-          new_partials.append(partial)
-        else:
-          new_partials.extend(image_specs[partial['image']]['partials'])
+  tag_data = collections.defaultdict(list)
+
+  for name, release in spec['releases'].items():
+    for tag_spec in release['tag_specs']:
+      if enabled_releases and name not in enabled_releases:
+        eprint('> Skipping release {}'.format(name))
+        continue
+
+      used_slice_sets, required_cli_args = get_slice_sets_and_required_args(
+          spec['slice_sets'], tag_spec)
+
+      slice_combos = aggregate_all_slice_combinations(spec, used_slice_sets)
+      for slices in slice_combos:
+
+        tag_args = gather_tag_args(slices, cli_args, required_cli_args)
+        tag_name = build_name_from_slices(tag_spec, slices, tag_args,
+                                          release['is_dockerfiles'])
+        used_partials = gather_slice_list_items(slices, 'partials')
+        used_tests = gather_slice_list_items(slices, 'tests')
+        test_runtime = find_first_slice_value(slices, 'test_runtime')
+        dockerfile_subdirectory = find_first_slice_value(
+            slices, 'dockerfile_subdirectory')
+        dockerfile_contents = merge_partials(spec['header'], used_partials,
+                                             all_partials)
+
+        tag_data[tag_name].append({
+            'release': name,
+            'tag_spec': tag_spec,
+            'is_dockerfiles': release['is_dockerfiles'],
+            'upload_images': release['upload_images'],
+            'cli_args': tag_args,
+            'dockerfile_subdirectory': dockerfile_subdirectory or '',
+            'partials': used_partials,
+            'tests': used_tests,
+            'test_runtime': test_runtime,
+            'dockerfile_contents': dockerfile_contents,
+        })
+
+  return tag_data
+
+
+def merge_partials(header, used_partials, all_partials):
+  """Merge all partial contents with their header."""
+  used_partials = list(used_partials)
+  return '\n'.join([header] + [all_partials[u] for u in used_partials])
+
+
+def upload_in_background(hub_repository, dock, image, tag):
+  """Upload a docker image (to be used by multiprocessing)."""
+  image.tag(hub_repository, tag=tag)
+  print(dock.images.push(hub_repository, tag=tag))
 
-      image_spec['partials'] = new_partials
-      too_deep += 1
 
-  return image_specs
+def mkdir_p(path):
+  """Create a directory and its parents, even if it already exists."""
+  try:
+    os.makedirs(path)
+  except OSError as e:
+    if e.errno != errno.EEXIST:
+      raise
 
 
-def construct_dockerfiles(tf_spec):
-  """Generate a mapping of {"cpu": <cpu dockerfile contents>, ...}.
+def gather_existing_partials(partial_path):
+  """Find and read all available partials.
 
   Args:
-    tf_spec: The full spec.yml loaded as a python object.
+    partial_path (string): read partials from this directory.
 
   Returns:
-    A string:string dict of short names ("cpu-devel") to Dockerfile contents.
+    Dict[string, string] of partial short names (like "ubuntu/python" or
+      "bazel") to the full contents of that partial.
   """
-  names_to_contents = dict()
-  image_specs = tf_spec['images']
-  image_specs = flatten_partial_references(image_specs)
-  image_specs = flatten_args_references(image_specs)
-  partial_specs = tf_spec['partials']
-  partial_specs = normalize_partial_args(partial_specs)
-
-  for name, image_spec in image_specs.items():
-    if not image_spec.get('create-dockerfile', True):
-      continue
-    documentation = construct_documentation(tf_spec['header'], partial_specs,
-                                            image_spec)
-    contents = construct_contents(partial_specs, image_spec)
-    names_to_contents[name] = '\n'.join([documentation, contents])
-
-  return names_to_contents
+  partials = dict()
+  for path, _, files in os.walk(partial_path):
+    for name in files:
+      fullpath = os.path.join(path, name)
+      if '.partial.Dockerfile' not in fullpath:
+        eprint(('> Probably not a problem: skipping {}, which is not a '
+                'partial.').format(fullpath))
+        continue
+      # partial_dir/foo/bar.partial.Dockerfile -> foo/bar
+      simple_name = fullpath[len(partial_path) + 1:-len('.partial.dockerfile')]
+      with open(fullpath, 'r') as f:
+        partial_contents = f.read()
+      partials[simple_name] = partial_contents
+  return partials
 
 
 def main(argv):
   if len(argv) > 1:
-    raise app.UsageError('Unexpected command line args found: {}'.format(argv))
+    raise app.UsageError('Too many command-line arguments.')
 
+  # Read the full spec file, used for everything
   with open(FLAGS.spec_file, 'r') as spec_file:
-    tf_spec = yaml.load(spec_file)
+    tag_spec = yaml.load(spec_file)
+
+  # Get existing partial contents
+  partials = gather_existing_partials(FLAGS.partial_dir)
 
   # Abort if spec.yaml is invalid
-  if FLAGS.validate:
-    schema = yaml.load(SCHEMA_TEXT)
-    v = TfDockerValidator(schema)
-    if not v.validate(tf_spec):
-      print('>> ERROR: {} is an invalid spec! The errors are:'.format(
-          FLAGS.spec_file))
-      print(yaml.dump(v.errors, indent=2))
+  schema = yaml.load(SCHEMA_TEXT)
+  v = TfDockerTagValidator(schema, partials=partials)
+  if not v.validate(tag_spec):
+    eprint('> Error: {} is an invalid spec! The errors are:'.format(
+        FLAGS.spec_file))
+    eprint(yaml.dump(v.errors, indent=2))
+    exit(1)
+  tag_spec = v.normalized(tag_spec)
+
+  # Assemble tags and images used to build them
+  all_tags = assemble_tags(tag_spec, FLAGS.arg, FLAGS.release, partials)
+
+  # Empty Dockerfile directory if building new Dockerfiles
+  if FLAGS.construct_dockerfiles:
+    eprint('> Emptying Dockerfile dir "{}"'.format(FLAGS.dockerfile_dir))
+    shutil.rmtree(FLAGS.dockerfile_dir, ignore_errors=True)
+    mkdir_p(FLAGS.dockerfile_dir)
+
+  # Set up Docker helper
+  dock = docker.from_env()
+
+  # Login to Docker if uploading images
+  if FLAGS.upload_to_hub:
+    if not FLAGS.hub_username:
+      eprint('> Error: please set --hub_username when uploading to Dockerhub.')
       exit(1)
-  else:
-    print('>> WARNING: Not validating {}'.format(FLAGS.spec_file))
-
-  # Generate mapping of { "cpu-devel": "<cpu-devel dockerfile contents>", ... }
-  names_to_contents = construct_dockerfiles(tf_spec)
-
-  # Write each completed Dockerfile
-  if not FLAGS.dry_run:
-    print('>> Emptying destination dir "{}"'.format(FLAGS.output_dir))
-    shutil.rmtree(FLAGS.output_dir, ignore_errors=True)
-    mkdir_p(FLAGS.output_dir)
-  else:
-    print('>> Skipping creation of {} (dry run)'.format(FLAGS.output_dir))
-  for name, contents in names_to_contents.items():
-    path = os.path.join(FLAGS.output_dir, name + '.Dockerfile')
-    if FLAGS.dry_run:
-      print('>> Skipping writing contents of {} (dry run)'.format(path))
-      print(contents)
-    else:
-      mkdir_p(FLAGS.output_dir)
-      print('>> Writing {}'.format(path))
-      with open(path, 'w') as f:
-        f.write(contents)
+    if not FLAGS.hub_repository:
+      eprint(
+          '> Error: please set --hub_repository when uploading to Dockerhub.')
+      exit(1)
+    if not FLAGS.hub_password:
+      eprint('> Error: please set --hub_password when uploading to Dockerhub.')
+      exit(1)
+    dock.login(
+        username=FLAGS.hub_username,
+        password=FLAGS.hub_password,
+    )
+
+  # Each tag has a name ('tag') and a definition consisting of the contents
+  # of its Dockerfile, its build arg list, etc.
+  failed_tags = []
+  for tag, tag_defs in all_tags.items():
+    for tag_def in tag_defs:
+      eprint('> Working on {}'.format(tag))
+
+      if FLAGS.exclude_tags_matching and re.match(FLAGS.exclude_tags_matching,
+                                                  tag):
+        eprint('>> Excluded due to match against "{}".'.format(
+            FLAGS.exclude_tags_matching))
+        continue
+
+      if FLAGS.only_tags_matching and not re.match(FLAGS.only_tags_matching,
+                                                   tag):
+        eprint('>> Excluded due to failure to match against "{}".'.format(
+            FLAGS.only_tags_matching))
+        continue
+
+      # Write releases marked "is_dockerfiles" into the Dockerfile directory
+      if FLAGS.construct_dockerfiles and tag_def['is_dockerfiles']:
+        path = os.path.join(FLAGS.dockerfile_dir,
+                            tag_def['dockerfile_subdirectory'],
+                            tag + '.Dockerfile')
+        eprint('>> Writing {}...'.format(path))
+        if not FLAGS.dry_run:
+          mkdir_p(os.path.dirname(path))
+          with open(path, 'w') as f:
+            f.write(tag_def['dockerfile_contents'])
+
+      # Don't build any images for dockerfile-only releases
+      if not FLAGS.build_images:
+        continue
+
+      # Generate a temporary Dockerfile to use to build, since docker-py
+      # needs a filepath relative to the build context (i.e. the current
+      # directory)
+      dockerfile = os.path.join(FLAGS.dockerfile_dir, tag + '.temp.Dockerfile')
+      if not FLAGS.dry_run:
+        with open(dockerfile, 'w') as f:
+          f.write(tag_def['dockerfile_contents'])
+      eprint('>> (Temporary) writing {}...'.format(dockerfile))
+
+      repo_tag = '{}:{}'.format(FLAGS.repository, tag)
+      eprint('>> Building {} using build args:'.format(repo_tag))
+      for arg, value in tag_def['cli_args'].items():
+        eprint('>>> {}={}'.format(arg, value))
+
+      # Note that we are NOT using cache_from, which appears to limit
+      # available cache layers to those from explicitly specified layers. Many
+      # of our layers are similar between local builds, so we want to use the
+      # implied local build cache.
+      tag_failed = False
+      image, logs = None, []
+      if not FLAGS.dry_run:
+        try:
+          image, logs = dock.images.build(
+              timeout=FLAGS.hub_timeout,
+              path='.',
+              dockerfile=dockerfile,
+              buildargs=tag_def['cli_args'],
+              tag=repo_tag)
+
+          # Print logs after finishing
+          log_lines = [l.get('stream', '') for l in logs]
+          eprint(''.join(log_lines))
+
+          # Run tests if requested, and dump output
+          # Could be improved by backgrounding, but would need better
+          # multiprocessing support to track failures properly.
+          if FLAGS.run_tests_path:
+            if not tag_def['tests']:
+              eprint('>>> No tests to run.')
+            for test in tag_def['tests']:
+              eprint('>> Testing {}...'.format(test))
+              container, = dock.containers.run(
+                  image,
+                  '/tests/' + test,
+                  working_dir='/',
+                  log_config={'type': 'journald'},
+                  detach=True,
+                  stderr=True,
+                  stdout=True,
+                  volumes={
+                      FLAGS.run_tests_path: {
+                          'bind': '/tests',
+                          'mode': 'ro'
+                      }
+                  },
+                  runtime=tag_def['test_runtime']),
+              ret = container.wait()
+              code = ret['StatusCode']
+              out = container.logs(stdout=True, stderr=False)
+              err = container.logs(stdout=False, stderr=True)
+              container.remove()
+              if out:
+                eprint('>>> Output stdout:')
+                eprint(out.decode('utf-8'))
+              else:
+                eprint('>>> No test standard out.')
+              if err:
+                eprint('>>> Output stderr:')
+                eprint(out.decode('utf-8'))
+              else:
+                eprint('>>> No test standard err.')
+              if code != 0:
+                eprint('>> {} failed tests with status: "{}"'.format(
+                    repo_tag, code))
+                failed_tags.append(tag)
+                tag_failed = True
+                if FLAGS.stop_on_failure:
+                  eprint('>> ABORTING due to --stop_on_failure!')
+                  exit(1)
+              else:
+                eprint('>> Tests look good!')
+
+        except docker.errors.BuildError as e:
+          eprint('>> {} failed to build with message: "{}"'.format(
+              repo_tag, e.msg))
+          eprint('>> Build logs follow:')
+          log_lines = [l.get('stream', '') for l in e.build_log]
+          eprint(''.join(log_lines))
+          failed_tags.append(tag)
+          tag_failed = True
+          if FLAGS.stop_on_failure:
+            eprint('>> ABORTING due to --stop_on_failure!')
+            exit(1)
+
+        # Clean temporary dockerfiles if they were created earlier
+        if not FLAGS.keep_temp_dockerfiles:
+          os.remove(dockerfile)
+
+      # Upload new images to DockerHub as long as they built + passed tests
+      if FLAGS.upload_to_hub:
+        if not tag_def['upload_images']:
+          continue
+        if tag_failed:
+          continue
+
+        eprint('>> Uploading to {}:{}'.format(FLAGS.hub_repository, tag))
+        if not FLAGS.dry_run:
+          p = multiprocessing.Process(
+              target=upload_in_background,
+              args=(FLAGS.hub_repository, dock, image, tag))
+          p.start()
+
+  if failed_tags:
+    eprint(
+        '> Some tags failed to build or failed testing, check scrollback for '
+        'errors: {}'.format(','.join(failed_tags)))
+    exit(1)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
index f889ed6f91d077fb5fb6044e55a9504c2a5b56c9..d8fabadec280cc136bd6cc9a30e79390a9a167bd 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
@@ -16,31 +16,14 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for using TensorFlow, with Jupyter included.
-#
-# Start from Ubuntu (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
 
-ARG USE_PYTHON_3_NOT_2=True
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -52,21 +35,37 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN ${PIP} install jupyter
+RUN ${PIP} install jupyter matplotlib
 
-RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
 EXPOSE 8888
 
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
index 182a534bed9855bf9e57c4f495822fe78523dcc3..857b5e20471a82bd162e55b146854d0a5c165db8 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
@@ -16,29 +16,14 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for using TensorFlow
-#
-# Start from Ubuntu (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
 
-ARG USE_PYTHON_3_NOT_2=True
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -50,10 +35,18 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
 
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
similarity index 50%
rename from tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
index dab7178db3adde78be55fc5b9d4b4254a131924e..c1f6dafbe0b023e5f4885cfd14ac34c96fcd9843 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
@@ -16,27 +16,12 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for developing changes for TensorFlow, with Jupyter included.
-#
-# Start from Ubuntu, with TF development packages (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -48,7 +33,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        python-dev \
         rsync \
         software-properties-common \
         unzip \
@@ -59,8 +43,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
+ 
+ENV CI_BUILD_PYTHON python
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
 
-ARG USE_PYTHON_3_NOT_2=True
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -72,32 +62,61 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     git \
+    wget \
     openjdk-8-jdk \
     ${PYTHON}-dev \
     swig
 
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
 # Install bazel
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
-    apt-get update && \
-    apt-get install -y bazel
+ARG BAZEL_VERSION=0.19.2
+RUN mkdir /bazel && \
+    wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    chmod +x /bazel/installer.sh && \
+    /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN ${PIP} install jupyter
+RUN ${PIP} install jupyter matplotlib
 
-RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
 EXPOSE 8888
 
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
similarity index 59%
rename from tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
index 68566ccc8aa954c64f7504d380cfcf5968f3c449..b4dfc8b09975c49f16686353cf7ec2fe1b02585b 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
@@ -16,25 +16,12 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for developing changes for TensorFlow.
-#
-# Start from Ubuntu, with TF development packages (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -46,7 +33,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        python-dev \
         rsync \
         software-properties-common \
         unzip \
@@ -57,8 +43,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
+ 
+ENV CI_BUILD_PYTHON python
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
 
-ARG USE_PYTHON_3_NOT_2=True
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -70,23 +62,44 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     git \
+    wget \
     openjdk-8-jdk \
     ${PYTHON}-dev \
     swig
 
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
 # Install bazel
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
-    apt-get update && \
-    apt-get install -y bazel
+ARG BAZEL_VERSION=0.19.2
+RUN mkdir /bazel && \
+    wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    chmod +x /bazel/installer.sh && \
+    /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6d76c06332bef15e5bbf33492a37971d9e5498f6
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -0,0 +1,142 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-10-0 \
+        cuda-cublas-dev-10-0 \
+        cuda-cudart-dev-10-0 \
+        cuda-cufft-dev-10-0 \
+        cuda-curand-dev-10-0 \
+        cuda-cusolver-dev-10-0 \
+        cuda-cusparse-dev-10-0 \
+        libcudnn7=7.4.1.5-1+cuda10.0 \
+        libcudnn7-dev=7.4.1.5-1+cuda10.0 \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        wget \
+        git \
+        && \
+    find /usr/local/cuda-10.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+
+RUN apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer-dev=5.0.2-1+cuda10.0 \
+        && rm -rf /var/lib/apt/lists/*
+
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=10.0
+ENV TF_CUDNN_VERSION=7
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    wget \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+# Install bazel
+ARG BAZEL_VERSION=0.19.2
+RUN mkdir /bazel && \
+    wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    chmod +x /bazel/installer.sh && \
+    /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..160abc876395cf048aa850301de701c950cba149
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -0,0 +1,125 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-10-0 \
+        cuda-cublas-dev-10-0 \
+        cuda-cudart-dev-10-0 \
+        cuda-cufft-dev-10-0 \
+        cuda-curand-dev-10-0 \
+        cuda-cusolver-dev-10-0 \
+        cuda-cusparse-dev-10-0 \
+        libcudnn7=7.4.1.5-1+cuda10.0 \
+        libcudnn7-dev=7.4.1.5-1+cuda10.0 \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        wget \
+        git \
+        && \
+    find /usr/local/cuda-10.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+
+RUN apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer-dev=5.0.2-1+cuda10.0 \
+        && rm -rf /var/lib/apt/lists/*
+
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=10.0
+ENV TF_CUDNN_VERSION=7
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    wget \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+# Install bazel
+ARG BAZEL_VERSION=0.19.2
+RUN mkdir /bazel && \
+    wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    chmod +x /bazel/installer.sh && \
+    /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..46252c541384c91f63cec54af299a945f28a8ccb
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
@@ -0,0 +1,99 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
+
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-10-0 \
+        cuda-cublas-10-0 \
+        cuda-cufft-10-0 \
+        cuda-curand-10-0 \
+        cuda-cusolver-10-0 \
+        cuda-cusparse-10-0 \
+        libcudnn7=7.4.1.5-1+cuda10.0 \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        software-properties-common \
+        unzip
+
+RUN apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda10.0 \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN ${PIP} install ${TF_PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
similarity index 52%
rename from tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
index e0312dbc2949797f3b6af35224bcfe66664c3cbd..80e427f824a186b64031b5325042ba374c9b0021 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
@@ -16,55 +16,42 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow.
-#
-# NVIDIA with CUDA and CuDNN, no dev stuff
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow-gpu (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
 
-FROM nvidia/cuda:9.0-base-ubuntu16.04
+FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
 
 # Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-9-0 \
-        cuda-cufft-9-0 \
-        cuda-curand-9-0 \
-        cuda-cusolver-9-0 \
-        cuda-cusparse-9-0 \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
+        cuda-command-line-tools-10-0 \
+        cuda-cublas-10-0 \
+        cuda-cufft-10-0 \
+        cuda-curand-10-0 \
+        cuda-cusolver-10-0 \
+        cuda-cusparse-10-0 \
+        libcudnn7=7.4.1.5-1+cuda10.0 \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
         software-properties-common \
-        unzip \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+        unzip
 
 RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda10.0 \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*
 
-ARG USE_PYTHON_3_NOT_2=True
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -76,11 +63,19 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
-ARG TF_PACKAGE=tensorflow-gpu
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
 
 COPY bashrc /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile
deleted file mode 100644
index 17faa84a682d90000538c913ecac545b7c4b4445..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-#
-# THIS IS A GENERATED DOCKERFILE.
-#
-# This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for TensorFlow, with Jupyter included.
-#
-# Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF development
-# packages.
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
-
-ARG UBUNTU_VERSION=16.04
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION}
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-dev-9-0 \
-        cuda-cudart-dev-9-0 \
-        cuda-cufft-dev-9-0 \
-        cuda-curand-dev-9-0 \
-        cuda-cusolver-dev-9-0 \
-        cuda-cusparse-dev-9-0 \
-        curl \
-        git \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libcudnn7-dev=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libnccl-dev=2.2.13-1+cuda9.0 \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        wget \
-        && \
-    rm -rf /var/lib/apt/lists/* && \
-    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
-
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
-        apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
-
-# Link NCCL libray and header where the build script expects them.
-RUN mkdir /usr/local/cuda-9.0/lib &&  \
-    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
-    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
-
-# TODO(tobyboyd): Remove after license is excluded from BUILD file.
-RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
-    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
-
-ARG USE_PYTHON_3_NOT_2=True
-ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
-ARG PYTHON=python${_PY_SUFFIX}
-ARG PIP=pip${_PY_SUFFIX}
-
-# See http://bugs.python.org/issue19846
-ENV LANG C.UTF-8
-
-RUN apt-get update && apt-get install -y \
-    ${PYTHON} \
-    ${PYTHON}-pip
-
-RUN ${PIP} install --upgrade \
-    pip \
-    setuptools
-
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    openjdk-8-jdk \
-    ${PYTHON}-dev \
-    swig
-
-# Install bazel
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
-    apt-get update && \
-    apt-get install -y bazel
-
-COPY bashrc /etc/bash.bashrc
-RUN chmod a+rwx /etc/bash.bashrc
-
-RUN ${PIP} install jupyter
-
-RUN mkdir /notebooks && chmod a+rwx /notebooks
-RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
-EXPOSE 8888
-
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile
deleted file mode 100644
index a3ba02a684ce6be99cddb917b3b069b3631e9436..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-#
-# THIS IS A GENERATED DOCKERFILE.
-#
-# This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for TensorFlow.
-#
-# Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF development
-# packages.
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
-
-ARG UBUNTU_VERSION=16.04
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION}
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-dev-9-0 \
-        cuda-cudart-dev-9-0 \
-        cuda-cufft-dev-9-0 \
-        cuda-curand-dev-9-0 \
-        cuda-cusolver-dev-9-0 \
-        cuda-cusparse-dev-9-0 \
-        curl \
-        git \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libcudnn7-dev=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libnccl-dev=2.2.13-1+cuda9.0 \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        wget \
-        && \
-    rm -rf /var/lib/apt/lists/* && \
-    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
-
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
-        apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
-
-# Link NCCL libray and header where the build script expects them.
-RUN mkdir /usr/local/cuda-9.0/lib &&  \
-    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
-    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
-
-# TODO(tobyboyd): Remove after license is excluded from BUILD file.
-RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
-    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
-
-ARG USE_PYTHON_3_NOT_2=True
-ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
-ARG PYTHON=python${_PY_SUFFIX}
-ARG PIP=pip${_PY_SUFFIX}
-
-# See http://bugs.python.org/issue19846
-ENV LANG C.UTF-8
-
-RUN apt-get update && apt-get install -y \
-    ${PYTHON} \
-    ${PYTHON}-pip
-
-RUN ${PIP} install --upgrade \
-    pip \
-    setuptools
-
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    openjdk-8-jdk \
-    ${PYTHON}-dev \
-    swig
-
-# Install bazel
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
-    apt-get update && \
-    apt-get install -y bazel
-
-COPY bashrc /etc/bash.bashrc
-RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile
deleted file mode 100644
index fbdea4628ad5008de9c4eee5009bca2884c47a2a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-#
-# THIS IS A GENERATED DOCKERFILE.
-#
-# This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow, with Jupyter included.
-#
-# NVIDIA with CUDA and CuDNN, no dev stuff
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow-gpu (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
-
-FROM nvidia/cuda:9.0-base-ubuntu16.04
-
-# Pick up some TF dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-9-0 \
-        cuda-cufft-9-0 \
-        cuda-curand-9-0 \
-        cuda-cusolver-9-0 \
-        cuda-cusparse-9-0 \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        software-properties-common \
-        unzip \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0
-
-ARG USE_PYTHON_3_NOT_2=True
-ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
-ARG PYTHON=python${_PY_SUFFIX}
-ARG PIP=pip${_PY_SUFFIX}
-
-# See http://bugs.python.org/issue19846
-ENV LANG C.UTF-8
-
-RUN apt-get update && apt-get install -y \
-    ${PYTHON} \
-    ${PYTHON}-pip
-
-RUN ${PIP} install --upgrade \
-    pip \
-    setuptools
-
-ARG TF_PACKAGE=tensorflow-gpu
-RUN ${PIP} install ${TF_PACKAGE}
-
-COPY bashrc /etc/bash.bashrc
-RUN chmod a+rwx /etc/bash.bashrc
-
-RUN ${PIP} install jupyter
-
-RUN mkdir /notebooks && chmod a+rwx /notebooks
-RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
-EXPOSE 8888
-
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile
deleted file mode 100644
index b08d8bdd14b638b87ac8fbd57cf2b3e8c4564582..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile
+++ /dev/null
@@ -1,13 +0,0 @@
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    openjdk-8-jdk \
-    ${PYTHON}-dev \
-    swig
-
-# Install bazel
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
-    apt-get update && \
-    apt-get install -y bazel
diff --git a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
index 2c9b9f3f9a081e97c96cedf1bbdf0936a9961d46..c4ec6095c0cae43b9d5756cd4391ca3ddd329fbe 100644
--- a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
@@ -1,8 +1,16 @@
-RUN ${PIP} install jupyter
+RUN ${PIP} install jupyter matplotlib
 
-RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
 EXPOSE 8888
 
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile
deleted file mode 100644
index 45159f711fcbdd0e6bb7083169d2abb39ab8dea5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile
+++ /dev/null
@@ -1,49 +0,0 @@
-ARG UBUNTU_VERSION=16.04
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION}
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-dev-9-0 \
-        cuda-cudart-dev-9-0 \
-        cuda-cufft-dev-9-0 \
-        cuda-curand-dev-9-0 \
-        cuda-cusolver-dev-9-0 \
-        cuda-cusparse-dev-9-0 \
-        curl \
-        git \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libcudnn7-dev=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libnccl-dev=2.2.13-1+cuda9.0 \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        wget \
-        && \
-    rm -rf /var/lib/apt/lists/* && \
-    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
-
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
-        apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
-
-# Link NCCL libray and header where the build script expects them.
-RUN mkdir /usr/local/cuda-9.0/lib &&  \
-    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
-    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
-
-# TODO(tobyboyd): Remove after license is excluded from BUILD file.
-RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
-    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
diff --git a/tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile
deleted file mode 100644
index 1064390af3b5006a8e539ad2b006d692e51692ae..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile
+++ /dev/null
@@ -1,28 +0,0 @@
-FROM nvidia/cuda:9.0-base-ubuntu16.04
-
-# Pick up some TF dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-9-0 \
-        cuda-cufft-9-0 \
-        cuda-curand-9-0 \
-        cuda-cusolver-9-0 \
-        cuda-cusparse-9-0 \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        software-properties-common \
-        unzip \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0
diff --git a/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
index 96e79547f0c67c232565019e0ae64d24d55d1516..76758bd147ef9d52b3db072bd0091190e132667c 100644
--- a/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
@@ -1,2 +1,7 @@
-ARG TF_PACKAGE
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
diff --git a/tensorflow/tools/dockerfiles/partials/test-import.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/test-import.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile
deleted file mode 100644
index 0a50735bf83364446919254010f0acab0e26404c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile
+++ /dev/null
@@ -1,2 +0,0 @@
-ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..855a01c379b3c3b26f0bd50b3b3513cdf363f135
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
@@ -0,0 +1,31 @@
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    wget \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+# Install bazel
+ARG BAZEL_VERSION=0.19.2
+RUN mkdir /bazel && \
+    wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    chmod +x /bazel/installer.sh && \
+    /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..d01b26e27f6ffb35affb95f8e40b7ce3b8e52d0a
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile
@@ -0,0 +1 @@
+FROM ubuntu:${UBUNTU_VERSION} as base
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
similarity index 64%
rename from tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile
rename to tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
index bc792722766e07d1af3d6944f14a8eb26f43dc1a..a61dfbbe54eb163b25160490f3ee245c36d21ffe 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
@@ -1,5 +1,4 @@
-ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
+FROM ubuntu:${UBUNTU_VERSION} AS base
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -11,7 +10,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        python-dev \
         rsync \
         software-properties-common \
         unzip \
@@ -22,3 +20,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
+ 
+ENV CI_BUILD_PYTHON python
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..2b4494ac5955f828b519bb2a61db72f91dace6ef
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
@@ -0,0 +1,48 @@
+FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-10-0 \
+        cuda-cublas-dev-10-0 \
+        cuda-cudart-dev-10-0 \
+        cuda-cufft-dev-10-0 \
+        cuda-curand-dev-10-0 \
+        cuda-cusolver-dev-10-0 \
+        cuda-cusparse-dev-10-0 \
+        libcudnn7=7.4.1.5-1+cuda10.0 \
+        libcudnn7-dev=7.4.1.5-1+cuda10.0 \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        wget \
+        git \
+        && \
+    find /usr/local/cuda-10.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+
+RUN apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer-dev=5.0.2-1+cuda10.0 \
+        && rm -rf /var/lib/apt/lists/*
+
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=10.0
+ENV TF_CUDNN_VERSION=7
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..a6393a3280c6eb7cf2d356b02734865be8eb5a04
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
@@ -0,0 +1,29 @@
+FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
+
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-10-0 \
+        cuda-cublas-10-0 \
+        cuda-cufft-10-0 \
+        cuda-curand-10-0 \
+        cuda-cusolver-10-0 \
+        cuda-cusparse-10-0 \
+        libcudnn7=7.4.1.5-1+cuda10.0 \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        software-properties-common \
+        unzip
+
+RUN apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda10.0 \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
diff --git a/tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile
similarity index 66%
rename from tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile
rename to tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile
index ee08af73a8e3bdd50537209c6624f4c143da9ad7..6af473195380801bded2e6849e97432caf07816b 100644
--- a/tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile
@@ -10,6 +10,9 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/test-devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/test-devel.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/version.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/version.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6ecd2b8b1acd59e50c172c3fc9c5574626ed5608
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/version.partial.Dockerfile
@@ -0,0 +1 @@
+ARG UBUNTU_VERSION=16.04
diff --git a/tensorflow/tools/dockerfiles/readme-for-jupyter.md b/tensorflow/tools/dockerfiles/readme-for-jupyter.md
new file mode 100644
index 0000000000000000000000000000000000000000..f104a7533b884bea06c46e9670d07d92bca87ea1
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/readme-for-jupyter.md
@@ -0,0 +1,3 @@
+Want more tutorials like these?
+
+Check out tensorflow.org/tutorials!
diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index 28bf9a55da123a0a45cd4b0e54971f14c355d794..19d96e7a3df4468ff82f2029a1945a02b1e58932 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -1,195 +1,148 @@
-# ======
-# HEADER
-# ======
-#
-# This is commented-out and prepended to each generated Dockerfile.
 header: |
-    Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-    ============================================================================
-
-    THIS IS A GENERATED DOCKERFILE.
-
-    This file was assembled from multiple pieces, whose use is documented
-    below. Please refer to the the TensorFlow dockerfiles documentation for
-    more information. Build args are documented as their default value.
-
-# ========
-# PARTIALS
-# ========
+    # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    # ============================================================================
+    #
+    # THIS IS A GENERATED DOCKERFILE.
+    #
+    # This file was assembled from multiple pieces, whose use is documented
+    # throughout. Please refer to the TensorFlow dockerfiles documentation
+    # for more information.
+
+# A combinatorial explosion of Docker images and Dockerfiles.
+# Each "release" defines all of the ways to combine related but separate chunks
+# of functionality ("slices") by listing all of the "slice sets" to use when
+# building.
 #
-# Represent and document pieces of a Dockerfile. Spec:
-# 
-# name: the name of the partial, is referenced from the images section
-#   desc: A description, inserted later into the Dockerfile
-#   file: Alternative file prefix, e.g. file.partial.Dockerfile. The default is
-#         the name of the partial.
-#   args: A dict of ARGs in the Dockerfile; each entry has the format
-#      ARG_NAME: VALUE where VALUE is one of:
-#         - a dict:
-#             desc: Documentation for the arg
-#             default: Default value for the arg; is written to the Dockerfile
-#             options: List of strings, part of documentation
-#         - a concrete value: the same as a dictionary with default: [value].
-
-partials:
-    ubuntu:
-        desc: Start from Ubuntu (no GPU support)
-        args:
-            UBUNTU_VERSION: 16.04
-
-    ubuntu-devel:
-        desc: Start from Ubuntu, with TF development packages (no GPU support)
-        args:
-            UBUNTU_VERSION: 16.04
-
-    bazel:
-        desc: Install the latest version of Bazel and Python development tools.
-
-    nvidia:
-        desc: NVIDIA with CUDA and CuDNN, no dev stuff
-        args:
-            UBUNTU_VERSION: 16.04
-
-    nvidia-devel:
-        desc: >
-            Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF
-            development packages.
-        args:
-            UBUNTU_VERSION: 16.04
+# For example, a release that uses {nightly}{py} would create 4 Dockerfiles
+# (which could become images or concrete Dockerfiles), because the "nightly"
+# and "py" slice sets both have two entries:
+#
+#   - nightly (no -py2 because the Python 2 slice set has add_to_name: ""
+#   - nightly-py3
+#   - nightly-gpu (similar)
+#   - nightly-gpu-py3
+#
+# Releases are all treated differently by TensorFlow's CI systems.
+releases:
+    # Built Nightly and pushed to tensorflow/tensorflow
+    nightly:
+        tag_specs:
+            - "{nightly}{py}{jupyter}"
+            - "{ubuntu-devel}{py}"
+
+    # Built per-release and pushed to tensorflow/tensorflow
+    # --arg _TAG_PREFIX=<val> should be set to "1.11" (for example) or "latest".
+    versioned:
+        tag_specs:
+            - "{_TAG_PREFIX}{ubuntu}{py}{jupyter}"
+
+    # Dockerfiles stored in the TF repo; not pushed anywhere
+    dockerfiles:
+        is_dockerfiles: true
+        upload_images: false
+        tag_specs:
+            - "{ubuntu}{jupyter}"
+            - "{ubuntu-devel}{jupyter}"
+
+slice_sets:
+
+    py:
+        - add_to_name: ""
+          args:
+              - USE_PYTHON_3_NOT_2=
+        - add_to_name: "-py3"
+          args:
+              - USE_PYTHON_3_NOT_2=1
 
-    python:
-        desc: Python is required for TensorFlow and other libraries.
-        args:
-            USE_PYTHON_3_NOT_2:
-                default: true
-                desc: Install python 3 over Python 2
-                
-    tensorflow:
-        desc: Install the TensorFlow Python package.
-        args:
-            TF_PACKAGE:
-                default: tensorflow
-                options:
-                    - tensorflow
-                    - tensorflow-gpu
-                    - tf-nightly
-                    - tf-nightly-gpu
-                desc: The specific TensorFlow Python package to install
-    shell:
-        desc: Configure TensorFlow's shell prompt and login tools.
     jupyter:
-        desc: Launch Jupyter on execution instead of a bash prompt.
-
-# ======
-# IMAGES
-# ======
-# 
-# Represent Dockerfiles. Spec:
-# 
-# name: the name of the image, possibly referenced by other images
-#   desc: A description, inserted later into the Dockerfile
-#   create-dockerfile: Create a dockerfile based on this. Useful for creating
-#      extensible base images that don't need a file. Default is true.
-#   partials: List of VALUEs, where a VALUE is either:
-#      - the name of a partial, which inserts that partial into this image
-#      - image: [name of another image], which inserts the partials from that
-#        image into this image
-#   arg-defaults: List of VALUEs, where a VALUE is either:
-#      - ARG_NAME: VALUE, which sets the ARG_NAME to VALUE wherever it appears
-#        in this image's partials
-#      - [name of another image], which loads the default args from that image
-images:
-
-    nodev:
-        create-dockerfile: false
-        partials:
-            - python
-            - tensorflow
-            - shell
-
-    dev:
-        create-dockerfile: false
-        partials:
-            - python
-            - bazel
-            - shell
-
-    cpu:
-      desc: Ubuntu-based, CPU-only environment for using TensorFlow
-      partials:
-        - ubuntu
-        - image: nodev
-
-    cpu-devel:
-      desc: >
-          Ubuntu-based, CPU-only environment for developing changes for
-          TensorFlow.
-      partials:
-        - ubuntu-devel
-        - image: dev
+        - add_to_name: ""
+        - add_to_name: "-jupyter"
+          partials:
+              - jupyter
 
-    nvidia:
-      desc: Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow.
-      arg-defaults: 
-        - TF_PACKAGE: tensorflow-gpu
-      partials:
-        - nvidia
-        - image: nodev
-
-    nvidia-devel:
-      desc: >
-          Ubuntu-based, Nvidia-GPU-enabled environment for developing changes
-          for TensorFlow.
-      arg-defaults: 
-        - TF_PACKAGE: tensorflow-gpu
-      partials:
-        - nvidia-devel
-        - image: dev
-
-    cpu-jupyter:
-      desc: >
-          Ubuntu-based, CPU-only environment for using TensorFlow, with Jupyter
-          included.
-      partials:
-        - image: cpu
-        - jupyter
-
-    cpu-devel-jupyter:
-      desc: >
-         Ubuntu-based, CPU-only environment for developing changes for
-         TensorFlow, with Jupyter included.
-      partials:
-        - image: cpu-devel
-        - jupyter
-
-    nvidia-jupyter:
-      desc: >
-        Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow, with
-        Jupyter included.
-      arg-defaults: 
-        - nvidia
-      partials:
-        - image: nvidia
-        - jupyter
+    ubuntu:
+        - add_to_name: ""
+          dockerfile_exclusive_name: "cpu"
+          partials:
+              - ubuntu/version
+              - ubuntu/cpu
+              - ubuntu/python
+              - tensorflow
+              - shell
+        - add_to_name: "-gpu"
+          dockerfile_exclusive_name: "gpu"
+          args:
+              - TF_PACKAGE=tensorflow-gpu
+          partials:
+              - ubuntu/version
+              - ubuntu/nvidia
+              - ubuntu/python
+              - tensorflow
+              - shell
+          tests:
+              - import-gpu.sh
+          test_runtime: nvidia
 
-    nvidia-devel-jupyter:
-      desc: >
-        Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for
-        TensorFlow, with Jupyter included.
-      arg-defaults: 
-        - nvidia-devel
-      partials:
-        - image: nvidia-devel
-        - jupyter
+    ubuntu-devel:
+        - add_to_name: "devel"
+          dockerfile_exclusive_name: "devel-cpu"
+          partials:
+              - ubuntu/version
+              - ubuntu/devel-cpu
+              - ubuntu/python
+              - ubuntu/bazel
+              - shell
+          tests:
+              - build-cpu.sh
+          args:
+              - CHECKOUT_TF_SRC=1
+        - add_to_name: "devel-gpu"
+          dockerfile_exclusive_name: "devel-gpu"
+          partials:
+              - ubuntu/version
+              - ubuntu/devel-nvidia
+              - ubuntu/python
+              - ubuntu/bazel
+              - shell
+          tests:
+              - build-gpu.sh
+          test_runtime: nvidia
+          args:
+              - CHECKOUT_TF_SRC=1
+
+    nightly:
+        - add_to_name: "nightly"
+          partials:
+              - ubuntu/version
+              - ubuntu/cpu
+              - ubuntu/python
+              - tensorflow
+              - shell
+          args:
+              - TF_PACKAGE=tf-nightly
+          tests:
+              - import.sh
+        - add_to_name: "nightly-gpu"
+          partials:
+              - ubuntu/version
+              - ubuntu/nvidia
+              - ubuntu/python
+              - tensorflow
+              - shell
+          test_runtime: nvidia
+          tests:
+              - import-gpu.sh
+          args:
+              - TF_PACKAGE=tf-nightly-gpu
diff --git a/tensorflow/tools/dockerfiles/tests/build-cpu.sh b/tensorflow/tools/dockerfiles/tests/build-cpu.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bcdc4c2139c83e65c15998d3dd6be2f29e27bff3
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/build-cpu.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# Download and build TensorFlow.
+set -euxo pipefail
+git clone --branch=master --depth=1 https://github.com/tensorflow/tensorflow.git /tensorflow
+cd /tensorflow
+
+ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# For optimized builds appropriate for the hardware platform of your choosing, uncomment below...
+# For ivy-bridge or sandy-bridge
+# --copt=-march="ivybridge" \
+# for haswell, broadwell, or skylake
+# --copt=-march="haswell" \
+tensorflow/tools/ci_build/builds/configured CPU \
+  bazel build -c opt --copt=-mavx --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+      tensorflow/tools/pip_package:build_pip_package && \
+  bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
+  pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
+  rm -rf /tmp/pip && \
+  rm -rf /root/.cache
+
diff --git a/tensorflow/tools/dockerfiles/tests/build-gpu.sh b/tensorflow/tools/dockerfiles/tests/build-gpu.sh
new file mode 100755
index 0000000000000000000000000000000000000000..76b25d5a7419b9a07a6799f14fa5175fb6fa36d5
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/build-gpu.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# Download and build TensorFlow.
+set -euxo pipefail
+git clone --branch=master --depth=1 https://github.com/tensorflow/tensorflow.git /tensorflow
+cd /tensorflow
+
+ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
+LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
+tensorflow/tools/ci_build/builds/configured GPU \
+bazel build -c opt --copt=-mavx --config=cuda \
+    --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+    tensorflow/tools/pip_package:build_pip_package && \
+rm /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
+bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
+pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
+rm -rf /tmp/pip && \
+rm -rf /root/.cache
diff --git a/tensorflow/tools/dockerfiles/tests/import-gpu.sh b/tensorflow/tools/dockerfiles/tests/import-gpu.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6559210dcbfbb5fe3c76c369c5ae211920f46d15
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/import-gpu.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+python -c 'import tensorflow as tf; tf.test.is_gpu_available() or exit(1)'
diff --git a/tensorflow/tools/dockerfiles/tests/import.sh b/tensorflow/tools/dockerfiles/tests/import.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b73bd86a8529e2b7634f0b027196b978f8245da0
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/import.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+set -euxo pipefail
+python -c 'import tensorflow as tf'
diff --git a/tensorflow/tools/dockerfiles/assembler.Dockerfile b/tensorflow/tools/dockerfiles/tools.Dockerfile
similarity index 95%
rename from tensorflow/tools/dockerfiles/assembler.Dockerfile
rename to tensorflow/tools/dockerfiles/tools.Dockerfile
index 7a8e07fced3465e188f47727013fa92d14424c7c..e8929295a5ee397acbe46ebf96894174ca01fca2 100644
--- a/tensorflow/tools/dockerfiles/assembler.Dockerfile
+++ b/tensorflow/tools/dockerfiles/tools.Dockerfile
@@ -20,8 +20,9 @@
 FROM debian:stretch
 LABEL maintainer="Austin Anderson <angerson@google.com>"
 
-RUN apt-get update && apt-get install -y python3 python3-pip bash
-RUN pip3 install --upgrade pip setuptools pyyaml absl-py cerberus
+RUN apt-get update && apt-get install -y python3 python3-pip bash curl
+RUN curl -sSL https://get.docker.com/ | sh
+RUN pip3 install --upgrade pip setuptools pyyaml absl-py cerberus docker
 
 WORKDIR /tf
 VOLUME ["/tf"]
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index 3dfd0dca33d34fae678be60665556e9ad241611d..fba909d26defffad2d7dbaffa4463695685ae50c 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -47,20 +47,24 @@ flags.DEFINE_string(
     "output_dir", "/tmp/out",
     "A directory, where the docs will be output to.")
 
+flags.DEFINE_bool("search_hints", True,
+                  "Include meta-data search hints at the top of each file.")
 
-def build_docs(output_dir, code_url_prefix):
+
+def build_docs(output_dir, code_url_prefix, search_hints=True):
   """Build api docs for tensorflow v2.
 
   Args:
     output_dir: A string path, where to put the files.
     code_url_prefix: prefix for "Defined in" links.
+    search_hints: Bool. Include meta-data search hints at the top of each file.
   """
   base_dir = path.dirname(tf.__file__)
   doc_generator = generate_lib.DocGenerator(
       root_title="TensorFlow 2.0 Preview",
       py_modules=[("tf", tf)],
       base_dir=base_dir,
-      search_hints=True,
+      search_hints=search_hints,
       code_url_prefix=code_url_prefix,
       site_path="api_docs/")
 
@@ -70,7 +74,8 @@ def build_docs(output_dir, code_url_prefix):
 def main(argv):
   del argv
   build_docs(output_dir=FLAGS.output_dir,
-             code_url_prefix=FLAGS.code_url_prefix)
+             code_url_prefix=FLAGS.code_url_prefix,
+             search_hints=FLAGS.search_hints)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/graph_transforms/backports.cc b/tensorflow/tools/graph_transforms/backports.cc
index 5c153e8cefc900728c78340dd43a56737d887b21..041e7eedfb7a38f0eeb7ec17b44c92010041dc29 100644
--- a/tensorflow/tools/graph_transforms/backports.cc
+++ b/tensorflow/tools/graph_transforms/backports.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/flatten_atrous.cc b/tensorflow/tools/graph_transforms/flatten_atrous.cc
index a6f7cb0ed8b45dc537b6fe8c7b9d7e09685feef9..c80b28fbbca7e3d29f5abdef30a130934f17c9c0 100644
--- a/tensorflow/tools/graph_transforms/flatten_atrous.cc
+++ b/tensorflow/tools/graph_transforms/flatten_atrous.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fold_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
index 975b17380f6ca7fbd94783c6226f54c89e730cde..16a0f7d58df66be06224d58de623ee7e2dc41880 100644
--- a/tensorflow/tools/graph_transforms/fold_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index 6df2718e61074daab7bdfd75ca923035ffe5fba4..dcc36b1a8557cf30ac030302fcb7545da55c7886 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index 156636ab8215d9abdc9e0ed461df550f1c7ed09c..fd546f812c0dafc5d2e71c94710c3c3f5b75250e 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fuse_convolutions.cc b/tensorflow/tools/graph_transforms/fuse_convolutions.cc
index df6e9e6dc2864872fa8f30741735a7d5985a3104..7754dde9c68753ea648ce31e0f87329826e10828 100644
--- a/tensorflow/tools/graph_transforms/fuse_convolutions.cc
+++ b/tensorflow/tools/graph_transforms/fuse_convolutions.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fuse_quantized_convolution.cc b/tensorflow/tools/graph_transforms/fuse_quantized_convolution.cc
index bd021d094efcea5ca5f512929d1b84e933a17d84..5aa2dd4f99b89f0ea03fe69db854c55f3f2f3c38 100644
--- a/tensorflow/tools/graph_transforms/fuse_quantized_convolution.cc
+++ b/tensorflow/tools/graph_transforms/fuse_quantized_convolution.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
diff --git a/tensorflow/tools/graph_transforms/insert_logging.cc b/tensorflow/tools/graph_transforms/insert_logging.cc
index 377665448c244aeace78f231ba0c263613afd9a0..ccc48540eb9731514ecbff41de86df956ff91a3b 100644
--- a/tensorflow/tools/graph_transforms/insert_logging.cc
+++ b/tensorflow/tools/graph_transforms/insert_logging.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/obfuscate_names.cc b/tensorflow/tools/graph_transforms/obfuscate_names.cc
index c470b51b96096a36eacdc67a74431ec02e0515d0..ee8ca3d097d71fef91d0ee50057ff6d215891596 100644
--- a/tensorflow/tools/graph_transforms/obfuscate_names.cc
+++ b/tensorflow/tools/graph_transforms/obfuscate_names.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc
index a022f5792676c62c52fd1197b0d8c436f7161a47..b139dad2ddd13ade70a4563a50b0db2db298ef36 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/quantize_weights.cc b/tensorflow/tools/graph_transforms/quantize_weights.cc
index cccae8a992a64b0f49798eda71513a2fe62ad656..a1a6e27171ee5a48dec91d64a3b15f6caa88dbf8 100644
--- a/tensorflow/tools/graph_transforms/quantize_weights.cc
+++ b/tensorflow/tools/graph_transforms/quantize_weights.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/remove_attribute.cc b/tensorflow/tools/graph_transforms/remove_attribute.cc
index b1a04c0f283bf6bc03da702447694558c5b98538..0a76c2b2052a2c26ee66691b361fff2be70bbf30 100644
--- a/tensorflow/tools/graph_transforms/remove_attribute.cc
+++ b/tensorflow/tools/graph_transforms/remove_attribute.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/remove_device.cc b/tensorflow/tools/graph_transforms/remove_device.cc
index 975fa3706335dd38e4f0992ff4c155addfc5e6a9..fdd43168a117b89884187e6b7a29e5f44f14fd33 100644
--- a/tensorflow/tools/graph_transforms/remove_device.cc
+++ b/tensorflow/tools/graph_transforms/remove_device.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/remove_nodes.cc b/tensorflow/tools/graph_transforms/remove_nodes.cc
index 05f036a86a09b2a6a94e9c1a1220803eabc64da5..aa0288689d9e093a39e8aa6b9156bac19ef40491 100644
--- a/tensorflow/tools/graph_transforms/remove_nodes.cc
+++ b/tensorflow/tools/graph_transforms/remove_nodes.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/rename_attribute.cc b/tensorflow/tools/graph_transforms/rename_attribute.cc
index bd066aab5b9ab69a38e313c0b0437457b3a2bb52..62897d43a8ca774418c7b45c1f886cd8cd7fd850 100644
--- a/tensorflow/tools/graph_transforms/rename_attribute.cc
+++ b/tensorflow/tools/graph_transforms/rename_attribute.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/rename_op.cc b/tensorflow/tools/graph_transforms/rename_op.cc
index e1e13c1be43a531355e5df4530183bd55836fe4c..9deee8bbffbbda41c1e59480c5e642d4c6ce1de9 100644
--- a/tensorflow/tools/graph_transforms/rename_op.cc
+++ b/tensorflow/tools/graph_transforms/rename_op.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/round_weights.cc b/tensorflow/tools/graph_transforms/round_weights.cc
index 72927e439b7f4177a8db035d022ba450a924ad98..3a145ac1f6b0ef238383f4eb75dd5de023503c47 100644
--- a/tensorflow/tools/graph_transforms/round_weights.cc
+++ b/tensorflow/tools/graph_transforms/round_weights.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/sort_by_execution_order.cc b/tensorflow/tools/graph_transforms/sort_by_execution_order.cc
index 43152d20fcc1aa477983c8d792dcab2e74664e73..548f5ba4820a82718676d995cbd7a09332051bf4 100644
--- a/tensorflow/tools/graph_transforms/sort_by_execution_order.cc
+++ b/tensorflow/tools/graph_transforms/sort_by_execution_order.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index cc82100148117c7846ba5781e1a97e172ad7f03c..bed51f89821032862ec3d24077cb51d9c676be94 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
diff --git a/tensorflow/tools/graph_transforms/strip_unused_nodes.cc b/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
index ae9d0aa20999c86fe2ea8902204604807f0f298c..d466f21c17ddfec9c0b0181f844b1b608f95246a 100644
--- a/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
+++ b/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 1cac5ee138316cd2f9839d2c67648d7d0703a398..1186189844aa887ba011b532df3a73d89ffe52b8 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -169,7 +169,6 @@ genrule(
         "grpc",
         [
             "@grpc//:LICENSE",
-            "@grpc//third_party/nanopb:LICENSE.txt",
             "@grpc//third_party/address_sorting:LICENSE",
         ],
     ),
@@ -208,7 +207,6 @@ genrule(
         "@zlib_archive//:zlib.h",
         "@grpc//:LICENSE",
         "@grpc//third_party/address_sorting:LICENSE",
-        "@grpc//third_party/nanopb:LICENSE.txt",
     ] + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index fa372dcd74b0557d6410feb111c60ef7e94007f5..4ed2f6ce3418e9d5ccb32618402af9a8f29ce1b5 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -18,6 +18,13 @@ load(
     "if_ngraph",
 )
 
+# This flag specifies whether TensorFlow 2.0 API should be built instead
+# of 1.* API. Note that TensorFlow 2.0 API is currently under development.
+config_setting(
+    name = "api_version_2",
+    define_values = {"tf_api_version": "2"},
+)
+
 # This returns a list of headers of all public header libraries (e.g.,
 # framework, lib), and all of the transitive dependencies of those
 # public headers.  Not all of the headers returned by the filegroup
@@ -59,35 +66,10 @@ COMMON_PIP_DEPS = [
     "setup.py",
     ":included_headers",
     "//tensorflow:tensorflow_py",
-    "//tensorflow/contrib/autograph:autograph",
-    "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
-    "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
-    "//tensorflow/contrib/compiler:xla",
-    "//tensorflow/contrib/constrained_optimization:constrained_optimization_pip",
-    "//tensorflow/contrib/eager/python/examples:examples_pip",
-    "//tensorflow/contrib/eager/python:evaluator",
-    "//tensorflow/contrib/gan:gan",
-    "//tensorflow/contrib/graph_editor:graph_editor_pip",
-    "//tensorflow/contrib/keras:keras",
-    "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
-    "//tensorflow/contrib/nn:nn_py",
-    "//tensorflow/contrib/predictor:predictor_pip",
-    "//tensorflow/contrib/proto:proto",
-    "//tensorflow/contrib/receptive_field:receptive_field_pip",
-    "//tensorflow/contrib/rate:rate",
-    "//tensorflow/contrib/rpc:rpc_pip",
-    "//tensorflow/contrib/session_bundle:session_bundle_pip",
-    "//tensorflow/contrib/signal:signal_py",
-    "//tensorflow/contrib/slim:slim",
-    "//tensorflow/contrib/slim/python/slim/data:data_pip",
-    "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
-    "//tensorflow/contrib/specs:specs",
-    "//tensorflow/contrib/summary:summary_test_util",
-    "//tensorflow/contrib/tensor_forest:init_py",
-    "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
-    "//tensorflow/contrib/timeseries:timeseries_pip",
-    "//tensorflow/contrib/tpu",
     "//tensorflow/examples/tutorials/mnist:package",
+    "//tensorflow/lite/python:interpreter_test_data",
+    "//tensorflow/lite/python:tflite_convert",
+    "//tensorflow/lite/toco/python:toco_from_protos",
     # "//tensorflow/python/autograph/converters:converters",
     # "//tensorflow/python/autograph/core:core",
     "//tensorflow/python/autograph/core:test_lib",
@@ -110,6 +92,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/eager:eager_pip",
     "//tensorflow/python/kernel_tests/signal:test_util",
     "//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files",
+    "//tensorflow/python/ops/ragged:ragged_test_util",
     "//tensorflow/python/saved_model:saved_model",
     "//tensorflow/python/tools:tools_pip",
     "//tensorflow/python/tools/api/generator:create_python_api",
@@ -118,13 +101,47 @@ COMMON_PIP_DEPS = [
     "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
 ]
 
+COMMON_PIP_DEPS_V1 = COMMON_PIP_DEPS + [
+    "//tensorflow/contrib/autograph:autograph",
+    "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
+    "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
+    "//tensorflow/contrib/compiler:xla",
+    "//tensorflow/contrib/constrained_optimization:constrained_optimization_pip",
+    "//tensorflow/contrib/eager/python/examples:examples_pip",
+    "//tensorflow/contrib/eager/python:evaluator",
+    "//tensorflow/contrib/gan:gan",
+    "//tensorflow/contrib/graph_editor:graph_editor_pip",
+    "//tensorflow/contrib/keras:keras",
+    "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
+    "//tensorflow/contrib/nn:nn_py",
+    "//tensorflow/contrib/predictor:predictor_pip",
+    "//tensorflow/contrib/proto:proto",
+    "//tensorflow/contrib/receptive_field:receptive_field_pip",
+    "//tensorflow/contrib/rate:rate",
+    "//tensorflow/contrib/rpc:rpc_pip",
+    "//tensorflow/contrib/session_bundle:session_bundle_pip",
+    "//tensorflow/contrib/signal:signal_py",
+    "//tensorflow/contrib/slim:slim",
+    "//tensorflow/contrib/slim/python/slim/data:data_pip",
+    "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
+    "//tensorflow/contrib/specs:specs",
+    "//tensorflow/contrib/summary:summary_test_util",
+    "//tensorflow/contrib/tensor_forest:init_py",
+    "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
+    "//tensorflow/contrib/timeseries:timeseries_pip",
+    "//tensorflow/contrib/tpu",
+]
+
 # On Windows, python binary is a zip file of runfiles tree.
 # Add everything to its data dependency for generating a runfiles tree
 # for building the pip package on Windows.
 py_binary(
     name = "simple_console_for_windows",
     srcs = ["simple_console_for_windows.py"],
-    data = COMMON_PIP_DEPS,
+    data = select({
+        "api_version_2": COMMON_PIP_DEPS,
+        "//conditions:default": COMMON_PIP_DEPS_V1,
+    }) + ["//tensorflow/python:pywrap_tensorflow_import_lib_file"],
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
@@ -211,7 +228,6 @@ filegroup(
         "grpc",
         [
             "@grpc//:LICENSE",
-            "@grpc//third_party/nanopb:LICENSE.txt",
             "@grpc//third_party/address_sorting:LICENSE",
         ],
     ) + if_ngraph([
@@ -228,15 +244,12 @@ sh_binary(
     data = select({
         "//tensorflow:windows": [
             ":simple_console_for_windows",
-            "//tensorflow/lite/python:interpreter_test_data",
-            "//tensorflow/lite/python:tflite_convert",
-            "//tensorflow/lite/toco/python:toco_from_protos",
         ],
-        "//conditions:default": COMMON_PIP_DEPS + [
+        "api_version_2": COMMON_PIP_DEPS + [
+            ":simple_console",
+        ],
+        "//conditions:default": COMMON_PIP_DEPS_V1 + [
             ":simple_console",
-            "//tensorflow/lite/python:interpreter_test_data",
-            "//tensorflow/lite/python:tflite_convert",
-            "//tensorflow/lite/toco/python:toco_from_protos",
         ],
     }) + if_mkl_ml(["//third_party/mkl:intel_binary_blob"]),
 )
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 85c913f158863c5ff3718ae3f305829e15237b22..3927540cc79ef8b827ce4d7e60e884c2237f8e9d 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -45,7 +45,7 @@ DOCLINES = __doc__.split('\n')
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.12.0-rc0'
+_VERSION = '1.12.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
@@ -87,7 +87,9 @@ if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
       REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.13.0a0, < 1.14.0a0'
-    if 'tensorflow_estimator' in pkg:
+    elif 'tensorflow_estimator' in pkg and '2.0' in project_name:
+      REQUIRED_PACKAGES[i] = 'tensorflow-estimator-2.0-preview'
+    elif 'tensorflow_estimator' in pkg:
       REQUIRED_PACKAGES[i] = 'tf-estimator-nightly'
 
 # weakref.finalize and enum were introduced in Python 3.4
diff --git a/tensorflow/version_check.bzl b/tensorflow/version_check.bzl
index 79e721dab422c1449214acbe5fc1643edc3a9db0..74feaa19ff1523375249adbb7397c3d082d9f96c 100644
--- a/tensorflow/version_check.bzl
+++ b/tensorflow/version_check.bzl
@@ -1,48 +1,52 @@
 """ Helpers to check minimum version of bazel."""
 
 def _extract_version_number(bazel_version):
-  """Extracts the semantic version number from a version string
+    """Extracts the semantic version number from a version string
 
-  Args:
-    bazel_version: the version string that begins with the semantic version
-      e.g. "1.2.3rc1 abc1234" where "abc1234" is a commit hash.
+    Args:
+      bazel_version: the version string that begins with the semantic version
+        e.g. "1.2.3rc1 abc1234" where "abc1234" is a commit hash.
 
-  Returns:
-    The semantic version string, like "1.2.3".
-  """
-  for i in range(len(bazel_version)):
-    c = bazel_version[i]
-    if not (c.isdigit() or c == "."):
-      return bazel_version[:i]
-  return bazel_version
+    Returns:
+      The semantic version string, like "1.2.3".
+    """
+    for i in range(len(bazel_version)):
+        c = bazel_version[i]
+        if not (c.isdigit() or c == "."):
+            return bazel_version[:i]
+    return bazel_version
 
 # Parse the bazel version string from `native.bazel_version`.
 # e.g.
 # "0.10.0rc1 abc123d" => (0, 10, 0)
 # "0.3.0" => (0, 3, 0)
 def _parse_bazel_version(bazel_version):
-  """Parses a version string into a 3-tuple of ints
+    """Parses a version string into a 3-tuple of ints
 
-  int tuples can be compared directly using binary operators (<, >).
+    int tuples can be compared directly using binary operators (<, >).
 
-  Args:
-    bazel_version: the Bazel version string
+    Args:
+      bazel_version: the Bazel version string
 
-  Returns:
-    An int 3-tuple of a (major, minor, patch) version.
-  """
+    Returns:
+      An int 3-tuple of a (major, minor, patch) version.
+    """
 
-  version = _extract_version_number(bazel_version)
-  return tuple([int(n) for n in version.split(".")])
+    version = _extract_version_number(bazel_version)
+    return tuple([int(n) for n in version.split(".")])
 
 def check_bazel_version_at_least(minimum_bazel_version):
-  if "bazel_version" not in dir(native):
-    fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" % minimum_bazel_version)
-  elif not native.bazel_version:
-    print("\nCurrent Bazel is not a release version, cannot check for compatibility.")
-    print("Make sure that you are running at least Bazel %s.\n" % minimum_bazel_version)
-    return
-
-  if _parse_bazel_version(native.bazel_version) < _parse_bazel_version(minimum_bazel_version):
-    fail("\nCurrent Bazel version is {}, expected at least {}\n".format(
-        native.bazel_version, minimum_bazel_version))
+    if "bazel_version" not in dir(native):
+        fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" % minimum_bazel_version)
+    elif not native.bazel_version:
+        print("\nCurrent Bazel is not a release version, cannot check for compatibility.")
+        print("Make sure that you are running at least Bazel %s.\n" % minimum_bazel_version)
+        return
+
+    if _parse_bazel_version(native.bazel_version) < _parse_bazel_version(minimum_bazel_version):
+        fail("\nCurrent Bazel version is {}, expected at least {}\n".format(
+            native.bazel_version,
+            minimum_bazel_version,
+        ))
+
+parse_bazel_version = _parse_bazel_version
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index dde10ef73da7553e20b01c22254552293eede6e1..aefab03b6d79f8c0f4ead003034d9d22fe2f1e07 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -23,6 +23,7 @@ load(
 load("//third_party/aws:workspace.bzl", aws = "repo")
 load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
 load("//third_party/highwayhash:workspace.bzl", highwayhash = "repo")
+load("//third_party/hwloc:workspace.bzl", hwloc = "repo")
 load("//third_party/icu:workspace.bzl", icu = "repo")
 load("//third_party/jpeg:workspace.bzl", jpeg = "repo")
 load("//third_party/nasm:workspace.bzl", nasm = "repo")
@@ -34,6 +35,7 @@ def initialize_third_party():
     aws()
     flatbuffers()
     highwayhash()
+    hwloc()
     icu()
     keras_applications()
     kissfft()
@@ -123,22 +125,22 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
-        sha256 = "3cf6132129ba87f0781c383bfaf381b7174b5818e81fffcc5d04bb451154f0f2",
-        strip_prefix = "abseil-cpp-f95179062eb65ce40895cc76f1398cce25394369",
+        sha256 = "ab499df1dc1ee5f9bf95f327adc22a7bd327ae5e7c023309cddccd0763ba1043",
+        strip_prefix = "abseil-cpp-389ec3f906f018661a5308458d623d01f96d7b23",
         urls = [
-            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/f95179062eb65ce40895cc76f1398cce25394369.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/f95179062eb65ce40895cc76f1398cce25394369.tar.gz",
+            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/389ec3f906f018661a5308458d623d01f96d7b23.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/389ec3f906f018661a5308458d623d01f96d7b23.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
-        sha256 = "6e505fa8bf8d234d0338679b390cb89f850d870214c751b01a5db7f647e4d438",
-        strip_prefix = "eigen-eigen-135398e50bed",
+        sha256 = "753fbb58d0a49b6bcbcfb126ebfa2e21fc97f7471529ba835a096008ce588d8a",
+        strip_prefix = "eigen-eigen-9f48e814419e",
         urls = [
-            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/135398e50bed.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/135398e50bed.tar.gz",
+            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/9f48e814419e.tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/9f48e814419e.tar.gz",
         ],
     )
 
@@ -168,26 +170,26 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "com_googlesource_code_re2",
-        sha256 = "803c7811146edeef8f91064de37c6f19136ff01a2a8cdb3230e940b2fd9f07fe",
-        strip_prefix = "re2-2018-07-01",
+        sha256 = "a31397714a353587413d307337d0b58f8a2e20e2b9d02f2e24e3463fa4eeda81",
+        strip_prefix = "re2-2018-10-01",
         system_build_file = clean_dep("//third_party/systemlibs:re2.BUILD"),
         urls = [
-            "https://mirror.bazel.build/github.com/google/re2/archive/2018-07-01.tar.gz",
-            "https://github.com/google/re2/archive/2018-07-01.tar.gz",
+            "https://mirror.bazel.build/github.com/google/re2/archive/2018-10-01.tar.gz",
+            "https://github.com/google/re2/archive/2018-10-01.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "com_github_googlecloudplatform_google_cloud_cpp",
-        sha256 = "3ade2072e6588ff56c0434abe6c63aa5f3f2d56be15a299bafc7e9cdf0a12c17",
-        strip_prefix = "google-cloud-cpp-0.3.0",
+        sha256 = "44eee8bd47cbd5ff192e895b45f9f913e2e117f10fdb9af0fd3b1a87a7b53bc3",
+        strip_prefix = "google-cloud-cpp-0.4.0",
         system_build_file = clean_dep("//third_party/systemlibs:google_cloud_cpp.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD",
         },
         urls = [
-            "https://mirror.bazel.build/github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.3.0.tar.gz",
-            "https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.3.0.tar.gz",
+            "https://mirror.bazel.build/github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.4.0.tar.gz",
+            "https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.4.0.tar.gz",
         ],
     )
 
@@ -347,16 +349,20 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     )
 
     PROTOBUF_URLS = [
-        "https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.1.1.tar.gz",
-        "https://github.com/google/protobuf/archive/v3.6.1.1.tar.gz",
+        "https://mirror.bazel.build/github.com/protocolbuffers/protobuf/archive/v3.6.1.2.tar.gz",
+        "https://github.com/protocolbuffers/protobuf/archive/v3.6.1.2.tar.gz",
     ]
-    PROTOBUF_SHA256 = "1ade182f91f0fa6c6116195def5d22270e01b9d03fe91319e4c6215022d0d24b"
-    PROTOBUF_STRIP_PREFIX = "protobuf-3.6.1.1"
+    PROTOBUF_SHA256 = "2244b0308846bb22b4ff0bcc675e99290ff9f1115553ae9671eba1030af31bc0"
+    PROTOBUF_STRIP_PREFIX = "protobuf-3.6.1.2"
 
     tf_http_archive(
         name = "protobuf_archive",
         sha256 = PROTOBUF_SHA256,
         strip_prefix = PROTOBUF_STRIP_PREFIX,
+        system_build_file = clean_dep("//third_party/systemlibs:protobuf.BUILD"),
+        system_link_files = {
+            "//third_party/systemlibs:protobuf.bzl": "protobuf.bzl",
+        },
         urls = PROTOBUF_URLS,
     )
 
@@ -367,6 +373,10 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         name = "com_google_protobuf",
         sha256 = PROTOBUF_SHA256,
         strip_prefix = PROTOBUF_STRIP_PREFIX,
+        system_build_file = clean_dep("//third_party/systemlibs:protobuf.BUILD"),
+        system_link_files = {
+            "//third_party/systemlibs:protobuf.bzl": "protobuf.bzl",
+        },
         urls = PROTOBUF_URLS,
     )
 
@@ -374,6 +384,10 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         name = "com_google_protobuf_cc",
         sha256 = PROTOBUF_SHA256,
         strip_prefix = PROTOBUF_STRIP_PREFIX,
+        system_build_file = clean_dep("//third_party/systemlibs:protobuf.BUILD"),
+        system_link_files = {
+            "//third_party/systemlibs:protobuf.bzl": "protobuf.bzl",
+        },
         urls = PROTOBUF_URLS,
     )
 
@@ -445,14 +459,26 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
+    # WARNING: make sure ncteisen@ and vpai@ are cc-ed on any CL to change the below rule
     tf_http_archive(
         name = "grpc",
-        sha256 = "50db9cf2221354485eb7c3bd55a4c27190caef7048a2a1a15fbe60a498f98b44",
-        strip_prefix = "grpc-1.13.0",
+        sha256 = "1aa84387232dda273ea8fdfe722622084f72c16f7b84bfc519ac7759b71cdc91",
+        strip_prefix = "grpc-69b6c047bc767b4d80e7af4d00ccb7c45b683dae",
         system_build_file = clean_dep("//third_party/systemlibs:grpc.BUILD"),
         urls = [
-            "https://mirror.bazel.build/github.com/grpc/grpc/archive/v1.13.0.tar.gz",
-            "https://github.com/grpc/grpc/archive/v1.13.0.tar.gz",
+            "https://mirror.bazel.build/github.com/grpc/grpc/archive/69b6c047bc767b4d80e7af4d00ccb7c45b683dae.tar.gz",
+            "https://github.com/grpc/grpc/archive/69b6c047bc767b4d80e7af4d00ccb7c45b683dae.tar.gz",
+        ],
+    )
+
+    tf_http_archive(
+        name = "com_github_nanopb_nanopb",
+        sha256 = "8bbbb1e78d4ddb0a1919276924ab10d11b631df48b657d960e0c795a25515735",
+        build_file = "@grpc//third_party:nanopb.BUILD",
+        strip_prefix = "nanopb-f8ac463766281625ad710900479130c7fcb4d63b",
+        urls = [
+            "https://mirror.bazel.build/github.com/nanopb/nanopb/archive/f8ac463766281625ad710900479130c7fcb4d63b.tar.gz",
+            "https://github.com/nanopb/nanopb/archive/f8ac463766281625ad710900479130c7fcb4d63b.tar.gz",
         ],
     )
 
@@ -472,11 +498,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "llvm",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
-        sha256 = "f4791ba3e166918bca82df34e2f854e8e188d6055888c64cb28743fd43f2d0d7",
-        strip_prefix = "llvm-b2a42b2112a511a5077fd747fb21e45349cff08d",
+        sha256 = "55769c91b9f5b5255d58a1ecd88e690a4e192dc8cbdf8f984596649abe3b5433",
+        strip_prefix = "llvm-2ba3294845dedcbb27dc49287bfbcdb49aa1e6b7",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/b2a42b2112a511a5077fd747fb21e45349cff08d.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/b2a42b2112a511a5077fd747fb21e45349cff08d.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/2ba3294845dedcbb27dc49287bfbcdb49aa1e6b7.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/2ba3294845dedcbb27dc49287bfbcdb49aa1e6b7.tar.gz",
         ],
     )
 
@@ -710,12 +736,22 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     )
 
     tf_http_archive(
-        name = "tflite_mobilenet",
-        build_file = clean_dep("//third_party:tflite_mobilenet.BUILD"),
-        sha256 = "23f814d1c076bdf03715dfb6cab3713aa4fbdf040fd5448c43196bd2e97a4c1b",
+        name = "tflite_mobilenet_float",
+        build_file = clean_dep("//third_party:tflite_mobilenet_float.BUILD"),
+        sha256 = "2fadeabb9968ec6833bee903900dda6e61b3947200535874ce2fe42a8493abc0",
+        urls = [
+            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz",
+            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz",
+        ],
+    )
+
+    tf_http_archive(
+        name = "tflite_mobilenet_quant",
+        build_file = clean_dep("//third_party:tflite_mobilenet_quant.BUILD"),
+        sha256 = "d32432d28673a936b2d6281ab0600c71cf7226dfe4cdcef3012555f691744166",
         urls = [
-            "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
-            "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
+            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
+            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
         ],
     )
 
@@ -794,44 +830,44 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "tbb",
         build_file = clean_dep("//third_party/ngraph:tbb.BUILD"),
-        sha256 = "724686f90bcda78f13b76f297d964008737ccd6399328143c1c0093e73ae6a13",
-        strip_prefix = "tbb-tbb_2018",
+        sha256 = "c3245012296f09f1418b78a8c2f17df5188b3bd0db620f7fd5fabe363320805a",
+        strip_prefix = "tbb-2019_U1",
         urls = [
-            "https://mirror.bazel.build/github.com/01org/tbb/archive/tbb_2018.zip",
-            "https://github.com/01org/tbb/archive/tbb_2018.zip",
+            "https://mirror.bazel.build/github.com/01org/tbb/archive/2019_U1.zip",
+            "https://github.com/01org/tbb/archive/2019_U1.zip",
         ],
     )
 
     tf_http_archive(
         name = "ngraph",
         build_file = clean_dep("//third_party/ngraph:ngraph.BUILD"),
-        sha256 = "2b28f9c9f063b96825a96d56d7f7978c9a1c55c9b25175c20dd49a8a77cb0305",
-        strip_prefix = "ngraph-0.9.1",
+        sha256 = "a1780f24a1381fc25e323b4b2d08b6ef5129f42e011305b2a34dcf43a48030d5",
+        strip_prefix = "ngraph-0.11.0",
         urls = [
-            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.9.1.tar.gz",
-            "https://github.com/NervanaSystems/ngraph/archive/v0.9.1.tar.gz",
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.11.0.tar.gz",
+            "https://github.com/NervanaSystems/ngraph/archive/v0.11.0.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "nlohmann_json_lib",
         build_file = clean_dep("//third_party/ngraph:nlohmann_json.BUILD"),
-        sha256 = "9f3549824af3ca7e9707a2503959886362801fb4926b869789d6929098a79e47",
-        strip_prefix = "json-3.1.1",
+        sha256 = "c377963a95989270c943d522bfefe7b889ef5ed0e1e15d535fd6f6f16ed70732",
+        strip_prefix = "json-3.4.0",
         urls = [
-            "https://mirror.bazel.build/github.com/nlohmann/json/archive/v3.1.1.tar.gz",
-            "https://github.com/nlohmann/json/archive/v3.1.1.tar.gz",
+            "https://mirror.bazel.build/github.com/nlohmann/json/archive/v3.4.0.tar.gz",
+            "https://github.com/nlohmann/json/archive/v3.4.0.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "ngraph_tf",
         build_file = clean_dep("//third_party/ngraph:ngraph_tf.BUILD"),
-        sha256 = "89accbc702e68a09775f1011a99dd16561038fd1ce59d566d64450176abaae5c",
-        strip_prefix = "ngraph-tf-0.7.0",
+        sha256 = "742a642d2c6622277df4c902b6830d616d0539cc8cd843d6cdb899bb99e66e36",
+        strip_prefix = "ngraph-tf-0.9.0",
         urls = [
-            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.7.0.tar.gz",
-            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.7.0.tar.gz",
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.9.0.zip",
+            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.9.0.zip",
         ],
     )
 
@@ -846,7 +882,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     # important since we have set GRPC_ARES=0 in .bazelrc
     native.bind(
         name = "cares",
-        actual = "@grpc//third_party/nanopb:nanopb",
+        actual = "@com_github_nanopb_nanopb//:nanopb",
     )
 
     # Needed by Protobuf
@@ -878,7 +914,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     # Needed by gRPC
     native.bind(
         name = "nanopb",
-        actual = "@grpc//third_party/nanopb:nanopb",
+        actual = "@com_github_nanopb_nanopb//:nanopb",
     )
 
     # Needed by gRPC
diff --git a/third_party/clang_toolchain/download_clang.bzl b/third_party/clang_toolchain/download_clang.bzl
index a941ee1c998dae14febe2453184fb75d1afe8016..7ced9027473e39ad9870ce138b64c7f7ec64ad01 100644
--- a/third_party/clang_toolchain/download_clang.bzl
+++ b/third_party/clang_toolchain/download_clang.bzl
@@ -39,15 +39,15 @@ def download_clang(repo_ctx, out_folder):
 
     # Latest CLANG_REVISION and CLANG_SUB_REVISION of the Chromiums's release
     # can be found in https://chromium.googlesource.com/chromium/src/tools/clang/+/master/scripts/update.py
-    CLANG_REVISION = "346388"
-    CLANG_SUB_REVISION = 3
+    CLANG_REVISION = "347933"
+    CLANG_SUB_REVISION = 1
 
     package_version = "%s-%s" % (CLANG_REVISION, CLANG_SUB_REVISION)
 
     checksums = {
-        "Linux_x64": "d47b7ac4756c3f8e3bbfa0e81bf199ec8e9faa3a6b11573f0705e9c04af7ad51",
-        "Mac": "de2b0c701e19cda633ea02804866dd24d8506afb8cae51fbcce3415b76f4ded3",
-        "Win": "c7d27f13b41aa9eaaf9760903962e9b2b0f8261058df0d35170711dc60545a7d",
+        "Linux_x64": "cae3643fdf5d46fc9bc8731212bb37573547148d90b64b083165e090133d11b0",
+        "Mac": "083a0e91a38c06e568652313ac7372b17a101268f7d65533d721ca30413442b4",
+        "Win": "43160487cfc7e88076a369a2b6e8e4a0f42e104c28d8903f3aaa62d630aba949",
     }
 
     platform_folder = _get_platform_folder(repo_ctx.os.name)
diff --git a/third_party/gpus/crosstool/BUILD.tpl b/third_party/gpus/crosstool/BUILD.tpl
index c8812fab3378328e44504598257a8860b45d1671..1260b265abdbce9f9db6b411555236292fe303d5 100644
--- a/third_party/gpus/crosstool/BUILD.tpl
+++ b/third_party/gpus/crosstool/BUILD.tpl
@@ -22,6 +22,7 @@ cc_toolchain_suite(
         "local|compiler": ":cc-compiler-local",
         "darwin|compiler": ":cc-compiler-darwin",
         "x64_windows|msvc-cl": ":cc-compiler-windows",
+        "x64_windows": ":cc-compiler-windows",
     },
 )
 
@@ -41,6 +42,7 @@ cc_toolchain(
     # last on the command line and contain all shared libraries to link, so all
     # regular options will be left of them.
     supports_param_files = 1,
+    toolchain_identifier = "local_linux",
 )
 
 cc_toolchain(
@@ -55,6 +57,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 0,
+    toolchain_identifier = "local_darwin",
 )
 
 cc_toolchain(
@@ -69,6 +72,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_identifier = "local_windows",
 )
 
 filegroup(
diff --git a/third_party/gpus/crosstool/CROSSTOOL.tpl b/third_party/gpus/crosstool/CROSSTOOL.tpl
index 921188cbb431d925df69fbd0cc06aac07fe1a1a9..5ca9b2deb4f3e39ab1b78bf695d7b75100d1fac6 100644
--- a/third_party/gpus/crosstool/CROSSTOOL.tpl
+++ b/third_party/gpus/crosstool/CROSSTOOL.tpl
@@ -642,6 +642,31 @@ toolchain {
     name: "no_legacy_features"
   }
 
+  # TODO(klimek): Previously we were using a .bat file to start python to run
+  # the python script that can redirect to nvcc - unfortunately .bat files
+  # have a rather short maximum length for command lines (8k). Instead, we
+  # now use the python binary as the compiler and pass the python script to
+  # it at the start of the command line. Investigate different possibilities
+  # to run the nvcc wrapper, either using pyinstaller --onefile, or writing
+  # a small C++ wrapper to redirect.
+  feature {
+    name: "redirector"
+    enabled: true
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      flag_group {
+        flag: "-B"
+        flag: "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py"
+      }
+    }
+  }
+
   # Suppress startup banner.
   feature {
     name: "nologo"
diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.bat.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.bat.tpl
deleted file mode 100644
index 8f8fb3e4231bf1b689cf9b21c53e990d5b9ee354..0000000000000000000000000000000000000000
--- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.bat.tpl
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-:: Invoke msvc_wrapper_for_nvcc.py, which is located in the same directory.
-@echo OFF
-set arg0=%~0
-for %%F in ("%arg0%") do set DRIVER_BIN=%%~dpF
-"%{python_binary}" -B "%DRIVER_BIN%\msvc_wrapper_for_nvcc.py" %*
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 03c67bcb3d75aca19bcad8b824d79283193dc115..8aa5b89cddb336380d35f85a6ecd3ebdf6589e88 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -190,7 +190,7 @@ def _get_win_cuda_defines(repository_ctx):
       get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
           "\\", "\\\\"),)
 
-  msvc_cl_path = "windows/msvc_wrapper_for_nvcc.bat"
+  msvc_cl_path = _get_python_bin(repository_ctx)
   msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace(
       "\\", "/")
   msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace(
@@ -1426,7 +1426,6 @@ def _create_local_cuda_repository(repository_ctx):
     repository_ctx.file(
         "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc", "")
     repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.py", "")
-    repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.bat", "")
   else:
     cuda_defines[
         "%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
@@ -1486,13 +1485,6 @@ def _create_local_cuda_repository(repository_ctx):
         "crosstool:windows/msvc_wrapper_for_nvcc.py",
         wrapper_defines,
     )
-    _tpl(
-        repository_ctx,
-        "crosstool:windows/msvc_wrapper_for_nvcc.bat",
-        {
-            "%{python_binary}": _get_python_bin(repository_ctx),
-        },
-    )
 
   _tpl(
       repository_ctx,
diff --git a/third_party/hwloc/BUILD b/third_party/hwloc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2f5d02becb930602574c4df02c51cec7662bc25d
--- /dev/null
+++ b/third_party/hwloc/BUILD
@@ -0,0 +1 @@
+# Dummy BUILD file to make this directory a package.
diff --git a/third_party/hwloc/BUILD.bazel b/third_party/hwloc/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..b73267d6680077aa855cab5d3af727e0763e0788
--- /dev/null
+++ b/third_party/hwloc/BUILD.bazel
@@ -0,0 +1,87 @@
+# hwloc: Portable Hardware Locality Library
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+COMMON_INCLUDE_COPTS = [
+    "-I.",
+    "-Ihwloc",
+    "-Iinclude",
+]
+
+DISABLE_WARNINGS_COPTS = [
+    "-Wno-vla",
+]
+
+VAR_SETTINGS_COPTS = [
+    "-DHWLOC_DUMPED_HWDATA_DIR=",
+    "-DRUNSTATEDIR=",
+]
+
+cc_library(
+    name = "hwloc",
+    srcs = [
+        "hwloc/base64.c",
+        "hwloc/bind.c",
+        "hwloc/bitmap.c",
+        "hwloc/components.c",
+        "hwloc/diff.c",
+        "hwloc/distances.c",
+        "hwloc/misc.c",
+        "hwloc/pci-common.c",
+        "hwloc/shmem.c",
+        "hwloc/static-components.h",
+        "hwloc/topology.c",
+        "hwloc/topology-hardwired.c",
+        "hwloc/topology-linux.c",
+        "hwloc/topology-noos.c",
+        "hwloc/topology-synthetic.c",
+        "hwloc/topology-x86.c",
+        "hwloc/topology-xml.c",
+        "hwloc/topology-xml-nolibxml.c",
+        "hwloc/traversal.c",
+        "include/hwloc/linux.h",
+        "include/hwloc/plugins.h",
+        "include/hwloc/shmem.h",
+        "include/private/autogen/config.h",
+        "include/private/components.h",
+        "include/private/cpuid-x86.h",
+        "include/private/debug.h",
+        "include/private/internal-components.h",
+        "include/private/misc.h",
+        "include/private/private.h",
+        "include/private/xml.h",
+    ],
+    hdrs = [
+        "include/hwloc.h",
+        "include/hwloc/autogen/config.h",
+        "include/hwloc/bitmap.h",
+        "include/hwloc/deprecated.h",
+        "include/hwloc/diff.h",
+        "include/hwloc/distances.h",
+        "include/hwloc/export.h",
+        "include/hwloc/helper.h",
+        "include/hwloc/inlines.h",
+        "include/hwloc/rename.h",
+    ],
+    copts = COMMON_INCLUDE_COPTS + DISABLE_WARNINGS_COPTS + VAR_SETTINGS_COPTS,
+    features = [
+        "-parse_headers",
+        "-layering_check",
+    ],
+    deps = [],
+)
+
+cc_binary(
+    name = "hwloc_print",
+    srcs = ["hwloc_print.cc"],
+    copts = COMMON_INCLUDE_COPTS,
+    deps = [
+        ":hwloc",
+    ],
+)
diff --git a/third_party/hwloc/workspace.bzl b/third_party/hwloc/workspace.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..47a143c8a0e0cee70b1c9087f78170adabe40ed9
--- /dev/null
+++ b/third_party/hwloc/workspace.bzl
@@ -0,0 +1,15 @@
+"""loads the hwloc library, used by TF."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "hwloc",
+        urls = [
+            "http://mirror.bazel.build/download.open-mpi.org/release/hwloc/v2.0/hwloc-2.0.3.tar.gz",
+            "https://download.open-mpi.org/release/hwloc/v2.0/hwloc-2.0.3.tar.gz",
+        ],
+        sha256 = "64def246aaa5b3a6e411ce10932a22e2146c3031b735c8f94739534f06ad071c",
+        strip_prefix = "hwloc-2.0.3",
+        build_file = "//third_party/hwloc:BUILD.bazel",
+    )
diff --git a/third_party/icu/BUILD.system b/third_party/icu/BUILD.system
index 328e412a8c29f6f7c2f5ecc5b6e8bbec7613972c..8a88a6ef7e0a51448e5c6157be2c277a60c53198 100644
--- a/third_party/icu/BUILD.system
+++ b/third_party/icu/BUILD.system
@@ -1,13 +1,19 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
 licenses(["notice"])  # Apache 2.0
 
 filegroup(
     name = "icu4c/LICENSE",
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "icu4j/main/shared/licenses/LICENSE",
-    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
 )
 
 cc_library(
@@ -15,7 +21,6 @@ cc_library(
     deps = [
         ":icuuc",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
diff --git a/third_party/keras_applications_archive/BUILD.system b/third_party/keras_applications_archive/BUILD.system
new file mode 100644
index 0000000000000000000000000000000000000000..a3b58f15030bb0648f73064c214b939856961d90
--- /dev/null
+++ b/third_party/keras_applications_archive/BUILD.system
@@ -0,0 +1,13 @@
+# Description: Keras Applications: set of pre-trained deep learning models.
+
+licenses(["notice"])  # MIT
+
+filegroup(
+    name = "LICENSE",
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "keras_applications",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/keras_applications_archive/workspace.bzl b/third_party/keras_applications_archive/workspace.bzl
index e90630fa974fb97f4c7d5a72c045a44c237a6ace..cf9d15ca28874439d5d8e78f87d8b502908d07fe 100644
--- a/third_party/keras_applications_archive/workspace.bzl
+++ b/third_party/keras_applications_archive/workspace.bzl
@@ -12,4 +12,5 @@ def repo():
             "https://github.com/keras-team/keras-applications/archive/1.0.6.tar.gz",
         ],
         build_file = "//third_party/keras_applications_archive:BUILD.bazel",
+        system_build_file = "//third_party/keras_applications_archive:BUILD.system",
     )
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index eb468aa65fce9c014bc7b53f1fb69729eb2a3718..6599b9e91b8bc29306a787d671174110526aa19b 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -2241,7 +2241,6 @@ cc_library(
     deps = [
         ":code_gen",
         ":config",
-        ":core",
         ":support",
     ],
 )
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
index 7a8ed3bf43955dfa3a77c7cafa30817b9d176d2d..bd842b87f8d28941072b1d11fb4ab6d3c54c28e0 100644
--- a/third_party/mkl_dnn/mkldnn.BUILD
+++ b/third_party/mkl_dnn/mkldnn.BUILD
@@ -17,8 +17,12 @@ cc_library(
     name = "mkl_dnn",
     srcs = glob([
         "src/common/*.cpp",
+        "src/common/*.hpp",
         "src/cpu/*.cpp",
+        "src/cpu/*.hpp",
         "src/cpu/gemm/*.cpp",
+        "src/cpu/gemm/*.hpp",
+        "src/cpu/xbyak/*.h",
     ]),
     hdrs = glob(["include/*"]),
     copts = [
@@ -63,3 +67,31 @@ cc_library(
         "//conditions:default": [],
     }),
 )
+
+cc_library(
+    name = "mkldnn_single_threaded",
+    srcs = glob([
+        "src/common/*.cpp",
+        "src/common/*.hpp",
+        "src/cpu/*.cpp",
+        "src/cpu/*.hpp",
+        "src/cpu/gemm/*.cpp",
+        "src/cpu/gemm/*.hpp",
+        "src/cpu/xbyak/*.h",
+    ]),
+    hdrs = glob(["include/*"]),
+    copts = [
+        "-fexceptions",
+        "-DMKLDNN_THR=MKLDNN_THR_SEQ",  # Disables threading.
+    ],
+    includes = [
+        "include",
+        "src",
+        "src/common",
+        "src/cpu",
+        "src/cpu/gemm",
+        "src/cpu/xbyak",
+    ],
+    nocopts = "-fno-exceptions",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/mpi/mpi.bzl b/third_party/mpi/mpi.bzl
index 38ce91c4d069fc311d5e7f17a49ff7904c9c67eb..3a483351d1f982eba09d6522db9842dd4f7eca84 100644
--- a/third_party/mpi/mpi.bzl
+++ b/third_party/mpi/mpi.bzl
@@ -2,16 +2,16 @@
 #based on the configuration options return one or the other
 
 def mpi_hdr():
-    MPI_LIB_IS_OPENMPI=True
-    hdrs = []    
+    MPI_LIB_IS_OPENMPI = True
+    hdrs = []
     if MPI_LIB_IS_OPENMPI:
-        hdrs = ["mpi.h", "mpi_portable_platform.h"]   #When using OpenMPI
+        hdrs = ["mpi.h", "mpi_portable_platform.h"]  #When using OpenMPI
     else:
-        hdrs = ["mpi.h",  "mpio.h", "mpicxx.h"]        #When using MVAPICH
+        hdrs = ["mpi.h", "mpio.h", "mpicxx.h"]  #When using MVAPICH
     return hdrs
 
 def if_mpi(if_true, if_false = []):
     return select({
         "//tensorflow:with_mpi_support": if_true,
-        "//conditions:default": if_false
+        "//conditions:default": if_false,
     })
diff --git a/third_party/ngraph/ngraph.BUILD b/third_party/ngraph/ngraph.BUILD
index 63e9548c53262461cfc9c3fd160f4f17430319c7..a7da325766cecc049065f9fe91d41d27f26ba1be 100644
--- a/third_party/ngraph/ngraph.BUILD
+++ b/third_party/ngraph/ngraph.BUILD
@@ -56,14 +56,16 @@ cc_library(
         "src/ngraph/runtime/cpu/cpu_backend.cpp",
         "src/ngraph/runtime/cpu/cpu_builder.cpp",
         "src/ngraph/runtime/cpu/cpu_call_frame.cpp",
+        "src/ngraph/runtime/cpu/cpu_cse.cpp",
+        "src/ngraph/runtime/cpu/cpu_executor.cpp",
         "src/ngraph/runtime/cpu/cpu_external_function.cpp",
         "src/ngraph/runtime/cpu/cpu_kernels.cpp",
         "src/ngraph/runtime/cpu/cpu_layout_descriptor.cpp",
+        "src/ngraph/runtime/cpu/cpu_op_annotations.cpp",
         "src/ngraph/runtime/cpu/cpu_tensor_view.cpp",
         "src/ngraph/runtime/cpu/cpu_tensor_view_wrapper.cpp",
         "src/ngraph/runtime/cpu/cpu_tracing.cpp",
         "src/ngraph/runtime/cpu/cpu_visualize_tree.cpp",
-        "src/ngraph/runtime/cpu/kernel/eigen_thread_pool.cpp",
         "src/ngraph/runtime/cpu/kernel/pad.cpp",
         "src/ngraph/runtime/cpu/kernel/reduce_max.cpp",
         "src/ngraph/runtime/cpu/kernel/reduce_sum.cpp",
@@ -79,20 +81,26 @@ cc_library(
         "src/ngraph/runtime/cpu/op/conv_relu.cpp",
         "src/ngraph/runtime/cpu/op/convert_layout.cpp",
         "src/ngraph/runtime/cpu/op/group_conv.cpp",
+        "src/ngraph/runtime/cpu/op/group_conv_bias.cpp",
+        "src/ngraph/runtime/cpu/op/halide_op.cpp",
+        "src/ngraph/runtime/cpu/op/leaky_relu.cpp",
         "src/ngraph/runtime/cpu/op/loop_kernel.cpp",
         "src/ngraph/runtime/cpu/op/lstm.cpp",
         "src/ngraph/runtime/cpu/op/matmul_bias.cpp",
         "src/ngraph/runtime/cpu/op/max_pool_with_indices.cpp",
         "src/ngraph/runtime/cpu/op/rnn.cpp",
         "src/ngraph/runtime/cpu/op/sigmoid_mul.cpp",
+        "src/ngraph/runtime/cpu/op/update_slice.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_assignment.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_collapse_dims.cpp",
-        "src/ngraph/runtime/cpu/pass/cpu_concat_inputs.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_fusion.cpp",
+        "src/ngraph/runtime/cpu/pass/cpu_horizontal_fusion.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_layout.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_mat_fusion.cpp",
+        "src/ngraph/runtime/cpu/pass/cpu_memory_optimization.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.cpp",
+        "src/ngraph/runtime/cpu/pass/cpu_reshape_sinking.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_rnn_fusion.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_workspace_insertion.cpp",
     ],
@@ -101,7 +109,7 @@ cc_library(
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
         '-D SHARED_LIB_EXT=\\".so\\"',
-        '-D NGRAPH_VERSION=\\"0.9.1\\"',
+        '-D NGRAPH_VERSION=\\"0.11.0\\"',
         "-D NGRAPH_DEX_ONLY",
         '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
@@ -124,11 +132,13 @@ cc_library(
         "src/ngraph/builder/*.cpp",
         "src/ngraph/descriptor/*.cpp",
         "src/ngraph/descriptor/layout/*.cpp",
+        "src/ngraph/op/experimental/generate_mask.cpp",
         "src/ngraph/op/experimental/quantized_avg_pool.cpp",
         "src/ngraph/op/experimental/quantized_conv_bias.cpp",
         "src/ngraph/op/experimental/quantized_conv_relu.cpp",
         "src/ngraph/op/experimental/quantized_conv.cpp",
         "src/ngraph/op/experimental/quantized_max_pool.cpp",
+        "src/ngraph/op/experimental/shape_of.cpp",
         "src/ngraph/op/*.cpp",
         "src/ngraph/op/util/*.cpp",
         "src/ngraph/pattern/*.cpp",
@@ -142,7 +152,7 @@ cc_library(
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
         '-D SHARED_LIB_EXT=\\".so\\"',
-        '-D NGRAPH_VERSION=\\"0.9.1\\"',
+        '-D NGRAPH_VERSION=\\"0.11.0\\"',
         '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
     visibility = ["//visibility:public"],
diff --git a/third_party/ngraph/ngraph_tf.BUILD b/third_party/ngraph/ngraph_tf.BUILD
index db9a66f9b5bcdaa29ec55175f1a8c76ac5f6f22a..6397e19e36aca5ea264a44ce5e92a1ca24ba46fc 100644
--- a/third_party/ngraph/ngraph_tf.BUILD
+++ b/third_party/ngraph/ngraph_tf.BUILD
@@ -18,6 +18,8 @@ cc_library(
         "src/ngraph_api.h",
         "src/ngraph_assign_clusters.cc",
         "src/ngraph_assign_clusters.h",
+        "src/ngraph_backend_manager.cc",
+        "src/ngraph_backend_manager.h",
         "src/ngraph_builder.cc",
         "src/ngraph_builder.h",
         "src/ngraph_capture_variables.cc",
diff --git a/third_party/systemlibs/protobuf.BUILD b/third_party/systemlibs/protobuf.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4b1cf396b9b7abef8feaa653c7c71e9e8a9e304e
--- /dev/null
+++ b/third_party/systemlibs/protobuf.BUILD
@@ -0,0 +1,104 @@
+load(
+    "@protobuf_archive//:protobuf.bzl",
+    "proto_gen",
+    "py_proto_library",
+    "cc_proto_library",
+)
+
+licenses(["notice"])
+
+filegroup(
+    name = "LICENSE",
+    visibility = ["//visibility:public"],
+)
+
+HEADERS = [
+    "google/protobuf/any.pb.h",
+    "google/protobuf/any.proto",
+    "google/protobuf/arena.h",
+    "google/protobuf/compiler/importer.h",
+    "google/protobuf/descriptor.h",
+    "google/protobuf/descriptor.pb.h",
+    "google/protobuf/descriptor.proto",
+    "google/protobuf/duration.pb.h",
+    "google/protobuf/duration.proto",
+    "google/protobuf/dynamic_message.h",
+    "google/protobuf/empty.pb.h",
+    "google/protobuf/empty.proto",
+    "google/protobuf/field_mask.pb.h",
+    "google/protobuf/field_mask.proto",
+    "google/protobuf/io/coded_stream.h",
+    "google/protobuf/io/zero_copy_stream.h",
+    "google/protobuf/io/zero_copy_stream_impl_lite.h",
+    "google/protobuf/map.h",
+    "google/protobuf/repeated_field.h",
+    "google/protobuf/text_format.h",
+    "google/protobuf/timestamp.pb.h",
+    "google/protobuf/timestamp.proto",
+    "google/protobuf/util/json_util.h",
+    "google/protobuf/util/type_resolver_util.h",
+    "google/protobuf/wrappers.pb.h",
+    "google/protobuf/wrappers.proto",
+]
+
+genrule(
+    name = "link_headers",
+    outs = HEADERS,
+    cmd = """
+      for i in $(OUTS); do
+        f=$${i#$(@D)/}
+        mkdir -p $(@D)/$${f%/*}
+        ln -sf $(INCLUDEDIR)/$$f $(@D)/$$f
+      done
+    """,
+)
+
+cc_library(
+    name = "protobuf",
+    hdrs = HEADERS,
+    linkopts = ["-lprotobuf"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "protobuf_headers",
+    hdrs = HEADERS,
+    linkopts = ["-lprotobuf"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "protoc_lib",
+    linkopts = ["-lprotoc"],
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "protoc",
+    outs = ["protoc.bin"],
+    cmd = "ln -s $$(which protoc) $@",
+    executable = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_proto_library(
+    name = "cc_wkt_protos",
+    hdrs = HEADERS,
+    internal_bootstrap_hack = 1,
+    protoc = ":protoc",
+    visibility = ["//visibility:public"],
+)
+
+proto_gen(
+    name = "protobuf_python_genproto",
+    includes = ["."],
+    protoc = "@protobuf_archive//:protoc",
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "protobuf_python",
+    data = [":link_headers"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/systemlibs/protobuf.bzl b/third_party/systemlibs/protobuf.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..2aa75610a9313d12daeb7406ea0107e53231e814
--- /dev/null
+++ b/third_party/systemlibs/protobuf.bzl
@@ -0,0 +1,425 @@
+def _GetPath(ctx, path):
+    if ctx.label.workspace_root:
+        return ctx.label.workspace_root + "/" + path
+    else:
+        return path
+
+def _IsNewExternal(ctx):
+    # Bazel 0.4.4 and older have genfiles paths that look like:
+    #   bazel-out/local-fastbuild/genfiles/external/repo/foo
+    # After the exec root rearrangement, they look like:
+    #   ../repo/bazel-out/local-fastbuild/genfiles/foo
+    return ctx.label.workspace_root.startswith("../")
+
+def _GenDir(ctx):
+    if _IsNewExternal(ctx):
+        # We are using the fact that Bazel 0.4.4+ provides repository-relative paths
+        # for ctx.genfiles_dir.
+        return ctx.genfiles_dir.path + (
+            "/" + ctx.attr.includes[0] if ctx.attr.includes and ctx.attr.includes[0] else ""
+        )
+
+    # This means that we're either in the old version OR the new version in the local repo.
+    # Either way, appending the source path to the genfiles dir works.
+    return ctx.var["GENDIR"] + "/" + _SourceDir(ctx)
+
+def _SourceDir(ctx):
+    if not ctx.attr.includes:
+        return ctx.label.workspace_root
+    if not ctx.attr.includes[0]:
+        return _GetPath(ctx, ctx.label.package)
+    if not ctx.label.package:
+        return _GetPath(ctx, ctx.attr.includes[0])
+    return _GetPath(ctx, ctx.label.package + "/" + ctx.attr.includes[0])
+
+def _CcHdrs(srcs, use_grpc_plugin = False):
+    ret = [s[:-len(".proto")] + ".pb.h" for s in srcs]
+    if use_grpc_plugin:
+        ret += [s[:-len(".proto")] + ".grpc.pb.h" for s in srcs]
+    return ret
+
+def _CcSrcs(srcs, use_grpc_plugin = False):
+    ret = [s[:-len(".proto")] + ".pb.cc" for s in srcs]
+    if use_grpc_plugin:
+        ret += [s[:-len(".proto")] + ".grpc.pb.cc" for s in srcs]
+    return ret
+
+def _CcOuts(srcs, use_grpc_plugin = False):
+    return _CcHdrs(srcs, use_grpc_plugin) + _CcSrcs(srcs, use_grpc_plugin)
+
+def _PyOuts(srcs, use_grpc_plugin = False):
+    ret = [s[:-len(".proto")] + "_pb2.py" for s in srcs]
+    if use_grpc_plugin:
+        ret += [s[:-len(".proto")] + "_pb2_grpc.py" for s in srcs]
+    return ret
+
+def _RelativeOutputPath(path, include, dest = ""):
+    if include == None:
+        return path
+
+    if not path.startswith(include):
+        fail("Include path %s isn't part of the path %s." % (include, path))
+
+    if include and include[-1] != "/":
+        include = include + "/"
+    if dest and dest[-1] != "/":
+        dest = dest + "/"
+
+    path = path[len(include):]
+    return dest + path
+
+def _proto_gen_impl(ctx):
+    """General implementation for generating protos"""
+    srcs = ctx.files.srcs
+    deps = []
+    deps += ctx.files.srcs
+    source_dir = _SourceDir(ctx)
+    gen_dir = _GenDir(ctx)
+    if source_dir:
+        import_flags = ["-I" + source_dir, "-I" + gen_dir]
+    else:
+        import_flags = ["-I."]
+
+    for dep in ctx.attr.deps:
+        import_flags += dep.proto.import_flags
+        deps += dep.proto.deps
+
+    args = []
+    if ctx.attr.gen_cc:
+        args += ["--cpp_out=" + gen_dir]
+    if ctx.attr.gen_py:
+        args += ["--python_out=" + gen_dir]
+
+    inputs = srcs + deps
+    if ctx.executable.plugin:
+        plugin = ctx.executable.plugin
+        lang = ctx.attr.plugin_language
+        if not lang and plugin.basename.startswith("protoc-gen-"):
+            lang = plugin.basename[len("protoc-gen-"):]
+        if not lang:
+            fail("cannot infer the target language of plugin", "plugin_language")
+
+        outdir = gen_dir
+        if ctx.attr.plugin_options:
+            outdir = ",".join(ctx.attr.plugin_options) + ":" + outdir
+        args += ["--plugin=protoc-gen-%s=%s" % (lang, plugin.path)]
+        args += ["--%s_out=%s" % (lang, outdir)]
+        inputs += [plugin]
+
+    if args:
+        ctx.action(
+            inputs = inputs,
+            outputs = ctx.outputs.outs,
+            arguments = args + import_flags + [s.path for s in srcs],
+            executable = ctx.executable.protoc,
+            mnemonic = "ProtoCompile",
+            use_default_shell_env = True,
+        )
+
+    return struct(
+        proto = struct(
+            srcs = srcs,
+            import_flags = import_flags,
+            deps = deps,
+        ),
+    )
+
+proto_gen = rule(
+    attrs = {
+        "srcs": attr.label_list(allow_files = True),
+        "deps": attr.label_list(providers = ["proto"]),
+        "includes": attr.string_list(),
+        "protoc": attr.label(
+            cfg = "host",
+            executable = True,
+            single_file = True,
+            mandatory = True,
+        ),
+        "plugin": attr.label(
+            cfg = "host",
+            allow_files = True,
+            executable = True,
+        ),
+        "plugin_language": attr.string(),
+        "plugin_options": attr.string_list(),
+        "gen_cc": attr.bool(),
+        "gen_py": attr.bool(),
+        "outs": attr.output_list(),
+    },
+    output_to_genfiles = True,
+    implementation = _proto_gen_impl,
+)
+"""Generates codes from Protocol Buffers definitions.
+
+This rule helps you to implement Skylark macros specific to the target
+language. You should prefer more specific `cc_proto_library `,
+`py_proto_library` and others unless you are adding such wrapper macros.
+
+Args:
+  srcs: Protocol Buffers definition files (.proto) to run the protocol compiler
+    against.
+  deps: a list of dependency labels; must be other proto libraries.
+  includes: a list of include paths to .proto files.
+  protoc: the label of the protocol compiler to generate the sources.
+  plugin: the label of the protocol compiler plugin to be passed to the protocol
+    compiler.
+  plugin_language: the language of the generated sources
+  plugin_options: a list of options to be passed to the plugin
+  gen_cc: generates C++ sources in addition to the ones from the plugin.
+  gen_py: generates Python sources in addition to the ones from the plugin.
+  outs: a list of labels of the expected outputs from the protocol compiler.
+"""
+
+def cc_proto_library(
+        name,
+        srcs = [],
+        deps = [],
+        cc_libs = [],
+        include = None,
+        protoc = "@com_google_protobuf//:protoc",
+        internal_bootstrap_hack = False,
+        use_grpc_plugin = False,
+        default_runtime = "@com_google_protobuf//:protobuf",
+        **kargs):
+    """Bazel rule to create a C++ protobuf library from proto source files
+
+    NOTE: the rule is only an internal workaround to generate protos. The
+    interface may change and the rule may be removed when bazel has introduced
+    the native rule.
+
+    Args:
+      name: the name of the cc_proto_library.
+      srcs: the .proto files of the cc_proto_library.
+      deps: a list of dependency labels; must be cc_proto_library.
+      cc_libs: a list of other cc_library targets depended by the generated
+          cc_library.
+      include: a string indicating the include path of the .proto files.
+      protoc: the label of the protocol compiler to generate the sources.
+      internal_bootstrap_hack: a flag indicate the cc_proto_library is used only
+          for bootstraping. When it is set to True, no files will be generated.
+          The rule will simply be a provider for .proto files, so that other
+          cc_proto_library can depend on it.
+      use_grpc_plugin: a flag to indicate whether to call the grpc C++ plugin
+          when processing the proto files.
+      default_runtime: the implicitly default runtime which will be depended on by
+          the generated cc_library target.
+      **kargs: other keyword arguments that are passed to cc_library.
+
+    """
+
+    includes = []
+    if include != None:
+        includes = [include]
+
+    if internal_bootstrap_hack:
+        # For pre-checked-in generated files, we add the internal_bootstrap_hack
+        # which will skip the codegen action.
+        proto_gen(
+            name = name + "_genproto",
+            srcs = srcs,
+            deps = [s + "_genproto" for s in deps],
+            includes = includes,
+            protoc = protoc,
+            visibility = ["//visibility:public"],
+        )
+
+        # An empty cc_library to make rule dependency consistent.
+        native.cc_library(
+            name = name,
+            **kargs
+        )
+        return
+
+    grpc_cpp_plugin = None
+    if use_grpc_plugin:
+        grpc_cpp_plugin = "//external:grpc_cpp_plugin"
+
+    gen_srcs = _CcSrcs(srcs, use_grpc_plugin)
+    gen_hdrs = _CcHdrs(srcs, use_grpc_plugin)
+    outs = gen_srcs + gen_hdrs
+
+    proto_gen(
+        name = name + "_genproto",
+        srcs = srcs,
+        deps = [s + "_genproto" for s in deps],
+        includes = includes,
+        protoc = protoc,
+        plugin = grpc_cpp_plugin,
+        plugin_language = "grpc",
+        gen_cc = 1,
+        outs = outs,
+        visibility = ["//visibility:public"],
+    )
+
+    if default_runtime and not default_runtime in cc_libs:
+        cc_libs = cc_libs + [default_runtime]
+    if use_grpc_plugin:
+        cc_libs = cc_libs + ["//external:grpc_lib"]
+
+    native.cc_library(
+        name = name,
+        srcs = gen_srcs,
+        hdrs = gen_hdrs,
+        deps = cc_libs + deps,
+        includes = includes,
+        **kargs
+    )
+
+def internal_gen_well_known_protos_java(srcs):
+    """Bazel rule to generate the gen_well_known_protos_java genrule
+
+    Args:
+      srcs: the well known protos
+    """
+    root = Label("%s//protobuf_java" % (REPOSITORY_NAME)).workspace_root
+    pkg = PACKAGE_NAME + "/" if PACKAGE_NAME else ""
+    if root == "":
+        include = " -I%ssrc " % pkg
+    else:
+        include = " -I%s/%ssrc " % (root, pkg)
+    native.genrule(
+        name = "gen_well_known_protos_java",
+        srcs = srcs,
+        outs = [
+            "wellknown.srcjar",
+        ],
+        cmd = "$(location :protoc) --java_out=$(@D)/wellknown.jar" +
+              " %s $(SRCS) " % include +
+              " && mv $(@D)/wellknown.jar $(@D)/wellknown.srcjar",
+        tools = [":protoc"],
+    )
+
+def internal_copied_filegroup(name, srcs, strip_prefix, dest, **kwargs):
+    """Macro to copy files to a different directory and then create a filegroup.
+
+    This is used by the //:protobuf_python py_proto_library target to work around
+    an issue caused by Python source files that are part of the same Python
+    package being in separate directories.
+
+    Args:
+      srcs: The source files to copy and add to the filegroup.
+      strip_prefix: Path to the root of the files to copy.
+      dest: The directory to copy the source files into.
+      **kwargs: extra arguments that will be passesd to the filegroup.
+    """
+    outs = [_RelativeOutputPath(s, strip_prefix, dest) for s in srcs]
+
+    native.genrule(
+        name = name + "_genrule",
+        srcs = srcs,
+        outs = outs,
+        cmd = " && ".join(
+            ["cp $(location %s) $(location %s)" %
+             (s, _RelativeOutputPath(s, strip_prefix, dest)) for s in srcs],
+        ),
+    )
+
+    native.filegroup(
+        name = name,
+        srcs = outs,
+        **kwargs
+    )
+
+def py_proto_library(
+        name,
+        srcs = [],
+        deps = [],
+        py_libs = [],
+        py_extra_srcs = [],
+        include = None,
+        default_runtime = "@com_google_protobuf//:protobuf_python",
+        protoc = "@com_google_protobuf//:protoc",
+        use_grpc_plugin = False,
+        **kargs):
+    """Bazel rule to create a Python protobuf library from proto source files
+
+    NOTE: the rule is only an internal workaround to generate protos. The
+    interface may change and the rule may be removed when bazel has introduced
+    the native rule.
+
+    Args:
+      name: the name of the py_proto_library.
+      srcs: the .proto files of the py_proto_library.
+      deps: a list of dependency labels; must be py_proto_library.
+      py_libs: a list of other py_library targets depended by the generated
+          py_library.
+      py_extra_srcs: extra source files that will be added to the output
+          py_library. This attribute is used for internal bootstrapping.
+      include: a string indicating the include path of the .proto files.
+      default_runtime: the implicitly default runtime which will be depended on by
+          the generated py_library target.
+      protoc: the label of the protocol compiler to generate the sources.
+      use_grpc_plugin: a flag to indicate whether to call the Python C++ plugin
+          when processing the proto files.
+      **kargs: other keyword arguments that are passed to cc_library.
+
+    """
+    outs = _PyOuts(srcs, use_grpc_plugin)
+
+    includes = []
+    if include != None:
+        includes = [include]
+
+    grpc_python_plugin = None
+    if use_grpc_plugin:
+        grpc_python_plugin = "//external:grpc_python_plugin"
+        # Note: Generated grpc code depends on Python grpc module. This dependency
+        # is not explicitly listed in py_libs. Instead, host system is assumed to
+        # have grpc installed.
+
+    proto_gen(
+        name = name + "_genproto",
+        srcs = srcs,
+        deps = [s + "_genproto" for s in deps],
+        includes = includes,
+        protoc = protoc,
+        gen_py = 1,
+        outs = outs,
+        visibility = ["//visibility:public"],
+        plugin = grpc_python_plugin,
+        plugin_language = "grpc",
+    )
+
+    if default_runtime and not default_runtime in py_libs + deps:
+        py_libs = py_libs + [default_runtime]
+
+    native.py_library(
+        name = name,
+        srcs = outs + py_extra_srcs,
+        deps = py_libs + deps,
+        imports = includes,
+        **kargs
+    )
+
+def internal_protobuf_py_tests(
+        name,
+        modules = [],
+        **kargs):
+    """Bazel rules to create batch tests for protobuf internal.
+
+    Args:
+      name: the name of the rule.
+      modules: a list of modules for tests. The macro will create a py_test for
+          each of the parameter with the source "google/protobuf/%s.py"
+      kargs: extra parameters that will be passed into the py_test.
+
+    """
+    for m in modules:
+        s = "python/google/protobuf/internal/%s.py" % m
+        native.py_test(
+            name = "py_%s" % m,
+            srcs = [s],
+            main = s,
+            **kargs
+        )
+
+def check_protobuf_required_bazel_version():
+    """For WORKSPACE files, to check the installed version of bazel.
+
+    This ensures bazel supports our approach to proto_library() depending on a
+    copied filegroup. (Fixed in bazel 0.5.4)
+    """
+    expected = apple_common.dotted_version("0.5.4")
+    current = apple_common.dotted_version(native.bazel_version)
+    if current.compare_to(expected) < 0:
+        fail("Bazel must be newer than 0.5.4")
diff --git a/third_party/systemlibs/syslibs_configure.bzl b/third_party/systemlibs/syslibs_configure.bzl
index dbf4fd6e32fe3ac91d2f553cac4176ca6c21961f..1b971eca8ad342063106de904b624b3e3a9a7706 100644
--- a/third_party/systemlibs/syslibs_configure.bzl
+++ b/third_party/systemlibs/syslibs_configure.bzl
@@ -15,6 +15,8 @@ VALID_LIBS = [
     "boringssl",
     "com_github_googleapis_googleapis",
     "com_github_googlecloudplatform_google_cloud_cpp",
+    "com_google_protobuf",
+    "com_google_protobuf_cc",
     "com_googlesource_code_re2",
     "curl",
     "cython",
@@ -26,12 +28,14 @@ VALID_LIBS = [
     "icu",
     "jpeg",
     "jsoncpp_git",
+    "keras_applications_archive",
     "lmdb",
     "nasm",
     "nsync",
     "org_sqlite",
     "pcre",
     "png_archive",
+    "protobuf_archive",
     "six_archive",
     "snappy",
     "swig",
diff --git a/third_party/tensorrt/remote.BUILD.tpl b/third_party/tensorrt/remote.BUILD.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..7598e7aa4bb32702307fe073a33903184b2dc70e
--- /dev/null
+++ b/third_party/tensorrt/remote.BUILD.tpl
@@ -0,0 +1,7 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
+alias(name="LICENSE", actual = "%{target}:LICENSE")
+alias(name = "tensorrt_headers", actual = "%{target}:tensorrt_headers")
+alias(name = "nv_infer", actual = "%{target}:nv_infer")
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 9b946505a615372aa7de317c8ee390a2cd4b60e9..77ee6622d17c77c4c55e4bcb6a645e8598e6497b 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -17,6 +17,7 @@ load(
 )
 
 _TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH"
+_TF_TENSORRT_CONFIG_REPO = "TF_TENSORRT_CONFIG_REPO"
 _TF_TENSORRT_VERSION = "TF_TENSORRT_VERSION"
 
 _TF_TENSORRT_LIBS = ["nvinfer"]
@@ -154,6 +155,15 @@ def _create_dummy_repository(repository_ctx):
 
 def _tensorrt_configure_impl(repository_ctx):
   """Implementation of the tensorrt_configure repository rule."""
+  if _TF_TENSORRT_CONFIG_REPO in repository_ctx.os.environ:
+    # Forward to the pre-configured remote repository.
+    repository_ctx.template("BUILD", Label("//third_party/tensorrt:remote.BUILD.tpl"), {
+        "%{target}": repository_ctx.os.environ[_TF_TENSORRT_CONFIG_REPO],
+    })
+    # Set up config file.
+    _tpl(repository_ctx, "build_defs.bzl", {"%{tensorrt_is_configured}": "True"})
+    return
+
   if _TENSORRT_INSTALL_PATH not in repository_ctx.os.environ:
     _create_dummy_repository(repository_ctx)
     return
diff --git a/third_party/tflite_mobilenet_float.BUILD b/third_party/tflite_mobilenet_float.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..de47ed61f9db9ad980468aa325e3c770e0aae4f1
--- /dev/null
+++ b/third_party/tflite_mobilenet_float.BUILD
@@ -0,0 +1,12 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(
+    glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
diff --git a/third_party/tflite_mobilenet_quant.BUILD b/third_party/tflite_mobilenet_quant.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..de47ed61f9db9ad980468aa325e3c770e0aae4f1
--- /dev/null
+++ b/third_party/tflite_mobilenet_quant.BUILD
@@ -0,0 +1,12 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(
+    glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
diff --git a/third_party/toolchains/preconfig/generate/BUILD b/third_party/toolchains/preconfig/generate/BUILD
index 7e3e93d6004894029135f3151a282bcc43b8938f..b4c98dc94de7a0368efbce712e8a3b48c49f7841 100644
--- a/third_party/toolchains/preconfig/generate/BUILD
+++ b/third_party/toolchains/preconfig/generate/BUILD
@@ -3,33 +3,37 @@ licenses(["restricted"])
 load(":generate.bzl", "tensorflow_rbe_config")
 
 tensorflow_rbe_config(
-    name = "ubuntu14.04-py3-gcc-cuda9.0-cudnn7-nccl2",
+    name = "ubuntu14.04-py3-gcc-cuda9.0-cudnn7-tensorrt5",
     compiler = "gcc",
     cuda_version = "9.0",
     cudnn_version = "7",
     python_version = "3",
+    tensorrt_version = "5",
 )
 
 tensorflow_rbe_config(
-    name = "ubuntu14.04-py3-clang-cuda9.0-cudnn7-nccl2",
+    name = "ubuntu14.04-py3-clang-cuda9.0-cudnn7-tensorrt5",
     compiler = "clang",
     cuda_version = "9.0",
     cudnn_version = "7",
     python_version = "3",
+    tensorrt_version = "5",
 )
 
 tensorflow_rbe_config(
-    name = "ubuntu14.04-py3-gcc-cuda10.0-cudnn7-nccl2",
+    name = "ubuntu14.04-py3-gcc-cuda10.0-cudnn7-tensorrt5",
     compiler = "gcc",
     cuda_version = "10.0",
     cudnn_version = "7",
     python_version = "3",
+    tensorrt_version = "5",
 )
 
 tensorflow_rbe_config(
-    name = "ubuntu14.04-py3-clang-cuda10.0-cudnn7-nccl2",
+    name = "ubuntu14.04-py3-clang-cuda10.0-cudnn7-tensorrt5",
     compiler = "clang",
     cuda_version = "10.0",
     cudnn_version = "7",
     python_version = "3",
+    tensorrt_version = "5",
 )
diff --git a/third_party/toolchains/preconfig/generate/archives.bzl b/third_party/toolchains/preconfig/generate/archives.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..0850893589ba428c42a5faee9546686f049a46cf
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/archives.bzl
@@ -0,0 +1,27 @@
+load("//tensorflow:version_check.bzl", "parse_bazel_version")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+def bazel_toolchains_archive():
+    # Not all bazel versions have set native.bazel_version - if it is not set,
+    # fall back to the more compatible version of the toolchains archive.
+    if native.bazel_version and parse_bazel_version(native.bazel_version) >= parse_bazel_version("0.19"):
+        # This version of the toolchains repo is incompatible with older bazel
+        # versions - we can remove this once TensorFlow drops support for bazel
+        # before 0.19.
+        http_archive(
+            name = "bazel_toolchains",
+            sha256 = "41c48a189be489e2d15dec40e0057ea15b95ee5b39cc2a7e6cf663e31432c75e",
+            strip_prefix = "bazel-toolchains-3f8c58fe530fedc446de04673bc1e32985887dea",
+            urls = [
+                "https://github.com/nlopezgi/bazel-toolchains/archive/3f8c58fe530fedc446de04673bc1e32985887dea.tar.gz",
+            ],
+        )
+    else:
+        http_archive(
+            name = "bazel_toolchains",
+            sha256 = "15b5858b1b5541ec44df31b94c3b8672815b31d71215a98398761ea9f4c4eedb",
+            strip_prefix = "bazel-toolchains-6200b238c9c2d137c0d9a7262c80cc71d98e692b",
+            urls = [
+                "https://github.com/bazelbuild/bazel-toolchains/archive/6200b238c9c2d137c0d9a7262c80cc71d98e692b.tar.gz",
+            ],
+        )
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 0309b8ffca19720e7912410655b4194ae257c672..c56c6f3346ac64d516fa08f02ba9a206571a35e3 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -1,4 +1,4 @@
 container_digests = {
-    "cuda9.0-cudnn7-ubuntu14.04": "sha256:c26138f4c38c754da2bad44a8a068523abf7fbd71d58a57ce92e5342c5431bf5",
-    "cuda10.0-cudnn7-ubuntu14.04": "sha256:7737d770599de8435115bfdf56977002319316a6735ab081f82506cb51443f9d",
+    "cuda9.0-cudnn7-ubuntu14.04": "sha256:c43ed5341dd765042e0bbd1bf50fadeedd649d1e0c34d81999cb6ce30916cb95",
+    "cuda10.0-cudnn7-ubuntu14.04": "sha256:919e75247743ae1244d5d72ee9f18090379d4a9035e5853010f6d59d87cd2e8b",
 }
diff --git a/third_party/toolchains/preconfig/generate/generate.bzl b/third_party/toolchains/preconfig/generate/generate.bzl
index 2fb3a94cdca7430b522939266a4b2b398a65df8d..75deea41b819d0deaf35af71587322f41ff095c0 100644
--- a/third_party/toolchains/preconfig/generate/generate.bzl
+++ b/third_party/toolchains/preconfig/generate/generate.bzl
@@ -3,15 +3,15 @@ load(
     "docker_toolchain_autoconfig",
 )
 
-def _tensorflow_rbe_config(name, cuda_version, cudnn_version, python_version, compiler):
+def _tensorflow_rbe_config(name, cuda_version, cudnn_version, python_version, compiler, tensorrt_version):
     docker_toolchain_autoconfig(
         name = name,
         base = "@cuda%s-cudnn%s-ubuntu14.04//image" % (cuda_version, cudnn_version),
-        bazel_version = "0.16.1",
+        bazel_version = "0.19.2",
         config_repos = [
             "local_config_cuda",
             "local_config_python",
-            "local_config_nccl",
+            "local_config_tensorrt",
         ],
         env = {
             "ABI_VERSION": "gcc",
@@ -31,14 +31,12 @@ def _tensorflow_rbe_config(name, cuda_version, cudnn_version, python_version, co
             "TF_ENABLE_XLA": "1",
             "TF_CUDNN_VERSION": cudnn_version,
             "TF_CUDA_VERSION": cuda_version,
-            "NCCL_INSTALL_PATH": "/usr/lib",
-            "NCCL_HDR_PATH": "/usr/include",
-            "TF_NCCL_VERSION": "2",
             "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "TF_NEED_TENSORRT" : "1",
+            "TF_TENSORRT_VERSION": tensorrt_version,
+            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
         },
-        # TODO(klimek): We should use the sources that we currently work on, not
-        # just the latest snapshot of tensorflow that is checked in.
-        git_repo = "https://github.com/tensorflow/tensorflow",
+        mount_project = "$(mount_project)",
         tags = ["manual"],
         incompatible_changes_off = True,
     )
diff --git a/third_party/toolchains/preconfig/generate/generate.sh b/third_party/toolchains/preconfig/generate/generate.sh
index 37c5211278abf243ab388d83688e6c8c7888cea3..79407d59ac28cb9355f0f05360cf99908d95f89b 100755
--- a/third_party/toolchains/preconfig/generate/generate.sh
+++ b/third_party/toolchains/preconfig/generate/generate.sh
@@ -33,7 +33,9 @@ PY_VERSION="${PLATFORM[1]}"
 COMPILER="${PLATFORM[2]}"
 CUDA_VERSION="${PLATFORM[3]}"
 CUDNN_VERSION="${PLATFORM[4]}"
-NCCL_VERSION="${PLATFORM[5]}"
+TENSORRT_VERSION="${PLATFORM[5]}"
+
+# TODO(klimek): Put this into the name.
 
 if [[ "${COMPILER}" == "gcc" ]]; then
   COMPILER="gcc-nvcc-${CUDA_VERSION}"
@@ -44,9 +46,9 @@ echo "Python: ${PY_VERSION}"
 echo "Compiler: ${COMPILER}"
 echo "CUDA: ${CUDA_VERSION}"
 echo "CUDNN: ${CUDNN_VERSION}"
-echo "NCCL: ${NCCL_VERSION}"
+echo "TensorRT: ${TENSORRT_VERSION}"
 
-bazel build "${PKG}/generate:${TARGET}"
+bazel build --define=mount_project="${PWD}" "${PKG}/generate:${TARGET}"
 cd "${TEMPDIR}"
 tar xvf "${ROOT}/bazel-bin/${PKG}/generate/${TARGET}_outputs.tar"
 
@@ -58,8 +60,8 @@ find . -empty -delete
 # <OS>/
 #   <CUDA>-<CUDNN>/
 #   <COMPILER>/
-#   <NCCL>/
 #   <PYTHON>/
+#   <TENSORRT>/
 
 # Create our toplevel output directory for the OS.
 mkdir "${OS}"
@@ -67,15 +69,15 @@ mkdir "${OS}"
 # Python:
 mv local_config_python "${OS}/${PY_VERSION}"
 
-# NCCL:
-mv local_config_nccl "${OS}/${NCCL_VERSION}"
-
 # Compiler:
 mv local_config_cuda/crosstool "${OS}/${COMPILER}"
 
 # CUDA:
 mv local_config_cuda "${OS}/${CUDA_VERSION}-${CUDNN_VERSION}"
 
+# TensorRT:
+mv local_config_tensorrt "${OS}/${TENSORRT_VERSION}"
+
 # Cleanup for copybara.
 find "${OS}" -name 'BUILD' -o -name '*.bzl' |xargs buildifier
 find "${OS}" -name 'BUILD' -o -name '*.bzl' |xargs -I {} mv {} {}.oss
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..399d7c1463ae8524ba6ff3d57a3eed8b4e1cc031
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
@@ -0,0 +1,56 @@
+# NVIDIA TensorRT
+# A high-performance deep learning inference optimizer and runtime.
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "tensorrt_headers",
+    hdrs = [":tensorrt_include"],
+    includes = [
+        "include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "nv_infer",
+    srcs = ["tensorrt/lib/libnvinfer.so.5"],
+    copts = cuda_default_copts(),
+    data = ["tensorrt/lib/libnvinfer.so.5"],
+    includes = [
+        "include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tensorrt_headers",
+        "@local_config_cuda//cuda",
+    ],
+)
+
+genrule(
+    name = "tensorrt_lib",
+    outs = [
+        "tensorrt/lib/libnvinfer.so.5",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/lib/x86_64-linux-gnu/libnvinfer.so.5.0.2" "$(@D)/libnvinfer.so.5"
+   """,
+)
+
+genrule(
+    name = "tensorrt_include",
+    outs = [
+        "tensorrt/include/NvInfer.h",
+        "tensorrt/include/NvUtils.h",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/include/x86_64-linux-gnu/NvInfer.h" "$(@D)/tensorrt/include/NvInfer.h" && cp -f "/usr/include/x86_64-linux-gnu/NvUtils.h" "$(@D)/tensorrt/include/NvUtils.h"
+   """,
+)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/WORKSPACE b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/WORKSPACE
new file mode 100644
index 0000000000000000000000000000000000000000..ce47f14b91bf5249f9face7e486cde60b9d2d669
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/WORKSPACE
@@ -0,0 +1,2 @@
+# DO NOT EDIT: automatically generated WORKSPACE file for tensorrt_configure rule
+workspace(name = "local_config_tensorrt")
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
new file mode 100755
index 0000000000000000000000000000000000000000..5c1c40361da2a20f4c504ec066784a615c454d12
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
@@ -0,0 +1,7 @@
+# Build configurations for TensorRT.
+
+def if_tensorrt(if_true, if_false = []):
+    """Tests whether TensorRT was enabled during the configure process."""
+    if True:
+        return if_true
+    return if_false
diff --git a/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD b/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
index c00f005e46cb727265886e98313c790875a85089..edd958364811d2e063b10f3c2e3a347b601794b5 100644
--- a/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
+++ b/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
@@ -39,6 +39,9 @@ cc_toolchain_suite(
         "x64_windows|msvc-cl": ":cc-compiler-x64_windows",
         "x64_windows|msys-gcc": ":cc-compiler-x64_windows_msys",
         "x64_windows|mingw-gcc": ":cc-compiler-x64_windows_mingw",
+        "x64_windows_msys": ":cc-compiler-x64_windows_msys",
+        "x64_windows": ":cc-compiler-x64_windows",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
     },
 )
 
@@ -54,6 +57,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_identifier = "msys_x64",
 )
 
 toolchain(
@@ -83,6 +87,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 0,
+    toolchain_identifier = "msys_x64_mingw",
 )
 
 toolchain(
@@ -112,6 +117,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_identifier = "msvc_x64",
 )
 
 toolchain(
@@ -140,6 +146,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_identifier = "stub_armeabi-v7a",
 )
 
 toolchain(
diff --git a/third_party/toolchains/preconfig/win_1803/bazel_018/CROSSTOOL b/third_party/toolchains/preconfig/win_1803/bazel_018/CROSSTOOL
index 04c8bcae456ad71e961a2a2f7dfa05875f666260..38a80c22da32de50a98b78da6e157db936d03040 100644
--- a/third_party/toolchains/preconfig/win_1803/bazel_018/CROSSTOOL
+++ b/third_party/toolchains/preconfig/win_1803/bazel_018/CROSSTOOL
@@ -14,42 +14,6 @@
 
 major_version: "local"
 minor_version: ""
-default_target_cpu: "same_as_host"
-
-default_toolchain {
-  cpu: "x64_windows"
-  toolchain_identifier: "msvc_x64"
-}
-
-default_toolchain {
-  cpu: "local"
-  toolchain_identifier: "stub_armeabi-v7a"
-}
-
-default_toolchain {
-  cpu: "armeabi-v7a"
-  toolchain_identifier: "stub_armeabi-v7a"
-}
-
-default_toolchain {
-  cpu: "x64_windows"
-  toolchain_identifier: "msvc_x64"
-}
-
-default_toolchain {
-  cpu: "x64_windows_msvc"
-  toolchain_identifier: "msvc_x64"
-}
-
-default_toolchain {
-  cpu: "x64_windows_msys"
-  toolchain_identifier: "msys_x64"
-}
-
-default_toolchain {
-  cpu: "s390x"
-  toolchain_identifier: "msys_x64"
-}
 
 # Android tooling requires a default toolchain for the armeabi-v7a cpu.
 toolchain {